You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
544 lines
20 KiB
544 lines
20 KiB
# $Id: tableparser.py 7898 2015-05-29 20:49:28Z milde $ |
|
# Author: David Goodger <goodger@python.org> |
|
# Copyright: This module has been placed in the public domain. |
|
|
|
""" |
|
This module defines table parser classes,which parse plaintext-graphic tables |
|
and produce a well-formed data structure suitable for building a CALS table. |
|
|
|
:Classes: |
|
- `GridTableParser`: Parse fully-formed tables represented with a grid. |
|
- `SimpleTableParser`: Parse simple tables, delimited by top & bottom |
|
borders. |
|
|
|
:Exception class: `TableMarkupError` |
|
|
|
:Function: |
|
`update_dict_of_lists()`: Merge two dictionaries containing list values. |
|
""" |
|
|
|
__docformat__ = 'reStructuredText' |
|
|
|
|
|
import re |
|
import sys |
|
from docutils import DataError |
|
from docutils.utils import strip_combining_chars |
|
|
|
|
|
class TableMarkupError(DataError): |
|
|
|
""" |
|
Raise if there is any problem with table markup. |
|
|
|
The keyword argument `offset` denotes the offset of the problem |
|
from the table's start line. |
|
""" |
|
|
|
def __init__(self, *args, **kwargs): |
|
self.offset = kwargs.pop('offset', 0) |
|
DataError.__init__(self, *args) |
|
|
|
|
|
class TableParser: |
|
|
|
""" |
|
Abstract superclass for the common parts of the syntax-specific parsers. |
|
""" |
|
|
|
head_body_separator_pat = None |
|
"""Matches the row separator between head rows and body rows.""" |
|
|
|
double_width_pad_char = '\x00' |
|
"""Padding character for East Asian double-width text.""" |
|
|
|
def parse(self, block): |
|
""" |
|
Analyze the text `block` and return a table data structure. |
|
|
|
Given a plaintext-graphic table in `block` (list of lines of text; no |
|
whitespace padding), parse the table, construct and return the data |
|
necessary to construct a CALS table or equivalent. |
|
|
|
Raise `TableMarkupError` if there is any problem with the markup. |
|
""" |
|
self.setup(block) |
|
self.find_head_body_sep() |
|
self.parse_table() |
|
structure = self.structure_from_cells() |
|
return structure |
|
|
|
def find_head_body_sep(self): |
|
"""Look for a head/body row separator line; store the line index.""" |
|
for i in range(len(self.block)): |
|
line = self.block[i] |
|
if self.head_body_separator_pat.match(line): |
|
if self.head_body_sep: |
|
raise TableMarkupError( |
|
'Multiple head/body row separators ' |
|
'(table lines %s and %s); only one allowed.' |
|
% (self.head_body_sep+1, i+1), offset=i) |
|
else: |
|
self.head_body_sep = i |
|
self.block[i] = line.replace('=', '-') |
|
if self.head_body_sep == 0 or self.head_body_sep == (len(self.block) |
|
- 1): |
|
raise TableMarkupError('The head/body row separator may not be ' |
|
'the first or last line of the table.', |
|
offset=i) |
|
|
|
|
|
class GridTableParser(TableParser): |
|
|
|
""" |
|
Parse a grid table using `parse()`. |
|
|
|
Here's an example of a grid table:: |
|
|
|
+------------------------+------------+----------+----------+ |
|
| Header row, column 1 | Header 2 | Header 3 | Header 4 | |
|
+========================+============+==========+==========+ |
|
| body row 1, column 1 | column 2 | column 3 | column 4 | |
|
+------------------------+------------+----------+----------+ |
|
| body row 2 | Cells may span columns. | |
|
+------------------------+------------+---------------------+ |
|
| body row 3 | Cells may | - Table cells | |
|
+------------------------+ span rows. | - contain | |
|
| body row 4 | | - body elements. | |
|
+------------------------+------------+---------------------+ |
|
|
|
Intersections use '+', row separators use '-' (except for one optional |
|
head/body row separator, which uses '='), and column separators use '|'. |
|
|
|
Passing the above table to the `parse()` method will result in the |
|
following data structure:: |
|
|
|
([24, 12, 10, 10], |
|
[[(0, 0, 1, ['Header row, column 1']), |
|
(0, 0, 1, ['Header 2']), |
|
(0, 0, 1, ['Header 3']), |
|
(0, 0, 1, ['Header 4'])]], |
|
[[(0, 0, 3, ['body row 1, column 1']), |
|
(0, 0, 3, ['column 2']), |
|
(0, 0, 3, ['column 3']), |
|
(0, 0, 3, ['column 4'])], |
|
[(0, 0, 5, ['body row 2']), |
|
(0, 2, 5, ['Cells may span columns.']), |
|
None, |
|
None], |
|
[(0, 0, 7, ['body row 3']), |
|
(1, 0, 7, ['Cells may', 'span rows.', '']), |
|
(1, 1, 7, ['- Table cells', '- contain', '- body elements.']), |
|
None], |
|
[(0, 0, 9, ['body row 4']), None, None, None]]) |
|
|
|
The first item is a list containing column widths (colspecs). The second |
|
item is a list of head rows, and the third is a list of body rows. Each |
|
row contains a list of cells. Each cell is either None (for a cell unused |
|
because of another cell's span), or a tuple. A cell tuple contains four |
|
items: the number of extra rows used by the cell in a vertical span |
|
(morerows); the number of extra columns used by the cell in a horizontal |
|
span (morecols); the line offset of the first line of the cell contents; |
|
and the cell contents, a list of lines of text. |
|
""" |
|
|
|
head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$') |
|
|
|
def setup(self, block): |
|
self.block = block[:] # make a copy; it may be modified |
|
self.block.disconnect() # don't propagate changes to parent |
|
self.bottom = len(block) - 1 |
|
self.right = len(block[0]) - 1 |
|
self.head_body_sep = None |
|
self.done = [-1] * len(block[0]) |
|
self.cells = [] |
|
self.rowseps = {0: [0]} |
|
self.colseps = {0: [0]} |
|
|
|
def parse_table(self): |
|
""" |
|
Start with a queue of upper-left corners, containing the upper-left |
|
corner of the table itself. Trace out one rectangular cell, remember |
|
it, and add its upper-right and lower-left corners to the queue of |
|
potential upper-left corners of further cells. Process the queue in |
|
top-to-bottom order, keeping track of how much of each text column has |
|
been seen. |
|
|
|
We'll end up knowing all the row and column boundaries, cell positions |
|
and their dimensions. |
|
""" |
|
corners = [(0, 0)] |
|
while corners: |
|
top, left = corners.pop(0) |
|
if top == self.bottom or left == self.right \ |
|
or top <= self.done[left]: |
|
continue |
|
result = self.scan_cell(top, left) |
|
if not result: |
|
continue |
|
bottom, right, rowseps, colseps = result |
|
update_dict_of_lists(self.rowseps, rowseps) |
|
update_dict_of_lists(self.colseps, colseps) |
|
self.mark_done(top, left, bottom, right) |
|
cellblock = self.block.get_2D_block(top + 1, left + 1, |
|
bottom, right) |
|
cellblock.disconnect() # lines in cell can't sync with parent |
|
cellblock.replace(self.double_width_pad_char, '') |
|
self.cells.append((top, left, bottom, right, cellblock)) |
|
corners.extend([(top, right), (bottom, left)]) |
|
corners.sort() |
|
if not self.check_parse_complete(): |
|
raise TableMarkupError('Malformed table; parse incomplete.') |
|
|
|
def mark_done(self, top, left, bottom, right): |
|
"""For keeping track of how much of each text column has been seen.""" |
|
before = top - 1 |
|
after = bottom - 1 |
|
for col in range(left, right): |
|
assert self.done[col] == before |
|
self.done[col] = after |
|
|
|
def check_parse_complete(self): |
|
"""Each text column should have been completely seen.""" |
|
last = self.bottom - 1 |
|
for col in range(self.right): |
|
if self.done[col] != last: |
|
return False |
|
return True |
|
|
|
def scan_cell(self, top, left): |
|
"""Starting at the top-left corner, start tracing out a cell.""" |
|
assert self.block[top][left] == '+' |
|
result = self.scan_right(top, left) |
|
return result |
|
|
|
def scan_right(self, top, left): |
|
""" |
|
Look for the top-right corner of the cell, and make note of all column |
|
boundaries ('+'). |
|
""" |
|
colseps = {} |
|
line = self.block[top] |
|
for i in range(left + 1, self.right + 1): |
|
if line[i] == '+': |
|
colseps[i] = [top] |
|
result = self.scan_down(top, left, i) |
|
if result: |
|
bottom, rowseps, newcolseps = result |
|
update_dict_of_lists(colseps, newcolseps) |
|
return bottom, i, rowseps, colseps |
|
elif line[i] != '-': |
|
return None |
|
return None |
|
|
|
def scan_down(self, top, left, right): |
|
""" |
|
Look for the bottom-right corner of the cell, making note of all row |
|
boundaries. |
|
""" |
|
rowseps = {} |
|
for i in range(top + 1, self.bottom + 1): |
|
if self.block[i][right] == '+': |
|
rowseps[i] = [right] |
|
result = self.scan_left(top, left, i, right) |
|
if result: |
|
newrowseps, colseps = result |
|
update_dict_of_lists(rowseps, newrowseps) |
|
return i, rowseps, colseps |
|
elif self.block[i][right] != '|': |
|
return None |
|
return None |
|
|
|
def scan_left(self, top, left, bottom, right): |
|
""" |
|
Noting column boundaries, look for the bottom-left corner of the cell. |
|
It must line up with the starting point. |
|
""" |
|
colseps = {} |
|
line = self.block[bottom] |
|
for i in range(right - 1, left, -1): |
|
if line[i] == '+': |
|
colseps[i] = [bottom] |
|
elif line[i] != '-': |
|
return None |
|
if line[left] != '+': |
|
return None |
|
result = self.scan_up(top, left, bottom, right) |
|
if result is not None: |
|
rowseps = result |
|
return rowseps, colseps |
|
return None |
|
|
|
def scan_up(self, top, left, bottom, right): |
|
""" |
|
Noting row boundaries, see if we can return to the starting point. |
|
""" |
|
rowseps = {} |
|
for i in range(bottom - 1, top, -1): |
|
if self.block[i][left] == '+': |
|
rowseps[i] = [left] |
|
elif self.block[i][left] != '|': |
|
return None |
|
return rowseps |
|
|
|
def structure_from_cells(self): |
|
""" |
|
From the data collected by `scan_cell()`, convert to the final data |
|
structure. |
|
""" |
|
rowseps = self.rowseps.keys() # list of row boundaries |
|
rowseps.sort() |
|
rowindex = {} |
|
for i in range(len(rowseps)): |
|
rowindex[rowseps[i]] = i # row boundary -> row number mapping |
|
colseps = self.colseps.keys() # list of column boundaries |
|
colseps.sort() |
|
colindex = {} |
|
for i in range(len(colseps)): |
|
colindex[colseps[i]] = i # column boundary -> col number map |
|
colspecs = [(colseps[i] - colseps[i - 1] - 1) |
|
for i in range(1, len(colseps))] # list of column widths |
|
# prepare an empty table with the correct number of rows & columns |
|
onerow = [None for i in range(len(colseps) - 1)] |
|
rows = [onerow[:] for i in range(len(rowseps) - 1)] |
|
# keep track of # of cells remaining; should reduce to zero |
|
remaining = (len(rowseps) - 1) * (len(colseps) - 1) |
|
for top, left, bottom, right, block in self.cells: |
|
rownum = rowindex[top] |
|
colnum = colindex[left] |
|
assert rows[rownum][colnum] is None, ( |
|
'Cell (row %s, column %s) already used.' |
|
% (rownum + 1, colnum + 1)) |
|
morerows = rowindex[bottom] - rownum - 1 |
|
morecols = colindex[right] - colnum - 1 |
|
remaining -= (morerows + 1) * (morecols + 1) |
|
# write the cell into the table |
|
rows[rownum][colnum] = (morerows, morecols, top + 1, block) |
|
assert remaining == 0, 'Unused cells remaining.' |
|
if self.head_body_sep: # separate head rows from body rows |
|
numheadrows = rowindex[self.head_body_sep] |
|
headrows = rows[:numheadrows] |
|
bodyrows = rows[numheadrows:] |
|
else: |
|
headrows = [] |
|
bodyrows = rows |
|
return (colspecs, headrows, bodyrows) |
|
|
|
|
|
class SimpleTableParser(TableParser): |
|
|
|
""" |
|
Parse a simple table using `parse()`. |
|
|
|
Here's an example of a simple table:: |
|
|
|
===== ===== |
|
col 1 col 2 |
|
===== ===== |
|
1 Second column of row 1. |
|
2 Second column of row 2. |
|
Second line of paragraph. |
|
3 - Second column of row 3. |
|
|
|
- Second item in bullet |
|
list (row 3, column 2). |
|
4 is a span |
|
------------ |
|
5 |
|
===== ===== |
|
|
|
Top and bottom borders use '=', column span underlines use '-', column |
|
separation is indicated with spaces. |
|
|
|
Passing the above table to the `parse()` method will result in the |
|
following data structure, whose interpretation is the same as for |
|
`GridTableParser`:: |
|
|
|
([5, 25], |
|
[[(0, 0, 1, ['col 1']), |
|
(0, 0, 1, ['col 2'])]], |
|
[[(0, 0, 3, ['1']), |
|
(0, 0, 3, ['Second column of row 1.'])], |
|
[(0, 0, 4, ['2']), |
|
(0, 0, 4, ['Second column of row 2.', |
|
'Second line of paragraph.'])], |
|
[(0, 0, 6, ['3']), |
|
(0, 0, 6, ['- Second column of row 3.', |
|
'', |
|
'- Second item in bullet', |
|
' list (row 3, column 2).'])], |
|
[(0, 1, 10, ['4 is a span'])], |
|
[(0, 0, 12, ['5']), |
|
(0, 0, 12, [''])]]) |
|
""" |
|
|
|
head_body_separator_pat = re.compile('=[ =]*$') |
|
span_pat = re.compile('-[ -]*$') |
|
|
|
def setup(self, block): |
|
self.block = block[:] # make a copy; it will be modified |
|
self.block.disconnect() # don't propagate changes to parent |
|
# Convert top & bottom borders to column span underlines: |
|
self.block[0] = self.block[0].replace('=', '-') |
|
self.block[-1] = self.block[-1].replace('=', '-') |
|
self.head_body_sep = None |
|
self.columns = [] |
|
self.border_end = None |
|
self.table = [] |
|
self.done = [-1] * len(block[0]) |
|
self.rowseps = {0: [0]} |
|
self.colseps = {0: [0]} |
|
|
|
def parse_table(self): |
|
""" |
|
First determine the column boundaries from the top border, then |
|
process rows. Each row may consist of multiple lines; accumulate |
|
lines until a row is complete. Call `self.parse_row` to finish the |
|
job. |
|
""" |
|
# Top border must fully describe all table columns. |
|
self.columns = self.parse_columns(self.block[0], 0) |
|
self.border_end = self.columns[-1][1] |
|
firststart, firstend = self.columns[0] |
|
offset = 1 # skip top border |
|
start = 1 |
|
text_found = None |
|
while offset < len(self.block): |
|
line = self.block[offset] |
|
if self.span_pat.match(line): |
|
# Column span underline or border; row is complete. |
|
self.parse_row(self.block[start:offset], start, |
|
(line.rstrip(), offset)) |
|
start = offset + 1 |
|
text_found = None |
|
elif line[firststart:firstend].strip(): |
|
# First column not blank, therefore it's a new row. |
|
if text_found and offset != start: |
|
self.parse_row(self.block[start:offset], start) |
|
start = offset |
|
text_found = 1 |
|
elif not text_found: |
|
start = offset + 1 |
|
offset += 1 |
|
|
|
def parse_columns(self, line, offset): |
|
""" |
|
Given a column span underline, return a list of (begin, end) pairs. |
|
""" |
|
cols = [] |
|
end = 0 |
|
while True: |
|
begin = line.find('-', end) |
|
end = line.find(' ', begin) |
|
if begin < 0: |
|
break |
|
if end < 0: |
|
end = len(line) |
|
cols.append((begin, end)) |
|
if self.columns: |
|
if cols[-1][1] != self.border_end: |
|
raise TableMarkupError('Column span incomplete in table ' |
|
'line %s.' % (offset+1), |
|
offset=offset) |
|
# Allow for an unbounded rightmost column: |
|
cols[-1] = (cols[-1][0], self.columns[-1][1]) |
|
return cols |
|
|
|
def init_row(self, colspec, offset): |
|
i = 0 |
|
cells = [] |
|
for start, end in colspec: |
|
morecols = 0 |
|
try: |
|
assert start == self.columns[i][0] |
|
while end != self.columns[i][1]: |
|
i += 1 |
|
morecols += 1 |
|
except (AssertionError, IndexError): |
|
raise TableMarkupError('Column span alignment problem ' |
|
'in table line %s.' % (offset+2), |
|
offset=offset+1) |
|
cells.append([0, morecols, offset, []]) |
|
i += 1 |
|
return cells |
|
|
|
def parse_row(self, lines, start, spanline=None): |
|
""" |
|
Given the text `lines` of a row, parse it and append to `self.table`. |
|
|
|
The row is parsed according to the current column spec (either |
|
`spanline` if provided or `self.columns`). For each column, extract |
|
text from each line, and check for text in column margins. Finally, |
|
adjust for insignificant whitespace. |
|
""" |
|
if not (lines or spanline): |
|
# No new row, just blank lines. |
|
return |
|
if spanline: |
|
columns = self.parse_columns(*spanline) |
|
span_offset = spanline[1] |
|
else: |
|
columns = self.columns[:] |
|
span_offset = start |
|
self.check_columns(lines, start, columns) |
|
row = self.init_row(columns, start) |
|
for i in range(len(columns)): |
|
start, end = columns[i] |
|
cellblock = lines.get_2D_block(0, start, len(lines), end) |
|
cellblock.disconnect() # lines in cell can't sync with parent |
|
cellblock.replace(self.double_width_pad_char, '') |
|
row[i][3] = cellblock |
|
self.table.append(row) |
|
|
|
def check_columns(self, lines, first_line, columns): |
|
""" |
|
Check for text in column margins and text overflow in the last column. |
|
Raise TableMarkupError if anything but whitespace is in column margins. |
|
Adjust the end value for the last column if there is text overflow. |
|
""" |
|
# "Infinite" value for a dummy last column's beginning, used to |
|
# check for text overflow: |
|
columns.append((sys.maxint, None)) |
|
lastcol = len(columns) - 2 |
|
# combining characters do not contribute to the column width |
|
lines = [strip_combining_chars(line) for line in lines] |
|
|
|
for i in range(len(columns) - 1): |
|
start, end = columns[i] |
|
nextstart = columns[i+1][0] |
|
offset = 0 |
|
for line in lines: |
|
if i == lastcol and line[end:].strip(): |
|
text = line[start:].rstrip() |
|
new_end = start + len(text) |
|
main_start, main_end = self.columns[-1] |
|
columns[i] = (start, max(main_end, new_end)) |
|
if new_end > main_end: |
|
self.columns[-1] = (main_start, new_end) |
|
elif line[end:nextstart].strip(): |
|
raise TableMarkupError('Text in column margin ' |
|
'in table line %s.' % (first_line+offset+1), |
|
offset=first_line+offset) |
|
offset += 1 |
|
columns.pop() |
|
|
|
def structure_from_cells(self): |
|
colspecs = [end - start for start, end in self.columns] |
|
first_body_row = 0 |
|
if self.head_body_sep: |
|
for i in range(len(self.table)): |
|
if self.table[i][0][2] > self.head_body_sep: |
|
first_body_row = i |
|
break |
|
return (colspecs, self.table[:first_body_row], |
|
self.table[first_body_row:]) |
|
|
|
|
|
def update_dict_of_lists(master, newdata): |
|
""" |
|
Extend the list values of `master` with those from `newdata`. |
|
|
|
Both parameters must be dictionaries containing list values. |
|
""" |
|
for key, values in newdata.items(): |
|
master.setdefault(key, []).extend(values)
|
|
|