1# $Id$
2# Author: David Goodger <goodger@python.org>
3# Copyright: This module has been placed in the public domain.
4
5"""
6This module defines table parser classes,which parse plaintext-graphic tables
7and produce a well-formed data structure suitable for building a CALS table.
8
9:Classes:
10 - `GridTableParser`: Parse fully-formed tables represented with a grid.
11 - `SimpleTableParser`: Parse simple tables, delimited by top & bottom
12 borders.
13
14:Exception class: `TableMarkupError`
15
16:Function:
17 `update_dict_of_lists()`: Merge two dictionaries containing list values.
18"""
19
20__docformat__ = 'reStructuredText'
21
22
23import re
24import sys
25from docutils import DataError
26from docutils.utils import strip_combining_chars
27
28
29class TableMarkupError(DataError):
30
31 """
32 Raise if there is any problem with table markup.
33
34 The keyword argument `offset` denotes the offset of the problem
35 from the table's start line.
36 """
37
38 def __init__(self, *args, **kwargs):
39 self.offset = kwargs.pop('offset', 0)
40 DataError.__init__(self, *args)
41
42
43class TableParser:
44
45 """
46 Abstract superclass for the common parts of the syntax-specific parsers.
47 """
48
49 head_body_separator_pat = None
50 """Matches the row separator between head rows and body rows."""
51
52 double_width_pad_char = '\x00'
53 """Padding character for East Asian double-width text."""
54
55 def parse(self, block):
56 """
57 Analyze the text `block` and return a table data structure.
58
59 Given a plaintext-graphic table in `block` (list of lines of text; no
60 whitespace padding), parse the table, construct and return the data
61 necessary to construct a CALS table or equivalent.
62
63 Raise `TableMarkupError` if there is any problem with the markup.
64 """
65 self.setup(block)
66 self.find_head_body_sep()
67 self.parse_table()
68 return self.structure_from_cells()
69
70 def find_head_body_sep(self):
71 """Look for a head/body row separator line; store the line index."""
72 for i in range(len(self.block)):
73 line = self.block[i]
74 if self.head_body_separator_pat.match(line):
75 if self.head_body_sep:
76 raise TableMarkupError(
77 'Multiple head/body row separators '
78 '(table lines %s and %s); only one allowed.'
79 % (self.head_body_sep+1, i+1), offset=i)
80 else:
81 self.head_body_sep = i
82 self.block[i] = line.replace('=', '-')
83 if self.head_body_sep == 0 or self.head_body_sep == (len(self.block)
84 - 1):
85 raise TableMarkupError('The head/body row separator may not be '
86 'the first or last line of the table.',
87 offset=i)
88
89
90class GridTableParser(TableParser):
91
92 """
93 Parse a grid table using `parse()`.
94
95 Here's an example of a grid table::
96
97 +------------------------+------------+----------+----------+
98 | Header row, column 1 | Header 2 | Header 3 | Header 4 |
99 +========================+============+==========+==========+
100 | body row 1, column 1 | column 2 | column 3 | column 4 |
101 +------------------------+------------+----------+----------+
102 | body row 2 | Cells may span columns. |
103 +------------------------+------------+---------------------+
104 | body row 3 | Cells may | - Table cells |
105 +------------------------+ span rows. | - contain |
106 | body row 4 | | - body elements. |
107 +------------------------+------------+---------------------+
108
109 Intersections use '+', row separators use '-' (except for one optional
110 head/body row separator, which uses '='), and column separators use '|'.
111
112 Passing the above table to the `parse()` method will result in the
113 following data structure::
114
115 ([24, 12, 10, 10],
116 [[(0, 0, 1, ['Header row, column 1']),
117 (0, 0, 1, ['Header 2']),
118 (0, 0, 1, ['Header 3']),
119 (0, 0, 1, ['Header 4'])]],
120 [[(0, 0, 3, ['body row 1, column 1']),
121 (0, 0, 3, ['column 2']),
122 (0, 0, 3, ['column 3']),
123 (0, 0, 3, ['column 4'])],
124 [(0, 0, 5, ['body row 2']),
125 (0, 2, 5, ['Cells may span columns.']),
126 None,
127 None],
128 [(0, 0, 7, ['body row 3']),
129 (1, 0, 7, ['Cells may', 'span rows.', '']),
130 (1, 1, 7, ['- Table cells', '- contain', '- body elements.']),
131 None],
132 [(0, 0, 9, ['body row 4']), None, None, None]])
133
134 The first item is a list containing column widths (colspecs). The second
135 item is a list of head rows, and the third is a list of body rows. Each
136 row contains a list of cells. Each cell is either None (for a cell unused
137 because of another cell's span), or a tuple. A cell tuple contains four
138 items: the number of extra rows used by the cell in a vertical span
139 (morerows); the number of extra columns used by the cell in a horizontal
140 span (morecols); the line offset of the first line of the cell contents;
141 and the cell contents, a list of lines of text.
142 """
143
144 head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$')
145
146 def setup(self, block):
147 self.block = block[:] # make a copy; it may be modified
148 self.block.disconnect() # don't propagate changes to parent
149 self.bottom = len(block) - 1
150 self.right = len(block[0]) - 1
151 self.head_body_sep = None
152 self.done = [-1] * len(block[0])
153 self.cells = []
154 self.rowseps = {0: [0]}
155 self.colseps = {0: [0]}
156
157 def parse_table(self):
158 """
159 Start with a queue of upper-left corners, containing the upper-left
160 corner of the table itself. Trace out one rectangular cell, remember
161 it, and add its upper-right and lower-left corners to the queue of
162 potential upper-left corners of further cells. Process the queue in
163 top-to-bottom order, keeping track of how much of each text column has
164 been seen.
165
166 We'll end up knowing all the row and column boundaries, cell positions
167 and their dimensions.
168 """
169 corners = [(0, 0)]
170 while corners:
171 top, left = corners.pop(0)
172 if (top == self.bottom
173 or left == self.right
174 or top <= self.done[left]):
175 continue
176 result = self.scan_cell(top, left)
177 if not result:
178 continue
179 bottom, right, rowseps, colseps = result
180 update_dict_of_lists(self.rowseps, rowseps)
181 update_dict_of_lists(self.colseps, colseps)
182 self.mark_done(top, left, bottom, right)
183 cellblock = self.block.get_2D_block(top + 1, left + 1,
184 bottom, right)
185 cellblock.disconnect() # lines in cell can't sync with parent
186 cellblock.replace(self.double_width_pad_char, '')
187 self.cells.append((top, left, bottom, right, cellblock))
188 corners.extend([(top, right), (bottom, left)])
189 corners.sort()
190 if not self.check_parse_complete():
191 raise TableMarkupError('Malformed table; parse incomplete.')
192
193 def mark_done(self, top, left, bottom, right):
194 """For keeping track of how much of each text column has been seen."""
195 before = top - 1
196 after = bottom - 1
197 for col in range(left, right):
198 assert self.done[col] == before
199 self.done[col] = after
200
201 def check_parse_complete(self):
202 """Each text column should have been completely seen."""
203 last = self.bottom - 1
204 for col in range(self.right):
205 if self.done[col] != last:
206 return False
207 return True
208
209 def scan_cell(self, top, left):
210 """Starting at the top-left corner, start tracing out a cell."""
211 assert self.block[top][left] == '+'
212 return self.scan_right(top, left)
213
214 def scan_right(self, top, left):
215 """
216 Look for the top-right corner of the cell, and make note of all column
217 boundaries ('+').
218 """
219 colseps = {}
220 line = self.block[top]
221 for i in range(left + 1, self.right + 1):
222 if line[i] == '+':
223 colseps[i] = [top]
224 result = self.scan_down(top, left, i)
225 if result:
226 bottom, rowseps, newcolseps = result
227 update_dict_of_lists(colseps, newcolseps)
228 return bottom, i, rowseps, colseps
229 elif line[i] != '-':
230 return None
231 return None
232
233 def scan_down(self, top, left, right):
234 """
235 Look for the bottom-right corner of the cell, making note of all row
236 boundaries.
237 """
238 rowseps = {}
239 for i in range(top + 1, self.bottom + 1):
240 if self.block[i][right] == '+':
241 rowseps[i] = [right]
242 result = self.scan_left(top, left, i, right)
243 if result:
244 newrowseps, colseps = result
245 update_dict_of_lists(rowseps, newrowseps)
246 return i, rowseps, colseps
247 elif self.block[i][right] != '|':
248 return None
249 return None
250
251 def scan_left(self, top, left, bottom, right):
252 """
253 Noting column boundaries, look for the bottom-left corner of the cell.
254 It must line up with the starting point.
255 """
256 colseps = {}
257 line = self.block[bottom]
258 for i in range(right - 1, left, -1):
259 if line[i] == '+':
260 colseps[i] = [bottom]
261 elif line[i] != '-':
262 return None
263 if line[left] != '+':
264 return None
265 result = self.scan_up(top, left, bottom, right)
266 if result is not None:
267 rowseps = result
268 return rowseps, colseps
269 return None
270
271 def scan_up(self, top, left, bottom, right):
272 """
273 Noting row boundaries, see if we can return to the starting point.
274 """
275 rowseps = {}
276 for i in range(bottom - 1, top, -1):
277 if self.block[i][left] == '+':
278 rowseps[i] = [left]
279 elif self.block[i][left] != '|':
280 return None
281 return rowseps
282
283 def structure_from_cells(self):
284 """
285 From the data collected by `scan_cell()`, convert to the final data
286 structure.
287 """
288 rowseps = sorted(self.rowseps.keys()) # list of row boundaries
289 rowindex = {}
290 for i in range(len(rowseps)):
291 rowindex[rowseps[i]] = i # row boundary -> row number mapping
292 colseps = sorted(self.colseps.keys()) # list of column boundaries
293 colindex = {}
294 for i in range(len(colseps)):
295 colindex[colseps[i]] = i # column boundary -> col number map
296 colspecs = [(colseps[i] - colseps[i - 1] - 1)
297 for i in range(1, len(colseps))] # list of column widths
298 # prepare an empty table with the correct number of rows & columns
299 onerow = [None for i in range(len(colseps) - 1)]
300 rows = [onerow[:] for i in range(len(rowseps) - 1)]
301 # keep track of # of cells remaining; should reduce to zero
302 remaining = (len(rowseps) - 1) * (len(colseps) - 1)
303 for top, left, bottom, right, block in self.cells:
304 rownum = rowindex[top]
305 colnum = colindex[left]
306 assert rows[rownum][colnum] is None, (
307 'Cell (row %s, column %s) already used.'
308 % (rownum + 1, colnum + 1))
309 morerows = rowindex[bottom] - rownum - 1
310 morecols = colindex[right] - colnum - 1
311 remaining -= (morerows + 1) * (morecols + 1)
312 # write the cell into the table
313 rows[rownum][colnum] = (morerows, morecols, top + 1, block)
314 assert remaining == 0, 'Unused cells remaining.'
315 if self.head_body_sep: # separate head rows from body rows
316 numheadrows = rowindex[self.head_body_sep]
317 headrows = rows[:numheadrows]
318 bodyrows = rows[numheadrows:]
319 else:
320 headrows = []
321 bodyrows = rows
322 return colspecs, headrows, bodyrows
323
324
325class SimpleTableParser(TableParser):
326
327 """
328 Parse a simple table using `parse()`.
329
330 Here's an example of a simple table::
331
332 ===== =====
333 col 1 col 2
334 ===== =====
335 1 Second column of row 1.
336 2 Second column of row 2.
337 Second line of paragraph.
338 3 - Second column of row 3.
339
340 - Second item in bullet
341 list (row 3, column 2).
342 4 is a span
343 ------------
344 5
345 ===== =====
346
347 Top and bottom borders use '=', column span underlines use '-', column
348 separation is indicated with spaces.
349
350 Passing the above table to the `parse()` method will result in the
351 following data structure, whose interpretation is the same as for
352 `GridTableParser`::
353
354 ([5, 25],
355 [[(0, 0, 1, ['col 1']),
356 (0, 0, 1, ['col 2'])]],
357 [[(0, 0, 3, ['1']),
358 (0, 0, 3, ['Second column of row 1.'])],
359 [(0, 0, 4, ['2']),
360 (0, 0, 4, ['Second column of row 2.',
361 'Second line of paragraph.'])],
362 [(0, 0, 6, ['3']),
363 (0, 0, 6, ['- Second column of row 3.',
364 '',
365 '- Second item in bullet',
366 ' list (row 3, column 2).'])],
367 [(0, 1, 10, ['4 is a span'])],
368 [(0, 0, 12, ['5']),
369 (0, 0, 12, [''])]])
370 """
371
372 head_body_separator_pat = re.compile('=[ =]*$')
373 span_pat = re.compile('-[ -]*$')
374
375 def setup(self, block):
376 self.block = block[:] # make a copy; it will be modified
377 self.block.disconnect() # don't propagate changes to parent
378 # Convert top & bottom borders to column span underlines:
379 self.block[0] = self.block[0].replace('=', '-')
380 self.block[-1] = self.block[-1].replace('=', '-')
381 self.head_body_sep = None
382 self.columns = []
383 self.border_end = None
384 self.table = []
385 self.done = [-1] * len(block[0])
386 self.rowseps = {0: [0]}
387 self.colseps = {0: [0]}
388
389 def parse_table(self):
390 """
391 First determine the column boundaries from the top border, then
392 process rows. Each row may consist of multiple lines; accumulate
393 lines until a row is complete. Call `self.parse_row` to finish the
394 job.
395 """
396 # Top border must fully describe all table columns.
397 self.columns = self.parse_columns(self.block[0], 0)
398 self.border_end = self.columns[-1][1]
399 firststart, firstend = self.columns[0]
400 offset = 1 # skip top border
401 start = 1
402 text_found = None
403 while offset < len(self.block):
404 line = self.block[offset]
405 if self.span_pat.match(line):
406 # Column span underline or border; row is complete.
407 self.parse_row(self.block[start:offset], start,
408 (line.rstrip(), offset))
409 start = offset + 1
410 text_found = None
411 elif line[firststart:firstend].strip():
412 # First column not blank, therefore it's a new row.
413 if text_found and offset != start:
414 self.parse_row(self.block[start:offset], start)
415 start = offset
416 text_found = 1
417 elif not text_found:
418 start = offset + 1
419 offset += 1
420
421 def parse_columns(self, line, offset):
422 """
423 Given a column span underline, return a list of (begin, end) pairs.
424 """
425 cols = []
426 end = 0
427 while True:
428 begin = line.find('-', end)
429 end = line.find(' ', begin)
430 if begin < 0:
431 break
432 if end < 0:
433 end = len(line)
434 cols.append((begin, end))
435 if self.columns:
436 if cols[-1][1] != self.border_end:
437 raise TableMarkupError('Column span incomplete in table '
438 'line %s.' % (offset+1),
439 offset=offset)
440 # Allow for an unbounded rightmost column:
441 cols[-1] = (cols[-1][0], self.columns[-1][1])
442 return cols
443
444 def init_row(self, colspec, offset):
445 i = 0
446 cells = []
447 for start, end in colspec:
448 morecols = 0
449 try:
450 assert start == self.columns[i][0]
451 while end != self.columns[i][1]:
452 i += 1
453 morecols += 1
454 except (AssertionError, IndexError):
455 raise TableMarkupError('Column span alignment problem '
456 'in table line %s.' % (offset+2),
457 offset=offset+1)
458 cells.append([0, morecols, offset, []])
459 i += 1
460 return cells
461
462 def parse_row(self, lines, start, spanline=None):
463 """
464 Given the text `lines` of a row, parse it and append to `self.table`.
465
466 The row is parsed according to the current column spec (either
467 `spanline` if provided or `self.columns`). For each column, extract
468 text from each line, and check for text in column margins. Finally,
469 adjust for insignificant whitespace.
470 """
471 if not (lines or spanline):
472 # No new row, just blank lines.
473 return
474 if spanline:
475 columns = self.parse_columns(*spanline)
476 else:
477 columns = self.columns[:]
478 self.check_columns(lines, start, columns)
479 row = self.init_row(columns, start)
480 for i in range(len(columns)):
481 start, end = columns[i]
482 cellblock = lines.get_2D_block(0, start, len(lines), end)
483 cellblock.disconnect() # lines in cell can't sync with parent
484 cellblock.replace(self.double_width_pad_char, '')
485 row[i][3] = cellblock
486 self.table.append(row)
487
488 def check_columns(self, lines, first_line, columns):
489 """
490 Check for text in column margins and text overflow in the last column.
491 Raise TableMarkupError if anything but whitespace is in column margins.
492 Adjust the end value for the last column if there is text overflow.
493 """
494 # "Infinite" value for a dummy last column's beginning, used to
495 # check for text overflow:
496 columns.append((sys.maxsize, None))
497 lastcol = len(columns) - 2
498 # combining characters do not contribute to the column width
499 lines = [strip_combining_chars(line) for line in lines]
500
501 for i in range(len(columns) - 1):
502 start, end = columns[i]
503 nextstart = columns[i+1][0]
504 offset = 0
505 for line in lines:
506 if i == lastcol and line[end:].strip():
507 text = line[start:].rstrip()
508 new_end = start + len(text)
509 main_start, main_end = self.columns[-1]
510 columns[i] = (start, max(main_end, new_end))
511 if new_end > main_end:
512 self.columns[-1] = (main_start, new_end)
513 elif line[end:nextstart].strip():
514 raise TableMarkupError('Text in column margin in table '
515 'line %s.' % (first_line+offset+1),
516 offset=first_line+offset)
517 offset += 1
518 columns.pop()
519
520 def structure_from_cells(self):
521 colspecs = [end - start for start, end in self.columns]
522 first_body_row = 0
523 if self.head_body_sep:
524 for i in range(len(self.table)):
525 if self.table[i][0][2] > self.head_body_sep:
526 first_body_row = i
527 break
528 return (colspecs, self.table[:first_body_row],
529 self.table[first_body_row:])
530
531
532def update_dict_of_lists(master, newdata):
533 """
534 Extend the list values of `master` with those from `newdata`.
535
536 Both parameters must be dictionaries containing list values.
537 """
538 for key, values in newdata.items():
539 master.setdefault(key, []).extend(values)