1# $Id$
2# Author: David Goodger <goodger@python.org>
3# Copyright: This module has been placed in the public domain.
4
5"""
6This module defines table parser classes,which parse plaintext-graphic tables
7and produce a well-formed data structure suitable for building a CALS table.
8
9:Classes:
10 - `GridTableParser`: Parse fully-formed tables represented with a grid.
11 - `SimpleTableParser`: Parse simple tables, delimited by top & bottom
12 borders.
13
14:Exception class: `TableMarkupError`
15
16:Function:
17 `update_dict_of_lists()`: Merge two dictionaries containing list values.
18"""
19
20from __future__ import annotations
21
22__docformat__ = 'reStructuredText'
23
24import re
25import sys
26from docutils import DataError
27from docutils.utils import strip_combining_chars
28
29
30class TableMarkupError(DataError):
31
32 """
33 Raise if there is any problem with table markup.
34
35 The keyword argument `offset` denotes the offset of the problem
36 from the table's start line.
37 """
38
39 def __init__(self, *args, **kwargs) -> None:
40 self.offset = kwargs.pop('offset', 0)
41 DataError.__init__(self, *args)
42
43
44class TableParser:
45
46 """
47 Abstract superclass for the common parts of the syntax-specific parsers.
48 """
49
50 head_body_separator_pat = None
51 """Matches the row separator between head rows and body rows."""
52
53 double_width_pad_char = '\x00'
54 """Padding character for East Asian double-width text."""
55
56 def parse(self, block):
57 """
58 Analyze the text `block` and return a table data structure.
59
60 Given a plaintext-graphic table in `block` (list of lines of text; no
61 whitespace padding), parse the table, construct and return the data
62 necessary to construct a CALS table or equivalent.
63
64 Raise `TableMarkupError` if there is any problem with the markup.
65 """
66 self.setup(block)
67 self.find_head_body_sep()
68 self.parse_table()
69 return self.structure_from_cells()
70
71 def find_head_body_sep(self):
72 """Look for a head/body row separator line; store the line index."""
73 for i in range(len(self.block)):
74 line = self.block[i]
75 if self.head_body_separator_pat.match(line):
76 if self.head_body_sep:
77 raise TableMarkupError(
78 'Multiple head/body row separators '
79 '(table lines %s and %s); only one allowed.'
80 % (self.head_body_sep+1, i+1), offset=i)
81 else:
82 self.head_body_sep = i
83 self.block[i] = line.replace('=', '-')
84 if self.head_body_sep == 0 or self.head_body_sep == (len(self.block)
85 - 1):
86 raise TableMarkupError('The head/body row separator may not be '
87 'the first or last line of the table.',
88 offset=i)
89
90
91class GridTableParser(TableParser):
92
93 """
94 Parse a grid table using `parse()`.
95
96 Here's an example of a grid table::
97
98 +------------------------+------------+----------+----------+
99 | Header row, column 1 | Header 2 | Header 3 | Header 4 |
100 +========================+============+==========+==========+
101 | body row 1, column 1 | column 2 | column 3 | column 4 |
102 +------------------------+------------+----------+----------+
103 | body row 2 | Cells may span columns. |
104 +------------------------+------------+---------------------+
105 | body row 3 | Cells may | - Table cells |
106 +------------------------+ span rows. | - contain |
107 | body row 4 | | - body elements. |
108 +------------------------+------------+---------------------+
109
110 Intersections use '+', row separators use '-' (except for one optional
111 head/body row separator, which uses '='), and column separators use '|'.
112
113 Passing the above table to the `parse()` method will result in the
114 following data structure::
115
116 ([24, 12, 10, 10],
117 [[(0, 0, 1, ['Header row, column 1']),
118 (0, 0, 1, ['Header 2']),
119 (0, 0, 1, ['Header 3']),
120 (0, 0, 1, ['Header 4'])]],
121 [[(0, 0, 3, ['body row 1, column 1']),
122 (0, 0, 3, ['column 2']),
123 (0, 0, 3, ['column 3']),
124 (0, 0, 3, ['column 4'])],
125 [(0, 0, 5, ['body row 2']),
126 (0, 2, 5, ['Cells may span columns.']),
127 None,
128 None],
129 [(0, 0, 7, ['body row 3']),
130 (1, 0, 7, ['Cells may', 'span rows.', '']),
131 (1, 1, 7, ['- Table cells', '- contain', '- body elements.']),
132 None],
133 [(0, 0, 9, ['body row 4']), None, None, None]])
134
135 The first item is a list containing column widths (colspecs). The second
136 item is a list of head rows, and the third is a list of body rows. Each
137 row contains a list of cells. Each cell is either None (for a cell unused
138 because of another cell's span), or a tuple. A cell tuple contains four
139 items: the number of extra rows used by the cell in a vertical span
140 (morerows); the number of extra columns used by the cell in a horizontal
141 span (morecols); the line offset of the first line of the cell contents;
142 and the cell contents, a list of lines of text.
143 """
144
145 head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$')
146
147 def setup(self, block) -> None:
148 self.block = block[:] # make a copy; it may be modified
149 self.block.disconnect() # don't propagate changes to parent
150 self.bottom = len(block) - 1
151 self.right = len(block[0]) - 1
152 self.head_body_sep = None
153 self.done = [-1] * len(block[0])
154 self.cells = []
155 self.rowseps = {0: [0]}
156 self.colseps = {0: [0]}
157
158 def parse_table(self):
159 """
160 Start with a queue of upper-left corners, containing the upper-left
161 corner of the table itself. Trace out one rectangular cell, remember
162 it, and add its upper-right and lower-left corners to the queue of
163 potential upper-left corners of further cells. Process the queue in
164 top-to-bottom order, keeping track of how much of each text column has
165 been seen.
166
167 We'll end up knowing all the row and column boundaries, cell positions
168 and their dimensions.
169 """
170 corners = [(0, 0)]
171 while corners:
172 top, left = corners.pop(0)
173 if (top == self.bottom
174 or left == self.right
175 or top <= self.done[left]):
176 continue
177 result = self.scan_cell(top, left)
178 if not result:
179 continue
180 bottom, right, rowseps, colseps = result
181 update_dict_of_lists(self.rowseps, rowseps)
182 update_dict_of_lists(self.colseps, colseps)
183 self.mark_done(top, left, bottom, right)
184 cellblock = self.block.get_2D_block(top + 1, left + 1,
185 bottom, right)
186 cellblock.disconnect() # lines in cell can't sync with parent
187 cellblock.replace(self.double_width_pad_char, '')
188 self.cells.append((top, left, bottom, right, cellblock))
189 corners.extend([(top, right), (bottom, left)])
190 corners.sort()
191 if not self.check_parse_complete():
192 raise TableMarkupError('Malformed table; parse incomplete.')
193
194 def mark_done(self, top, left, bottom, right) -> None:
195 """For keeping track of how much of each text column has been seen."""
196 before = top - 1
197 after = bottom - 1
198 for col in range(left, right):
199 assert self.done[col] == before
200 self.done[col] = after
201
202 def check_parse_complete(self) -> bool:
203 """Each text column should have been completely seen."""
204 last = self.bottom - 1
205 for col in range(self.right):
206 if self.done[col] != last:
207 return False
208 return True
209
210 def scan_cell(self, top, left):
211 """Starting at the top-left corner, start tracing out a cell."""
212 assert self.block[top][left] == '+'
213 return self.scan_right(top, left)
214
215 def scan_right(self, top, left):
216 """
217 Look for the top-right corner of the cell, and make note of all column
218 boundaries ('+').
219 """
220 colseps = {}
221 line = self.block[top]
222 for i in range(left + 1, self.right + 1):
223 if line[i] == '+':
224 colseps[i] = [top]
225 result = self.scan_down(top, left, i)
226 if result:
227 bottom, rowseps, newcolseps = result
228 update_dict_of_lists(colseps, newcolseps)
229 return bottom, i, rowseps, colseps
230 elif line[i] != '-':
231 return None
232 return None
233
234 def scan_down(self, top, left, right):
235 """
236 Look for the bottom-right corner of the cell, making note of all row
237 boundaries.
238 """
239 rowseps = {}
240 for i in range(top + 1, self.bottom + 1):
241 if self.block[i][right] == '+':
242 rowseps[i] = [right]
243 result = self.scan_left(top, left, i, right)
244 if result:
245 newrowseps, colseps = result
246 update_dict_of_lists(rowseps, newrowseps)
247 return i, rowseps, colseps
248 elif self.block[i][right] != '|':
249 return None
250 return None
251
252 def scan_left(self, top, left, bottom, right):
253 """
254 Noting column boundaries, look for the bottom-left corner of the cell.
255 It must line up with the starting point.
256 """
257 colseps = {}
258 line = self.block[bottom]
259 for i in range(right - 1, left, -1):
260 if line[i] == '+':
261 colseps[i] = [bottom]
262 elif line[i] != '-':
263 return None
264 if line[left] != '+':
265 return None
266 result = self.scan_up(top, left, bottom, right)
267 if result is not None:
268 rowseps = result
269 return rowseps, colseps
270 return None
271
272 def scan_up(self, top, left, bottom, right):
273 """
274 Noting row boundaries, see if we can return to the starting point.
275 """
276 rowseps = {}
277 for i in range(bottom - 1, top, -1):
278 if self.block[i][left] == '+':
279 rowseps[i] = [left]
280 elif self.block[i][left] != '|':
281 return None
282 return rowseps
283
284 def structure_from_cells(self):
285 """
286 From the data collected by `scan_cell()`, convert to the final data
287 structure.
288 """
289 rowseps = sorted(self.rowseps.keys()) # list of row boundaries
290 rowindex = {}
291 for i in range(len(rowseps)):
292 rowindex[rowseps[i]] = i # row boundary -> row number mapping
293 colseps = sorted(self.colseps.keys()) # list of column boundaries
294 colindex = {}
295 for i in range(len(colseps)):
296 colindex[colseps[i]] = i # column boundary -> col number map
297 colspecs = [(colseps[i] - colseps[i - 1] - 1)
298 for i in range(1, len(colseps))] # list of column widths
299 # prepare an empty table with the correct number of rows & columns
300 onerow = [None for i in range(len(colseps) - 1)]
301 rows = [onerow[:] for i in range(len(rowseps) - 1)]
302 # keep track of # of cells remaining; should reduce to zero
303 remaining = (len(rowseps) - 1) * (len(colseps) - 1)
304 for top, left, bottom, right, block in self.cells:
305 rownum = rowindex[top]
306 colnum = colindex[left]
307 assert rows[rownum][colnum] is None, (
308 'Cell (row %s, column %s) already used.'
309 % (rownum + 1, colnum + 1))
310 morerows = rowindex[bottom] - rownum - 1
311 morecols = colindex[right] - colnum - 1
312 remaining -= (morerows + 1) * (morecols + 1)
313 # write the cell into the table
314 rows[rownum][colnum] = (morerows, morecols, top + 1, block)
315 assert remaining == 0, 'Unused cells remaining.'
316 if self.head_body_sep: # separate head rows from body rows
317 numheadrows = rowindex[self.head_body_sep]
318 headrows = rows[:numheadrows]
319 bodyrows = rows[numheadrows:]
320 else:
321 headrows = []
322 bodyrows = rows
323 return colspecs, headrows, bodyrows
324
325
326class SimpleTableParser(TableParser):
327
328 """
329 Parse a simple table using `parse()`.
330
331 Here's an example of a simple table::
332
333 ===== =====
334 col 1 col 2
335 ===== =====
336 1 Second column of row 1.
337 2 Second column of row 2.
338 Second line of paragraph.
339 3 - Second column of row 3.
340
341 - Second item in bullet
342 list (row 3, column 2).
343 4 is a span
344 ------------
345 5
346 ===== =====
347
348 Top and bottom borders use '=', column span underlines use '-', column
349 separation is indicated with spaces.
350
351 Passing the above table to the `parse()` method will result in the
352 following data structure, whose interpretation is the same as for
353 `GridTableParser`::
354
355 ([5, 25],
356 [[(0, 0, 1, ['col 1']),
357 (0, 0, 1, ['col 2'])]],
358 [[(0, 0, 3, ['1']),
359 (0, 0, 3, ['Second column of row 1.'])],
360 [(0, 0, 4, ['2']),
361 (0, 0, 4, ['Second column of row 2.',
362 'Second line of paragraph.'])],
363 [(0, 0, 6, ['3']),
364 (0, 0, 6, ['- Second column of row 3.',
365 '',
366 '- Second item in bullet',
367 ' list (row 3, column 2).'])],
368 [(0, 1, 10, ['4 is a span'])],
369 [(0, 0, 12, ['5']),
370 (0, 0, 12, [''])]])
371 """
372
373 head_body_separator_pat = re.compile('=[ =]*$')
374 span_pat = re.compile('-[ -]*$')
375
376 def setup(self, block) -> None:
377 self.block = block[:] # make a copy; it will be modified
378 self.block.disconnect() # don't propagate changes to parent
379 # Convert top & bottom borders to column span underlines:
380 self.block[0] = self.block[0].replace('=', '-')
381 self.block[-1] = self.block[-1].replace('=', '-')
382 self.head_body_sep = None
383 self.columns = []
384 self.border_end = None
385 self.table = []
386 self.done = [-1] * len(block[0])
387 self.rowseps = {0: [0]}
388 self.colseps = {0: [0]}
389
390 def parse_table(self) -> None:
391 """
392 First determine the column boundaries from the top border, then
393 process rows. Each row may consist of multiple lines; accumulate
394 lines until a row is complete. Call `self.parse_row` to finish the
395 job.
396 """
397 # Top border must fully describe all table columns.
398 self.columns = self.parse_columns(self.block[0], 0)
399 self.border_end = self.columns[-1][1]
400 firststart, firstend = self.columns[0]
401 offset = 1 # skip top border
402 start = 1
403 text_found = None
404 while offset < len(self.block):
405 line = self.block[offset]
406 if self.span_pat.match(line):
407 # Column span underline or border; row is complete.
408 self.parse_row(self.block[start:offset], start,
409 (line.rstrip(), offset))
410 start = offset + 1
411 text_found = None
412 elif line[firststart:firstend].strip():
413 # First column not blank, therefore it's a new row.
414 if text_found and offset != start:
415 self.parse_row(self.block[start:offset], start)
416 start = offset
417 text_found = 1
418 elif not text_found:
419 start = offset + 1
420 offset += 1
421
422 def parse_columns(self, line, offset):
423 """
424 Given a column span underline, return a list of (begin, end) pairs.
425 """
426 cols = []
427 end = 0
428 while True:
429 begin = line.find('-', end)
430 end = line.find(' ', begin)
431 if begin < 0:
432 break
433 if end < 0:
434 end = len(line)
435 cols.append((begin, end))
436 if self.columns:
437 if cols[-1][1] != self.border_end:
438 raise TableMarkupError('Column span incomplete in table '
439 'line %s.' % (offset+1),
440 offset=offset)
441 # Allow for an unbounded rightmost column:
442 cols[-1] = (cols[-1][0], self.columns[-1][1])
443 return cols
444
445 def init_row(self, colspec, offset):
446 i = 0
447 cells = []
448 for start, end in colspec:
449 morecols = 0
450 try:
451 assert start == self.columns[i][0]
452 while end != self.columns[i][1]:
453 i += 1
454 morecols += 1
455 except (AssertionError, IndexError):
456 raise TableMarkupError('Column span alignment problem '
457 'in table line %s.' % (offset+2),
458 offset=offset+1)
459 cells.append([0, morecols, offset, []])
460 i += 1
461 return cells
462
463 def parse_row(self, lines, start, spanline=None) -> None:
464 """
465 Given the text `lines` of a row, parse it and append to `self.table`.
466
467 The row is parsed according to the current column spec (either
468 `spanline` if provided or `self.columns`). For each column, extract
469 text from each line, and check for text in column margins. Finally,
470 adjust for insignificant whitespace.
471 """
472 if not (lines or spanline):
473 # No new row, just blank lines.
474 return
475 if spanline:
476 columns = self.parse_columns(*spanline)
477 else:
478 columns = self.columns[:]
479 self.check_columns(lines, start, columns)
480 row = self.init_row(columns, start)
481 for i in range(len(columns)):
482 start, end = columns[i]
483 cellblock = lines.get_2D_block(0, start, len(lines), end)
484 cellblock.disconnect() # lines in cell can't sync with parent
485 cellblock.replace(self.double_width_pad_char, '')
486 row[i][3] = cellblock
487 self.table.append(row)
488
489 def check_columns(self, lines, first_line, columns):
490 """
491 Check for text in column margins and text overflow in the last column.
492 Raise TableMarkupError if anything but whitespace is in column margins.
493 Adjust the end value for the last column if there is text overflow.
494 """
495 # "Infinite" value for a dummy last column's beginning, used to
496 # check for text overflow:
497 columns.append((sys.maxsize, None))
498 lastcol = len(columns) - 2
499 # combining characters do not contribute to the column width
500 lines = [strip_combining_chars(line) for line in lines]
501
502 for i in range(len(columns) - 1):
503 start, end = columns[i]
504 nextstart = columns[i+1][0]
505 offset = 0
506 for line in lines:
507 if i == lastcol and line[end:].strip():
508 text = line[start:].rstrip()
509 new_end = start + len(text)
510 main_start, main_end = self.columns[-1]
511 columns[i] = (start, max(main_end, new_end))
512 if new_end > main_end:
513 self.columns[-1] = (main_start, new_end)
514 elif line[end:nextstart].strip():
515 raise TableMarkupError('Text in column margin in table '
516 'line %s.' % (first_line+offset+1),
517 offset=first_line+offset)
518 offset += 1
519 columns.pop()
520
521 def structure_from_cells(self):
522 colspecs = [end - start for start, end in self.columns]
523 first_body_row = 0
524 if self.head_body_sep:
525 for i in range(len(self.table)):
526 if self.table[i][0][2] > self.head_body_sep:
527 first_body_row = i
528 break
529 return (colspecs, self.table[:first_body_row],
530 self.table[first_body_row:])
531
532
533def update_dict_of_lists(master, newdata) -> None:
534 """
535 Extend the list values of `master` with those from `newdata`.
536
537 Both parameters must be dictionaries containing list values.
538 """
539 for key, values in newdata.items():
540 master.setdefault(key, []).extend(values)