Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/docutils/parsers/rst/tableparser.py: 50%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

268 statements  

1# $Id$ 

2# Author: David Goodger <goodger@python.org> 

3# Copyright: This module has been placed in the public domain. 

4 

5""" 

6This module defines table parser classes,which parse plaintext-graphic tables 

7and produce a well-formed data structure suitable for building a CALS table. 

8 

9:Classes: 

10 - `GridTableParser`: Parse fully-formed tables represented with a grid. 

11 - `SimpleTableParser`: Parse simple tables, delimited by top & bottom 

12 borders. 

13 

14:Exception class: `TableMarkupError` 

15 

16:Function: 

17 `update_dict_of_lists()`: Merge two dictionaries containing list values. 

18""" 

19 

20__docformat__ = 'reStructuredText' 

21 

22 

23import re 

24import sys 

25from docutils import DataError 

26from docutils.utils import strip_combining_chars 

27 

28 

29class TableMarkupError(DataError): 

30 

31 """ 

32 Raise if there is any problem with table markup. 

33 

34 The keyword argument `offset` denotes the offset of the problem 

35 from the table's start line. 

36 """ 

37 

38 def __init__(self, *args, **kwargs): 

39 self.offset = kwargs.pop('offset', 0) 

40 DataError.__init__(self, *args) 

41 

42 

43class TableParser: 

44 

45 """ 

46 Abstract superclass for the common parts of the syntax-specific parsers. 

47 """ 

48 

49 head_body_separator_pat = None 

50 """Matches the row separator between head rows and body rows.""" 

51 

52 double_width_pad_char = '\x00' 

53 """Padding character for East Asian double-width text.""" 

54 

55 def parse(self, block): 

56 """ 

57 Analyze the text `block` and return a table data structure. 

58 

59 Given a plaintext-graphic table in `block` (list of lines of text; no 

60 whitespace padding), parse the table, construct and return the data 

61 necessary to construct a CALS table or equivalent. 

62 

63 Raise `TableMarkupError` if there is any problem with the markup. 

64 """ 

65 self.setup(block) 

66 self.find_head_body_sep() 

67 self.parse_table() 

68 return self.structure_from_cells() 

69 

70 def find_head_body_sep(self): 

71 """Look for a head/body row separator line; store the line index.""" 

72 for i in range(len(self.block)): 

73 line = self.block[i] 

74 if self.head_body_separator_pat.match(line): 

75 if self.head_body_sep: 

76 raise TableMarkupError( 

77 'Multiple head/body row separators ' 

78 '(table lines %s and %s); only one allowed.' 

79 % (self.head_body_sep+1, i+1), offset=i) 

80 else: 

81 self.head_body_sep = i 

82 self.block[i] = line.replace('=', '-') 

83 if self.head_body_sep == 0 or self.head_body_sep == (len(self.block) 

84 - 1): 

85 raise TableMarkupError('The head/body row separator may not be ' 

86 'the first or last line of the table.', 

87 offset=i) 

88 

89 

90class GridTableParser(TableParser): 

91 

92 """ 

93 Parse a grid table using `parse()`. 

94 

95 Here's an example of a grid table:: 

96 

97 +------------------------+------------+----------+----------+ 

98 | Header row, column 1 | Header 2 | Header 3 | Header 4 | 

99 +========================+============+==========+==========+ 

100 | body row 1, column 1 | column 2 | column 3 | column 4 | 

101 +------------------------+------------+----------+----------+ 

102 | body row 2 | Cells may span columns. | 

103 +------------------------+------------+---------------------+ 

104 | body row 3 | Cells may | - Table cells | 

105 +------------------------+ span rows. | - contain | 

106 | body row 4 | | - body elements. | 

107 +------------------------+------------+---------------------+ 

108 

109 Intersections use '+', row separators use '-' (except for one optional 

110 head/body row separator, which uses '='), and column separators use '|'. 

111 

112 Passing the above table to the `parse()` method will result in the 

113 following data structure:: 

114 

115 ([24, 12, 10, 10], 

116 [[(0, 0, 1, ['Header row, column 1']), 

117 (0, 0, 1, ['Header 2']), 

118 (0, 0, 1, ['Header 3']), 

119 (0, 0, 1, ['Header 4'])]], 

120 [[(0, 0, 3, ['body row 1, column 1']), 

121 (0, 0, 3, ['column 2']), 

122 (0, 0, 3, ['column 3']), 

123 (0, 0, 3, ['column 4'])], 

124 [(0, 0, 5, ['body row 2']), 

125 (0, 2, 5, ['Cells may span columns.']), 

126 None, 

127 None], 

128 [(0, 0, 7, ['body row 3']), 

129 (1, 0, 7, ['Cells may', 'span rows.', '']), 

130 (1, 1, 7, ['- Table cells', '- contain', '- body elements.']), 

131 None], 

132 [(0, 0, 9, ['body row 4']), None, None, None]]) 

133 

134 The first item is a list containing column widths (colspecs). The second 

135 item is a list of head rows, and the third is a list of body rows. Each 

136 row contains a list of cells. Each cell is either None (for a cell unused 

137 because of another cell's span), or a tuple. A cell tuple contains four 

138 items: the number of extra rows used by the cell in a vertical span 

139 (morerows); the number of extra columns used by the cell in a horizontal 

140 span (morecols); the line offset of the first line of the cell contents; 

141 and the cell contents, a list of lines of text. 

142 """ 

143 

144 head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$') 

145 

146 def setup(self, block): 

147 self.block = block[:] # make a copy; it may be modified 

148 self.block.disconnect() # don't propagate changes to parent 

149 self.bottom = len(block) - 1 

150 self.right = len(block[0]) - 1 

151 self.head_body_sep = None 

152 self.done = [-1] * len(block[0]) 

153 self.cells = [] 

154 self.rowseps = {0: [0]} 

155 self.colseps = {0: [0]} 

156 

157 def parse_table(self): 

158 """ 

159 Start with a queue of upper-left corners, containing the upper-left 

160 corner of the table itself. Trace out one rectangular cell, remember 

161 it, and add its upper-right and lower-left corners to the queue of 

162 potential upper-left corners of further cells. Process the queue in 

163 top-to-bottom order, keeping track of how much of each text column has 

164 been seen. 

165 

166 We'll end up knowing all the row and column boundaries, cell positions 

167 and their dimensions. 

168 """ 

169 corners = [(0, 0)] 

170 while corners: 

171 top, left = corners.pop(0) 

172 if (top == self.bottom 

173 or left == self.right 

174 or top <= self.done[left]): 

175 continue 

176 result = self.scan_cell(top, left) 

177 if not result: 

178 continue 

179 bottom, right, rowseps, colseps = result 

180 update_dict_of_lists(self.rowseps, rowseps) 

181 update_dict_of_lists(self.colseps, colseps) 

182 self.mark_done(top, left, bottom, right) 

183 cellblock = self.block.get_2D_block(top + 1, left + 1, 

184 bottom, right) 

185 cellblock.disconnect() # lines in cell can't sync with parent 

186 cellblock.replace(self.double_width_pad_char, '') 

187 self.cells.append((top, left, bottom, right, cellblock)) 

188 corners.extend([(top, right), (bottom, left)]) 

189 corners.sort() 

190 if not self.check_parse_complete(): 

191 raise TableMarkupError('Malformed table; parse incomplete.') 

192 

193 def mark_done(self, top, left, bottom, right): 

194 """For keeping track of how much of each text column has been seen.""" 

195 before = top - 1 

196 after = bottom - 1 

197 for col in range(left, right): 

198 assert self.done[col] == before 

199 self.done[col] = after 

200 

201 def check_parse_complete(self): 

202 """Each text column should have been completely seen.""" 

203 last = self.bottom - 1 

204 for col in range(self.right): 

205 if self.done[col] != last: 

206 return False 

207 return True 

208 

209 def scan_cell(self, top, left): 

210 """Starting at the top-left corner, start tracing out a cell.""" 

211 assert self.block[top][left] == '+' 

212 return self.scan_right(top, left) 

213 

214 def scan_right(self, top, left): 

215 """ 

216 Look for the top-right corner of the cell, and make note of all column 

217 boundaries ('+'). 

218 """ 

219 colseps = {} 

220 line = self.block[top] 

221 for i in range(left + 1, self.right + 1): 

222 if line[i] == '+': 

223 colseps[i] = [top] 

224 result = self.scan_down(top, left, i) 

225 if result: 

226 bottom, rowseps, newcolseps = result 

227 update_dict_of_lists(colseps, newcolseps) 

228 return bottom, i, rowseps, colseps 

229 elif line[i] != '-': 

230 return None 

231 return None 

232 

233 def scan_down(self, top, left, right): 

234 """ 

235 Look for the bottom-right corner of the cell, making note of all row 

236 boundaries. 

237 """ 

238 rowseps = {} 

239 for i in range(top + 1, self.bottom + 1): 

240 if self.block[i][right] == '+': 

241 rowseps[i] = [right] 

242 result = self.scan_left(top, left, i, right) 

243 if result: 

244 newrowseps, colseps = result 

245 update_dict_of_lists(rowseps, newrowseps) 

246 return i, rowseps, colseps 

247 elif self.block[i][right] != '|': 

248 return None 

249 return None 

250 

251 def scan_left(self, top, left, bottom, right): 

252 """ 

253 Noting column boundaries, look for the bottom-left corner of the cell. 

254 It must line up with the starting point. 

255 """ 

256 colseps = {} 

257 line = self.block[bottom] 

258 for i in range(right - 1, left, -1): 

259 if line[i] == '+': 

260 colseps[i] = [bottom] 

261 elif line[i] != '-': 

262 return None 

263 if line[left] != '+': 

264 return None 

265 result = self.scan_up(top, left, bottom, right) 

266 if result is not None: 

267 rowseps = result 

268 return rowseps, colseps 

269 return None 

270 

271 def scan_up(self, top, left, bottom, right): 

272 """ 

273 Noting row boundaries, see if we can return to the starting point. 

274 """ 

275 rowseps = {} 

276 for i in range(bottom - 1, top, -1): 

277 if self.block[i][left] == '+': 

278 rowseps[i] = [left] 

279 elif self.block[i][left] != '|': 

280 return None 

281 return rowseps 

282 

283 def structure_from_cells(self): 

284 """ 

285 From the data collected by `scan_cell()`, convert to the final data 

286 structure. 

287 """ 

288 rowseps = sorted(self.rowseps.keys()) # list of row boundaries 

289 rowindex = {} 

290 for i in range(len(rowseps)): 

291 rowindex[rowseps[i]] = i # row boundary -> row number mapping 

292 colseps = sorted(self.colseps.keys()) # list of column boundaries 

293 colindex = {} 

294 for i in range(len(colseps)): 

295 colindex[colseps[i]] = i # column boundary -> col number map 

296 colspecs = [(colseps[i] - colseps[i - 1] - 1) 

297 for i in range(1, len(colseps))] # list of column widths 

298 # prepare an empty table with the correct number of rows & columns 

299 onerow = [None for i in range(len(colseps) - 1)] 

300 rows = [onerow[:] for i in range(len(rowseps) - 1)] 

301 # keep track of # of cells remaining; should reduce to zero 

302 remaining = (len(rowseps) - 1) * (len(colseps) - 1) 

303 for top, left, bottom, right, block in self.cells: 

304 rownum = rowindex[top] 

305 colnum = colindex[left] 

306 assert rows[rownum][colnum] is None, ( 

307 'Cell (row %s, column %s) already used.' 

308 % (rownum + 1, colnum + 1)) 

309 morerows = rowindex[bottom] - rownum - 1 

310 morecols = colindex[right] - colnum - 1 

311 remaining -= (morerows + 1) * (morecols + 1) 

312 # write the cell into the table 

313 rows[rownum][colnum] = (morerows, morecols, top + 1, block) 

314 assert remaining == 0, 'Unused cells remaining.' 

315 if self.head_body_sep: # separate head rows from body rows 

316 numheadrows = rowindex[self.head_body_sep] 

317 headrows = rows[:numheadrows] 

318 bodyrows = rows[numheadrows:] 

319 else: 

320 headrows = [] 

321 bodyrows = rows 

322 return colspecs, headrows, bodyrows 

323 

324 

325class SimpleTableParser(TableParser): 

326 

327 """ 

328 Parse a simple table using `parse()`. 

329 

330 Here's an example of a simple table:: 

331 

332 ===== ===== 

333 col 1 col 2 

334 ===== ===== 

335 1 Second column of row 1. 

336 2 Second column of row 2. 

337 Second line of paragraph. 

338 3 - Second column of row 3. 

339 

340 - Second item in bullet 

341 list (row 3, column 2). 

342 4 is a span 

343 ------------ 

344 5 

345 ===== ===== 

346 

347 Top and bottom borders use '=', column span underlines use '-', column 

348 separation is indicated with spaces. 

349 

350 Passing the above table to the `parse()` method will result in the 

351 following data structure, whose interpretation is the same as for 

352 `GridTableParser`:: 

353 

354 ([5, 25], 

355 [[(0, 0, 1, ['col 1']), 

356 (0, 0, 1, ['col 2'])]], 

357 [[(0, 0, 3, ['1']), 

358 (0, 0, 3, ['Second column of row 1.'])], 

359 [(0, 0, 4, ['2']), 

360 (0, 0, 4, ['Second column of row 2.', 

361 'Second line of paragraph.'])], 

362 [(0, 0, 6, ['3']), 

363 (0, 0, 6, ['- Second column of row 3.', 

364 '', 

365 '- Second item in bullet', 

366 ' list (row 3, column 2).'])], 

367 [(0, 1, 10, ['4 is a span'])], 

368 [(0, 0, 12, ['5']), 

369 (0, 0, 12, [''])]]) 

370 """ 

371 

372 head_body_separator_pat = re.compile('=[ =]*$') 

373 span_pat = re.compile('-[ -]*$') 

374 

375 def setup(self, block): 

376 self.block = block[:] # make a copy; it will be modified 

377 self.block.disconnect() # don't propagate changes to parent 

378 # Convert top & bottom borders to column span underlines: 

379 self.block[0] = self.block[0].replace('=', '-') 

380 self.block[-1] = self.block[-1].replace('=', '-') 

381 self.head_body_sep = None 

382 self.columns = [] 

383 self.border_end = None 

384 self.table = [] 

385 self.done = [-1] * len(block[0]) 

386 self.rowseps = {0: [0]} 

387 self.colseps = {0: [0]} 

388 

389 def parse_table(self): 

390 """ 

391 First determine the column boundaries from the top border, then 

392 process rows. Each row may consist of multiple lines; accumulate 

393 lines until a row is complete. Call `self.parse_row` to finish the 

394 job. 

395 """ 

396 # Top border must fully describe all table columns. 

397 self.columns = self.parse_columns(self.block[0], 0) 

398 self.border_end = self.columns[-1][1] 

399 firststart, firstend = self.columns[0] 

400 offset = 1 # skip top border 

401 start = 1 

402 text_found = None 

403 while offset < len(self.block): 

404 line = self.block[offset] 

405 if self.span_pat.match(line): 

406 # Column span underline or border; row is complete. 

407 self.parse_row(self.block[start:offset], start, 

408 (line.rstrip(), offset)) 

409 start = offset + 1 

410 text_found = None 

411 elif line[firststart:firstend].strip(): 

412 # First column not blank, therefore it's a new row. 

413 if text_found and offset != start: 

414 self.parse_row(self.block[start:offset], start) 

415 start = offset 

416 text_found = 1 

417 elif not text_found: 

418 start = offset + 1 

419 offset += 1 

420 

421 def parse_columns(self, line, offset): 

422 """ 

423 Given a column span underline, return a list of (begin, end) pairs. 

424 """ 

425 cols = [] 

426 end = 0 

427 while True: 

428 begin = line.find('-', end) 

429 end = line.find(' ', begin) 

430 if begin < 0: 

431 break 

432 if end < 0: 

433 end = len(line) 

434 cols.append((begin, end)) 

435 if self.columns: 

436 if cols[-1][1] != self.border_end: 

437 raise TableMarkupError('Column span incomplete in table ' 

438 'line %s.' % (offset+1), 

439 offset=offset) 

440 # Allow for an unbounded rightmost column: 

441 cols[-1] = (cols[-1][0], self.columns[-1][1]) 

442 return cols 

443 

444 def init_row(self, colspec, offset): 

445 i = 0 

446 cells = [] 

447 for start, end in colspec: 

448 morecols = 0 

449 try: 

450 assert start == self.columns[i][0] 

451 while end != self.columns[i][1]: 

452 i += 1 

453 morecols += 1 

454 except (AssertionError, IndexError): 

455 raise TableMarkupError('Column span alignment problem ' 

456 'in table line %s.' % (offset+2), 

457 offset=offset+1) 

458 cells.append([0, morecols, offset, []]) 

459 i += 1 

460 return cells 

461 

462 def parse_row(self, lines, start, spanline=None): 

463 """ 

464 Given the text `lines` of a row, parse it and append to `self.table`. 

465 

466 The row is parsed according to the current column spec (either 

467 `spanline` if provided or `self.columns`). For each column, extract 

468 text from each line, and check for text in column margins. Finally, 

469 adjust for insignificant whitespace. 

470 """ 

471 if not (lines or spanline): 

472 # No new row, just blank lines. 

473 return 

474 if spanline: 

475 columns = self.parse_columns(*spanline) 

476 else: 

477 columns = self.columns[:] 

478 self.check_columns(lines, start, columns) 

479 row = self.init_row(columns, start) 

480 for i in range(len(columns)): 

481 start, end = columns[i] 

482 cellblock = lines.get_2D_block(0, start, len(lines), end) 

483 cellblock.disconnect() # lines in cell can't sync with parent 

484 cellblock.replace(self.double_width_pad_char, '') 

485 row[i][3] = cellblock 

486 self.table.append(row) 

487 

488 def check_columns(self, lines, first_line, columns): 

489 """ 

490 Check for text in column margins and text overflow in the last column. 

491 Raise TableMarkupError if anything but whitespace is in column margins. 

492 Adjust the end value for the last column if there is text overflow. 

493 """ 

494 # "Infinite" value for a dummy last column's beginning, used to 

495 # check for text overflow: 

496 columns.append((sys.maxsize, None)) 

497 lastcol = len(columns) - 2 

498 # combining characters do not contribute to the column width 

499 lines = [strip_combining_chars(line) for line in lines] 

500 

501 for i in range(len(columns) - 1): 

502 start, end = columns[i] 

503 nextstart = columns[i+1][0] 

504 offset = 0 

505 for line in lines: 

506 if i == lastcol and line[end:].strip(): 

507 text = line[start:].rstrip() 

508 new_end = start + len(text) 

509 main_start, main_end = self.columns[-1] 

510 columns[i] = (start, max(main_end, new_end)) 

511 if new_end > main_end: 

512 self.columns[-1] = (main_start, new_end) 

513 elif line[end:nextstart].strip(): 

514 raise TableMarkupError('Text in column margin in table ' 

515 'line %s.' % (first_line+offset+1), 

516 offset=first_line+offset) 

517 offset += 1 

518 columns.pop() 

519 

520 def structure_from_cells(self): 

521 colspecs = [end - start for start, end in self.columns] 

522 first_body_row = 0 

523 if self.head_body_sep: 

524 for i in range(len(self.table)): 

525 if self.table[i][0][2] > self.head_body_sep: 

526 first_body_row = i 

527 break 

528 return (colspecs, self.table[:first_body_row], 

529 self.table[first_body_row:]) 

530 

531 

532def update_dict_of_lists(master, newdata): 

533 """ 

534 Extend the list values of `master` with those from `newdata`. 

535 

536 Both parameters must be dictionaries containing list values. 

537 """ 

538 for key, values in newdata.items(): 

539 master.setdefault(key, []).extend(values)