Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/docutils/parsers/rst/tableparser.py: 50%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

274 statements  

1# $Id$ 

2# Author: David Goodger <goodger@python.org> 

3# Copyright: This module has been placed in the public domain. 

4 

5""" 

6This module defines table parser classes,which parse plaintext-graphic tables 

7and produce a well-formed data structure suitable for building a CALS table. 

8 

9:Classes: 

10 - `GridTableParser`: Parse fully-formed tables represented with a grid. 

11 - `SimpleTableParser`: Parse simple tables, delimited by top & bottom 

12 borders. 

13 

14:Exception class: `TableMarkupError` 

15 

16:Function: 

17 `update_dict_of_lists()`: Merge two dictionaries containing list values. 

18""" 

19 

20from __future__ import annotations 

21 

22__docformat__ = 'reStructuredText' 

23 

24import re 

25import sys 

26from docutils import DataError 

27from docutils.utils import strip_combining_chars 

28 

29 

30class TableMarkupError(DataError): 

31 

32 """ 

33 Raise if there is any problem with table markup. 

34 

35 The keyword argument `offset` denotes the offset of the problem 

36 from the table's start line. 

37 """ 

38 

39 def __init__(self, *args, **kwargs) -> None: 

40 self.offset = kwargs.pop('offset', 0) 

41 DataError.__init__(self, *args) 

42 

43 

44class TableParser: 

45 

46 """ 

47 Abstract superclass for the common parts of the syntax-specific parsers. 

48 """ 

49 

50 head_body_separator_pat = None 

51 """Matches the row separator between head rows and body rows.""" 

52 

53 double_width_pad_char = '\x00' 

54 """Padding character for East Asian double-width text.""" 

55 

56 def parse(self, block): 

57 """ 

58 Analyze the text `block` and return a table data structure. 

59 

60 Given a plaintext-graphic table in `block` (list of lines of text; no 

61 whitespace padding), parse the table, construct and return the data 

62 necessary to construct a CALS table or equivalent. 

63 

64 Raise `TableMarkupError` if there is any problem with the markup. 

65 """ 

66 self.setup(block) 

67 self.find_head_body_sep() 

68 self.parse_table() 

69 return self.structure_from_cells() 

70 

71 def find_head_body_sep(self): 

72 """Look for a head/body row separator line; store the line index.""" 

73 for i in range(len(self.block)): 

74 line = self.block[i] 

75 if self.head_body_separator_pat.match(line): 

76 if self.head_body_sep: 

77 raise TableMarkupError( 

78 'Multiple head/body row separators ' 

79 '(table lines %s and %s); only one allowed.' 

80 % (self.head_body_sep+1, i+1), offset=i) 

81 else: 

82 self.head_body_sep = i 

83 self.block[i] = line.replace('=', '-') 

84 if self.head_body_sep == 0 or self.head_body_sep == (len(self.block) 

85 - 1): 

86 raise TableMarkupError('The head/body row separator may not be ' 

87 'the first or last line of the table.', 

88 offset=i) 

89 

90 

91class GridTableParser(TableParser): 

92 

93 """ 

94 Parse a grid table using `parse()`. 

95 

96 Here's an example of a grid table:: 

97 

98 +------------------------+------------+----------+----------+ 

99 | Header row, column 1 | Header 2 | Header 3 | Header 4 | 

100 +========================+============+==========+==========+ 

101 | body row 1, column 1 | column 2 | column 3 | column 4 | 

102 +------------------------+------------+----------+----------+ 

103 | body row 2 | Cells may span columns. | 

104 +------------------------+------------+---------------------+ 

105 | body row 3 | Cells may | - Table cells | 

106 +------------------------+ span rows. | - contain | 

107 | body row 4 | | - body elements. | 

108 +------------------------+------------+---------------------+ 

109 

110 Intersections use '+', row separators use '-' (except for one optional 

111 head/body row separator, which uses '='), and column separators use '|'. 

112 

113 Passing the above table to the `parse()` method will result in the 

114 following data structure:: 

115 

116 ([24, 12, 10, 10], 

117 [[(0, 0, 1, ['Header row, column 1']), 

118 (0, 0, 1, ['Header 2']), 

119 (0, 0, 1, ['Header 3']), 

120 (0, 0, 1, ['Header 4'])]], 

121 [[(0, 0, 3, ['body row 1, column 1']), 

122 (0, 0, 3, ['column 2']), 

123 (0, 0, 3, ['column 3']), 

124 (0, 0, 3, ['column 4'])], 

125 [(0, 0, 5, ['body row 2']), 

126 (0, 2, 5, ['Cells may span columns.']), 

127 None, 

128 None], 

129 [(0, 0, 7, ['body row 3']), 

130 (1, 0, 7, ['Cells may', 'span rows.', '']), 

131 (1, 1, 7, ['- Table cells', '- contain', '- body elements.']), 

132 None], 

133 [(0, 0, 9, ['body row 4']), None, None, None]]) 

134 

135 The first item is a list containing column widths (colspecs). The second 

136 item is a list of head rows, and the third is a list of body rows. Each 

137 row contains a list of cells. Each cell is either None (for a cell unused 

138 because of another cell's span), or a tuple. A cell tuple contains four 

139 items: the number of extra rows used by the cell in a vertical span 

140 (morerows); the number of extra columns used by the cell in a horizontal 

141 span (morecols); the line offset of the first line of the cell contents; 

142 and the cell contents, a list of lines of text. 

143 """ 

144 

145 head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$') 

146 

147 def setup(self, block) -> None: 

148 self.block = block[:] # make a copy; it may be modified 

149 self.block.disconnect() # don't propagate changes to parent 

150 self.bottom = len(block) - 1 

151 self.right = len(block[0]) - 1 

152 self.head_body_sep = None 

153 self.done = [-1] * len(block[0]) 

154 self.cells = [] 

155 self.rowseps = {0: [0]} 

156 self.colseps = {0: [0]} 

157 

158 def parse_table(self): 

159 """ 

160 Start with a queue of upper-left corners, containing the upper-left 

161 corner of the table itself. Trace out one rectangular cell, remember 

162 it, and add its upper-right and lower-left corners to the queue of 

163 potential upper-left corners of further cells. Process the queue in 

164 top-to-bottom order, keeping track of how much of each text column has 

165 been seen. 

166 

167 We'll end up knowing all the row and column boundaries, cell positions 

168 and their dimensions. 

169 """ 

170 # a copy of the block without combining characters: 

171 self.stripped_block = [strip_combining_chars(line) 

172 for line in self.block] 

173 corners = [(0, 0)] 

174 while corners: 

175 top, left = corners.pop(0) 

176 if (top == self.bottom 

177 or left == self.right 

178 or top <= self.done[left]): 

179 continue 

180 result = self.scan_cell(top, left) 

181 if not result: 

182 continue 

183 bottom, right, rowseps, colseps = result 

184 update_dict_of_lists(self.rowseps, rowseps) 

185 update_dict_of_lists(self.colseps, colseps) 

186 self.mark_done(top, left, bottom, right) 

187 cellblock = self.block.get_2D_block(top + 1, left + 1, 

188 bottom, right) 

189 cellblock.disconnect() # lines in cell can't sync with parent 

190 cellblock.replace(self.double_width_pad_char, '') 

191 self.cells.append((top, left, bottom, right, cellblock)) 

192 corners.extend([(top, right), (bottom, left)]) 

193 corners.sort() 

194 if not self.check_parse_complete(): 

195 raise TableMarkupError('Malformed table; parse incomplete.') 

196 

197 def mark_done(self, top, left, bottom, right) -> None: 

198 """For keeping track of how much of each text column has been seen.""" 

199 before = top - 1 

200 after = bottom - 1 

201 for col in range(left, right): 

202 assert self.done[col] == before 

203 self.done[col] = after 

204 

205 def check_parse_complete(self) -> bool: 

206 """Each text column should have been completely seen.""" 

207 last = self.bottom - 1 

208 for col in range(self.right): 

209 if self.done[col] != last: 

210 return False 

211 return True 

212 

213 def scan_cell(self, top, left): 

214 """Starting at the top-left corner, start tracing out a cell.""" 

215 assert self.stripped_block[top][left] == '+' 

216 return self.scan_right(top, left) 

217 

218 def scan_right(self, top, left): 

219 """ 

220 Look for the top-right corner of the cell, and make note of all column 

221 boundaries ('+'). 

222 """ 

223 colseps = {} 

224 line = self.stripped_block[top] 

225 for i in range(left + 1, self.right + 1): 

226 if line[i] == '+': 

227 colseps[i] = [top] 

228 result = self.scan_down(top, left, i) 

229 if result: 

230 bottom, rowseps, newcolseps = result 

231 update_dict_of_lists(colseps, newcolseps) 

232 return bottom, i, rowseps, colseps 

233 elif line[i] != '-': 

234 return None 

235 return None 

236 

237 def scan_down(self, top, left, right): 

238 """ 

239 Look for the bottom-right corner of the cell, making note of all row 

240 boundaries. 

241 """ 

242 rowseps = {} 

243 for i in range(top + 1, self.bottom + 1): 

244 if self.stripped_block[i][right] == '+': 

245 rowseps[i] = [right] 

246 result = self.scan_left(top, left, i, right) 

247 if result: 

248 newrowseps, colseps = result 

249 update_dict_of_lists(rowseps, newrowseps) 

250 return i, rowseps, colseps 

251 elif self.stripped_block[i][right] != '|': 

252 return None 

253 return None 

254 

255 def scan_left(self, top, left, bottom, right): 

256 """ 

257 Noting column boundaries, look for the bottom-left corner of the cell. 

258 It must line up with the starting point. 

259 """ 

260 colseps = {} 

261 line = self.stripped_block[bottom] 

262 for i in range(right - 1, left, -1): 

263 if line[i] == '+': 

264 colseps[i] = [bottom] 

265 elif line[i] != '-': 

266 return None 

267 if line[left] != '+': 

268 return None 

269 result = self.scan_up(top, left, bottom, right) 

270 if result is not None: 

271 rowseps = result 

272 return rowseps, colseps 

273 return None 

274 

275 def scan_up(self, top, left, bottom, right): 

276 """ 

277 Noting row boundaries, see if we can return to the starting point. 

278 """ 

279 rowseps = {} 

280 for i in range(bottom - 1, top, -1): 

281 if self.stripped_block[i][left] == '+': 

282 rowseps[i] = [left] 

283 elif self.stripped_block[i][left] != '|': 

284 return None 

285 return rowseps 

286 

287 def structure_from_cells(self): 

288 """ 

289 From the data collected by `scan_cell()`, convert to the final data 

290 structure. 

291 """ 

292 rowseps = sorted(self.rowseps.keys()) # list of row boundaries 

293 rowindex = {} 

294 for i in range(len(rowseps)): 

295 rowindex[rowseps[i]] = i # row boundary -> row number mapping 

296 colseps = sorted(self.colseps.keys()) # list of column boundaries 

297 colindex = {} 

298 for i in range(len(colseps)): 

299 colindex[colseps[i]] = i # column boundary -> col number map 

300 colspecs = [(colseps[i] - colseps[i - 1] - 1) 

301 for i in range(1, len(colseps))] # list of column widths 

302 # prepare an empty table with the correct number of rows & columns 

303 onerow = [None for i in range(len(colseps) - 1)] 

304 rows = [onerow[:] for i in range(len(rowseps) - 1)] 

305 # keep track of # of cells remaining; should reduce to zero 

306 remaining = (len(rowseps) - 1) * (len(colseps) - 1) 

307 for top, left, bottom, right, block in self.cells: 

308 rownum = rowindex[top] 

309 colnum = colindex[left] 

310 assert rows[rownum][colnum] is None, ( 

311 'Cell (row %s, column %s) already used.' 

312 % (rownum + 1, colnum + 1)) 

313 morerows = rowindex[bottom] - rownum - 1 

314 morecols = colindex[right] - colnum - 1 

315 remaining -= (morerows + 1) * (morecols + 1) 

316 # write the cell into the table 

317 rows[rownum][colnum] = (morerows, morecols, top + 1, block) 

318 assert remaining == 0, 'Unused cells remaining.' 

319 if self.head_body_sep: # separate head rows from body rows 

320 numheadrows = rowindex[self.head_body_sep] 

321 headrows = rows[:numheadrows] 

322 bodyrows = rows[numheadrows:] 

323 else: 

324 headrows = [] 

325 bodyrows = rows 

326 return colspecs, headrows, bodyrows 

327 

328 

329class SimpleTableParser(TableParser): 

330 

331 """ 

332 Parse a simple table using `parse()`. 

333 

334 Here's an example of a simple table:: 

335 

336 ===== ===== 

337 col 1 col 2 

338 ===== ===== 

339 1 Second column of row 1. 

340 2 Second column of row 2. 

341 Second line of paragraph. 

342 3 - Second column of row 3. 

343 

344 - Second item in bullet 

345 list (row 3, column 2). 

346 4 is a span 

347 ------------ 

348 5 

349 ===== ===== 

350 

351 Top and bottom borders use '=', column span underlines use '-', column 

352 separation is indicated with spaces. 

353 

354 Passing the above table to the `parse()` method will result in the 

355 following data structure, whose interpretation is the same as for 

356 `GridTableParser`:: 

357 

358 ([5, 25], 

359 [[(0, 0, 1, ['col 1']), 

360 (0, 0, 1, ['col 2'])]], 

361 [[(0, 0, 3, ['1']), 

362 (0, 0, 3, ['Second column of row 1.'])], 

363 [(0, 0, 4, ['2']), 

364 (0, 0, 4, ['Second column of row 2.', 

365 'Second line of paragraph.'])], 

366 [(0, 0, 6, ['3']), 

367 (0, 0, 6, ['- Second column of row 3.', 

368 '', 

369 '- Second item in bullet', 

370 ' list (row 3, column 2).'])], 

371 [(0, 1, 10, ['4 is a span'])], 

372 [(0, 0, 12, ['5']), 

373 (0, 0, 12, [''])]]) 

374 """ 

375 

376 head_body_separator_pat = re.compile('=[ =]*$') 

377 span_pat = re.compile('-[ -]*$') 

378 

379 def setup(self, block) -> None: 

380 self.block = block[:] # make a copy; it will be modified 

381 self.block.disconnect() # don't propagate changes to parent 

382 # Convert top & bottom borders to column span underlines: 

383 self.block[0] = self.block[0].replace('=', '-') 

384 self.block[-1] = self.block[-1].replace('=', '-') 

385 self.head_body_sep = None 

386 self.columns = [] 

387 self.border_end = None 

388 self.table = [] 

389 self.done = [-1] * len(block[0]) 

390 self.rowseps = {0: [0]} 

391 self.colseps = {0: [0]} 

392 

393 def parse_table(self) -> None: 

394 """ 

395 First determine the column boundaries from the top border, then 

396 process rows. Each row may consist of multiple lines; accumulate 

397 lines until a row is complete. Call `self.parse_row` to finish the 

398 job. 

399 """ 

400 # Top border must fully describe all table columns. 

401 self.columns = self.parse_columns(self.block[0], 0) 

402 self.border_end = self.columns[-1][1] 

403 firststart, firstend = self.columns[0] 

404 offset = 1 # skip top border 

405 start = 1 

406 text_found = None 

407 while offset < len(self.block): 

408 line = self.block[offset] 

409 if self.span_pat.match(line): 

410 # Column span underline or border; row is complete. 

411 self.parse_row(self.block[start:offset], start, 

412 (line.rstrip(), offset)) 

413 start = offset + 1 

414 text_found = None 

415 elif line[firststart:firstend].strip(): 

416 # First column not blank, therefore it's a new row. 

417 if text_found and offset != start: 

418 self.parse_row(self.block[start:offset], start) 

419 start = offset 

420 text_found = 1 

421 elif not text_found: 

422 start = offset + 1 

423 offset += 1 

424 

425 def parse_columns(self, line, offset): 

426 """ 

427 Given a column span underline, return a list of (begin, end) pairs. 

428 """ 

429 cols = [] 

430 end = 0 

431 while True: 

432 begin = line.find('-', end) 

433 end = line.find(' ', begin) 

434 if begin < 0: 

435 break 

436 if end < 0: 

437 end = len(line) 

438 cols.append((begin, end)) 

439 if self.columns: 

440 if cols[-1][1] != self.border_end: 

441 raise TableMarkupError('Column span incomplete in table ' 

442 'line %s.' % (offset+1), 

443 offset=offset) 

444 # Allow for an unbounded rightmost column: 

445 cols[-1] = (cols[-1][0], self.columns[-1][1]) 

446 return cols 

447 

448 def init_row(self, colspec, offset): 

449 i = 0 

450 cells = [] 

451 for start, end in colspec: 

452 morecols = 0 

453 try: 

454 assert start == self.columns[i][0] 

455 while end != self.columns[i][1]: 

456 i += 1 

457 morecols += 1 

458 except (AssertionError, IndexError): 

459 raise TableMarkupError('Column span alignment problem ' 

460 'in table line %s.' % (offset+2), 

461 offset=offset+1) 

462 cells.append([0, morecols, offset, []]) 

463 i += 1 

464 return cells 

465 

466 def parse_row(self, lines, start, spanline=None) -> None: 

467 """ 

468 Given the text `lines` of a row, parse it and append to `self.table`. 

469 

470 The row is parsed according to the current column spec (either 

471 `spanline` if provided or `self.columns`). For each column, extract 

472 text from each line, and check for text in column margins. Finally, 

473 adjust for insignificant whitespace. 

474 """ 

475 if not (lines or spanline): 

476 # No new row, just blank lines. 

477 return 

478 if spanline: 

479 columns = self.parse_columns(*spanline) 

480 else: 

481 columns = self.columns[:] 

482 self.check_columns(lines, start, columns) 

483 row = self.init_row(columns, start) 

484 for i in range(len(columns)): 

485 start, end = columns[i] 

486 cellblock = lines.get_2D_block(0, start, len(lines), end) 

487 cellblock.disconnect() # lines in cell can't sync with parent 

488 cellblock.replace(self.double_width_pad_char, '') 

489 row[i][3] = cellblock 

490 self.table.append(row) 

491 

492 def check_columns(self, lines, first_line, columns): 

493 """ 

494 Check for text in column margins and text overflow in the last column. 

495 Raise TableMarkupError if anything but whitespace is in column margins. 

496 Adjust the end value for the last column if there is text overflow. 

497 """ 

498 # "Infinite" value for a dummy last column's beginning, used to 

499 # check for text overflow: 

500 columns.append((sys.maxsize, None)) 

501 lastcol = len(columns) - 2 

502 # combining characters do not contribute to the column width 

503 lines = [strip_combining_chars(line) for line in lines] 

504 

505 for i in range(len(columns) - 1): 

506 start, end = columns[i] 

507 nextstart = columns[i+1][0] 

508 offset = 0 

509 for line in lines: 

510 if i == lastcol and line[end:].strip(): 

511 text = line[start:].rstrip() 

512 new_end = start + len(text) 

513 main_start, main_end = self.columns[-1] 

514 columns[i] = (start, max(main_end, new_end)) 

515 if new_end > main_end: 

516 self.columns[-1] = (main_start, new_end) 

517 elif line[end:nextstart].strip(): 

518 raise TableMarkupError('Text in column margin in table ' 

519 'line %s.' % (first_line+offset+1), 

520 offset=first_line+offset) 

521 offset += 1 

522 columns.pop() 

523 

524 def structure_from_cells(self): 

525 colspecs = [end - start for start, end in self.columns] 

526 first_body_row = 0 

527 if self.head_body_sep: 

528 for i in range(len(self.table)): 

529 if self.table[i][0][2] > self.head_body_sep: 

530 first_body_row = i 

531 break 

532 return (colspecs, self.table[:first_body_row], 

533 self.table[first_body_row:]) 

534 

535 

536def update_dict_of_lists(master, newdata) -> None: 

537 """ 

538 Extend the list values of `master` with those from `newdata`. 

539 

540 Both parameters must be dictionaries containing list values. 

541 """ 

542 for key, values in newdata.items(): 

543 master.setdefault(key, []).extend(values)