Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/docutils/parsers/rst/tableparser.py: 51%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

273 statements  

1# $Id$ 

2# Author: David Goodger <goodger@python.org> 

3# Copyright: This module has been placed in the public domain. 

4 

5""" 

6This module defines table parser classes,which parse plaintext-graphic tables 

7and produce a well-formed data structure suitable for building a CALS table. 

8 

9:Classes: 

10 - `GridTableParser`: Parse fully-formed tables represented with a grid. 

11 - `SimpleTableParser`: Parse simple tables, delimited by top & bottom 

12 borders. 

13 

14:Exception class: `TableMarkupError` 

15 

16:Function: 

17 `update_dict_of_lists()`: Merge two dictionaries containing list values. 

18""" 

19 

20from __future__ import annotations 

21 

22__docformat__ = 'reStructuredText' 

23 

24import re 

25import sys 

26from docutils import DataError 

27from docutils.utils import strip_combining_chars 

28 

29 

30class TableMarkupError(DataError): 

31 

32 """ 

33 Raise if there is any problem with table markup. 

34 

35 The keyword argument `offset` denotes the offset of the problem 

36 from the table's start line. 

37 """ 

38 

39 def __init__(self, *args, **kwargs) -> None: 

40 self.offset = kwargs.pop('offset', 0) 

41 DataError.__init__(self, *args) 

42 

43 

44class TableParser: 

45 

46 """ 

47 Abstract superclass for the common parts of the syntax-specific parsers. 

48 """ 

49 

50 head_body_separator_pat = None 

51 """Matches the row separator between head rows and body rows.""" 

52 

53 double_width_pad_char = '\x00' 

54 """Padding character for East Asian double-width text.""" 

55 

56 def parse(self, block): 

57 """ 

58 Analyze the text `block` and return a table data structure. 

59 

60 Given a plaintext-graphic table in `block` (list of lines of text; no 

61 whitespace padding), parse the table, construct and return the data 

62 necessary to construct a CALS table or equivalent. 

63 

64 Raise `TableMarkupError` if there is any problem with the markup. 

65 """ 

66 self.setup(block) 

67 self.find_head_body_sep() 

68 self.parse_table() 

69 return self.structure_from_cells() 

70 

71 def find_head_body_sep(self): 

72 """Look for a head/body row separator line; store the line index.""" 

73 for i in range(len(self.block)): 

74 line = self.block[i] 

75 if self.head_body_separator_pat.match(line): 

76 if self.head_body_sep: 

77 raise TableMarkupError( 

78 'Multiple head/body row separators ' 

79 '(table lines %s and %s); only one allowed.' 

80 % (self.head_body_sep+1, i+1), offset=i) 

81 else: 

82 self.head_body_sep = i 

83 self.block[i] = line.replace('=', '-') 

84 if self.head_body_sep == 0 or self.head_body_sep == (len(self.block) 

85 - 1): 

86 raise TableMarkupError('The head/body row separator may not be ' 

87 'the first or last line of the table.', 

88 offset=i) 

89 

90 

91class GridTableParser(TableParser): 

92 

93 """ 

94 Parse a grid table using `parse()`. 

95 

96 Here's an example of a grid table:: 

97 

98 +------------------------+------------+----------+----------+ 

99 | Header row, column 1 | Header 2 | Header 3 | Header 4 | 

100 +========================+============+==========+==========+ 

101 | body row 1, column 1 | column 2 | column 3 | column 4 | 

102 +------------------------+------------+----------+----------+ 

103 | body row 2 | Cells may span columns. | 

104 +------------------------+------------+---------------------+ 

105 | body row 3 | Cells may | - Table cells | 

106 +------------------------+ span rows. | - contain | 

107 | body row 4 | | - body elements. | 

108 +------------------------+------------+---------------------+ 

109 

110 Intersections use '+', row separators use '-' (except for one optional 

111 head/body row separator, which uses '='), and column separators use '|'. 

112 

113 Passing the above table to the `parse()` method will result in the 

114 following data structure:: 

115 

116 ([24, 12, 10, 10], 

117 [[(0, 0, 1, ['Header row, column 1']), 

118 (0, 0, 1, ['Header 2']), 

119 (0, 0, 1, ['Header 3']), 

120 (0, 0, 1, ['Header 4'])]], 

121 [[(0, 0, 3, ['body row 1, column 1']), 

122 (0, 0, 3, ['column 2']), 

123 (0, 0, 3, ['column 3']), 

124 (0, 0, 3, ['column 4'])], 

125 [(0, 0, 5, ['body row 2']), 

126 (0, 2, 5, ['Cells may span columns.']), 

127 None, 

128 None], 

129 [(0, 0, 7, ['body row 3']), 

130 (1, 0, 7, ['Cells may', 'span rows.', '']), 

131 (1, 1, 7, ['- Table cells', '- contain', '- body elements.']), 

132 None], 

133 [(0, 0, 9, ['body row 4']), None, None, None]]) 

134 

135 The first item is a list containing column widths (colspecs). The second 

136 item is a list of head rows, and the third is a list of body rows. Each 

137 row contains a list of cells. Each cell is either None (for a cell unused 

138 because of another cell's span), or a tuple. A cell tuple contains four 

139 items: the number of extra rows used by the cell in a vertical span 

140 (morerows); the number of extra columns used by the cell in a horizontal 

141 span (morecols); the line offset of the first line of the cell contents; 

142 and the cell contents, a list of lines of text. 

143 """ 

144 

145 head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$') 

146 

147 def setup(self, block) -> None: 

148 self.block = block[:] # make a copy; it may be modified 

149 self.block.disconnect() # don't propagate changes to parent 

150 self.bottom = len(block) - 1 

151 self.right = len(block[0]) - 1 

152 self.head_body_sep = None 

153 self.done = [-1] * len(block[0]) 

154 self.cells = [] 

155 self.rowseps = {0: [0]} 

156 self.colseps = {0: [0]} 

157 

158 def parse_table(self): 

159 """ 

160 Start with a queue of upper-left corners, containing the upper-left 

161 corner of the table itself. Trace out one rectangular cell, remember 

162 it, and add its upper-right and lower-left corners to the queue of 

163 potential upper-left corners of further cells. Process the queue in 

164 top-to-bottom order, keeping track of how much of each text column has 

165 been seen. 

166 

167 We'll end up knowing all the row and column boundaries, cell positions 

168 and their dimensions. 

169 """ 

170 corners = [(0, 0)] 

171 while corners: 

172 top, left = corners.pop(0) 

173 if (top == self.bottom 

174 or left == self.right 

175 or top <= self.done[left]): 

176 continue 

177 result = self.scan_cell(top, left) 

178 if not result: 

179 continue 

180 bottom, right, rowseps, colseps = result 

181 update_dict_of_lists(self.rowseps, rowseps) 

182 update_dict_of_lists(self.colseps, colseps) 

183 self.mark_done(top, left, bottom, right) 

184 cellblock = self.block.get_2D_block(top + 1, left + 1, 

185 bottom, right) 

186 cellblock.disconnect() # lines in cell can't sync with parent 

187 cellblock.replace(self.double_width_pad_char, '') 

188 self.cells.append((top, left, bottom, right, cellblock)) 

189 corners.extend([(top, right), (bottom, left)]) 

190 corners.sort() 

191 if not self.check_parse_complete(): 

192 raise TableMarkupError('Malformed table; parse incomplete.') 

193 

194 def mark_done(self, top, left, bottom, right) -> None: 

195 """For keeping track of how much of each text column has been seen.""" 

196 before = top - 1 

197 after = bottom - 1 

198 for col in range(left, right): 

199 assert self.done[col] == before 

200 self.done[col] = after 

201 

202 def check_parse_complete(self) -> bool: 

203 """Each text column should have been completely seen.""" 

204 last = self.bottom - 1 

205 for col in range(self.right): 

206 if self.done[col] != last: 

207 return False 

208 return True 

209 

210 def scan_cell(self, top, left): 

211 """Starting at the top-left corner, start tracing out a cell.""" 

212 assert self.block[top][left] == '+' 

213 return self.scan_right(top, left) 

214 

215 def scan_right(self, top, left): 

216 """ 

217 Look for the top-right corner of the cell, and make note of all column 

218 boundaries ('+'). 

219 """ 

220 colseps = {} 

221 line = self.block[top] 

222 for i in range(left + 1, self.right + 1): 

223 if line[i] == '+': 

224 colseps[i] = [top] 

225 result = self.scan_down(top, left, i) 

226 if result: 

227 bottom, rowseps, newcolseps = result 

228 update_dict_of_lists(colseps, newcolseps) 

229 return bottom, i, rowseps, colseps 

230 elif line[i] != '-': 

231 return None 

232 return None 

233 

234 def scan_down(self, top, left, right): 

235 """ 

236 Look for the bottom-right corner of the cell, making note of all row 

237 boundaries. 

238 """ 

239 rowseps = {} 

240 for i in range(top + 1, self.bottom + 1): 

241 if self.block[i][right] == '+': 

242 rowseps[i] = [right] 

243 result = self.scan_left(top, left, i, right) 

244 if result: 

245 newrowseps, colseps = result 

246 update_dict_of_lists(rowseps, newrowseps) 

247 return i, rowseps, colseps 

248 elif self.block[i][right] != '|': 

249 return None 

250 return None 

251 

252 def scan_left(self, top, left, bottom, right): 

253 """ 

254 Noting column boundaries, look for the bottom-left corner of the cell. 

255 It must line up with the starting point. 

256 """ 

257 colseps = {} 

258 line = self.block[bottom] 

259 for i in range(right - 1, left, -1): 

260 if line[i] == '+': 

261 colseps[i] = [bottom] 

262 elif line[i] != '-': 

263 return None 

264 if line[left] != '+': 

265 return None 

266 result = self.scan_up(top, left, bottom, right) 

267 if result is not None: 

268 rowseps = result 

269 return rowseps, colseps 

270 return None 

271 

272 def scan_up(self, top, left, bottom, right): 

273 """ 

274 Noting row boundaries, see if we can return to the starting point. 

275 """ 

276 rowseps = {} 

277 for i in range(bottom - 1, top, -1): 

278 if self.block[i][left] == '+': 

279 rowseps[i] = [left] 

280 elif self.block[i][left] != '|': 

281 return None 

282 return rowseps 

283 

284 def structure_from_cells(self): 

285 """ 

286 From the data collected by `scan_cell()`, convert to the final data 

287 structure. 

288 """ 

289 rowseps = sorted(self.rowseps.keys()) # list of row boundaries 

290 rowindex = {} 

291 for i in range(len(rowseps)): 

292 rowindex[rowseps[i]] = i # row boundary -> row number mapping 

293 colseps = sorted(self.colseps.keys()) # list of column boundaries 

294 colindex = {} 

295 for i in range(len(colseps)): 

296 colindex[colseps[i]] = i # column boundary -> col number map 

297 colspecs = [(colseps[i] - colseps[i - 1] - 1) 

298 for i in range(1, len(colseps))] # list of column widths 

299 # prepare an empty table with the correct number of rows & columns 

300 onerow = [None for i in range(len(colseps) - 1)] 

301 rows = [onerow[:] for i in range(len(rowseps) - 1)] 

302 # keep track of # of cells remaining; should reduce to zero 

303 remaining = (len(rowseps) - 1) * (len(colseps) - 1) 

304 for top, left, bottom, right, block in self.cells: 

305 rownum = rowindex[top] 

306 colnum = colindex[left] 

307 assert rows[rownum][colnum] is None, ( 

308 'Cell (row %s, column %s) already used.' 

309 % (rownum + 1, colnum + 1)) 

310 morerows = rowindex[bottom] - rownum - 1 

311 morecols = colindex[right] - colnum - 1 

312 remaining -= (morerows + 1) * (morecols + 1) 

313 # write the cell into the table 

314 rows[rownum][colnum] = (morerows, morecols, top + 1, block) 

315 assert remaining == 0, 'Unused cells remaining.' 

316 if self.head_body_sep: # separate head rows from body rows 

317 numheadrows = rowindex[self.head_body_sep] 

318 headrows = rows[:numheadrows] 

319 bodyrows = rows[numheadrows:] 

320 else: 

321 headrows = [] 

322 bodyrows = rows 

323 return colspecs, headrows, bodyrows 

324 

325 

326class SimpleTableParser(TableParser): 

327 

328 """ 

329 Parse a simple table using `parse()`. 

330 

331 Here's an example of a simple table:: 

332 

333 ===== ===== 

334 col 1 col 2 

335 ===== ===== 

336 1 Second column of row 1. 

337 2 Second column of row 2. 

338 Second line of paragraph. 

339 3 - Second column of row 3. 

340 

341 - Second item in bullet 

342 list (row 3, column 2). 

343 4 is a span 

344 ------------ 

345 5 

346 ===== ===== 

347 

348 Top and bottom borders use '=', column span underlines use '-', column 

349 separation is indicated with spaces. 

350 

351 Passing the above table to the `parse()` method will result in the 

352 following data structure, whose interpretation is the same as for 

353 `GridTableParser`:: 

354 

355 ([5, 25], 

356 [[(0, 0, 1, ['col 1']), 

357 (0, 0, 1, ['col 2'])]], 

358 [[(0, 0, 3, ['1']), 

359 (0, 0, 3, ['Second column of row 1.'])], 

360 [(0, 0, 4, ['2']), 

361 (0, 0, 4, ['Second column of row 2.', 

362 'Second line of paragraph.'])], 

363 [(0, 0, 6, ['3']), 

364 (0, 0, 6, ['- Second column of row 3.', 

365 '', 

366 '- Second item in bullet', 

367 ' list (row 3, column 2).'])], 

368 [(0, 1, 10, ['4 is a span'])], 

369 [(0, 0, 12, ['5']), 

370 (0, 0, 12, [''])]]) 

371 """ 

372 

373 head_body_separator_pat = re.compile('=[ =]*$') 

374 span_pat = re.compile('-[ -]*$') 

375 

376 def setup(self, block) -> None: 

377 self.block = block[:] # make a copy; it will be modified 

378 self.block.disconnect() # don't propagate changes to parent 

379 # Convert top & bottom borders to column span underlines: 

380 self.block[0] = self.block[0].replace('=', '-') 

381 self.block[-1] = self.block[-1].replace('=', '-') 

382 self.head_body_sep = None 

383 self.columns = [] 

384 self.border_end = None 

385 self.table = [] 

386 self.done = [-1] * len(block[0]) 

387 self.rowseps = {0: [0]} 

388 self.colseps = {0: [0]} 

389 

390 def parse_table(self) -> None: 

391 """ 

392 First determine the column boundaries from the top border, then 

393 process rows. Each row may consist of multiple lines; accumulate 

394 lines until a row is complete. Call `self.parse_row` to finish the 

395 job. 

396 """ 

397 # Top border must fully describe all table columns. 

398 self.columns = self.parse_columns(self.block[0], 0) 

399 self.border_end = self.columns[-1][1] 

400 firststart, firstend = self.columns[0] 

401 offset = 1 # skip top border 

402 start = 1 

403 text_found = None 

404 while offset < len(self.block): 

405 line = self.block[offset] 

406 if self.span_pat.match(line): 

407 # Column span underline or border; row is complete. 

408 self.parse_row(self.block[start:offset], start, 

409 (line.rstrip(), offset)) 

410 start = offset + 1 

411 text_found = None 

412 elif line[firststart:firstend].strip(): 

413 # First column not blank, therefore it's a new row. 

414 if text_found and offset != start: 

415 self.parse_row(self.block[start:offset], start) 

416 start = offset 

417 text_found = 1 

418 elif not text_found: 

419 start = offset + 1 

420 offset += 1 

421 

422 def parse_columns(self, line, offset): 

423 """ 

424 Given a column span underline, return a list of (begin, end) pairs. 

425 """ 

426 cols = [] 

427 end = 0 

428 while True: 

429 begin = line.find('-', end) 

430 end = line.find(' ', begin) 

431 if begin < 0: 

432 break 

433 if end < 0: 

434 end = len(line) 

435 cols.append((begin, end)) 

436 if self.columns: 

437 if cols[-1][1] != self.border_end: 

438 raise TableMarkupError('Column span incomplete in table ' 

439 'line %s.' % (offset+1), 

440 offset=offset) 

441 # Allow for an unbounded rightmost column: 

442 cols[-1] = (cols[-1][0], self.columns[-1][1]) 

443 return cols 

444 

445 def init_row(self, colspec, offset): 

446 i = 0 

447 cells = [] 

448 for start, end in colspec: 

449 morecols = 0 

450 try: 

451 assert start == self.columns[i][0] 

452 while end != self.columns[i][1]: 

453 i += 1 

454 morecols += 1 

455 except (AssertionError, IndexError): 

456 raise TableMarkupError('Column span alignment problem ' 

457 'in table line %s.' % (offset+2), 

458 offset=offset+1) 

459 cells.append([0, morecols, offset, []]) 

460 i += 1 

461 return cells 

462 

463 def parse_row(self, lines, start, spanline=None) -> None: 

464 """ 

465 Given the text `lines` of a row, parse it and append to `self.table`. 

466 

467 The row is parsed according to the current column spec (either 

468 `spanline` if provided or `self.columns`). For each column, extract 

469 text from each line, and check for text in column margins. Finally, 

470 adjust for insignificant whitespace. 

471 """ 

472 if not (lines or spanline): 

473 # No new row, just blank lines. 

474 return 

475 if spanline: 

476 columns = self.parse_columns(*spanline) 

477 else: 

478 columns = self.columns[:] 

479 self.check_columns(lines, start, columns) 

480 row = self.init_row(columns, start) 

481 for i in range(len(columns)): 

482 start, end = columns[i] 

483 cellblock = lines.get_2D_block(0, start, len(lines), end) 

484 cellblock.disconnect() # lines in cell can't sync with parent 

485 cellblock.replace(self.double_width_pad_char, '') 

486 row[i][3] = cellblock 

487 self.table.append(row) 

488 

489 def check_columns(self, lines, first_line, columns): 

490 """ 

491 Check for text in column margins and text overflow in the last column. 

492 Raise TableMarkupError if anything but whitespace is in column margins. 

493 Adjust the end value for the last column if there is text overflow. 

494 """ 

495 # "Infinite" value for a dummy last column's beginning, used to 

496 # check for text overflow: 

497 columns.append((sys.maxsize, None)) 

498 lastcol = len(columns) - 2 

499 # combining characters do not contribute to the column width 

500 lines = [strip_combining_chars(line) for line in lines] 

501 

502 for i in range(len(columns) - 1): 

503 start, end = columns[i] 

504 nextstart = columns[i+1][0] 

505 offset = 0 

506 for line in lines: 

507 if i == lastcol and line[end:].strip(): 

508 text = line[start:].rstrip() 

509 new_end = start + len(text) 

510 main_start, main_end = self.columns[-1] 

511 columns[i] = (start, max(main_end, new_end)) 

512 if new_end > main_end: 

513 self.columns[-1] = (main_start, new_end) 

514 elif line[end:nextstart].strip(): 

515 raise TableMarkupError('Text in column margin in table ' 

516 'line %s.' % (first_line+offset+1), 

517 offset=first_line+offset) 

518 offset += 1 

519 columns.pop() 

520 

521 def structure_from_cells(self): 

522 colspecs = [end - start for start, end in self.columns] 

523 first_body_row = 0 

524 if self.head_body_sep: 

525 for i in range(len(self.table)): 

526 if self.table[i][0][2] > self.head_body_sep: 

527 first_body_row = i 

528 break 

529 return (colspecs, self.table[:first_body_row], 

530 self.table[first_body_row:]) 

531 

532 

533def update_dict_of_lists(master, newdata) -> None: 

534 """ 

535 Extend the list values of `master` with those from `newdata`. 

536 

537 Both parameters must be dictionaries containing list values. 

538 """ 

539 for key, values in newdata.items(): 

540 master.setdefault(key, []).extend(values)