Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/docutils/parsers/rst/tableparser.py: 50%

1# $Id$

2# Author: David Goodger <goodger@python.org>

3# Copyright: This module has been placed in the public domain.

5"""

6This module defines table parser classes,which parse plaintext-graphic tables

7and produce a well-formed data structure suitable for building a CALS table.

9:Classes:

10 - `GridTableParser`: Parse fully-formed tables represented with a grid.

11 - `SimpleTableParser`: Parse simple tables, delimited by top & bottom

12 borders.

14:Exception class: `TableMarkupError`

16:Function:

17 `update_dict_of_lists()`: Merge two dictionaries containing list values.

18"""

20from __future__ import annotations

22__docformat__ = 'reStructuredText'

24import re

25import sys

26from docutils import DataError

27from docutils.utils import strip_combining_chars

30class TableMarkupError(DataError):

32 """

33 Raise if there is any problem with table markup.

35 The keyword argument `offset` denotes the offset of the problem

36 from the table's start line.

37 """

39 def __init__(self, *args, **kwargs) -> None:

40 self.offset = kwargs.pop('offset', 0)

41 DataError.__init__(self, *args)

44class TableParser:

46 """

47 Abstract superclass for the common parts of the syntax-specific parsers.

48 """

50 head_body_separator_pat = None

51 """Matches the row separator between head rows and body rows."""

53 double_width_pad_char = '\x00'

54 """Padding character for East Asian double-width text."""

56 def parse(self, block):

57 """

58 Analyze the text `block` and return a table data structure.

60 Given a plaintext-graphic table in `block` (list of lines of text; no

61 whitespace padding), parse the table, construct and return the data

62 necessary to construct a CALS table or equivalent.

64 Raise `TableMarkupError` if there is any problem with the markup.

65 """

66 self.setup(block)

67 self.find_head_body_sep()

68 self.parse_table()

69 return self.structure_from_cells()

71 def find_head_body_sep(self):

72 """Look for a head/body row separator line; store the line index."""

73 for i in range(len(self.block)):

74 line = self.block[i]

75 if self.head_body_separator_pat.match(line):

76 if self.head_body_sep:

77 raise TableMarkupError(

78 'Multiple head/body row separators '

79 '(table lines %s and %s); only one allowed.'

80 % (self.head_body_sep+1, i+1), offset=i)

81 else:

82 self.head_body_sep = i

83 self.block[i] = line.replace('=', '-')

84 if self.head_body_sep == 0 or self.head_body_sep == (len(self.block)

85 - 1):

86 raise TableMarkupError('The head/body row separator may not be '

87 'the first or last line of the table.',

88 offset=i)

91class GridTableParser(TableParser):

93 """

94 Parse a grid table using `parse()`.

96 Here's an example of a grid table::

98 +------------------------+------------+----------+----------+

100 +========================+============+==========+==========+

102 +------------------------+------------+----------+----------+

103 | body row 2 | Cells may span columns. |

104 +------------------------+------------+---------------------+

105 | body row 3 | Cells may | - Table cells |

106 +------------------------+ span rows. | - contain |

107 | body row 4 | | - body elements. |

108 +------------------------+------------+---------------------+

109

110 Intersections use '+', row separators use '-' (except for one optional

111 head/body row separator, which uses '='), and column separators use '|'.

112

113 Passing the above table to the `parse()` method will result in the

114 following data structure::

115

116 ([24, 12, 10, 10],

117 [[(0, 0, 1, ['Header row, column 1']),

118 (0, 0, 1, ['Header 2']),

119 (0, 0, 1, ['Header 3']),

120 (0, 0, 1, ['Header 4'])]],

121 [[(0, 0, 3, ['body row 1, column 1']),

122 (0, 0, 3, ['column 2']),

123 (0, 0, 3, ['column 3']),

124 (0, 0, 3, ['column 4'])],

125 [(0, 0, 5, ['body row 2']),

126 (0, 2, 5, ['Cells may span columns.']),

127 None,

128 None],

129 [(0, 0, 7, ['body row 3']),

130 (1, 0, 7, ['Cells may', 'span rows.', '']),

131 (1, 1, 7, ['- Table cells', '- contain', '- body elements.']),

132 None],

133 [(0, 0, 9, ['body row 4']), None, None, None]])

134

135 The first item is a list containing column widths (colspecs). The second

136 item is a list of head rows, and the third is a list of body rows. Each

137 row contains a list of cells. Each cell is either None (for a cell unused

138 because of another cell's span), or a tuple. A cell tuple contains four

139 items: the number of extra rows used by the cell in a vertical span

140 (morerows); the number of extra columns used by the cell in a horizontal

141 span (morecols); the line offset of the first line of the cell contents;

142 and the cell contents, a list of lines of text.

143 """

144

145 head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$')

146

147 def setup(self, block) -> None:

148 self.block = block[:] # make a copy; it may be modified

149 self.block.disconnect() # don't propagate changes to parent

150 self.bottom = len(block) - 1

151 self.right = len(block[0]) - 1

152 self.head_body_sep = None

153 self.done = [-1] * len(block[0])

154 self.cells = []

155 self.rowseps = {0: [0]}

156 self.colseps = {0: [0]}

157

158 def parse_table(self):

159 """

160 Start with a queue of upper-left corners, containing the upper-left

161 corner of the table itself. Trace out one rectangular cell, remember

162 it, and add its upper-right and lower-left corners to the queue of

163 potential upper-left corners of further cells. Process the queue in

164 top-to-bottom order, keeping track of how much of each text column has

165 been seen.

166

167 We'll end up knowing all the row and column boundaries, cell positions

168 and their dimensions.

169 """

170 # a copy of the block without combining characters:

171 self.stripped_block = [strip_combining_chars(line)

172 for line in self.block]

173 corners = [(0, 0)]

174 while corners:

175 top, left = corners.pop(0)

176 if (top == self.bottom

177 or left == self.right

178 or top <= self.done[left]):

179 continue

180 result = self.scan_cell(top, left)

181 if not result:

182 continue

183 bottom, right, rowseps, colseps = result

184 update_dict_of_lists(self.rowseps, rowseps)

185 update_dict_of_lists(self.colseps, colseps)

186 self.mark_done(top, left, bottom, right)

187 cellblock = self.block.get_2D_block(top + 1, left + 1,

188 bottom, right)

189 cellblock.disconnect() # lines in cell can't sync with parent

190 cellblock.replace(self.double_width_pad_char, '')

191 self.cells.append((top, left, bottom, right, cellblock))

192 corners.extend([(top, right), (bottom, left)])

193 corners.sort()

194 if not self.check_parse_complete():

195 raise TableMarkupError('Malformed table; parse incomplete.')

196

197 def mark_done(self, top, left, bottom, right) -> None:

198 """For keeping track of how much of each text column has been seen."""

199 before = top - 1

200 after = bottom - 1

201 for col in range(left, right):

202 assert self.done[col] == before

203 self.done[col] = after

204

205 def check_parse_complete(self) -> bool:

206 """Each text column should have been completely seen."""

207 last = self.bottom - 1

208 for col in range(self.right):

209 if self.done[col] != last:

210 return False

211 return True

212

213 def scan_cell(self, top, left):

214 """Starting at the top-left corner, start tracing out a cell."""

215 assert self.stripped_block[top][left] == '+'

216 return self.scan_right(top, left)

217

218 def scan_right(self, top, left):

219 """

220 Look for the top-right corner of the cell, and make note of all column

221 boundaries ('+').

222 """

223 colseps = {}

224 line = self.stripped_block[top]

225 for i in range(left + 1, self.right + 1):

226 if line[i] == '+':

227 colseps[i] = [top]

228 result = self.scan_down(top, left, i)

229 if result:

230 bottom, rowseps, newcolseps = result

231 update_dict_of_lists(colseps, newcolseps)

232 return bottom, i, rowseps, colseps

233 elif line[i] != '-':

234 return None

235 return None

236

237 def scan_down(self, top, left, right):

238 """

239 Look for the bottom-right corner of the cell, making note of all row

240 boundaries.

241 """

242 rowseps = {}

243 for i in range(top + 1, self.bottom + 1):

244 if self.stripped_block[i][right] == '+':

245 rowseps[i] = [right]

246 result = self.scan_left(top, left, i, right)

247 if result:

248 newrowseps, colseps = result

249 update_dict_of_lists(rowseps, newrowseps)

250 return i, rowseps, colseps

251 elif self.stripped_block[i][right] != '|':

252 return None

253 return None

254

255 def scan_left(self, top, left, bottom, right):

256 """

257 Noting column boundaries, look for the bottom-left corner of the cell.

258 It must line up with the starting point.

259 """

260 colseps = {}

261 line = self.stripped_block[bottom]

262 for i in range(right - 1, left, -1):

263 if line[i] == '+':

264 colseps[i] = [bottom]

265 elif line[i] != '-':

266 return None

267 if line[left] != '+':

268 return None

269 result = self.scan_up(top, left, bottom, right)

270 if result is not None:

271 rowseps = result

272 return rowseps, colseps

273 return None

274

275 def scan_up(self, top, left, bottom, right):

276 """

277 Noting row boundaries, see if we can return to the starting point.

278 """

279 rowseps = {}

280 for i in range(bottom - 1, top, -1):

281 if self.stripped_block[i][left] == '+':

282 rowseps[i] = [left]

283 elif self.stripped_block[i][left] != '|':

284 return None

285 return rowseps

286

287 def structure_from_cells(self):

288 """

289 From the data collected by `scan_cell()`, convert to the final data

290 structure.

291 """

292 rowseps = sorted(self.rowseps.keys()) # list of row boundaries

293 rowindex = {}

294 for i in range(len(rowseps)):

295 rowindex[rowseps[i]] = i # row boundary -> row number mapping

296 colseps = sorted(self.colseps.keys()) # list of column boundaries

297 colindex = {}

298 for i in range(len(colseps)):

299 colindex[colseps[i]] = i # column boundary -> col number map

300 colspecs = [(colseps[i] - colseps[i - 1] - 1)

301 for i in range(1, len(colseps))] # list of column widths

302 # prepare an empty table with the correct number of rows & columns

303 onerow = [None for i in range(len(colseps) - 1)]

304 rows = [onerow[:] for i in range(len(rowseps) - 1)]

305 # keep track of # of cells remaining; should reduce to zero

306 remaining = (len(rowseps) - 1) * (len(colseps) - 1)

307 for top, left, bottom, right, block in self.cells:

308 rownum = rowindex[top]

309 colnum = colindex[left]

310 assert rows[rownum][colnum] is None, (

311 'Cell (row %s, column %s) already used.'

312 % (rownum + 1, colnum + 1))

313 morerows = rowindex[bottom] - rownum - 1

314 morecols = colindex[right] - colnum - 1

315 remaining -= (morerows + 1) * (morecols + 1)

316 # write the cell into the table

317 rows[rownum][colnum] = (morerows, morecols, top + 1, block)

318 assert remaining == 0, 'Unused cells remaining.'

319 if self.head_body_sep: # separate head rows from body rows

320 numheadrows = rowindex[self.head_body_sep]

321 headrows = rows[:numheadrows]

322 bodyrows = rows[numheadrows:]

323 else:

324 headrows = []

325 bodyrows = rows

326 return colspecs, headrows, bodyrows

327

328

329class SimpleTableParser(TableParser):

330

331 """

332 Parse a simple table using `parse()`.

333

334 Here's an example of a simple table::

335

336 ===== =====

337 col 1 col 2

338 ===== =====

339 1 Second column of row 1.

340 2 Second column of row 2.

341 Second line of paragraph.

342 3 - Second column of row 3.

343

344 - Second item in bullet

345 list (row 3, column 2).

346 4 is a span

347 ------------

348 5

349 ===== =====

350

351 Top and bottom borders use '=', column span underlines use '-', column

352 separation is indicated with spaces.

353

354 Passing the above table to the `parse()` method will result in the

355 following data structure, whose interpretation is the same as for

356 `GridTableParser`::

357

358 ([5, 25],

359 [[(0, 0, 1, ['col 1']),

360 (0, 0, 1, ['col 2'])]],

361 [[(0, 0, 3, ['1']),

362 (0, 0, 3, ['Second column of row 1.'])],

363 [(0, 0, 4, ['2']),

364 (0, 0, 4, ['Second column of row 2.',

365 'Second line of paragraph.'])],

366 [(0, 0, 6, ['3']),

367 (0, 0, 6, ['- Second column of row 3.',

368 '',

369 '- Second item in bullet',

370 ' list (row 3, column 2).'])],

371 [(0, 1, 10, ['4 is a span'])],

372 [(0, 0, 12, ['5']),

373 (0, 0, 12, [''])]])

374 """

375

376 head_body_separator_pat = re.compile('=[ =]*$')

377 span_pat = re.compile('-[ -]*$')

378

379 def setup(self, block) -> None:

380 self.block = block[:] # make a copy; it will be modified

381 self.block.disconnect() # don't propagate changes to parent

382 # Convert top & bottom borders to column span underlines:

383 self.block[0] = self.block[0].replace('=', '-')

384 self.block[-1] = self.block[-1].replace('=', '-')

385 self.head_body_sep = None

386 self.columns = []

387 self.border_end = None

388 self.table = []

389 self.done = [-1] * len(block[0])

390 self.rowseps = {0: [0]}

391 self.colseps = {0: [0]}

392

393 def parse_table(self) -> None:

394 """

395 First determine the column boundaries from the top border, then

396 process rows. Each row may consist of multiple lines; accumulate

397 lines until a row is complete. Call `self.parse_row` to finish the

398 job.

399 """

400 # Top border must fully describe all table columns.

401 self.columns = self.parse_columns(self.block[0], 0)

402 self.border_end = self.columns[-1][1]

403 firststart, firstend = self.columns[0]

404 offset = 1 # skip top border

405 start = 1

406 text_found = None

407 while offset < len(self.block):

408 line = self.block[offset]

409 if self.span_pat.match(line):

410 # Column span underline or border; row is complete.

411 self.parse_row(self.block[start:offset], start,

412 (line.rstrip(), offset))

413 start = offset + 1

414 text_found = None

415 elif line[firststart:firstend].strip():

416 # First column not blank, therefore it's a new row.

417 if text_found and offset != start:

418 self.parse_row(self.block[start:offset], start)

419 start = offset

420 text_found = 1

421 elif not text_found:

422 start = offset + 1

423 offset += 1

424

425 def parse_columns(self, line, offset):

426 """

427 Given a column span underline, return a list of (begin, end) pairs.

428 """

429 cols = []

430 end = 0

431 while True:

432 begin = line.find('-', end)

433 end = line.find(' ', begin)

434 if begin < 0:

435 break

436 if end < 0:

437 end = len(line)

438 cols.append((begin, end))

439 if self.columns:

440 if cols[-1][1] != self.border_end:

441 raise TableMarkupError('Column span incomplete in table '

442 'line %s.' % (offset+1),

443 offset=offset)

444 # Allow for an unbounded rightmost column:

445 cols[-1] = (cols[-1][0], self.columns[-1][1])

446 return cols

447

448 def init_row(self, colspec, offset):

449 i = 0

450 cells = []

451 for start, end in colspec:

452 morecols = 0

453 try:

454 assert start == self.columns[i][0]

455 while end != self.columns[i][1]:

456 i += 1

457 morecols += 1

458 except (AssertionError, IndexError):

459 raise TableMarkupError('Column span alignment problem '

460 'in table line %s.' % (offset+2),

461 offset=offset+1)

462 cells.append([0, morecols, offset, []])

463 i += 1

464 return cells

465

466 def parse_row(self, lines, start, spanline=None) -> None:

467 """

468 Given the text `lines` of a row, parse it and append to `self.table`.

469

470 The row is parsed according to the current column spec (either

471 `spanline` if provided or `self.columns`). For each column, extract

472 text from each line, and check for text in column margins. Finally,

473 adjust for insignificant whitespace.

474 """

475 if not (lines or spanline):

476 # No new row, just blank lines.

477 return

478 if spanline:

479 columns = self.parse_columns(*spanline)

480 else:

481 columns = self.columns[:]

482 self.check_columns(lines, start, columns)

483 row = self.init_row(columns, start)

484 for i in range(len(columns)):

485 start, end = columns[i]

486 cellblock = lines.get_2D_block(0, start, len(lines), end)

487 cellblock.disconnect() # lines in cell can't sync with parent

488 cellblock.replace(self.double_width_pad_char, '')

489 row[i][3] = cellblock

490 self.table.append(row)

491

492 def check_columns(self, lines, first_line, columns):

493 """

494 Check for text in column margins and text overflow in the last column.

495 Raise TableMarkupError if anything but whitespace is in column margins.

496 Adjust the end value for the last column if there is text overflow.

497 """

498 # "Infinite" value for a dummy last column's beginning, used to

499 # check for text overflow:

500 columns.append((sys.maxsize, None))

501 lastcol = len(columns) - 2

502 # combining characters do not contribute to the column width

503 lines = [strip_combining_chars(line) for line in lines]

504

505 for i in range(len(columns) - 1):

506 start, end = columns[i]

507 nextstart = columns[i+1][0]

508 offset = 0

509 for line in lines:

510 if i == lastcol and line[end:].strip():

511 text = line[start:].rstrip()

512 new_end = start + len(text)

513 main_start, main_end = self.columns[-1]

514 columns[i] = (start, max(main_end, new_end))

515 if new_end > main_end:

516 self.columns[-1] = (main_start, new_end)

517 elif line[end:nextstart].strip():

518 raise TableMarkupError('Text in column margin in table '

519 'line %s.' % (first_line+offset+1),

520 offset=first_line+offset)

521 offset += 1

522 columns.pop()

523

524 def structure_from_cells(self):

525 colspecs = [end - start for start, end in self.columns]

526 first_body_row = 0

527 if self.head_body_sep:

528 for i in range(len(self.table)):

529 if self.table[i][0][2] > self.head_body_sep:

530 first_body_row = i

531 break

532 return (colspecs, self.table[:first_body_row],

533 self.table[first_body_row:])

534

535

536def update_dict_of_lists(master, newdata) -> None:

537 """

538 Extend the list values of `master` with those from `newdata`.

539

540 Both parameters must be dictionaries containing list values.

541 """

542 for key, values in newdata.items():

543 master.setdefault(key, []).extend(values)