Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/docutils/parsers/rst/tableparser.py: 51%

1# $Id$

2# Author: David Goodger <goodger@python.org>

3# Copyright: This module has been placed in the public domain.

5"""

6This module defines table parser classes,which parse plaintext-graphic tables

7and produce a well-formed data structure suitable for building a CALS table.

9:Classes:

10 - `GridTableParser`: Parse fully-formed tables represented with a grid.

11 - `SimpleTableParser`: Parse simple tables, delimited by top & bottom

12 borders.

14:Exception class: `TableMarkupError`

16:Function:

17 `update_dict_of_lists()`: Merge two dictionaries containing list values.

18"""

20from __future__ import annotations

22__docformat__ = 'reStructuredText'

24import re

25import sys

26from docutils import DataError

27from docutils.utils import strip_combining_chars

30class TableMarkupError(DataError):

32 """

33 Raise if there is any problem with table markup.

35 The keyword argument `offset` denotes the offset of the problem

36 from the table's start line.

37 """

39 def __init__(self, *args, **kwargs) -> None:

40 self.offset = kwargs.pop('offset', 0)

41 DataError.__init__(self, *args)

44class TableParser:

46 """

47 Abstract superclass for the common parts of the syntax-specific parsers.

48 """

50 head_body_separator_pat = None

51 """Matches the row separator between head rows and body rows."""

53 double_width_pad_char = '\x00'

54 """Padding character for East Asian double-width text."""

56 def parse(self, block):

57 """

58 Analyze the text `block` and return a table data structure.

60 Given a plaintext-graphic table in `block` (list of lines of text; no

61 whitespace padding), parse the table, construct and return the data

62 necessary to construct a CALS table or equivalent.

64 Raise `TableMarkupError` if there is any problem with the markup.

65 """

66 self.setup(block)

67 self.find_head_body_sep()

68 self.parse_table()

69 return self.structure_from_cells()

71 def find_head_body_sep(self):

72 """Look for a head/body row separator line; store the line index."""

73 for i in range(len(self.block)):

74 line = self.block[i]

75 if self.head_body_separator_pat.match(line):

76 if self.head_body_sep:

77 raise TableMarkupError(

78 'Multiple head/body row separators '

79 '(table lines %s and %s); only one allowed.'

80 % (self.head_body_sep+1, i+1), offset=i)

81 else:

82 self.head_body_sep = i

83 self.block[i] = line.replace('=', '-')

84 if self.head_body_sep == 0 or self.head_body_sep == (len(self.block)

85 - 1):

86 raise TableMarkupError('The head/body row separator may not be '

87 'the first or last line of the table.',

88 offset=i)

91class GridTableParser(TableParser):

93 """

94 Parse a grid table using `parse()`.

96 Here's an example of a grid table::

98 +------------------------+------------+----------+----------+

100 +========================+============+==========+==========+

102 +------------------------+------------+----------+----------+

103 | body row 2 | Cells may span columns. |

104 +------------------------+------------+---------------------+

105 | body row 3 | Cells may | - Table cells |

106 +------------------------+ span rows. | - contain |

107 | body row 4 | | - body elements. |

108 +------------------------+------------+---------------------+

109

110 Intersections use '+', row separators use '-' (except for one optional

111 head/body row separator, which uses '='), and column separators use '|'.

112

113 Passing the above table to the `parse()` method will result in the

114 following data structure::

115

116 ([24, 12, 10, 10],

117 [[(0, 0, 1, ['Header row, column 1']),

118 (0, 0, 1, ['Header 2']),

119 (0, 0, 1, ['Header 3']),

120 (0, 0, 1, ['Header 4'])]],

121 [[(0, 0, 3, ['body row 1, column 1']),

122 (0, 0, 3, ['column 2']),

123 (0, 0, 3, ['column 3']),

124 (0, 0, 3, ['column 4'])],

125 [(0, 0, 5, ['body row 2']),

126 (0, 2, 5, ['Cells may span columns.']),

127 None,

128 None],

129 [(0, 0, 7, ['body row 3']),

130 (1, 0, 7, ['Cells may', 'span rows.', '']),

131 (1, 1, 7, ['- Table cells', '- contain', '- body elements.']),

132 None],

133 [(0, 0, 9, ['body row 4']), None, None, None]])

134

135 The first item is a list containing column widths (colspecs). The second

136 item is a list of head rows, and the third is a list of body rows. Each

137 row contains a list of cells. Each cell is either None (for a cell unused

138 because of another cell's span), or a tuple. A cell tuple contains four

139 items: the number of extra rows used by the cell in a vertical span

140 (morerows); the number of extra columns used by the cell in a horizontal

141 span (morecols); the line offset of the first line of the cell contents;

142 and the cell contents, a list of lines of text.

143 """

144

145 head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$')

146

147 def setup(self, block) -> None:

148 self.block = block[:] # make a copy; it may be modified

149 self.block.disconnect() # don't propagate changes to parent

150 self.bottom = len(block) - 1

151 self.right = len(block[0]) - 1

152 self.head_body_sep = None

153 self.done = [-1] * len(block[0])

154 self.cells = []

155 self.rowseps = {0: [0]}

156 self.colseps = {0: [0]}

157

158 def parse_table(self):

159 """

160 Start with a queue of upper-left corners, containing the upper-left

161 corner of the table itself. Trace out one rectangular cell, remember

162 it, and add its upper-right and lower-left corners to the queue of

163 potential upper-left corners of further cells. Process the queue in

164 top-to-bottom order, keeping track of how much of each text column has

165 been seen.

166

167 We'll end up knowing all the row and column boundaries, cell positions

168 and their dimensions.

169 """

170 corners = [(0, 0)]

171 while corners:

172 top, left = corners.pop(0)

173 if (top == self.bottom

174 or left == self.right

175 or top <= self.done[left]):

176 continue

177 result = self.scan_cell(top, left)

178 if not result:

179 continue

180 bottom, right, rowseps, colseps = result

181 update_dict_of_lists(self.rowseps, rowseps)

182 update_dict_of_lists(self.colseps, colseps)

183 self.mark_done(top, left, bottom, right)

184 cellblock = self.block.get_2D_block(top + 1, left + 1,

185 bottom, right)

186 cellblock.disconnect() # lines in cell can't sync with parent

187 cellblock.replace(self.double_width_pad_char, '')

188 self.cells.append((top, left, bottom, right, cellblock))

189 corners.extend([(top, right), (bottom, left)])

190 corners.sort()

191 if not self.check_parse_complete():

192 raise TableMarkupError('Malformed table; parse incomplete.')

193

194 def mark_done(self, top, left, bottom, right) -> None:

195 """For keeping track of how much of each text column has been seen."""

196 before = top - 1

197 after = bottom - 1

198 for col in range(left, right):

199 assert self.done[col] == before

200 self.done[col] = after

201

202 def check_parse_complete(self) -> bool:

203 """Each text column should have been completely seen."""

204 last = self.bottom - 1

205 for col in range(self.right):

206 if self.done[col] != last:

207 return False

208 return True

209

210 def scan_cell(self, top, left):

211 """Starting at the top-left corner, start tracing out a cell."""

212 assert self.block[top][left] == '+'

213 return self.scan_right(top, left)

214

215 def scan_right(self, top, left):

216 """

217 Look for the top-right corner of the cell, and make note of all column

218 boundaries ('+').

219 """

220 colseps = {}

221 line = self.block[top]

222 for i in range(left + 1, self.right + 1):

223 if line[i] == '+':

224 colseps[i] = [top]

225 result = self.scan_down(top, left, i)

226 if result:

227 bottom, rowseps, newcolseps = result

228 update_dict_of_lists(colseps, newcolseps)

229 return bottom, i, rowseps, colseps

230 elif line[i] != '-':

231 return None

232 return None

233

234 def scan_down(self, top, left, right):

235 """

236 Look for the bottom-right corner of the cell, making note of all row

237 boundaries.

238 """

239 rowseps = {}

240 for i in range(top + 1, self.bottom + 1):

241 if self.block[i][right] == '+':

242 rowseps[i] = [right]

243 result = self.scan_left(top, left, i, right)

244 if result:

245 newrowseps, colseps = result

246 update_dict_of_lists(rowseps, newrowseps)

247 return i, rowseps, colseps

248 elif self.block[i][right] != '|':

249 return None

250 return None

251

252 def scan_left(self, top, left, bottom, right):

253 """

254 Noting column boundaries, look for the bottom-left corner of the cell.

255 It must line up with the starting point.

256 """

257 colseps = {}

258 line = self.block[bottom]

259 for i in range(right - 1, left, -1):

260 if line[i] == '+':

261 colseps[i] = [bottom]

262 elif line[i] != '-':

263 return None

264 if line[left] != '+':

265 return None

266 result = self.scan_up(top, left, bottom, right)

267 if result is not None:

268 rowseps = result

269 return rowseps, colseps

270 return None

271

272 def scan_up(self, top, left, bottom, right):

273 """

274 Noting row boundaries, see if we can return to the starting point.

275 """

276 rowseps = {}

277 for i in range(bottom - 1, top, -1):

278 if self.block[i][left] == '+':

279 rowseps[i] = [left]

280 elif self.block[i][left] != '|':

281 return None

282 return rowseps

283

284 def structure_from_cells(self):

285 """

286 From the data collected by `scan_cell()`, convert to the final data

287 structure.

288 """

289 rowseps = sorted(self.rowseps.keys()) # list of row boundaries

290 rowindex = {}

291 for i in range(len(rowseps)):

292 rowindex[rowseps[i]] = i # row boundary -> row number mapping

293 colseps = sorted(self.colseps.keys()) # list of column boundaries

294 colindex = {}

295 for i in range(len(colseps)):

296 colindex[colseps[i]] = i # column boundary -> col number map

297 colspecs = [(colseps[i] - colseps[i - 1] - 1)

298 for i in range(1, len(colseps))] # list of column widths

299 # prepare an empty table with the correct number of rows & columns

300 onerow = [None for i in range(len(colseps) - 1)]

301 rows = [onerow[:] for i in range(len(rowseps) - 1)]

302 # keep track of # of cells remaining; should reduce to zero

303 remaining = (len(rowseps) - 1) * (len(colseps) - 1)

304 for top, left, bottom, right, block in self.cells:

305 rownum = rowindex[top]

306 colnum = colindex[left]

307 assert rows[rownum][colnum] is None, (

308 'Cell (row %s, column %s) already used.'

309 % (rownum + 1, colnum + 1))

310 morerows = rowindex[bottom] - rownum - 1

311 morecols = colindex[right] - colnum - 1

312 remaining -= (morerows + 1) * (morecols + 1)

313 # write the cell into the table

314 rows[rownum][colnum] = (morerows, morecols, top + 1, block)

315 assert remaining == 0, 'Unused cells remaining.'

316 if self.head_body_sep: # separate head rows from body rows

317 numheadrows = rowindex[self.head_body_sep]

318 headrows = rows[:numheadrows]

319 bodyrows = rows[numheadrows:]

320 else:

321 headrows = []

322 bodyrows = rows

323 return colspecs, headrows, bodyrows

324

325

326class SimpleTableParser(TableParser):

327

328 """

329 Parse a simple table using `parse()`.

330

331 Here's an example of a simple table::

332

333 ===== =====

334 col 1 col 2

335 ===== =====

336 1 Second column of row 1.

337 2 Second column of row 2.

338 Second line of paragraph.

339 3 - Second column of row 3.

340

341 - Second item in bullet

342 list (row 3, column 2).

343 4 is a span

344 ------------

345 5

346 ===== =====

347

348 Top and bottom borders use '=', column span underlines use '-', column

349 separation is indicated with spaces.

350

351 Passing the above table to the `parse()` method will result in the

352 following data structure, whose interpretation is the same as for

353 `GridTableParser`::

354

355 ([5, 25],

356 [[(0, 0, 1, ['col 1']),

357 (0, 0, 1, ['col 2'])]],

358 [[(0, 0, 3, ['1']),

359 (0, 0, 3, ['Second column of row 1.'])],

360 [(0, 0, 4, ['2']),

361 (0, 0, 4, ['Second column of row 2.',

362 'Second line of paragraph.'])],

363 [(0, 0, 6, ['3']),

364 (0, 0, 6, ['- Second column of row 3.',

365 '',

366 '- Second item in bullet',

367 ' list (row 3, column 2).'])],

368 [(0, 1, 10, ['4 is a span'])],

369 [(0, 0, 12, ['5']),

370 (0, 0, 12, [''])]])

371 """

372

373 head_body_separator_pat = re.compile('=[ =]*$')

374 span_pat = re.compile('-[ -]*$')

375

376 def setup(self, block) -> None:

377 self.block = block[:] # make a copy; it will be modified

378 self.block.disconnect() # don't propagate changes to parent

379 # Convert top & bottom borders to column span underlines:

380 self.block[0] = self.block[0].replace('=', '-')

381 self.block[-1] = self.block[-1].replace('=', '-')

382 self.head_body_sep = None

383 self.columns = []

384 self.border_end = None

385 self.table = []

386 self.done = [-1] * len(block[0])

387 self.rowseps = {0: [0]}

388 self.colseps = {0: [0]}

389

390 def parse_table(self) -> None:

391 """

392 First determine the column boundaries from the top border, then

393 process rows. Each row may consist of multiple lines; accumulate

394 lines until a row is complete. Call `self.parse_row` to finish the

395 job.

396 """

397 # Top border must fully describe all table columns.

398 self.columns = self.parse_columns(self.block[0], 0)

399 self.border_end = self.columns[-1][1]

400 firststart, firstend = self.columns[0]

401 offset = 1 # skip top border

402 start = 1

403 text_found = None

404 while offset < len(self.block):

405 line = self.block[offset]

406 if self.span_pat.match(line):

407 # Column span underline or border; row is complete.

408 self.parse_row(self.block[start:offset], start,

409 (line.rstrip(), offset))

410 start = offset + 1

411 text_found = None

412 elif line[firststart:firstend].strip():

413 # First column not blank, therefore it's a new row.

414 if text_found and offset != start:

415 self.parse_row(self.block[start:offset], start)

416 start = offset

417 text_found = 1

418 elif not text_found:

419 start = offset + 1

420 offset += 1

421

422 def parse_columns(self, line, offset):

423 """

424 Given a column span underline, return a list of (begin, end) pairs.

425 """

426 cols = []

427 end = 0

428 while True:

429 begin = line.find('-', end)

430 end = line.find(' ', begin)

431 if begin < 0:

432 break

433 if end < 0:

434 end = len(line)

435 cols.append((begin, end))

436 if self.columns:

437 if cols[-1][1] != self.border_end:

438 raise TableMarkupError('Column span incomplete in table '

439 'line %s.' % (offset+1),

440 offset=offset)

441 # Allow for an unbounded rightmost column:

442 cols[-1] = (cols[-1][0], self.columns[-1][1])

443 return cols

444

445 def init_row(self, colspec, offset):

446 i = 0

447 cells = []

448 for start, end in colspec:

449 morecols = 0

450 try:

451 assert start == self.columns[i][0]

452 while end != self.columns[i][1]:

453 i += 1

454 morecols += 1

455 except (AssertionError, IndexError):

456 raise TableMarkupError('Column span alignment problem '

457 'in table line %s.' % (offset+2),

458 offset=offset+1)

459 cells.append([0, morecols, offset, []])

460 i += 1

461 return cells

462

463 def parse_row(self, lines, start, spanline=None) -> None:

464 """

465 Given the text `lines` of a row, parse it and append to `self.table`.

466

467 The row is parsed according to the current column spec (either

468 `spanline` if provided or `self.columns`). For each column, extract

469 text from each line, and check for text in column margins. Finally,

470 adjust for insignificant whitespace.

471 """

472 if not (lines or spanline):

473 # No new row, just blank lines.

474 return

475 if spanline:

476 columns = self.parse_columns(*spanline)

477 else:

478 columns = self.columns[:]

479 self.check_columns(lines, start, columns)

480 row = self.init_row(columns, start)

481 for i in range(len(columns)):

482 start, end = columns[i]

483 cellblock = lines.get_2D_block(0, start, len(lines), end)

484 cellblock.disconnect() # lines in cell can't sync with parent

485 cellblock.replace(self.double_width_pad_char, '')

486 row[i][3] = cellblock

487 self.table.append(row)

488

489 def check_columns(self, lines, first_line, columns):

490 """

491 Check for text in column margins and text overflow in the last column.

492 Raise TableMarkupError if anything but whitespace is in column margins.

493 Adjust the end value for the last column if there is text overflow.

494 """

495 # "Infinite" value for a dummy last column's beginning, used to

496 # check for text overflow:

497 columns.append((sys.maxsize, None))

498 lastcol = len(columns) - 2

499 # combining characters do not contribute to the column width

500 lines = [strip_combining_chars(line) for line in lines]

501

502 for i in range(len(columns) - 1):

503 start, end = columns[i]

504 nextstart = columns[i+1][0]

505 offset = 0

506 for line in lines:

507 if i == lastcol and line[end:].strip():

508 text = line[start:].rstrip()

509 new_end = start + len(text)

510 main_start, main_end = self.columns[-1]

511 columns[i] = (start, max(main_end, new_end))

512 if new_end > main_end:

513 self.columns[-1] = (main_start, new_end)

514 elif line[end:nextstart].strip():

515 raise TableMarkupError('Text in column margin in table '

516 'line %s.' % (first_line+offset+1),

517 offset=first_line+offset)

518 offset += 1

519 columns.pop()

520

521 def structure_from_cells(self):

522 colspecs = [end - start for start, end in self.columns]

523 first_body_row = 0

524 if self.head_body_sep:

525 for i in range(len(self.table)):

526 if self.table[i][0][2] > self.head_body_sep:

527 first_body_row = i

528 break

529 return (colspecs, self.table[:first_body_row],

530 self.table[first_body_row:])

531

532

533def update_dict_of_lists(master, newdata) -> None:

534 """

535 Extend the list values of `master` with those from `newdata`.

536

537 Both parameters must be dictionaries containing list values.

538 """

539 for key, values in newdata.items():

540 master.setdefault(key, []).extend(values)