Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/docutils/parsers/rst/tableparser.py: 50%

1# $Id$

2# Author: David Goodger <goodger@python.org>

3# Copyright: This module has been placed in the public domain.

5"""

6This module defines table parser classes,which parse plaintext-graphic tables

7and produce a well-formed data structure suitable for building a CALS table.

9:Classes:

10 - `GridTableParser`: Parse fully-formed tables represented with a grid.

11 - `SimpleTableParser`: Parse simple tables, delimited by top & bottom

12 borders.

14:Exception class: `TableMarkupError`

16:Function:

17 `update_dict_of_lists()`: Merge two dictionaries containing list values.

18"""

20__docformat__ = 'reStructuredText'

23import re

24import sys

25from docutils import DataError

26from docutils.utils import strip_combining_chars

29class TableMarkupError(DataError):

31 """

32 Raise if there is any problem with table markup.

34 The keyword argument `offset` denotes the offset of the problem

35 from the table's start line.

36 """

38 def __init__(self, *args, **kwargs):

39 self.offset = kwargs.pop('offset', 0)

40 DataError.__init__(self, *args)

43class TableParser:

45 """

46 Abstract superclass for the common parts of the syntax-specific parsers.

47 """

49 head_body_separator_pat = None

50 """Matches the row separator between head rows and body rows."""

52 double_width_pad_char = '\x00'

53 """Padding character for East Asian double-width text."""

55 def parse(self, block):

56 """

57 Analyze the text `block` and return a table data structure.

59 Given a plaintext-graphic table in `block` (list of lines of text; no

60 whitespace padding), parse the table, construct and return the data

61 necessary to construct a CALS table or equivalent.

63 Raise `TableMarkupError` if there is any problem with the markup.

64 """

65 self.setup(block)

66 self.find_head_body_sep()

67 self.parse_table()

68 return self.structure_from_cells()

70 def find_head_body_sep(self):

71 """Look for a head/body row separator line; store the line index."""

72 for i in range(len(self.block)):

73 line = self.block[i]

74 if self.head_body_separator_pat.match(line):

75 if self.head_body_sep:

76 raise TableMarkupError(

77 'Multiple head/body row separators '

78 '(table lines %s and %s); only one allowed.'

79 % (self.head_body_sep+1, i+1), offset=i)

80 else:

81 self.head_body_sep = i

82 self.block[i] = line.replace('=', '-')

83 if self.head_body_sep == 0 or self.head_body_sep == (len(self.block)

84 - 1):

85 raise TableMarkupError('The head/body row separator may not be '

86 'the first or last line of the table.',

87 offset=i)

90class GridTableParser(TableParser):

92 """

93 Parse a grid table using `parse()`.

95 Here's an example of a grid table::

97 +------------------------+------------+----------+----------+

99 +========================+============+==========+==========+

101 +------------------------+------------+----------+----------+

102 | body row 2 | Cells may span columns. |

103 +------------------------+------------+---------------------+

104 | body row 3 | Cells may | - Table cells |

105 +------------------------+ span rows. | - contain |

106 | body row 4 | | - body elements. |

107 +------------------------+------------+---------------------+

108

109 Intersections use '+', row separators use '-' (except for one optional

110 head/body row separator, which uses '='), and column separators use '|'.

111

112 Passing the above table to the `parse()` method will result in the

113 following data structure::

114

115 ([24, 12, 10, 10],

116 [[(0, 0, 1, ['Header row, column 1']),

117 (0, 0, 1, ['Header 2']),

118 (0, 0, 1, ['Header 3']),

119 (0, 0, 1, ['Header 4'])]],

120 [[(0, 0, 3, ['body row 1, column 1']),

121 (0, 0, 3, ['column 2']),

122 (0, 0, 3, ['column 3']),

123 (0, 0, 3, ['column 4'])],

124 [(0, 0, 5, ['body row 2']),

125 (0, 2, 5, ['Cells may span columns.']),

126 None,

127 None],

128 [(0, 0, 7, ['body row 3']),

129 (1, 0, 7, ['Cells may', 'span rows.', '']),

130 (1, 1, 7, ['- Table cells', '- contain', '- body elements.']),

131 None],

132 [(0, 0, 9, ['body row 4']), None, None, None]])

133

134 The first item is a list containing column widths (colspecs). The second

135 item is a list of head rows, and the third is a list of body rows. Each

136 row contains a list of cells. Each cell is either None (for a cell unused

137 because of another cell's span), or a tuple. A cell tuple contains four

138 items: the number of extra rows used by the cell in a vertical span

139 (morerows); the number of extra columns used by the cell in a horizontal

140 span (morecols); the line offset of the first line of the cell contents;

141 and the cell contents, a list of lines of text.

142 """

143

144 head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$')

145

146 def setup(self, block):

147 self.block = block[:] # make a copy; it may be modified

148 self.block.disconnect() # don't propagate changes to parent

149 self.bottom = len(block) - 1

150 self.right = len(block[0]) - 1

151 self.head_body_sep = None

152 self.done = [-1] * len(block[0])

153 self.cells = []

154 self.rowseps = {0: [0]}

155 self.colseps = {0: [0]}

156

157 def parse_table(self):

158 """

159 Start with a queue of upper-left corners, containing the upper-left

160 corner of the table itself. Trace out one rectangular cell, remember

161 it, and add its upper-right and lower-left corners to the queue of

162 potential upper-left corners of further cells. Process the queue in

163 top-to-bottom order, keeping track of how much of each text column has

164 been seen.

165

166 We'll end up knowing all the row and column boundaries, cell positions

167 and their dimensions.

168 """

169 corners = [(0, 0)]

170 while corners:

171 top, left = corners.pop(0)

172 if (top == self.bottom

173 or left == self.right

174 or top <= self.done[left]):

175 continue

176 result = self.scan_cell(top, left)

177 if not result:

178 continue

179 bottom, right, rowseps, colseps = result

180 update_dict_of_lists(self.rowseps, rowseps)

181 update_dict_of_lists(self.colseps, colseps)

182 self.mark_done(top, left, bottom, right)

183 cellblock = self.block.get_2D_block(top + 1, left + 1,

184 bottom, right)

185 cellblock.disconnect() # lines in cell can't sync with parent

186 cellblock.replace(self.double_width_pad_char, '')

187 self.cells.append((top, left, bottom, right, cellblock))

188 corners.extend([(top, right), (bottom, left)])

189 corners.sort()

190 if not self.check_parse_complete():

191 raise TableMarkupError('Malformed table; parse incomplete.')

192

193 def mark_done(self, top, left, bottom, right):

194 """For keeping track of how much of each text column has been seen."""

195 before = top - 1

196 after = bottom - 1

197 for col in range(left, right):

198 assert self.done[col] == before

199 self.done[col] = after

200

201 def check_parse_complete(self):

202 """Each text column should have been completely seen."""

203 last = self.bottom - 1

204 for col in range(self.right):

205 if self.done[col] != last:

206 return False

207 return True

208

209 def scan_cell(self, top, left):

210 """Starting at the top-left corner, start tracing out a cell."""

211 assert self.block[top][left] == '+'

212 return self.scan_right(top, left)

213

214 def scan_right(self, top, left):

215 """

216 Look for the top-right corner of the cell, and make note of all column

217 boundaries ('+').

218 """

219 colseps = {}

220 line = self.block[top]

221 for i in range(left + 1, self.right + 1):

222 if line[i] == '+':

223 colseps[i] = [top]

224 result = self.scan_down(top, left, i)

225 if result:

226 bottom, rowseps, newcolseps = result

227 update_dict_of_lists(colseps, newcolseps)

228 return bottom, i, rowseps, colseps

229 elif line[i] != '-':

230 return None

231 return None

232

233 def scan_down(self, top, left, right):

234 """

235 Look for the bottom-right corner of the cell, making note of all row

236 boundaries.

237 """

238 rowseps = {}

239 for i in range(top + 1, self.bottom + 1):

240 if self.block[i][right] == '+':

241 rowseps[i] = [right]

242 result = self.scan_left(top, left, i, right)

243 if result:

244 newrowseps, colseps = result

245 update_dict_of_lists(rowseps, newrowseps)

246 return i, rowseps, colseps

247 elif self.block[i][right] != '|':

248 return None

249 return None

250

251 def scan_left(self, top, left, bottom, right):

252 """

253 Noting column boundaries, look for the bottom-left corner of the cell.

254 It must line up with the starting point.

255 """

256 colseps = {}

257 line = self.block[bottom]

258 for i in range(right - 1, left, -1):

259 if line[i] == '+':

260 colseps[i] = [bottom]

261 elif line[i] != '-':

262 return None

263 if line[left] != '+':

264 return None

265 result = self.scan_up(top, left, bottom, right)

266 if result is not None:

267 rowseps = result

268 return rowseps, colseps

269 return None

270

271 def scan_up(self, top, left, bottom, right):

272 """

273 Noting row boundaries, see if we can return to the starting point.

274 """

275 rowseps = {}

276 for i in range(bottom - 1, top, -1):

277 if self.block[i][left] == '+':

278 rowseps[i] = [left]

279 elif self.block[i][left] != '|':

280 return None

281 return rowseps

282

283 def structure_from_cells(self):

284 """

285 From the data collected by `scan_cell()`, convert to the final data

286 structure.

287 """

288 rowseps = sorted(self.rowseps.keys()) # list of row boundaries

289 rowindex = {}

290 for i in range(len(rowseps)):

291 rowindex[rowseps[i]] = i # row boundary -> row number mapping

292 colseps = sorted(self.colseps.keys()) # list of column boundaries

293 colindex = {}

294 for i in range(len(colseps)):

295 colindex[colseps[i]] = i # column boundary -> col number map

296 colspecs = [(colseps[i] - colseps[i - 1] - 1)

297 for i in range(1, len(colseps))] # list of column widths

298 # prepare an empty table with the correct number of rows & columns

299 onerow = [None for i in range(len(colseps) - 1)]

300 rows = [onerow[:] for i in range(len(rowseps) - 1)]

301 # keep track of # of cells remaining; should reduce to zero

302 remaining = (len(rowseps) - 1) * (len(colseps) - 1)

303 for top, left, bottom, right, block in self.cells:

304 rownum = rowindex[top]

305 colnum = colindex[left]

306 assert rows[rownum][colnum] is None, (

307 'Cell (row %s, column %s) already used.'

308 % (rownum + 1, colnum + 1))

309 morerows = rowindex[bottom] - rownum - 1

310 morecols = colindex[right] - colnum - 1

311 remaining -= (morerows + 1) * (morecols + 1)

312 # write the cell into the table

313 rows[rownum][colnum] = (morerows, morecols, top + 1, block)

314 assert remaining == 0, 'Unused cells remaining.'

315 if self.head_body_sep: # separate head rows from body rows

316 numheadrows = rowindex[self.head_body_sep]

317 headrows = rows[:numheadrows]

318 bodyrows = rows[numheadrows:]

319 else:

320 headrows = []

321 bodyrows = rows

322 return colspecs, headrows, bodyrows

323

324

325class SimpleTableParser(TableParser):

326

327 """

328 Parse a simple table using `parse()`.

329

330 Here's an example of a simple table::

331

332 ===== =====

333 col 1 col 2

334 ===== =====

335 1 Second column of row 1.

336 2 Second column of row 2.

337 Second line of paragraph.

338 3 - Second column of row 3.

339

340 - Second item in bullet

341 list (row 3, column 2).

342 4 is a span

343 ------------

344 5

345 ===== =====

346

347 Top and bottom borders use '=', column span underlines use '-', column

348 separation is indicated with spaces.

349

350 Passing the above table to the `parse()` method will result in the

351 following data structure, whose interpretation is the same as for

352 `GridTableParser`::

353

354 ([5, 25],

355 [[(0, 0, 1, ['col 1']),

356 (0, 0, 1, ['col 2'])]],

357 [[(0, 0, 3, ['1']),

358 (0, 0, 3, ['Second column of row 1.'])],

359 [(0, 0, 4, ['2']),

360 (0, 0, 4, ['Second column of row 2.',

361 'Second line of paragraph.'])],

362 [(0, 0, 6, ['3']),

363 (0, 0, 6, ['- Second column of row 3.',

364 '',

365 '- Second item in bullet',

366 ' list (row 3, column 2).'])],

367 [(0, 1, 10, ['4 is a span'])],

368 [(0, 0, 12, ['5']),

369 (0, 0, 12, [''])]])

370 """

371

372 head_body_separator_pat = re.compile('=[ =]*$')

373 span_pat = re.compile('-[ -]*$')

374

375 def setup(self, block):

376 self.block = block[:] # make a copy; it will be modified

377 self.block.disconnect() # don't propagate changes to parent

378 # Convert top & bottom borders to column span underlines:

379 self.block[0] = self.block[0].replace('=', '-')

380 self.block[-1] = self.block[-1].replace('=', '-')

381 self.head_body_sep = None

382 self.columns = []

383 self.border_end = None

384 self.table = []

385 self.done = [-1] * len(block[0])

386 self.rowseps = {0: [0]}

387 self.colseps = {0: [0]}

388

389 def parse_table(self):

390 """

391 First determine the column boundaries from the top border, then

392 process rows. Each row may consist of multiple lines; accumulate

393 lines until a row is complete. Call `self.parse_row` to finish the

394 job.

395 """

396 # Top border must fully describe all table columns.

397 self.columns = self.parse_columns(self.block[0], 0)

398 self.border_end = self.columns[-1][1]

399 firststart, firstend = self.columns[0]

400 offset = 1 # skip top border

401 start = 1

402 text_found = None

403 while offset < len(self.block):

404 line = self.block[offset]

405 if self.span_pat.match(line):

406 # Column span underline or border; row is complete.

407 self.parse_row(self.block[start:offset], start,

408 (line.rstrip(), offset))

409 start = offset + 1

410 text_found = None

411 elif line[firststart:firstend].strip():

412 # First column not blank, therefore it's a new row.

413 if text_found and offset != start:

414 self.parse_row(self.block[start:offset], start)

415 start = offset

416 text_found = 1

417 elif not text_found:

418 start = offset + 1

419 offset += 1

420

421 def parse_columns(self, line, offset):

422 """

423 Given a column span underline, return a list of (begin, end) pairs.

424 """

425 cols = []

426 end = 0

427 while True:

428 begin = line.find('-', end)

429 end = line.find(' ', begin)

430 if begin < 0:

431 break

432 if end < 0:

433 end = len(line)

434 cols.append((begin, end))

435 if self.columns:

436 if cols[-1][1] != self.border_end:

437 raise TableMarkupError('Column span incomplete in table '

438 'line %s.' % (offset+1),

439 offset=offset)

440 # Allow for an unbounded rightmost column:

441 cols[-1] = (cols[-1][0], self.columns[-1][1])

442 return cols

443

444 def init_row(self, colspec, offset):

445 i = 0

446 cells = []

447 for start, end in colspec:

448 morecols = 0

449 try:

450 assert start == self.columns[i][0]

451 while end != self.columns[i][1]:

452 i += 1

453 morecols += 1

454 except (AssertionError, IndexError):

455 raise TableMarkupError('Column span alignment problem '

456 'in table line %s.' % (offset+2),

457 offset=offset+1)

458 cells.append([0, morecols, offset, []])

459 i += 1

460 return cells

461

462 def parse_row(self, lines, start, spanline=None):

463 """

464 Given the text `lines` of a row, parse it and append to `self.table`.

465

466 The row is parsed according to the current column spec (either

467 `spanline` if provided or `self.columns`). For each column, extract

468 text from each line, and check for text in column margins. Finally,

469 adjust for insignificant whitespace.

470 """

471 if not (lines or spanline):

472 # No new row, just blank lines.

473 return

474 if spanline:

475 columns = self.parse_columns(*spanline)

476 else:

477 columns = self.columns[:]

478 self.check_columns(lines, start, columns)

479 row = self.init_row(columns, start)

480 for i in range(len(columns)):

481 start, end = columns[i]

482 cellblock = lines.get_2D_block(0, start, len(lines), end)

483 cellblock.disconnect() # lines in cell can't sync with parent

484 cellblock.replace(self.double_width_pad_char, '')

485 row[i][3] = cellblock

486 self.table.append(row)

487

488 def check_columns(self, lines, first_line, columns):

489 """

490 Check for text in column margins and text overflow in the last column.

491 Raise TableMarkupError if anything but whitespace is in column margins.

492 Adjust the end value for the last column if there is text overflow.

493 """

494 # "Infinite" value for a dummy last column's beginning, used to

495 # check for text overflow:

496 columns.append((sys.maxsize, None))

497 lastcol = len(columns) - 2

498 # combining characters do not contribute to the column width

499 lines = [strip_combining_chars(line) for line in lines]

500

501 for i in range(len(columns) - 1):

502 start, end = columns[i]

503 nextstart = columns[i+1][0]

504 offset = 0

505 for line in lines:

506 if i == lastcol and line[end:].strip():

507 text = line[start:].rstrip()

508 new_end = start + len(text)

509 main_start, main_end = self.columns[-1]

510 columns[i] = (start, max(main_end, new_end))

511 if new_end > main_end:

512 self.columns[-1] = (main_start, new_end)

513 elif line[end:nextstart].strip():

514 raise TableMarkupError('Text in column margin in table '

515 'line %s.' % (first_line+offset+1),

516 offset=first_line+offset)

517 offset += 1

518 columns.pop()

519

520 def structure_from_cells(self):

521 colspecs = [end - start for start, end in self.columns]

522 first_body_row = 0

523 if self.head_body_sep:

524 for i in range(len(self.table)):

525 if self.table[i][0][2] > self.head_body_sep:

526 first_body_row = i

527 break

528 return (colspecs, self.table[:first_body_row],

529 self.table[first_body_row:])

530

531

532def update_dict_of_lists(master, newdata):

533 """

534 Extend the list values of `master` with those from `newdata`.

535

536 Both parameters must be dictionaries containing list values.

537 """

538 for key, values in newdata.items():

539 master.setdefault(key, []).extend(values)