Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/html.py: 21%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

367 statements  

1""" 

2:mod:`pandas.io.html` is a module containing functionality for dealing with 

3HTML IO. 

4 

5""" 

6 

7from __future__ import annotations 

8 

9from collections import abc 

10import numbers 

11import re 

12from typing import ( 

13 TYPE_CHECKING, 

14 Iterable, 

15 Literal, 

16 Pattern, 

17 Sequence, 

18 cast, 

19) 

20 

21from pandas._libs import lib 

22from pandas._typing import ( 

23 BaseBuffer, 

24 DtypeBackend, 

25 FilePath, 

26 ReadBuffer, 

27) 

28from pandas.compat._optional import import_optional_dependency 

29from pandas.errors import ( 

30 AbstractMethodError, 

31 EmptyDataError, 

32) 

33from pandas.util._validators import check_dtype_backend 

34 

35from pandas.core.dtypes.common import is_list_like 

36 

37from pandas import isna 

38from pandas.core.indexes.base import Index 

39from pandas.core.indexes.multi import MultiIndex 

40from pandas.core.series import Series 

41 

42from pandas.io.common import ( 

43 file_exists, 

44 get_handle, 

45 is_url, 

46 stringify_path, 

47 urlopen, 

48 validate_header_arg, 

49) 

50from pandas.io.formats.printing import pprint_thing 

51from pandas.io.parsers import TextParser 

52 

53if TYPE_CHECKING: 

54 from pandas import DataFrame 

55 

56_IMPORTS = False 

57_HAS_BS4 = False 

58_HAS_LXML = False 

59_HAS_HTML5LIB = False 

60 

61 

62def _importers() -> None: 

63 # import things we need 

64 # but make this done on a first use basis 

65 

66 global _IMPORTS 

67 if _IMPORTS: 

68 return 

69 

70 global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB 

71 bs4 = import_optional_dependency("bs4", errors="ignore") 

72 _HAS_BS4 = bs4 is not None 

73 

74 lxml = import_optional_dependency("lxml.etree", errors="ignore") 

75 _HAS_LXML = lxml is not None 

76 

77 html5lib = import_optional_dependency("html5lib", errors="ignore") 

78 _HAS_HTML5LIB = html5lib is not None 

79 

80 _IMPORTS = True 

81 

82 

83############# 

84# READ HTML # 

85############# 

86_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}") 

87 

88 

89def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str: 

90 """ 

91 Replace extra whitespace inside of a string with a single space. 

92 

93 Parameters 

94 ---------- 

95 s : str or unicode 

96 The string from which to remove extra whitespace. 

97 regex : re.Pattern 

98 The regular expression to use to remove extra whitespace. 

99 

100 Returns 

101 ------- 

102 subd : str or unicode 

103 `s` with all extra whitespace replaced with a single space. 

104 """ 

105 return regex.sub(" ", s.strip()) 

106 

107 

108def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]: 

109 """ 

110 Get an iterator given an integer, slice or container. 

111 

112 Parameters 

113 ---------- 

114 skiprows : int, slice, container 

115 The iterator to use to skip rows; can also be a slice. 

116 

117 Raises 

118 ------ 

119 TypeError 

120 * If `skiprows` is not a slice, integer, or Container 

121 

122 Returns 

123 ------- 

124 it : iterable 

125 A proper iterator to use to skip rows of a DataFrame. 

126 """ 

127 if isinstance(skiprows, slice): 

128 start, step = skiprows.start or 0, skiprows.step or 1 

129 return list(range(start, skiprows.stop, step)) 

130 elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows): 

131 return cast("int | Sequence[int]", skiprows) 

132 elif skiprows is None: 

133 return 0 

134 raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows") 

135 

136 

137def _read(obj: FilePath | BaseBuffer, encoding: str | None) -> str | bytes: 

138 """ 

139 Try to read from a url, file or string. 

140 

141 Parameters 

142 ---------- 

143 obj : str, unicode, path object, or file-like object 

144 

145 Returns 

146 ------- 

147 raw_text : str 

148 """ 

149 text: str | bytes 

150 if ( 

151 is_url(obj) 

152 or hasattr(obj, "read") 

153 or (isinstance(obj, str) and file_exists(obj)) 

154 ): 

155 with get_handle(obj, "r", encoding=encoding) as handles: 

156 text = handles.handle.read() 

157 elif isinstance(obj, (str, bytes)): 

158 text = obj 

159 else: 

160 raise TypeError(f"Cannot read object of type '{type(obj).__name__}'") 

161 return text 

162 

163 

164class _HtmlFrameParser: 

165 """ 

166 Base class for parsers that parse HTML into DataFrames. 

167 

168 Parameters 

169 ---------- 

170 io : str or file-like 

171 This can be either a string of raw HTML, a valid URL using the HTTP, 

172 FTP, or FILE protocols or a file-like object. 

173 

174 match : str or regex 

175 The text to match in the document. 

176 

177 attrs : dict 

178 List of HTML <table> element attributes to match. 

179 

180 encoding : str 

181 Encoding to be used by parser 

182 

183 displayed_only : bool 

184 Whether or not items with "display:none" should be ignored 

185 

186 extract_links : {None, "all", "header", "body", "footer"} 

187 Table elements in the specified section(s) with <a> tags will have their 

188 href extracted. 

189 

190 .. versionadded:: 1.5.0 

191 

192 Attributes 

193 ---------- 

194 io : str or file-like 

195 raw HTML, URL, or file-like object 

196 

197 match : regex 

198 The text to match in the raw HTML 

199 

200 attrs : dict-like 

201 A dictionary of valid table attributes to use to search for table 

202 elements. 

203 

204 encoding : str 

205 Encoding to be used by parser 

206 

207 displayed_only : bool 

208 Whether or not items with "display:none" should be ignored 

209 

210 extract_links : {None, "all", "header", "body", "footer"} 

211 Table elements in the specified section(s) with <a> tags will have their 

212 href extracted. 

213 

214 .. versionadded:: 1.5.0 

215 

216 Notes 

217 ----- 

218 To subclass this class effectively you must override the following methods: 

219 * :func:`_build_doc` 

220 * :func:`_attr_getter` 

221 * :func:`_href_getter` 

222 * :func:`_text_getter` 

223 * :func:`_parse_td` 

224 * :func:`_parse_thead_tr` 

225 * :func:`_parse_tbody_tr` 

226 * :func:`_parse_tfoot_tr` 

227 * :func:`_parse_tables` 

228 * :func:`_equals_tag` 

229 See each method's respective documentation for details on their 

230 functionality. 

231 """ 

232 

233 def __init__( 

234 self, 

235 io: FilePath | ReadBuffer[str] | ReadBuffer[bytes], 

236 match: str | Pattern, 

237 attrs: dict[str, str] | None, 

238 encoding: str, 

239 displayed_only: bool, 

240 extract_links: Literal[None, "header", "footer", "body", "all"], 

241 ) -> None: 

242 self.io = io 

243 self.match = match 

244 self.attrs = attrs 

245 self.encoding = encoding 

246 self.displayed_only = displayed_only 

247 self.extract_links = extract_links 

248 

249 def parse_tables(self): 

250 """ 

251 Parse and return all tables from the DOM. 

252 

253 Returns 

254 ------- 

255 list of parsed (header, body, footer) tuples from tables. 

256 """ 

257 tables = self._parse_tables(self._build_doc(), self.match, self.attrs) 

258 return (self._parse_thead_tbody_tfoot(table) for table in tables) 

259 

260 def _attr_getter(self, obj, attr): 

261 """ 

262 Return the attribute value of an individual DOM node. 

263 

264 Parameters 

265 ---------- 

266 obj : node-like 

267 A DOM node. 

268 

269 attr : str or unicode 

270 The attribute, such as "colspan" 

271 

272 Returns 

273 ------- 

274 str or unicode 

275 The attribute value. 

276 """ 

277 # Both lxml and BeautifulSoup have the same implementation: 

278 return obj.get(attr) 

279 

280 def _href_getter(self, obj): 

281 """ 

282 Return a href if the DOM node contains a child <a> or None. 

283 

284 Parameters 

285 ---------- 

286 obj : node-like 

287 A DOM node. 

288 

289 Returns 

290 ------- 

291 href : str or unicode 

292 The href from the <a> child of the DOM node. 

293 """ 

294 raise AbstractMethodError(self) 

295 

296 def _text_getter(self, obj): 

297 """ 

298 Return the text of an individual DOM node. 

299 

300 Parameters 

301 ---------- 

302 obj : node-like 

303 A DOM node. 

304 

305 Returns 

306 ------- 

307 text : str or unicode 

308 The text from an individual DOM node. 

309 """ 

310 raise AbstractMethodError(self) 

311 

312 def _parse_td(self, obj): 

313 """ 

314 Return the td elements from a row element. 

315 

316 Parameters 

317 ---------- 

318 obj : node-like 

319 A DOM <tr> node. 

320 

321 Returns 

322 ------- 

323 list of node-like 

324 These are the elements of each row, i.e., the columns. 

325 """ 

326 raise AbstractMethodError(self) 

327 

328 def _parse_thead_tr(self, table): 

329 """ 

330 Return the list of thead row elements from the parsed table element. 

331 

332 Parameters 

333 ---------- 

334 table : a table element that contains zero or more thead elements. 

335 

336 Returns 

337 ------- 

338 list of node-like 

339 These are the <tr> row elements of a table. 

340 """ 

341 raise AbstractMethodError(self) 

342 

343 def _parse_tbody_tr(self, table): 

344 """ 

345 Return the list of tbody row elements from the parsed table element. 

346 

347 HTML5 table bodies consist of either 0 or more <tbody> elements (which 

348 only contain <tr> elements) or 0 or more <tr> elements. This method 

349 checks for both structures. 

350 

351 Parameters 

352 ---------- 

353 table : a table element that contains row elements. 

354 

355 Returns 

356 ------- 

357 list of node-like 

358 These are the <tr> row elements of a table. 

359 """ 

360 raise AbstractMethodError(self) 

361 

362 def _parse_tfoot_tr(self, table): 

363 """ 

364 Return the list of tfoot row elements from the parsed table element. 

365 

366 Parameters 

367 ---------- 

368 table : a table element that contains row elements. 

369 

370 Returns 

371 ------- 

372 list of node-like 

373 These are the <tr> row elements of a table. 

374 """ 

375 raise AbstractMethodError(self) 

376 

377 def _parse_tables(self, doc, match, attrs): 

378 """ 

379 Return all tables from the parsed DOM. 

380 

381 Parameters 

382 ---------- 

383 doc : the DOM from which to parse the table element. 

384 

385 match : str or regular expression 

386 The text to search for in the DOM tree. 

387 

388 attrs : dict 

389 A dictionary of table attributes that can be used to disambiguate 

390 multiple tables on a page. 

391 

392 Raises 

393 ------ 

394 ValueError : `match` does not match any text in the document. 

395 

396 Returns 

397 ------- 

398 list of node-like 

399 HTML <table> elements to be parsed into raw data. 

400 """ 

401 raise AbstractMethodError(self) 

402 

403 def _equals_tag(self, obj, tag): 

404 """ 

405 Return whether an individual DOM node matches a tag 

406 

407 Parameters 

408 ---------- 

409 obj : node-like 

410 A DOM node. 

411 

412 tag : str 

413 Tag name to be checked for equality. 

414 

415 Returns 

416 ------- 

417 boolean 

418 Whether `obj`'s tag name is `tag` 

419 """ 

420 raise AbstractMethodError(self) 

421 

422 def _build_doc(self): 

423 """ 

424 Return a tree-like object that can be used to iterate over the DOM. 

425 

426 Returns 

427 ------- 

428 node-like 

429 The DOM from which to parse the table element. 

430 """ 

431 raise AbstractMethodError(self) 

432 

433 def _parse_thead_tbody_tfoot(self, table_html): 

434 """ 

435 Given a table, return parsed header, body, and foot. 

436 

437 Parameters 

438 ---------- 

439 table_html : node-like 

440 

441 Returns 

442 ------- 

443 tuple of (header, body, footer), each a list of list-of-text rows. 

444 

445 Notes 

446 ----- 

447 Header and body are lists-of-lists. Top level list is a list of 

448 rows. Each row is a list of str text. 

449 

450 Logic: Use <thead>, <tbody>, <tfoot> elements to identify 

451 header, body, and footer, otherwise: 

452 - Put all rows into body 

453 - Move rows from top of body to header only if 

454 all elements inside row are <th> 

455 - Move rows from bottom of body to footer only if 

456 all elements inside row are <th> 

457 """ 

458 header_rows = self._parse_thead_tr(table_html) 

459 body_rows = self._parse_tbody_tr(table_html) 

460 footer_rows = self._parse_tfoot_tr(table_html) 

461 

462 def row_is_all_th(row): 

463 return all(self._equals_tag(t, "th") for t in self._parse_td(row)) 

464 

465 if not header_rows: 

466 # The table has no <thead>. Move the top all-<th> rows from 

467 # body_rows to header_rows. (This is a common case because many 

468 # tables in the wild have no <thead> or <tfoot> 

469 while body_rows and row_is_all_th(body_rows[0]): 

470 header_rows.append(body_rows.pop(0)) 

471 

472 header = self._expand_colspan_rowspan(header_rows, section="header") 

473 body = self._expand_colspan_rowspan(body_rows, section="body") 

474 footer = self._expand_colspan_rowspan(footer_rows, section="footer") 

475 

476 return header, body, footer 

477 

478 def _expand_colspan_rowspan( 

479 self, rows, section: Literal["header", "footer", "body"] 

480 ): 

481 """ 

482 Given a list of <tr>s, return a list of text rows. 

483 

484 Parameters 

485 ---------- 

486 rows : list of node-like 

487 List of <tr>s 

488 section : the section that the rows belong to (header, body or footer). 

489 

490 Returns 

491 ------- 

492 list of list 

493 Each returned row is a list of str text, or tuple (text, link) 

494 if extract_links is not None. 

495 

496 Notes 

497 ----- 

498 Any cell with ``rowspan`` or ``colspan`` will have its contents copied 

499 to subsequent cells. 

500 """ 

501 all_texts = [] # list of rows, each a list of str 

502 text: str | tuple 

503 remainder: list[ 

504 tuple[int, str | tuple, int] 

505 ] = [] # list of (index, text, nrows) 

506 

507 for tr in rows: 

508 texts = [] # the output for this row 

509 next_remainder = [] 

510 

511 index = 0 

512 tds = self._parse_td(tr) 

513 for td in tds: 

514 # Append texts from previous rows with rowspan>1 that come 

515 # before this <td> 

516 while remainder and remainder[0][0] <= index: 

517 prev_i, prev_text, prev_rowspan = remainder.pop(0) 

518 texts.append(prev_text) 

519 if prev_rowspan > 1: 

520 next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) 

521 index += 1 

522 

523 # Append the text from this <td>, colspan times 

524 text = _remove_whitespace(self._text_getter(td)) 

525 if self.extract_links in ("all", section): 

526 href = self._href_getter(td) 

527 text = (text, href) 

528 rowspan = int(self._attr_getter(td, "rowspan") or 1) 

529 colspan = int(self._attr_getter(td, "colspan") or 1) 

530 

531 for _ in range(colspan): 

532 texts.append(text) 

533 if rowspan > 1: 

534 next_remainder.append((index, text, rowspan - 1)) 

535 index += 1 

536 

537 # Append texts from previous rows at the final position 

538 for prev_i, prev_text, prev_rowspan in remainder: 

539 texts.append(prev_text) 

540 if prev_rowspan > 1: 

541 next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) 

542 

543 all_texts.append(texts) 

544 remainder = next_remainder 

545 

546 # Append rows that only appear because the previous row had non-1 

547 # rowspan 

548 while remainder: 

549 next_remainder = [] 

550 texts = [] 

551 for prev_i, prev_text, prev_rowspan in remainder: 

552 texts.append(prev_text) 

553 if prev_rowspan > 1: 

554 next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) 

555 all_texts.append(texts) 

556 remainder = next_remainder 

557 

558 return all_texts 

559 

560 def _handle_hidden_tables(self, tbl_list, attr_name): 

561 """ 

562 Return list of tables, potentially removing hidden elements 

563 

564 Parameters 

565 ---------- 

566 tbl_list : list of node-like 

567 Type of list elements will vary depending upon parser used 

568 attr_name : str 

569 Name of the accessor for retrieving HTML attributes 

570 

571 Returns 

572 ------- 

573 list of node-like 

574 Return type matches `tbl_list` 

575 """ 

576 if not self.displayed_only: 

577 return tbl_list 

578 

579 return [ 

580 x 

581 for x in tbl_list 

582 if "display:none" 

583 not in getattr(x, attr_name).get("style", "").replace(" ", "") 

584 ] 

585 

586 

587class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): 

588 """ 

589 HTML to DataFrame parser that uses BeautifulSoup under the hood. 

590 

591 See Also 

592 -------- 

593 pandas.io.html._HtmlFrameParser 

594 pandas.io.html._LxmlFrameParser 

595 

596 Notes 

597 ----- 

598 Documentation strings for this class are in the base class 

599 :class:`pandas.io.html._HtmlFrameParser`. 

600 """ 

601 

602 def __init__(self, *args, **kwargs) -> None: 

603 super().__init__(*args, **kwargs) 

604 from bs4 import SoupStrainer 

605 

606 self._strainer = SoupStrainer("table") 

607 

608 def _parse_tables(self, doc, match, attrs): 

609 element_name = self._strainer.name 

610 tables = doc.find_all(element_name, attrs=attrs) 

611 

612 if not tables: 

613 raise ValueError("No tables found") 

614 

615 result = [] 

616 unique_tables = set() 

617 tables = self._handle_hidden_tables(tables, "attrs") 

618 

619 for table in tables: 

620 if self.displayed_only: 

621 for elem in table.find_all(style=re.compile(r"display:\s*none")): 

622 elem.decompose() 

623 

624 if table not in unique_tables and table.find(string=match) is not None: 

625 result.append(table) 

626 unique_tables.add(table) 

627 

628 if not result: 

629 raise ValueError(f"No tables found matching pattern {repr(match.pattern)}") 

630 return result 

631 

632 def _href_getter(self, obj) -> str | None: 

633 a = obj.find("a", href=True) 

634 return None if not a else a["href"] 

635 

636 def _text_getter(self, obj): 

637 return obj.text 

638 

639 def _equals_tag(self, obj, tag): 

640 return obj.name == tag 

641 

642 def _parse_td(self, row): 

643 return row.find_all(("td", "th"), recursive=False) 

644 

645 def _parse_thead_tr(self, table): 

646 return table.select("thead tr") 

647 

648 def _parse_tbody_tr(self, table): 

649 from_tbody = table.select("tbody tr") 

650 from_root = table.find_all("tr", recursive=False) 

651 # HTML spec: at most one of these lists has content 

652 return from_tbody + from_root 

653 

654 def _parse_tfoot_tr(self, table): 

655 return table.select("tfoot tr") 

656 

657 def _setup_build_doc(self): 

658 raw_text = _read(self.io, self.encoding) 

659 if not raw_text: 

660 raise ValueError(f"No text parsed from document: {self.io}") 

661 return raw_text 

662 

663 def _build_doc(self): 

664 from bs4 import BeautifulSoup 

665 

666 bdoc = self._setup_build_doc() 

667 if isinstance(bdoc, bytes) and self.encoding is not None: 

668 udoc = bdoc.decode(self.encoding) 

669 from_encoding = None 

670 else: 

671 udoc = bdoc 

672 from_encoding = self.encoding 

673 

674 soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding) 

675 

676 for br in soup.find_all("br"): 

677 br.replace_with("\n" + br.text) 

678 

679 return soup 

680 

681 

682def _build_xpath_expr(attrs) -> str: 

683 """ 

684 Build an xpath expression to simulate bs4's ability to pass in kwargs to 

685 search for attributes when using the lxml parser. 

686 

687 Parameters 

688 ---------- 

689 attrs : dict 

690 A dict of HTML attributes. These are NOT checked for validity. 

691 

692 Returns 

693 ------- 

694 expr : unicode 

695 An XPath expression that checks for the given HTML attributes. 

696 """ 

697 # give class attribute as class_ because class is a python keyword 

698 if "class_" in attrs: 

699 attrs["class"] = attrs.pop("class_") 

700 

701 s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()]) 

702 return f"[{s}]" 

703 

704 

705_re_namespace = {"re": "http://exslt.org/regular-expressions"} 

706 

707 

708class _LxmlFrameParser(_HtmlFrameParser): 

709 """ 

710 HTML to DataFrame parser that uses lxml under the hood. 

711 

712 Warning 

713 ------- 

714 This parser can only handle HTTP, FTP, and FILE urls. 

715 

716 See Also 

717 -------- 

718 _HtmlFrameParser 

719 _BeautifulSoupLxmlFrameParser 

720 

721 Notes 

722 ----- 

723 Documentation strings for this class are in the base class 

724 :class:`_HtmlFrameParser`. 

725 """ 

726 

727 def _href_getter(self, obj) -> str | None: 

728 href = obj.xpath(".//a/@href") 

729 return None if not href else href[0] 

730 

731 def _text_getter(self, obj): 

732 return obj.text_content() 

733 

734 def _parse_td(self, row): 

735 # Look for direct children only: the "row" element here may be a 

736 # <thead> or <tfoot> (see _parse_thead_tr). 

737 return row.xpath("./td|./th") 

738 

739 def _parse_tables(self, doc, match, kwargs): 

740 pattern = match.pattern 

741 

742 # 1. check all descendants for the given pattern and only search tables 

743 # GH 49929 

744 xpath_expr = f"//table[.//text()[re:test(., {repr(pattern)})]]" 

745 

746 # if any table attributes were given build an xpath expression to 

747 # search for them 

748 if kwargs: 

749 xpath_expr += _build_xpath_expr(kwargs) 

750 

751 tables = doc.xpath(xpath_expr, namespaces=_re_namespace) 

752 

753 tables = self._handle_hidden_tables(tables, "attrib") 

754 if self.displayed_only: 

755 for table in tables: 

756 # lxml utilizes XPATH 1.0 which does not have regex 

757 # support. As a result, we find all elements with a style 

758 # attribute and iterate them to check for display:none 

759 for elem in table.xpath(".//*[@style]"): 

760 if "display:none" in elem.attrib.get("style", "").replace(" ", ""): 

761 elem.getparent().remove(elem) 

762 

763 if not tables: 

764 raise ValueError(f"No tables found matching regex {repr(pattern)}") 

765 return tables 

766 

767 def _equals_tag(self, obj, tag): 

768 return obj.tag == tag 

769 

770 def _build_doc(self): 

771 """ 

772 Raises 

773 ------ 

774 ValueError 

775 * If a URL that lxml cannot parse is passed. 

776 

777 Exception 

778 * Any other ``Exception`` thrown. For example, trying to parse a 

779 URL that is syntactically correct on a machine with no internet 

780 connection will fail. 

781 

782 See Also 

783 -------- 

784 pandas.io.html._HtmlFrameParser._build_doc 

785 """ 

786 from lxml.etree import XMLSyntaxError 

787 from lxml.html import ( 

788 HTMLParser, 

789 fromstring, 

790 parse, 

791 ) 

792 

793 parser = HTMLParser(recover=True, encoding=self.encoding) 

794 

795 try: 

796 if is_url(self.io): 

797 with urlopen(self.io) as f: 

798 r = parse(f, parser=parser) 

799 else: 

800 # try to parse the input in the simplest way 

801 r = parse(self.io, parser=parser) 

802 try: 

803 r = r.getroot() 

804 except AttributeError: 

805 pass 

806 except (UnicodeDecodeError, OSError) as e: 

807 # if the input is a blob of html goop 

808 if not is_url(self.io): 

809 r = fromstring(self.io, parser=parser) 

810 

811 try: 

812 r = r.getroot() 

813 except AttributeError: 

814 pass 

815 else: 

816 raise e 

817 else: 

818 if not hasattr(r, "text_content"): 

819 raise XMLSyntaxError("no text parsed from document", 0, 0, 0) 

820 

821 for br in r.xpath("*//br"): 

822 br.tail = "\n" + (br.tail or "") 

823 

824 return r 

825 

826 def _parse_thead_tr(self, table): 

827 rows = [] 

828 

829 for thead in table.xpath(".//thead"): 

830 rows.extend(thead.xpath("./tr")) 

831 

832 # HACK: lxml does not clean up the clearly-erroneous 

833 # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add 

834 # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its 

835 # children as though it's a <tr>. 

836 # 

837 # Better solution would be to use html5lib. 

838 elements_at_root = thead.xpath("./td|./th") 

839 if elements_at_root: 

840 rows.append(thead) 

841 

842 return rows 

843 

844 def _parse_tbody_tr(self, table): 

845 from_tbody = table.xpath(".//tbody//tr") 

846 from_root = table.xpath("./tr") 

847 # HTML spec: at most one of these lists has content 

848 return from_tbody + from_root 

849 

850 def _parse_tfoot_tr(self, table): 

851 return table.xpath(".//tfoot//tr") 

852 

853 

854def _expand_elements(body) -> None: 

855 data = [len(elem) for elem in body] 

856 lens = Series(data) 

857 lens_max = lens.max() 

858 not_max = lens[lens != lens_max] 

859 

860 empty = [""] 

861 for ind, length in not_max.items(): 

862 body[ind] += empty * (lens_max - length) 

863 

864 

865def _data_to_frame(**kwargs): 

866 head, body, foot = kwargs.pop("data") 

867 header = kwargs.pop("header") 

868 kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"]) 

869 if head: 

870 body = head + body 

871 

872 # Infer header when there is a <thead> or top <th>-only rows 

873 if header is None: 

874 if len(head) == 1: 

875 header = 0 

876 else: 

877 # ignore all-empty-text rows 

878 header = [i for i, row in enumerate(head) if any(text for text in row)] 

879 

880 if foot: 

881 body += foot 

882 

883 # fill out elements of body that are "ragged" 

884 _expand_elements(body) 

885 with TextParser(body, header=header, **kwargs) as tp: 

886 return tp.read() 

887 

888 

889_valid_parsers = { 

890 "lxml": _LxmlFrameParser, 

891 None: _LxmlFrameParser, 

892 "html5lib": _BeautifulSoupHtml5LibFrameParser, 

893 "bs4": _BeautifulSoupHtml5LibFrameParser, 

894} 

895 

896 

897def _parser_dispatch(flavor: str | None) -> type[_HtmlFrameParser]: 

898 """ 

899 Choose the parser based on the input flavor. 

900 

901 Parameters 

902 ---------- 

903 flavor : str 

904 The type of parser to use. This must be a valid backend. 

905 

906 Returns 

907 ------- 

908 cls : _HtmlFrameParser subclass 

909 The parser class based on the requested input flavor. 

910 

911 Raises 

912 ------ 

913 ValueError 

914 * If `flavor` is not a valid backend. 

915 ImportError 

916 * If you do not have the requested `flavor` 

917 """ 

918 valid_parsers = list(_valid_parsers.keys()) 

919 if flavor not in valid_parsers: 

920 raise ValueError( 

921 f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}" 

922 ) 

923 

924 if flavor in ("bs4", "html5lib"): 

925 if not _HAS_HTML5LIB: 

926 raise ImportError("html5lib not found, please install it") 

927 if not _HAS_BS4: 

928 raise ImportError("BeautifulSoup4 (bs4) not found, please install it") 

929 # Although we call this above, we want to raise here right before use. 

930 bs4 = import_optional_dependency("bs4") # noqa:F841 

931 

932 else: 

933 if not _HAS_LXML: 

934 raise ImportError("lxml not found, please install it") 

935 return _valid_parsers[flavor] 

936 

937 

938def _print_as_set(s) -> str: 

939 arg = ", ".join([pprint_thing(el) for el in s]) 

940 return f"{{{arg}}}" 

941 

942 

943def _validate_flavor(flavor): 

944 if flavor is None: 

945 flavor = "lxml", "bs4" 

946 elif isinstance(flavor, str): 

947 flavor = (flavor,) 

948 elif isinstance(flavor, abc.Iterable): 

949 if not all(isinstance(flav, str) for flav in flavor): 

950 raise TypeError( 

951 f"Object of type {repr(type(flavor).__name__)} " 

952 f"is not an iterable of strings" 

953 ) 

954 else: 

955 msg = repr(flavor) if isinstance(flavor, str) else str(flavor) 

956 msg += " is not a valid flavor" 

957 raise ValueError(msg) 

958 

959 flavor = tuple(flavor) 

960 valid_flavors = set(_valid_parsers) 

961 flavor_set = set(flavor) 

962 

963 if not flavor_set & valid_flavors: 

964 raise ValueError( 

965 f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid " 

966 f"flavors are {_print_as_set(valid_flavors)}" 

967 ) 

968 return flavor 

969 

970 

971def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs): 

972 flavor = _validate_flavor(flavor) 

973 compiled_match = re.compile(match) # you can pass a compiled regex here 

974 

975 retained = None 

976 for flav in flavor: 

977 parser = _parser_dispatch(flav) 

978 p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links) 

979 

980 try: 

981 tables = p.parse_tables() 

982 except ValueError as caught: 

983 # if `io` is an io-like object, check if it's seekable 

984 # and try to rewind it before trying the next parser 

985 if hasattr(io, "seekable") and io.seekable(): 

986 io.seek(0) 

987 elif hasattr(io, "seekable") and not io.seekable(): 

988 # if we couldn't rewind it, let the user know 

989 raise ValueError( 

990 f"The flavor {flav} failed to parse your input. " 

991 "Since you passed a non-rewindable file " 

992 "object, we can't rewind it to try " 

993 "another parser. Try read_html() with a different flavor." 

994 ) from caught 

995 

996 retained = caught 

997 else: 

998 break 

999 else: 

1000 assert retained is not None # for mypy 

1001 raise retained 

1002 

1003 ret = [] 

1004 for table in tables: 

1005 try: 

1006 df = _data_to_frame(data=table, **kwargs) 

1007 # Cast MultiIndex header to an Index of tuples when extracting header 

1008 # links and replace nan with None (therefore can't use mi.to_flat_index()). 

1009 # This maintains consistency of selection (e.g. df.columns.str[1]) 

1010 if extract_links in ("all", "header") and isinstance( 

1011 df.columns, MultiIndex 

1012 ): 

1013 df.columns = Index( 

1014 ((col[0], None if isna(col[1]) else col[1]) for col in df.columns), 

1015 tupleize_cols=False, 

1016 ) 

1017 

1018 ret.append(df) 

1019 except EmptyDataError: # empty table 

1020 continue 

1021 return ret 

1022 

1023 

1024def read_html( 

1025 io: FilePath | ReadBuffer[str], 

1026 *, 

1027 match: str | Pattern = ".+", 

1028 flavor: str | None = None, 

1029 header: int | Sequence[int] | None = None, 

1030 index_col: int | Sequence[int] | None = None, 

1031 skiprows: int | Sequence[int] | slice | None = None, 

1032 attrs: dict[str, str] | None = None, 

1033 parse_dates: bool = False, 

1034 thousands: str | None = ",", 

1035 encoding: str | None = None, 

1036 decimal: str = ".", 

1037 converters: dict | None = None, 

1038 na_values: Iterable[object] | None = None, 

1039 keep_default_na: bool = True, 

1040 displayed_only: bool = True, 

1041 extract_links: Literal[None, "header", "footer", "body", "all"] = None, 

1042 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

1043) -> list[DataFrame]: 

1044 r""" 

1045 Read HTML tables into a ``list`` of ``DataFrame`` objects. 

1046 

1047 Parameters 

1048 ---------- 

1049 io : str, path object, or file-like object 

1050 String, path object (implementing ``os.PathLike[str]``), or file-like 

1051 object implementing a string ``read()`` function. 

1052 The string can represent a URL or the HTML itself. Note that 

1053 lxml only accepts the http, ftp and file url protocols. If you have a 

1054 URL that starts with ``'https'`` you might try removing the ``'s'``. 

1055 

1056 match : str or compiled regular expression, optional 

1057 The set of tables containing text matching this regex or string will be 

1058 returned. Unless the HTML is extremely simple you will probably need to 

1059 pass a non-empty string here. Defaults to '.+' (match any non-empty 

1060 string). The default value will return all tables contained on a page. 

1061 This value is converted to a regular expression so that there is 

1062 consistent behavior between Beautiful Soup and lxml. 

1063 

1064 flavor : str, optional 

1065 The parsing engine to use. 'bs4' and 'html5lib' are synonymous with 

1066 each other, they are both there for backwards compatibility. The 

1067 default of ``None`` tries to use ``lxml`` to parse and if that fails it 

1068 falls back on ``bs4`` + ``html5lib``. 

1069 

1070 header : int or list-like, optional 

1071 The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to 

1072 make the columns headers. 

1073 

1074 index_col : int or list-like, optional 

1075 The column (or list of columns) to use to create the index. 

1076 

1077 skiprows : int, list-like or slice, optional 

1078 Number of rows to skip after parsing the column integer. 0-based. If a 

1079 sequence of integers or a slice is given, will skip the rows indexed by 

1080 that sequence. Note that a single element sequence means 'skip the nth 

1081 row' whereas an integer means 'skip n rows'. 

1082 

1083 attrs : dict, optional 

1084 This is a dictionary of attributes that you can pass to use to identify 

1085 the table in the HTML. These are not checked for validity before being 

1086 passed to lxml or Beautiful Soup. However, these attributes must be 

1087 valid HTML table attributes to work correctly. For example, :: 

1088 

1089 attrs = {'id': 'table'} 

1090 

1091 is a valid attribute dictionary because the 'id' HTML tag attribute is 

1092 a valid HTML attribute for *any* HTML tag as per `this document 

1093 <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>`__. :: 

1094 

1095 attrs = {'asdf': 'table'} 

1096 

1097 is *not* a valid attribute dictionary because 'asdf' is not a valid 

1098 HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 

1099 table attributes can be found `here 

1100 <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A 

1101 working draft of the HTML 5 spec can be found `here 

1102 <https://html.spec.whatwg.org/multipage/tables.html>`__. It contains the 

1103 latest information on table attributes for the modern web. 

1104 

1105 parse_dates : bool, optional 

1106 See :func:`~read_csv` for more details. 

1107 

1108 thousands : str, optional 

1109 Separator to use to parse thousands. Defaults to ``','``. 

1110 

1111 encoding : str, optional 

1112 The encoding used to decode the web page. Defaults to ``None``.``None`` 

1113 preserves the previous encoding behavior, which depends on the 

1114 underlying parser library (e.g., the parser library will try to use 

1115 the encoding provided by the document). 

1116 

1117 decimal : str, default '.' 

1118 Character to recognize as decimal point (e.g. use ',' for European 

1119 data). 

1120 

1121 converters : dict, default None 

1122 Dict of functions for converting values in certain columns. Keys can 

1123 either be integers or column labels, values are functions that take one 

1124 input argument, the cell (not column) content, and return the 

1125 transformed content. 

1126 

1127 na_values : iterable, default None 

1128 Custom NA values. 

1129 

1130 keep_default_na : bool, default True 

1131 If na_values are specified and keep_default_na is False the default NaN 

1132 values are overridden, otherwise they're appended to. 

1133 

1134 displayed_only : bool, default True 

1135 Whether elements with "display: none" should be parsed. 

1136 

1137 extract_links : {None, "all", "header", "body", "footer"} 

1138 Table elements in the specified section(s) with <a> tags will have their 

1139 href extracted. 

1140 

1141 .. versionadded:: 1.5.0 

1142 

1143 dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames 

1144 Which dtype_backend to use, e.g. whether a DataFrame should have NumPy 

1145 arrays, nullable dtypes are used for all dtypes that have a nullable 

1146 implementation when "numpy_nullable" is set, pyarrow is used for all 

1147 dtypes if "pyarrow" is set. 

1148 

1149 The dtype_backends are still experimential. 

1150 

1151 .. versionadded:: 2.0 

1152 

1153 Returns 

1154 ------- 

1155 dfs 

1156 A list of DataFrames. 

1157 

1158 See Also 

1159 -------- 

1160 read_csv : Read a comma-separated values (csv) file into DataFrame. 

1161 

1162 Notes 

1163 ----- 

1164 Before using this function you should read the :ref:`gotchas about the 

1165 HTML parsing libraries <io.html.gotchas>`. 

1166 

1167 Expect to do some cleanup after you call this function. For example, you 

1168 might need to manually assign column names if the column names are 

1169 converted to NaN when you pass the `header=0` argument. We try to assume as 

1170 little as possible about the structure of the table and push the 

1171 idiosyncrasies of the HTML contained in the table to the user. 

1172 

1173 This function searches for ``<table>`` elements and only for ``<tr>`` 

1174 and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>`` 

1175 element in the table. ``<td>`` stands for "table data". This function 

1176 attempts to properly handle ``colspan`` and ``rowspan`` attributes. 

1177 If the function has a ``<thead>`` argument, it is used to construct 

1178 the header, otherwise the function attempts to find the header within 

1179 the body (by putting rows with only ``<th>`` elements into the header). 

1180 

1181 Similar to :func:`~read_csv` the `header` argument is applied 

1182 **after** `skiprows` is applied. 

1183 

1184 This function will *always* return a list of :class:`DataFrame` *or* 

1185 it will fail, e.g., it will *not* return an empty list. 

1186 

1187 Examples 

1188 -------- 

1189 See the :ref:`read_html documentation in the IO section of the docs 

1190 <io.read_html>` for some examples of reading in HTML tables. 

1191 """ 

1192 _importers() 

1193 

1194 # Type check here. We don't want to parse only to fail because of an 

1195 # invalid value of an integer skiprows. 

1196 if isinstance(skiprows, numbers.Integral) and skiprows < 0: 

1197 raise ValueError( 

1198 "cannot skip rows starting from the end of the " 

1199 "data (you passed a negative value)" 

1200 ) 

1201 if extract_links not in [None, "header", "footer", "body", "all"]: 

1202 raise ValueError( 

1203 "`extract_links` must be one of " 

1204 '{None, "header", "footer", "body", "all"}, got ' 

1205 f'"{extract_links}"' 

1206 ) 

1207 validate_header_arg(header) 

1208 check_dtype_backend(dtype_backend) 

1209 

1210 io = stringify_path(io) 

1211 

1212 return _parse( 

1213 flavor=flavor, 

1214 io=io, 

1215 match=match, 

1216 header=header, 

1217 index_col=index_col, 

1218 skiprows=skiprows, 

1219 parse_dates=parse_dates, 

1220 thousands=thousands, 

1221 attrs=attrs, 

1222 encoding=encoding, 

1223 decimal=decimal, 

1224 converters=converters, 

1225 na_values=na_values, 

1226 keep_default_na=keep_default_na, 

1227 displayed_only=displayed_only, 

1228 extract_links=extract_links, 

1229 dtype_backend=dtype_backend, 

1230 )