Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/html.py: 21%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

358 statements  

1""" 

2:mod:`pandas.io.html` is a module containing functionality for dealing with 

3HTML IO. 

4 

5""" 

6 

7from __future__ import annotations 

8 

9from collections import abc 

10import numbers 

11import re 

12from re import Pattern 

13from typing import ( 

14 TYPE_CHECKING, 

15 Literal, 

16 cast, 

17) 

18import warnings 

19 

20from pandas._libs import lib 

21from pandas.compat._optional import import_optional_dependency 

22from pandas.errors import ( 

23 AbstractMethodError, 

24 EmptyDataError, 

25) 

26from pandas.util._decorators import doc 

27from pandas.util._exceptions import find_stack_level 

28from pandas.util._validators import check_dtype_backend 

29 

30from pandas.core.dtypes.common import is_list_like 

31 

32from pandas import isna 

33from pandas.core.indexes.base import Index 

34from pandas.core.indexes.multi import MultiIndex 

35from pandas.core.series import Series 

36from pandas.core.shared_docs import _shared_docs 

37 

38from pandas.io.common import ( 

39 file_exists, 

40 get_handle, 

41 is_file_like, 

42 is_fsspec_url, 

43 is_url, 

44 stringify_path, 

45 validate_header_arg, 

46) 

47from pandas.io.formats.printing import pprint_thing 

48from pandas.io.parsers import TextParser 

49 

50if TYPE_CHECKING: 

51 from collections.abc import ( 

52 Iterable, 

53 Sequence, 

54 ) 

55 

56 from pandas._typing import ( 

57 BaseBuffer, 

58 DtypeBackend, 

59 FilePath, 

60 HTMLFlavors, 

61 ReadBuffer, 

62 StorageOptions, 

63 ) 

64 

65 from pandas import DataFrame 

66 

67############# 

68# READ HTML # 

69############# 

70_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}") 

71 

72 

73def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str: 

74 """ 

75 Replace extra whitespace inside of a string with a single space. 

76 

77 Parameters 

78 ---------- 

79 s : str or unicode 

80 The string from which to remove extra whitespace. 

81 regex : re.Pattern 

82 The regular expression to use to remove extra whitespace. 

83 

84 Returns 

85 ------- 

86 subd : str or unicode 

87 `s` with all extra whitespace replaced with a single space. 

88 """ 

89 return regex.sub(" ", s.strip()) 

90 

91 

92def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]: 

93 """ 

94 Get an iterator given an integer, slice or container. 

95 

96 Parameters 

97 ---------- 

98 skiprows : int, slice, container 

99 The iterator to use to skip rows; can also be a slice. 

100 

101 Raises 

102 ------ 

103 TypeError 

104 * If `skiprows` is not a slice, integer, or Container 

105 

106 Returns 

107 ------- 

108 it : iterable 

109 A proper iterator to use to skip rows of a DataFrame. 

110 """ 

111 if isinstance(skiprows, slice): 

112 start, step = skiprows.start or 0, skiprows.step or 1 

113 return list(range(start, skiprows.stop, step)) 

114 elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows): 

115 return cast("int | Sequence[int]", skiprows) 

116 elif skiprows is None: 

117 return 0 

118 raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows") 

119 

120 

121def _read( 

122 obj: FilePath | BaseBuffer, 

123 encoding: str | None, 

124 storage_options: StorageOptions | None, 

125) -> str | bytes: 

126 """ 

127 Try to read from a url, file or string. 

128 

129 Parameters 

130 ---------- 

131 obj : str, unicode, path object, or file-like object 

132 

133 Returns 

134 ------- 

135 raw_text : str 

136 """ 

137 text: str | bytes 

138 if ( 

139 is_url(obj) 

140 or hasattr(obj, "read") 

141 or (isinstance(obj, str) and file_exists(obj)) 

142 ): 

143 with get_handle( 

144 obj, "r", encoding=encoding, storage_options=storage_options 

145 ) as handles: 

146 text = handles.handle.read() 

147 elif isinstance(obj, (str, bytes)): 

148 text = obj 

149 else: 

150 raise TypeError(f"Cannot read object of type '{type(obj).__name__}'") 

151 return text 

152 

153 

154class _HtmlFrameParser: 

155 """ 

156 Base class for parsers that parse HTML into DataFrames. 

157 

158 Parameters 

159 ---------- 

160 io : str or file-like 

161 This can be either a string of raw HTML, a valid URL using the HTTP, 

162 FTP, or FILE protocols or a file-like object. 

163 

164 match : str or regex 

165 The text to match in the document. 

166 

167 attrs : dict 

168 List of HTML <table> element attributes to match. 

169 

170 encoding : str 

171 Encoding to be used by parser 

172 

173 displayed_only : bool 

174 Whether or not items with "display:none" should be ignored 

175 

176 extract_links : {None, "all", "header", "body", "footer"} 

177 Table elements in the specified section(s) with <a> tags will have their 

178 href extracted. 

179 

180 .. versionadded:: 1.5.0 

181 

182 Attributes 

183 ---------- 

184 io : str or file-like 

185 raw HTML, URL, or file-like object 

186 

187 match : regex 

188 The text to match in the raw HTML 

189 

190 attrs : dict-like 

191 A dictionary of valid table attributes to use to search for table 

192 elements. 

193 

194 encoding : str 

195 Encoding to be used by parser 

196 

197 displayed_only : bool 

198 Whether or not items with "display:none" should be ignored 

199 

200 extract_links : {None, "all", "header", "body", "footer"} 

201 Table elements in the specified section(s) with <a> tags will have their 

202 href extracted. 

203 

204 .. versionadded:: 1.5.0 

205 

206 Notes 

207 ----- 

208 To subclass this class effectively you must override the following methods: 

209 * :func:`_build_doc` 

210 * :func:`_attr_getter` 

211 * :func:`_href_getter` 

212 * :func:`_text_getter` 

213 * :func:`_parse_td` 

214 * :func:`_parse_thead_tr` 

215 * :func:`_parse_tbody_tr` 

216 * :func:`_parse_tfoot_tr` 

217 * :func:`_parse_tables` 

218 * :func:`_equals_tag` 

219 See each method's respective documentation for details on their 

220 functionality. 

221 """ 

222 

223 def __init__( 

224 self, 

225 io: FilePath | ReadBuffer[str] | ReadBuffer[bytes], 

226 match: str | Pattern, 

227 attrs: dict[str, str] | None, 

228 encoding: str, 

229 displayed_only: bool, 

230 extract_links: Literal[None, "header", "footer", "body", "all"], 

231 storage_options: StorageOptions = None, 

232 ) -> None: 

233 self.io = io 

234 self.match = match 

235 self.attrs = attrs 

236 self.encoding = encoding 

237 self.displayed_only = displayed_only 

238 self.extract_links = extract_links 

239 self.storage_options = storage_options 

240 

241 def parse_tables(self): 

242 """ 

243 Parse and return all tables from the DOM. 

244 

245 Returns 

246 ------- 

247 list of parsed (header, body, footer) tuples from tables. 

248 """ 

249 tables = self._parse_tables(self._build_doc(), self.match, self.attrs) 

250 return (self._parse_thead_tbody_tfoot(table) for table in tables) 

251 

252 def _attr_getter(self, obj, attr): 

253 """ 

254 Return the attribute value of an individual DOM node. 

255 

256 Parameters 

257 ---------- 

258 obj : node-like 

259 A DOM node. 

260 

261 attr : str or unicode 

262 The attribute, such as "colspan" 

263 

264 Returns 

265 ------- 

266 str or unicode 

267 The attribute value. 

268 """ 

269 # Both lxml and BeautifulSoup have the same implementation: 

270 return obj.get(attr) 

271 

272 def _href_getter(self, obj) -> str | None: 

273 """ 

274 Return a href if the DOM node contains a child <a> or None. 

275 

276 Parameters 

277 ---------- 

278 obj : node-like 

279 A DOM node. 

280 

281 Returns 

282 ------- 

283 href : str or unicode 

284 The href from the <a> child of the DOM node. 

285 """ 

286 raise AbstractMethodError(self) 

287 

288 def _text_getter(self, obj): 

289 """ 

290 Return the text of an individual DOM node. 

291 

292 Parameters 

293 ---------- 

294 obj : node-like 

295 A DOM node. 

296 

297 Returns 

298 ------- 

299 text : str or unicode 

300 The text from an individual DOM node. 

301 """ 

302 raise AbstractMethodError(self) 

303 

304 def _parse_td(self, obj): 

305 """ 

306 Return the td elements from a row element. 

307 

308 Parameters 

309 ---------- 

310 obj : node-like 

311 A DOM <tr> node. 

312 

313 Returns 

314 ------- 

315 list of node-like 

316 These are the elements of each row, i.e., the columns. 

317 """ 

318 raise AbstractMethodError(self) 

319 

320 def _parse_thead_tr(self, table): 

321 """ 

322 Return the list of thead row elements from the parsed table element. 

323 

324 Parameters 

325 ---------- 

326 table : a table element that contains zero or more thead elements. 

327 

328 Returns 

329 ------- 

330 list of node-like 

331 These are the <tr> row elements of a table. 

332 """ 

333 raise AbstractMethodError(self) 

334 

335 def _parse_tbody_tr(self, table): 

336 """ 

337 Return the list of tbody row elements from the parsed table element. 

338 

339 HTML5 table bodies consist of either 0 or more <tbody> elements (which 

340 only contain <tr> elements) or 0 or more <tr> elements. This method 

341 checks for both structures. 

342 

343 Parameters 

344 ---------- 

345 table : a table element that contains row elements. 

346 

347 Returns 

348 ------- 

349 list of node-like 

350 These are the <tr> row elements of a table. 

351 """ 

352 raise AbstractMethodError(self) 

353 

354 def _parse_tfoot_tr(self, table): 

355 """ 

356 Return the list of tfoot row elements from the parsed table element. 

357 

358 Parameters 

359 ---------- 

360 table : a table element that contains row elements. 

361 

362 Returns 

363 ------- 

364 list of node-like 

365 These are the <tr> row elements of a table. 

366 """ 

367 raise AbstractMethodError(self) 

368 

369 def _parse_tables(self, document, match, attrs): 

370 """ 

371 Return all tables from the parsed DOM. 

372 

373 Parameters 

374 ---------- 

375 document : the DOM from which to parse the table element. 

376 

377 match : str or regular expression 

378 The text to search for in the DOM tree. 

379 

380 attrs : dict 

381 A dictionary of table attributes that can be used to disambiguate 

382 multiple tables on a page. 

383 

384 Raises 

385 ------ 

386 ValueError : `match` does not match any text in the document. 

387 

388 Returns 

389 ------- 

390 list of node-like 

391 HTML <table> elements to be parsed into raw data. 

392 """ 

393 raise AbstractMethodError(self) 

394 

395 def _equals_tag(self, obj, tag) -> bool: 

396 """ 

397 Return whether an individual DOM node matches a tag 

398 

399 Parameters 

400 ---------- 

401 obj : node-like 

402 A DOM node. 

403 

404 tag : str 

405 Tag name to be checked for equality. 

406 

407 Returns 

408 ------- 

409 boolean 

410 Whether `obj`'s tag name is `tag` 

411 """ 

412 raise AbstractMethodError(self) 

413 

414 def _build_doc(self): 

415 """ 

416 Return a tree-like object that can be used to iterate over the DOM. 

417 

418 Returns 

419 ------- 

420 node-like 

421 The DOM from which to parse the table element. 

422 """ 

423 raise AbstractMethodError(self) 

424 

425 def _parse_thead_tbody_tfoot(self, table_html): 

426 """ 

427 Given a table, return parsed header, body, and foot. 

428 

429 Parameters 

430 ---------- 

431 table_html : node-like 

432 

433 Returns 

434 ------- 

435 tuple of (header, body, footer), each a list of list-of-text rows. 

436 

437 Notes 

438 ----- 

439 Header and body are lists-of-lists. Top level list is a list of 

440 rows. Each row is a list of str text. 

441 

442 Logic: Use <thead>, <tbody>, <tfoot> elements to identify 

443 header, body, and footer, otherwise: 

444 - Put all rows into body 

445 - Move rows from top of body to header only if 

446 all elements inside row are <th> 

447 - Move rows from bottom of body to footer only if 

448 all elements inside row are <th> 

449 """ 

450 header_rows = self._parse_thead_tr(table_html) 

451 body_rows = self._parse_tbody_tr(table_html) 

452 footer_rows = self._parse_tfoot_tr(table_html) 

453 

454 def row_is_all_th(row): 

455 return all(self._equals_tag(t, "th") for t in self._parse_td(row)) 

456 

457 if not header_rows: 

458 # The table has no <thead>. Move the top all-<th> rows from 

459 # body_rows to header_rows. (This is a common case because many 

460 # tables in the wild have no <thead> or <tfoot> 

461 while body_rows and row_is_all_th(body_rows[0]): 

462 header_rows.append(body_rows.pop(0)) 

463 

464 header = self._expand_colspan_rowspan(header_rows, section="header") 

465 body = self._expand_colspan_rowspan(body_rows, section="body") 

466 footer = self._expand_colspan_rowspan(footer_rows, section="footer") 

467 

468 return header, body, footer 

469 

470 def _expand_colspan_rowspan( 

471 self, rows, section: Literal["header", "footer", "body"] 

472 ): 

473 """ 

474 Given a list of <tr>s, return a list of text rows. 

475 

476 Parameters 

477 ---------- 

478 rows : list of node-like 

479 List of <tr>s 

480 section : the section that the rows belong to (header, body or footer). 

481 

482 Returns 

483 ------- 

484 list of list 

485 Each returned row is a list of str text, or tuple (text, link) 

486 if extract_links is not None. 

487 

488 Notes 

489 ----- 

490 Any cell with ``rowspan`` or ``colspan`` will have its contents copied 

491 to subsequent cells. 

492 """ 

493 all_texts = [] # list of rows, each a list of str 

494 text: str | tuple 

495 remainder: list[ 

496 tuple[int, str | tuple, int] 

497 ] = [] # list of (index, text, nrows) 

498 

499 for tr in rows: 

500 texts = [] # the output for this row 

501 next_remainder = [] 

502 

503 index = 0 

504 tds = self._parse_td(tr) 

505 for td in tds: 

506 # Append texts from previous rows with rowspan>1 that come 

507 # before this <td> 

508 while remainder and remainder[0][0] <= index: 

509 prev_i, prev_text, prev_rowspan = remainder.pop(0) 

510 texts.append(prev_text) 

511 if prev_rowspan > 1: 

512 next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) 

513 index += 1 

514 

515 # Append the text from this <td>, colspan times 

516 text = _remove_whitespace(self._text_getter(td)) 

517 if self.extract_links in ("all", section): 

518 href = self._href_getter(td) 

519 text = (text, href) 

520 rowspan = int(self._attr_getter(td, "rowspan") or 1) 

521 colspan = int(self._attr_getter(td, "colspan") or 1) 

522 

523 for _ in range(colspan): 

524 texts.append(text) 

525 if rowspan > 1: 

526 next_remainder.append((index, text, rowspan - 1)) 

527 index += 1 

528 

529 # Append texts from previous rows at the final position 

530 for prev_i, prev_text, prev_rowspan in remainder: 

531 texts.append(prev_text) 

532 if prev_rowspan > 1: 

533 next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) 

534 

535 all_texts.append(texts) 

536 remainder = next_remainder 

537 

538 # Append rows that only appear because the previous row had non-1 

539 # rowspan 

540 while remainder: 

541 next_remainder = [] 

542 texts = [] 

543 for prev_i, prev_text, prev_rowspan in remainder: 

544 texts.append(prev_text) 

545 if prev_rowspan > 1: 

546 next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) 

547 all_texts.append(texts) 

548 remainder = next_remainder 

549 

550 return all_texts 

551 

552 def _handle_hidden_tables(self, tbl_list, attr_name: str): 

553 """ 

554 Return list of tables, potentially removing hidden elements 

555 

556 Parameters 

557 ---------- 

558 tbl_list : list of node-like 

559 Type of list elements will vary depending upon parser used 

560 attr_name : str 

561 Name of the accessor for retrieving HTML attributes 

562 

563 Returns 

564 ------- 

565 list of node-like 

566 Return type matches `tbl_list` 

567 """ 

568 if not self.displayed_only: 

569 return tbl_list 

570 

571 return [ 

572 x 

573 for x in tbl_list 

574 if "display:none" 

575 not in getattr(x, attr_name).get("style", "").replace(" ", "") 

576 ] 

577 

578 

579class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): 

580 """ 

581 HTML to DataFrame parser that uses BeautifulSoup under the hood. 

582 

583 See Also 

584 -------- 

585 pandas.io.html._HtmlFrameParser 

586 pandas.io.html._LxmlFrameParser 

587 

588 Notes 

589 ----- 

590 Documentation strings for this class are in the base class 

591 :class:`pandas.io.html._HtmlFrameParser`. 

592 """ 

593 

594 def _parse_tables(self, document, match, attrs): 

595 element_name = "table" 

596 tables = document.find_all(element_name, attrs=attrs) 

597 if not tables: 

598 raise ValueError("No tables found") 

599 

600 result = [] 

601 unique_tables = set() 

602 tables = self._handle_hidden_tables(tables, "attrs") 

603 

604 for table in tables: 

605 if self.displayed_only: 

606 for elem in table.find_all("style"): 

607 elem.decompose() 

608 

609 for elem in table.find_all(style=re.compile(r"display:\s*none")): 

610 elem.decompose() 

611 

612 if table not in unique_tables and table.find(string=match) is not None: 

613 result.append(table) 

614 unique_tables.add(table) 

615 if not result: 

616 raise ValueError(f"No tables found matching pattern {repr(match.pattern)}") 

617 return result 

618 

619 def _href_getter(self, obj) -> str | None: 

620 a = obj.find("a", href=True) 

621 return None if not a else a["href"] 

622 

623 def _text_getter(self, obj): 

624 return obj.text 

625 

626 def _equals_tag(self, obj, tag) -> bool: 

627 return obj.name == tag 

628 

629 def _parse_td(self, row): 

630 return row.find_all(("td", "th"), recursive=False) 

631 

632 def _parse_thead_tr(self, table): 

633 return table.select("thead tr") 

634 

635 def _parse_tbody_tr(self, table): 

636 from_tbody = table.select("tbody tr") 

637 from_root = table.find_all("tr", recursive=False) 

638 # HTML spec: at most one of these lists has content 

639 return from_tbody + from_root 

640 

641 def _parse_tfoot_tr(self, table): 

642 return table.select("tfoot tr") 

643 

644 def _setup_build_doc(self): 

645 raw_text = _read(self.io, self.encoding, self.storage_options) 

646 if not raw_text: 

647 raise ValueError(f"No text parsed from document: {self.io}") 

648 return raw_text 

649 

650 def _build_doc(self): 

651 from bs4 import BeautifulSoup 

652 

653 bdoc = self._setup_build_doc() 

654 if isinstance(bdoc, bytes) and self.encoding is not None: 

655 udoc = bdoc.decode(self.encoding) 

656 from_encoding = None 

657 else: 

658 udoc = bdoc 

659 from_encoding = self.encoding 

660 

661 soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding) 

662 

663 for br in soup.find_all("br"): 

664 br.replace_with("\n" + br.text) 

665 

666 return soup 

667 

668 

669def _build_xpath_expr(attrs) -> str: 

670 """ 

671 Build an xpath expression to simulate bs4's ability to pass in kwargs to 

672 search for attributes when using the lxml parser. 

673 

674 Parameters 

675 ---------- 

676 attrs : dict 

677 A dict of HTML attributes. These are NOT checked for validity. 

678 

679 Returns 

680 ------- 

681 expr : unicode 

682 An XPath expression that checks for the given HTML attributes. 

683 """ 

684 # give class attribute as class_ because class is a python keyword 

685 if "class_" in attrs: 

686 attrs["class"] = attrs.pop("class_") 

687 

688 s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()]) 

689 return f"[{s}]" 

690 

691 

692_re_namespace = {"re": "http://exslt.org/regular-expressions"} 

693 

694 

695class _LxmlFrameParser(_HtmlFrameParser): 

696 """ 

697 HTML to DataFrame parser that uses lxml under the hood. 

698 

699 Warning 

700 ------- 

701 This parser can only handle HTTP, FTP, and FILE urls. 

702 

703 See Also 

704 -------- 

705 _HtmlFrameParser 

706 _BeautifulSoupLxmlFrameParser 

707 

708 Notes 

709 ----- 

710 Documentation strings for this class are in the base class 

711 :class:`_HtmlFrameParser`. 

712 """ 

713 

714 def _href_getter(self, obj) -> str | None: 

715 href = obj.xpath(".//a/@href") 

716 return None if not href else href[0] 

717 

718 def _text_getter(self, obj): 

719 return obj.text_content() 

720 

721 def _parse_td(self, row): 

722 # Look for direct children only: the "row" element here may be a 

723 # <thead> or <tfoot> (see _parse_thead_tr). 

724 return row.xpath("./td|./th") 

725 

726 def _parse_tables(self, document, match, kwargs): 

727 pattern = match.pattern 

728 

729 # 1. check all descendants for the given pattern and only search tables 

730 # GH 49929 

731 xpath_expr = f"//table[.//text()[re:test(., {repr(pattern)})]]" 

732 

733 # if any table attributes were given build an xpath expression to 

734 # search for them 

735 if kwargs: 

736 xpath_expr += _build_xpath_expr(kwargs) 

737 

738 tables = document.xpath(xpath_expr, namespaces=_re_namespace) 

739 

740 tables = self._handle_hidden_tables(tables, "attrib") 

741 if self.displayed_only: 

742 for table in tables: 

743 # lxml utilizes XPATH 1.0 which does not have regex 

744 # support. As a result, we find all elements with a style 

745 # attribute and iterate them to check for display:none 

746 for elem in table.xpath(".//style"): 

747 elem.drop_tree() 

748 for elem in table.xpath(".//*[@style]"): 

749 if "display:none" in elem.attrib.get("style", "").replace(" ", ""): 

750 elem.drop_tree() 

751 if not tables: 

752 raise ValueError(f"No tables found matching regex {repr(pattern)}") 

753 return tables 

754 

755 def _equals_tag(self, obj, tag) -> bool: 

756 return obj.tag == tag 

757 

758 def _build_doc(self): 

759 """ 

760 Raises 

761 ------ 

762 ValueError 

763 * If a URL that lxml cannot parse is passed. 

764 

765 Exception 

766 * Any other ``Exception`` thrown. For example, trying to parse a 

767 URL that is syntactically correct on a machine with no internet 

768 connection will fail. 

769 

770 See Also 

771 -------- 

772 pandas.io.html._HtmlFrameParser._build_doc 

773 """ 

774 from lxml.etree import XMLSyntaxError 

775 from lxml.html import ( 

776 HTMLParser, 

777 fromstring, 

778 parse, 

779 ) 

780 

781 parser = HTMLParser(recover=True, encoding=self.encoding) 

782 

783 try: 

784 if is_url(self.io): 

785 with get_handle( 

786 self.io, "r", storage_options=self.storage_options 

787 ) as f: 

788 r = parse(f.handle, parser=parser) 

789 else: 

790 # try to parse the input in the simplest way 

791 r = parse(self.io, parser=parser) 

792 try: 

793 r = r.getroot() 

794 except AttributeError: 

795 pass 

796 except (UnicodeDecodeError, OSError) as e: 

797 # if the input is a blob of html goop 

798 if not is_url(self.io): 

799 r = fromstring(self.io, parser=parser) 

800 

801 try: 

802 r = r.getroot() 

803 except AttributeError: 

804 pass 

805 else: 

806 raise e 

807 else: 

808 if not hasattr(r, "text_content"): 

809 raise XMLSyntaxError("no text parsed from document", 0, 0, 0) 

810 

811 for br in r.xpath("*//br"): 

812 br.tail = "\n" + (br.tail or "") 

813 

814 return r 

815 

816 def _parse_thead_tr(self, table): 

817 rows = [] 

818 

819 for thead in table.xpath(".//thead"): 

820 rows.extend(thead.xpath("./tr")) 

821 

822 # HACK: lxml does not clean up the clearly-erroneous 

823 # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add 

824 # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its 

825 # children as though it's a <tr>. 

826 # 

827 # Better solution would be to use html5lib. 

828 elements_at_root = thead.xpath("./td|./th") 

829 if elements_at_root: 

830 rows.append(thead) 

831 

832 return rows 

833 

834 def _parse_tbody_tr(self, table): 

835 from_tbody = table.xpath(".//tbody//tr") 

836 from_root = table.xpath("./tr") 

837 # HTML spec: at most one of these lists has content 

838 return from_tbody + from_root 

839 

840 def _parse_tfoot_tr(self, table): 

841 return table.xpath(".//tfoot//tr") 

842 

843 

844def _expand_elements(body) -> None: 

845 data = [len(elem) for elem in body] 

846 lens = Series(data) 

847 lens_max = lens.max() 

848 not_max = lens[lens != lens_max] 

849 

850 empty = [""] 

851 for ind, length in not_max.items(): 

852 body[ind] += empty * (lens_max - length) 

853 

854 

855def _data_to_frame(**kwargs): 

856 head, body, foot = kwargs.pop("data") 

857 header = kwargs.pop("header") 

858 kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"]) 

859 if head: 

860 body = head + body 

861 

862 # Infer header when there is a <thead> or top <th>-only rows 

863 if header is None: 

864 if len(head) == 1: 

865 header = 0 

866 else: 

867 # ignore all-empty-text rows 

868 header = [i for i, row in enumerate(head) if any(text for text in row)] 

869 

870 if foot: 

871 body += foot 

872 

873 # fill out elements of body that are "ragged" 

874 _expand_elements(body) 

875 with TextParser(body, header=header, **kwargs) as tp: 

876 return tp.read() 

877 

878 

879_valid_parsers = { 

880 "lxml": _LxmlFrameParser, 

881 None: _LxmlFrameParser, 

882 "html5lib": _BeautifulSoupHtml5LibFrameParser, 

883 "bs4": _BeautifulSoupHtml5LibFrameParser, 

884} 

885 

886 

887def _parser_dispatch(flavor: HTMLFlavors | None) -> type[_HtmlFrameParser]: 

888 """ 

889 Choose the parser based on the input flavor. 

890 

891 Parameters 

892 ---------- 

893 flavor : {{"lxml", "html5lib", "bs4"}} or None 

894 The type of parser to use. This must be a valid backend. 

895 

896 Returns 

897 ------- 

898 cls : _HtmlFrameParser subclass 

899 The parser class based on the requested input flavor. 

900 

901 Raises 

902 ------ 

903 ValueError 

904 * If `flavor` is not a valid backend. 

905 ImportError 

906 * If you do not have the requested `flavor` 

907 """ 

908 valid_parsers = list(_valid_parsers.keys()) 

909 if flavor not in valid_parsers: 

910 raise ValueError( 

911 f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}" 

912 ) 

913 

914 if flavor in ("bs4", "html5lib"): 

915 import_optional_dependency("html5lib") 

916 import_optional_dependency("bs4") 

917 else: 

918 import_optional_dependency("lxml.etree") 

919 return _valid_parsers[flavor] 

920 

921 

922def _print_as_set(s) -> str: 

923 arg = ", ".join([pprint_thing(el) for el in s]) 

924 return f"{{{arg}}}" 

925 

926 

927def _validate_flavor(flavor): 

928 if flavor is None: 

929 flavor = "lxml", "bs4" 

930 elif isinstance(flavor, str): 

931 flavor = (flavor,) 

932 elif isinstance(flavor, abc.Iterable): 

933 if not all(isinstance(flav, str) for flav in flavor): 

934 raise TypeError( 

935 f"Object of type {repr(type(flavor).__name__)} " 

936 f"is not an iterable of strings" 

937 ) 

938 else: 

939 msg = repr(flavor) if isinstance(flavor, str) else str(flavor) 

940 msg += " is not a valid flavor" 

941 raise ValueError(msg) 

942 

943 flavor = tuple(flavor) 

944 valid_flavors = set(_valid_parsers) 

945 flavor_set = set(flavor) 

946 

947 if not flavor_set & valid_flavors: 

948 raise ValueError( 

949 f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid " 

950 f"flavors are {_print_as_set(valid_flavors)}" 

951 ) 

952 return flavor 

953 

954 

955def _parse( 

956 flavor, 

957 io, 

958 match, 

959 attrs, 

960 encoding, 

961 displayed_only, 

962 extract_links, 

963 storage_options, 

964 **kwargs, 

965): 

966 flavor = _validate_flavor(flavor) 

967 compiled_match = re.compile(match) # you can pass a compiled regex here 

968 

969 retained = None 

970 for flav in flavor: 

971 parser = _parser_dispatch(flav) 

972 p = parser( 

973 io, 

974 compiled_match, 

975 attrs, 

976 encoding, 

977 displayed_only, 

978 extract_links, 

979 storage_options, 

980 ) 

981 

982 try: 

983 tables = p.parse_tables() 

984 except ValueError as caught: 

985 # if `io` is an io-like object, check if it's seekable 

986 # and try to rewind it before trying the next parser 

987 if hasattr(io, "seekable") and io.seekable(): 

988 io.seek(0) 

989 elif hasattr(io, "seekable") and not io.seekable(): 

990 # if we couldn't rewind it, let the user know 

991 raise ValueError( 

992 f"The flavor {flav} failed to parse your input. " 

993 "Since you passed a non-rewindable file " 

994 "object, we can't rewind it to try " 

995 "another parser. Try read_html() with a different flavor." 

996 ) from caught 

997 

998 retained = caught 

999 else: 

1000 break 

1001 else: 

1002 assert retained is not None # for mypy 

1003 raise retained 

1004 

1005 ret = [] 

1006 for table in tables: 

1007 try: 

1008 df = _data_to_frame(data=table, **kwargs) 

1009 # Cast MultiIndex header to an Index of tuples when extracting header 

1010 # links and replace nan with None (therefore can't use mi.to_flat_index()). 

1011 # This maintains consistency of selection (e.g. df.columns.str[1]) 

1012 if extract_links in ("all", "header") and isinstance( 

1013 df.columns, MultiIndex 

1014 ): 

1015 df.columns = Index( 

1016 ((col[0], None if isna(col[1]) else col[1]) for col in df.columns), 

1017 tupleize_cols=False, 

1018 ) 

1019 

1020 ret.append(df) 

1021 except EmptyDataError: # empty table 

1022 continue 

1023 return ret 

1024 

1025 

1026@doc(storage_options=_shared_docs["storage_options"]) 

1027def read_html( 

1028 io: FilePath | ReadBuffer[str], 

1029 *, 

1030 match: str | Pattern = ".+", 

1031 flavor: HTMLFlavors | Sequence[HTMLFlavors] | None = None, 

1032 header: int | Sequence[int] | None = None, 

1033 index_col: int | Sequence[int] | None = None, 

1034 skiprows: int | Sequence[int] | slice | None = None, 

1035 attrs: dict[str, str] | None = None, 

1036 parse_dates: bool = False, 

1037 thousands: str | None = ",", 

1038 encoding: str | None = None, 

1039 decimal: str = ".", 

1040 converters: dict | None = None, 

1041 na_values: Iterable[object] | None = None, 

1042 keep_default_na: bool = True, 

1043 displayed_only: bool = True, 

1044 extract_links: Literal[None, "header", "footer", "body", "all"] = None, 

1045 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

1046 storage_options: StorageOptions = None, 

1047) -> list[DataFrame]: 

1048 r""" 

1049 Read HTML tables into a ``list`` of ``DataFrame`` objects. 

1050 

1051 Parameters 

1052 ---------- 

1053 io : str, path object, or file-like object 

1054 String, path object (implementing ``os.PathLike[str]``), or file-like 

1055 object implementing a string ``read()`` function. 

1056 The string can represent a URL or the HTML itself. Note that 

1057 lxml only accepts the http, ftp and file url protocols. If you have a 

1058 URL that starts with ``'https'`` you might try removing the ``'s'``. 

1059 

1060 .. deprecated:: 2.1.0 

1061 Passing html literal strings is deprecated. 

1062 Wrap literal string/bytes input in ``io.StringIO``/``io.BytesIO`` instead. 

1063 

1064 match : str or compiled regular expression, optional 

1065 The set of tables containing text matching this regex or string will be 

1066 returned. Unless the HTML is extremely simple you will probably need to 

1067 pass a non-empty string here. Defaults to '.+' (match any non-empty 

1068 string). The default value will return all tables contained on a page. 

1069 This value is converted to a regular expression so that there is 

1070 consistent behavior between Beautiful Soup and lxml. 

1071 

1072 flavor : {{"lxml", "html5lib", "bs4"}} or list-like, optional 

1073 The parsing engine (or list of parsing engines) to use. 'bs4' and 

1074 'html5lib' are synonymous with each other, they are both there for 

1075 backwards compatibility. The default of ``None`` tries to use ``lxml`` 

1076 to parse and if that fails it falls back on ``bs4`` + ``html5lib``. 

1077 

1078 header : int or list-like, optional 

1079 The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to 

1080 make the columns headers. 

1081 

1082 index_col : int or list-like, optional 

1083 The column (or list of columns) to use to create the index. 

1084 

1085 skiprows : int, list-like or slice, optional 

1086 Number of rows to skip after parsing the column integer. 0-based. If a 

1087 sequence of integers or a slice is given, will skip the rows indexed by 

1088 that sequence. Note that a single element sequence means 'skip the nth 

1089 row' whereas an integer means 'skip n rows'. 

1090 

1091 attrs : dict, optional 

1092 This is a dictionary of attributes that you can pass to use to identify 

1093 the table in the HTML. These are not checked for validity before being 

1094 passed to lxml or Beautiful Soup. However, these attributes must be 

1095 valid HTML table attributes to work correctly. For example, :: 

1096 

1097 attrs = {{'id': 'table'}} 

1098 

1099 is a valid attribute dictionary because the 'id' HTML tag attribute is 

1100 a valid HTML attribute for *any* HTML tag as per `this document 

1101 <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>`__. :: 

1102 

1103 attrs = {{'asdf': 'table'}} 

1104 

1105 is *not* a valid attribute dictionary because 'asdf' is not a valid 

1106 HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 

1107 table attributes can be found `here 

1108 <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A 

1109 working draft of the HTML 5 spec can be found `here 

1110 <https://html.spec.whatwg.org/multipage/tables.html>`__. It contains the 

1111 latest information on table attributes for the modern web. 

1112 

1113 parse_dates : bool, optional 

1114 See :func:`~read_csv` for more details. 

1115 

1116 thousands : str, optional 

1117 Separator to use to parse thousands. Defaults to ``','``. 

1118 

1119 encoding : str, optional 

1120 The encoding used to decode the web page. Defaults to ``None``.``None`` 

1121 preserves the previous encoding behavior, which depends on the 

1122 underlying parser library (e.g., the parser library will try to use 

1123 the encoding provided by the document). 

1124 

1125 decimal : str, default '.' 

1126 Character to recognize as decimal point (e.g. use ',' for European 

1127 data). 

1128 

1129 converters : dict, default None 

1130 Dict of functions for converting values in certain columns. Keys can 

1131 either be integers or column labels, values are functions that take one 

1132 input argument, the cell (not column) content, and return the 

1133 transformed content. 

1134 

1135 na_values : iterable, default None 

1136 Custom NA values. 

1137 

1138 keep_default_na : bool, default True 

1139 If na_values are specified and keep_default_na is False the default NaN 

1140 values are overridden, otherwise they're appended to. 

1141 

1142 displayed_only : bool, default True 

1143 Whether elements with "display: none" should be parsed. 

1144 

1145 extract_links : {{None, "all", "header", "body", "footer"}} 

1146 Table elements in the specified section(s) with <a> tags will have their 

1147 href extracted. 

1148 

1149 .. versionadded:: 1.5.0 

1150 

1151 dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' 

1152 Back-end data type applied to the resultant :class:`DataFrame` 

1153 (still experimental). Behaviour is as follows: 

1154 

1155 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` 

1156 (default). 

1157 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` 

1158 DataFrame. 

1159 

1160 .. versionadded:: 2.0 

1161 

1162 {storage_options} 

1163 

1164 .. versionadded:: 2.1.0 

1165 

1166 Returns 

1167 ------- 

1168 dfs 

1169 A list of DataFrames. 

1170 

1171 See Also 

1172 -------- 

1173 read_csv : Read a comma-separated values (csv) file into DataFrame. 

1174 

1175 Notes 

1176 ----- 

1177 Before using this function you should read the :ref:`gotchas about the 

1178 HTML parsing libraries <io.html.gotchas>`. 

1179 

1180 Expect to do some cleanup after you call this function. For example, you 

1181 might need to manually assign column names if the column names are 

1182 converted to NaN when you pass the `header=0` argument. We try to assume as 

1183 little as possible about the structure of the table and push the 

1184 idiosyncrasies of the HTML contained in the table to the user. 

1185 

1186 This function searches for ``<table>`` elements and only for ``<tr>`` 

1187 and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>`` 

1188 element in the table. ``<td>`` stands for "table data". This function 

1189 attempts to properly handle ``colspan`` and ``rowspan`` attributes. 

1190 If the function has a ``<thead>`` argument, it is used to construct 

1191 the header, otherwise the function attempts to find the header within 

1192 the body (by putting rows with only ``<th>`` elements into the header). 

1193 

1194 Similar to :func:`~read_csv` the `header` argument is applied 

1195 **after** `skiprows` is applied. 

1196 

1197 This function will *always* return a list of :class:`DataFrame` *or* 

1198 it will fail, e.g., it will *not* return an empty list. 

1199 

1200 Examples 

1201 -------- 

1202 See the :ref:`read_html documentation in the IO section of the docs 

1203 <io.read_html>` for some examples of reading in HTML tables. 

1204 """ 

1205 # Type check here. We don't want to parse only to fail because of an 

1206 # invalid value of an integer skiprows. 

1207 if isinstance(skiprows, numbers.Integral) and skiprows < 0: 

1208 raise ValueError( 

1209 "cannot skip rows starting from the end of the " 

1210 "data (you passed a negative value)" 

1211 ) 

1212 if extract_links not in [None, "header", "footer", "body", "all"]: 

1213 raise ValueError( 

1214 "`extract_links` must be one of " 

1215 '{None, "header", "footer", "body", "all"}, got ' 

1216 f'"{extract_links}"' 

1217 ) 

1218 

1219 validate_header_arg(header) 

1220 check_dtype_backend(dtype_backend) 

1221 

1222 io = stringify_path(io) 

1223 

1224 if isinstance(io, str) and not any( 

1225 [ 

1226 is_file_like(io), 

1227 file_exists(io), 

1228 is_url(io), 

1229 is_fsspec_url(io), 

1230 ] 

1231 ): 

1232 warnings.warn( 

1233 "Passing literal html to 'read_html' is deprecated and " 

1234 "will be removed in a future version. To read from a " 

1235 "literal string, wrap it in a 'StringIO' object.", 

1236 FutureWarning, 

1237 stacklevel=find_stack_level(), 

1238 ) 

1239 

1240 return _parse( 

1241 flavor=flavor, 

1242 io=io, 

1243 match=match, 

1244 header=header, 

1245 index_col=index_col, 

1246 skiprows=skiprows, 

1247 parse_dates=parse_dates, 

1248 thousands=thousands, 

1249 attrs=attrs, 

1250 encoding=encoding, 

1251 decimal=decimal, 

1252 converters=converters, 

1253 na_values=na_values, 

1254 keep_default_na=keep_default_na, 

1255 displayed_only=displayed_only, 

1256 extract_links=extract_links, 

1257 dtype_backend=dtype_backend, 

1258 storage_options=storage_options, 

1259 )