Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/html.py: 21%

1"""

2:mod:`pandas.io.html` is a module containing functionality for dealing with

3HTML IO.

5"""

7from __future__ import annotations

9from collections import abc

10import numbers

11import re

12from typing import (

13 TYPE_CHECKING,

14 Iterable,

15 Literal,

16 Pattern,

17 Sequence,

18 cast,

19)

21from pandas._libs import lib

22from pandas._typing import (

23 BaseBuffer,

24 DtypeBackend,

25 FilePath,

26 ReadBuffer,

27)

28from pandas.compat._optional import import_optional_dependency

29from pandas.errors import (

30 AbstractMethodError,

31 EmptyDataError,

32)

33from pandas.util._validators import check_dtype_backend

35from pandas.core.dtypes.common import is_list_like

37from pandas import isna

38from pandas.core.indexes.base import Index

39from pandas.core.indexes.multi import MultiIndex

40from pandas.core.series import Series

42from pandas.io.common import (

43 file_exists,

44 get_handle,

45 is_url,

46 stringify_path,

47 urlopen,

48 validate_header_arg,

49)

50from pandas.io.formats.printing import pprint_thing

51from pandas.io.parsers import TextParser

53if TYPE_CHECKING:

54 from pandas import DataFrame

56_IMPORTS = False

57_HAS_BS4 = False

58_HAS_LXML = False

59_HAS_HTML5LIB = False

62def _importers() -> None:

63 # import things we need

64 # but make this done on a first use basis

66 global _IMPORTS

67 if _IMPORTS:

68 return

70 global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB

71 bs4 = import_optional_dependency("bs4", errors="ignore")

72 _HAS_BS4 = bs4 is not None

74 lxml = import_optional_dependency("lxml.etree", errors="ignore")

75 _HAS_LXML = lxml is not None

77 html5lib = import_optional_dependency("html5lib", errors="ignore")

78 _HAS_HTML5LIB = html5lib is not None

80 _IMPORTS = True

83#############

84# READ HTML #

85#############

86_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}")

89def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str:

90 """

91 Replace extra whitespace inside of a string with a single space.

93 Parameters

94 ----------

95 s : str or unicode

96 The string from which to remove extra whitespace.

97 regex : re.Pattern

98 The regular expression to use to remove extra whitespace.

100 Returns

101 -------

102 subd : str or unicode

103 `s` with all extra whitespace replaced with a single space.

104 """

105 return regex.sub(" ", s.strip())

106

107

108def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]:

109 """

110 Get an iterator given an integer, slice or container.

111

112 Parameters

113 ----------

114 skiprows : int, slice, container

115 The iterator to use to skip rows; can also be a slice.

116

117 Raises

118 ------

119 TypeError

120 * If `skiprows` is not a slice, integer, or Container

121

122 Returns

123 -------

124 it : iterable

125 A proper iterator to use to skip rows of a DataFrame.

126 """

127 if isinstance(skiprows, slice):

128 start, step = skiprows.start or 0, skiprows.step or 1

129 return list(range(start, skiprows.stop, step))

130 elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):

131 return cast("int | Sequence[int]", skiprows)

132 elif skiprows is None:

133 return 0

134 raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows")

135

136

137def _read(obj: FilePath | BaseBuffer, encoding: str | None) -> str | bytes:

138 """

139 Try to read from a url, file or string.

140

141 Parameters

142 ----------

143 obj : str, unicode, path object, or file-like object

144

145 Returns

146 -------

147 raw_text : str

148 """

149 text: str | bytes

150 if (

151 is_url(obj)

152 or hasattr(obj, "read")

153 or (isinstance(obj, str) and file_exists(obj))

154 ):

155 with get_handle(obj, "r", encoding=encoding) as handles:

156 text = handles.handle.read()

157 elif isinstance(obj, (str, bytes)):

158 text = obj

159 else:

160 raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")

161 return text

162

163

164class _HtmlFrameParser:

165 """

166 Base class for parsers that parse HTML into DataFrames.

167

168 Parameters

169 ----------

170 io : str or file-like

171 This can be either a string of raw HTML, a valid URL using the HTTP,

172 FTP, or FILE protocols or a file-like object.

173

174 match : str or regex

175 The text to match in the document.

176

177 attrs : dict

178 List of HTML <table> element attributes to match.

179

180 encoding : str

181 Encoding to be used by parser

182

183 displayed_only : bool

184 Whether or not items with "display:none" should be ignored

185

186 extract_links : {None, "all", "header", "body", "footer"}

187 Table elements in the specified section(s) with <a> tags will have their

188 href extracted.

189

190 .. versionadded:: 1.5.0

191

192 Attributes

193 ----------

194 io : str or file-like

195 raw HTML, URL, or file-like object

196

197 match : regex

198 The text to match in the raw HTML

199

200 attrs : dict-like

201 A dictionary of valid table attributes to use to search for table

202 elements.

203

204 encoding : str

205 Encoding to be used by parser

206

207 displayed_only : bool

208 Whether or not items with "display:none" should be ignored

209

210 extract_links : {None, "all", "header", "body", "footer"}

211 Table elements in the specified section(s) with <a> tags will have their

212 href extracted.

213

214 .. versionadded:: 1.5.0

215

216 Notes

217 -----

218 To subclass this class effectively you must override the following methods:

219 * :func:`_build_doc`

220 * :func:`_attr_getter`

221 * :func:`_href_getter`

222 * :func:`_text_getter`

223 * :func:`_parse_td`

224 * :func:`_parse_thead_tr`

225 * :func:`_parse_tbody_tr`

226 * :func:`_parse_tfoot_tr`

227 * :func:`_parse_tables`

228 * :func:`_equals_tag`

229 See each method's respective documentation for details on their

230 functionality.

231 """

232

233 def __init__(

234 self,

235 io: FilePath | ReadBuffer[str] | ReadBuffer[bytes],

236 match: str | Pattern,

237 attrs: dict[str, str] | None,

238 encoding: str,

239 displayed_only: bool,

240 extract_links: Literal[None, "header", "footer", "body", "all"],

241 ) -> None:

242 self.io = io

243 self.match = match

244 self.attrs = attrs

245 self.encoding = encoding

246 self.displayed_only = displayed_only

247 self.extract_links = extract_links

248

249 def parse_tables(self):

250 """

251 Parse and return all tables from the DOM.

252

253 Returns

254 -------

255 list of parsed (header, body, footer) tuples from tables.

256 """

257 tables = self._parse_tables(self._build_doc(), self.match, self.attrs)

258 return (self._parse_thead_tbody_tfoot(table) for table in tables)

259

260 def _attr_getter(self, obj, attr):

261 """

262 Return the attribute value of an individual DOM node.

263

264 Parameters

265 ----------

266 obj : node-like

267 A DOM node.

268

269 attr : str or unicode

270 The attribute, such as "colspan"

271

272 Returns

273 -------

274 str or unicode

275 The attribute value.

276 """

277 # Both lxml and BeautifulSoup have the same implementation:

278 return obj.get(attr)

279

280 def _href_getter(self, obj):

281 """

282 Return a href if the DOM node contains a child <a> or None.

283

284 Parameters

285 ----------

286 obj : node-like

287 A DOM node.

288

289 Returns

290 -------

291 href : str or unicode

292 The href from the <a> child of the DOM node.

293 """

294 raise AbstractMethodError(self)

295

296 def _text_getter(self, obj):

297 """

298 Return the text of an individual DOM node.

299

300 Parameters

301 ----------

302 obj : node-like

303 A DOM node.

304

305 Returns

306 -------

307 text : str or unicode

308 The text from an individual DOM node.

309 """

310 raise AbstractMethodError(self)

311

312 def _parse_td(self, obj):

313 """

314 Return the td elements from a row element.

315

316 Parameters

317 ----------

318 obj : node-like

319 A DOM <tr> node.

320

321 Returns

322 -------

323 list of node-like

324 These are the elements of each row, i.e., the columns.

325 """

326 raise AbstractMethodError(self)

327

328 def _parse_thead_tr(self, table):

329 """

330 Return the list of thead row elements from the parsed table element.

331

332 Parameters

333 ----------

334 table : a table element that contains zero or more thead elements.

335

336 Returns

337 -------

338 list of node-like

339 These are the <tr> row elements of a table.

340 """

341 raise AbstractMethodError(self)

342

343 def _parse_tbody_tr(self, table):

344 """

345 Return the list of tbody row elements from the parsed table element.

346

347 HTML5 table bodies consist of either 0 or more <tbody> elements (which

348 only contain <tr> elements) or 0 or more <tr> elements. This method

349 checks for both structures.

350

351 Parameters

352 ----------

353 table : a table element that contains row elements.

354

355 Returns

356 -------

357 list of node-like

358 These are the <tr> row elements of a table.

359 """

360 raise AbstractMethodError(self)

361

362 def _parse_tfoot_tr(self, table):

363 """

364 Return the list of tfoot row elements from the parsed table element.

365

366 Parameters

367 ----------

368 table : a table element that contains row elements.

369

370 Returns

371 -------

372 list of node-like

373 These are the <tr> row elements of a table.

374 """

375 raise AbstractMethodError(self)

376

377 def _parse_tables(self, doc, match, attrs):

378 """

379 Return all tables from the parsed DOM.

380

381 Parameters

382 ----------

383 doc : the DOM from which to parse the table element.

384

385 match : str or regular expression

386 The text to search for in the DOM tree.

387

388 attrs : dict

389 A dictionary of table attributes that can be used to disambiguate

390 multiple tables on a page.

391

392 Raises

393 ------

394 ValueError : `match` does not match any text in the document.

395

396 Returns

397 -------

398 list of node-like

399 HTML <table> elements to be parsed into raw data.

400 """

401 raise AbstractMethodError(self)

402

403 def _equals_tag(self, obj, tag):

404 """

405 Return whether an individual DOM node matches a tag

406

407 Parameters

408 ----------

409 obj : node-like

410 A DOM node.

411

412 tag : str

413 Tag name to be checked for equality.

414

415 Returns

416 -------

417 boolean

418 Whether `obj`'s tag name is `tag`

419 """

420 raise AbstractMethodError(self)

421

422 def _build_doc(self):

423 """

424 Return a tree-like object that can be used to iterate over the DOM.

425

426 Returns

427 -------

428 node-like

429 The DOM from which to parse the table element.

430 """

431 raise AbstractMethodError(self)

432

433 def _parse_thead_tbody_tfoot(self, table_html):

434 """

435 Given a table, return parsed header, body, and foot.

436

437 Parameters

438 ----------

439 table_html : node-like

440

441 Returns

442 -------

443 tuple of (header, body, footer), each a list of list-of-text rows.

444

445 Notes

446 -----

447 Header and body are lists-of-lists. Top level list is a list of

448 rows. Each row is a list of str text.

449

450 Logic: Use <thead>, <tbody>, <tfoot> elements to identify

451 header, body, and footer, otherwise:

452 - Put all rows into body

453 - Move rows from top of body to header only if

454 all elements inside row are <th>

455 - Move rows from bottom of body to footer only if

456 all elements inside row are <th>

457 """

458 header_rows = self._parse_thead_tr(table_html)

459 body_rows = self._parse_tbody_tr(table_html)

460 footer_rows = self._parse_tfoot_tr(table_html)

461

462 def row_is_all_th(row):

463 return all(self._equals_tag(t, "th") for t in self._parse_td(row))

464

465 if not header_rows:

466 # The table has no <thead>. Move the top all-<th> rows from

467 # body_rows to header_rows. (This is a common case because many

468 # tables in the wild have no <thead> or <tfoot>

469 while body_rows and row_is_all_th(body_rows[0]):

470 header_rows.append(body_rows.pop(0))

471

472 header = self._expand_colspan_rowspan(header_rows, section="header")

473 body = self._expand_colspan_rowspan(body_rows, section="body")

474 footer = self._expand_colspan_rowspan(footer_rows, section="footer")

475

476 return header, body, footer

477

478 def _expand_colspan_rowspan(

479 self, rows, section: Literal["header", "footer", "body"]

480 ):

481 """

482 Given a list of <tr>s, return a list of text rows.

483

484 Parameters

485 ----------

486 rows : list of node-like

487 List of <tr>s

488 section : the section that the rows belong to (header, body or footer).

489

490 Returns

491 -------

492 list of list

493 Each returned row is a list of str text, or tuple (text, link)

494 if extract_links is not None.

495

496 Notes

497 -----

498 Any cell with ``rowspan`` or ``colspan`` will have its contents copied

499 to subsequent cells.

500 """

501 all_texts = [] # list of rows, each a list of str

502 text: str | tuple

503 remainder: list[

504 tuple[int, str | tuple, int]

505 ] = [] # list of (index, text, nrows)

506

507 for tr in rows:

508 texts = [] # the output for this row

509 next_remainder = []

510

511 index = 0

512 tds = self._parse_td(tr)

513 for td in tds:

514 # Append texts from previous rows with rowspan>1 that come

515 # before this <td>

516 while remainder and remainder[0][0] <= index:

517 prev_i, prev_text, prev_rowspan = remainder.pop(0)

518 texts.append(prev_text)

519 if prev_rowspan > 1:

520 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))

521 index += 1

522

523 # Append the text from this <td>, colspan times

524 text = _remove_whitespace(self._text_getter(td))

525 if self.extract_links in ("all", section):

526 href = self._href_getter(td)

527 text = (text, href)

528 rowspan = int(self._attr_getter(td, "rowspan") or 1)

529 colspan = int(self._attr_getter(td, "colspan") or 1)

530

531 for _ in range(colspan):

532 texts.append(text)

533 if rowspan > 1:

534 next_remainder.append((index, text, rowspan - 1))

535 index += 1

536

537 # Append texts from previous rows at the final position

538 for prev_i, prev_text, prev_rowspan in remainder:

539 texts.append(prev_text)

540 if prev_rowspan > 1:

541 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))

542

543 all_texts.append(texts)

544 remainder = next_remainder

545

546 # Append rows that only appear because the previous row had non-1

547 # rowspan

548 while remainder:

549 next_remainder = []

550 texts = []

551 for prev_i, prev_text, prev_rowspan in remainder:

552 texts.append(prev_text)

553 if prev_rowspan > 1:

554 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))

555 all_texts.append(texts)

556 remainder = next_remainder

557

558 return all_texts

559

560 def _handle_hidden_tables(self, tbl_list, attr_name):

561 """

562 Return list of tables, potentially removing hidden elements

563

564 Parameters

565 ----------

566 tbl_list : list of node-like

567 Type of list elements will vary depending upon parser used

568 attr_name : str

569 Name of the accessor for retrieving HTML attributes

570

571 Returns

572 -------

573 list of node-like

574 Return type matches `tbl_list`

575 """

576 if not self.displayed_only:

577 return tbl_list

578

579 return [

580 x

581 for x in tbl_list

582 if "display:none"

583 not in getattr(x, attr_name).get("style", "").replace(" ", "")

584 ]

585

586

587class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):

588 """

589 HTML to DataFrame parser that uses BeautifulSoup under the hood.

590

591 See Also

592 --------

593 pandas.io.html._HtmlFrameParser

594 pandas.io.html._LxmlFrameParser

595

596 Notes

597 -----

598 Documentation strings for this class are in the base class

599 :class:`pandas.io.html._HtmlFrameParser`.

600 """

601

602 def __init__(self, *args, **kwargs) -> None:

603 super().__init__(*args, **kwargs)

604 from bs4 import SoupStrainer

605

606 self._strainer = SoupStrainer("table")

607

608 def _parse_tables(self, doc, match, attrs):

609 element_name = self._strainer.name

610 tables = doc.find_all(element_name, attrs=attrs)

611

612 if not tables:

613 raise ValueError("No tables found")

614

615 result = []

616 unique_tables = set()

617 tables = self._handle_hidden_tables(tables, "attrs")

618

619 for table in tables:

620 if self.displayed_only:

621 for elem in table.find_all(style=re.compile(r"display:\s*none")):

622 elem.decompose()

623

624 if table not in unique_tables and table.find(string=match) is not None:

625 result.append(table)

626 unique_tables.add(table)

627

628 if not result:

629 raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")

630 return result

631

632 def _href_getter(self, obj) -> str | None:

633 a = obj.find("a", href=True)

634 return None if not a else a["href"]

635

636 def _text_getter(self, obj):

637 return obj.text

638

639 def _equals_tag(self, obj, tag):

640 return obj.name == tag

641

642 def _parse_td(self, row):

643 return row.find_all(("td", "th"), recursive=False)

644

645 def _parse_thead_tr(self, table):

646 return table.select("thead tr")

647

648 def _parse_tbody_tr(self, table):

649 from_tbody = table.select("tbody tr")

650 from_root = table.find_all("tr", recursive=False)

651 # HTML spec: at most one of these lists has content

652 return from_tbody + from_root

653

654 def _parse_tfoot_tr(self, table):

655 return table.select("tfoot tr")

656

657 def _setup_build_doc(self):

658 raw_text = _read(self.io, self.encoding)

659 if not raw_text:

660 raise ValueError(f"No text parsed from document: {self.io}")

661 return raw_text

662

663 def _build_doc(self):

664 from bs4 import BeautifulSoup

665

666 bdoc = self._setup_build_doc()

667 if isinstance(bdoc, bytes) and self.encoding is not None:

668 udoc = bdoc.decode(self.encoding)

669 from_encoding = None

670 else:

671 udoc = bdoc

672 from_encoding = self.encoding

673

674 soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)

675

676 for br in soup.find_all("br"):

677 br.replace_with("\n" + br.text)

678

679 return soup

680

681

682def _build_xpath_expr(attrs) -> str:

683 """

684 Build an xpath expression to simulate bs4's ability to pass in kwargs to

685 search for attributes when using the lxml parser.

686

687 Parameters

688 ----------

689 attrs : dict

690 A dict of HTML attributes. These are NOT checked for validity.

691

692 Returns

693 -------

694 expr : unicode

695 An XPath expression that checks for the given HTML attributes.

696 """

697 # give class attribute as class_ because class is a python keyword

698 if "class_" in attrs:

699 attrs["class"] = attrs.pop("class_")

700

701 s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()])

702 return f"[{s}]"

703

704

705_re_namespace = {"re": "http://exslt.org/regular-expressions"}

706

707

708class _LxmlFrameParser(_HtmlFrameParser):

709 """

710 HTML to DataFrame parser that uses lxml under the hood.

711

712 Warning

713 -------

714 This parser can only handle HTTP, FTP, and FILE urls.

715

716 See Also

717 --------

718 _HtmlFrameParser

719 _BeautifulSoupLxmlFrameParser

720

721 Notes

722 -----

723 Documentation strings for this class are in the base class

724 :class:`_HtmlFrameParser`.

725 """

726

727 def _href_getter(self, obj) -> str | None:

728 href = obj.xpath(".//a/@href")

729 return None if not href else href[0]

730

731 def _text_getter(self, obj):

732 return obj.text_content()

733

734 def _parse_td(self, row):

735 # Look for direct children only: the "row" element here may be a

736 # <thead> or <tfoot> (see _parse_thead_tr).

737 return row.xpath("./td|./th")

738

739 def _parse_tables(self, doc, match, kwargs):

740 pattern = match.pattern

741

742 # 1. check all descendants for the given pattern and only search tables

743 # GH 49929

744 xpath_expr = f"//table[.//text()[re:test(., {repr(pattern)})]]"

745

746 # if any table attributes were given build an xpath expression to

747 # search for them

748 if kwargs:

749 xpath_expr += _build_xpath_expr(kwargs)

750

751 tables = doc.xpath(xpath_expr, namespaces=_re_namespace)

752

753 tables = self._handle_hidden_tables(tables, "attrib")

754 if self.displayed_only:

755 for table in tables:

756 # lxml utilizes XPATH 1.0 which does not have regex

757 # support. As a result, we find all elements with a style

758 # attribute and iterate them to check for display:none

759 for elem in table.xpath(".//*[@style]"):

760 if "display:none" in elem.attrib.get("style", "").replace(" ", ""):

761 elem.getparent().remove(elem)

762

763 if not tables:

764 raise ValueError(f"No tables found matching regex {repr(pattern)}")

765 return tables

766

767 def _equals_tag(self, obj, tag):

768 return obj.tag == tag

769

770 def _build_doc(self):

771 """

772 Raises

773 ------

774 ValueError

775 * If a URL that lxml cannot parse is passed.

776

777 Exception

778 * Any other ``Exception`` thrown. For example, trying to parse a

779 URL that is syntactically correct on a machine with no internet

780 connection will fail.

781

782 See Also

783 --------

784 pandas.io.html._HtmlFrameParser._build_doc

785 """

786 from lxml.etree import XMLSyntaxError

787 from lxml.html import (

788 HTMLParser,

789 fromstring,

790 parse,

791 )

792

793 parser = HTMLParser(recover=True, encoding=self.encoding)

794

795 try:

796 if is_url(self.io):

797 with urlopen(self.io) as f:

798 r = parse(f, parser=parser)

799 else:

800 # try to parse the input in the simplest way

801 r = parse(self.io, parser=parser)

802 try:

803 r = r.getroot()

804 except AttributeError:

805 pass

806 except (UnicodeDecodeError, OSError) as e:

807 # if the input is a blob of html goop

808 if not is_url(self.io):

809 r = fromstring(self.io, parser=parser)

810

811 try:

812 r = r.getroot()

813 except AttributeError:

814 pass

815 else:

816 raise e

817 else:

818 if not hasattr(r, "text_content"):

819 raise XMLSyntaxError("no text parsed from document", 0, 0, 0)

820

821 for br in r.xpath("*//br"):

822 br.tail = "\n" + (br.tail or "")

823

824 return r

825

826 def _parse_thead_tr(self, table):

827 rows = []

828

829 for thead in table.xpath(".//thead"):

830 rows.extend(thead.xpath("./tr"))

831

832 # HACK: lxml does not clean up the clearly-erroneous

833 # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add

834 # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its

835 # children as though it's a <tr>.

836 #

837 # Better solution would be to use html5lib.

838 elements_at_root = thead.xpath("./td|./th")

839 if elements_at_root:

840 rows.append(thead)

841

842 return rows

843

844 def _parse_tbody_tr(self, table):

845 from_tbody = table.xpath(".//tbody//tr")

846 from_root = table.xpath("./tr")

847 # HTML spec: at most one of these lists has content

848 return from_tbody + from_root

849

850 def _parse_tfoot_tr(self, table):

851 return table.xpath(".//tfoot//tr")

852

853

854def _expand_elements(body) -> None:

855 data = [len(elem) for elem in body]

856 lens = Series(data)

857 lens_max = lens.max()

858 not_max = lens[lens != lens_max]

859

860 empty = [""]

861 for ind, length in not_max.items():

862 body[ind] += empty * (lens_max - length)

863

864

865def _data_to_frame(**kwargs):

866 head, body, foot = kwargs.pop("data")

867 header = kwargs.pop("header")

868 kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"])

869 if head:

870 body = head + body

871

872 # Infer header when there is a <thead> or top <th>-only rows

873 if header is None:

874 if len(head) == 1:

875 header = 0

876 else:

877 # ignore all-empty-text rows

878 header = [i for i, row in enumerate(head) if any(text for text in row)]

879

880 if foot:

881 body += foot

882

883 # fill out elements of body that are "ragged"

884 _expand_elements(body)

885 with TextParser(body, header=header, **kwargs) as tp:

886 return tp.read()

887

888

889_valid_parsers = {

890 "lxml": _LxmlFrameParser,

891 None: _LxmlFrameParser,

892 "html5lib": _BeautifulSoupHtml5LibFrameParser,

893 "bs4": _BeautifulSoupHtml5LibFrameParser,

894}

895

896

897def _parser_dispatch(flavor: str | None) -> type[_HtmlFrameParser]:

898 """

899 Choose the parser based on the input flavor.

900

901 Parameters

902 ----------

903 flavor : str

904 The type of parser to use. This must be a valid backend.

905

906 Returns

907 -------

908 cls : _HtmlFrameParser subclass

909 The parser class based on the requested input flavor.

910

911 Raises

912 ------

913 ValueError

914 * If `flavor` is not a valid backend.

915 ImportError

916 * If you do not have the requested `flavor`

917 """

918 valid_parsers = list(_valid_parsers.keys())

919 if flavor not in valid_parsers:

920 raise ValueError(

921 f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}"

922 )

923

924 if flavor in ("bs4", "html5lib"):

925 if not _HAS_HTML5LIB:

926 raise ImportError("html5lib not found, please install it")

927 if not _HAS_BS4:

928 raise ImportError("BeautifulSoup4 (bs4) not found, please install it")

929 # Although we call this above, we want to raise here right before use.

930 bs4 = import_optional_dependency("bs4") # noqa:F841

931

932 else:

933 if not _HAS_LXML:

934 raise ImportError("lxml not found, please install it")

935 return _valid_parsers[flavor]

936

937

938def _print_as_set(s) -> str:

939 arg = ", ".join([pprint_thing(el) for el in s])

940 return f"{{{arg}}}"

941

942

943def _validate_flavor(flavor):

944 if flavor is None:

945 flavor = "lxml", "bs4"

946 elif isinstance(flavor, str):

947 flavor = (flavor,)

948 elif isinstance(flavor, abc.Iterable):

949 if not all(isinstance(flav, str) for flav in flavor):

950 raise TypeError(

951 f"Object of type {repr(type(flavor).__name__)} "

952 f"is not an iterable of strings"

953 )

954 else:

955 msg = repr(flavor) if isinstance(flavor, str) else str(flavor)

956 msg += " is not a valid flavor"

957 raise ValueError(msg)

958

959 flavor = tuple(flavor)

960 valid_flavors = set(_valid_parsers)

961 flavor_set = set(flavor)

962

963 if not flavor_set & valid_flavors:

964 raise ValueError(

965 f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid "

966 f"flavors are {_print_as_set(valid_flavors)}"

967 )

968 return flavor

969

970

971def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs):

972 flavor = _validate_flavor(flavor)

973 compiled_match = re.compile(match) # you can pass a compiled regex here

974

975 retained = None

976 for flav in flavor:

977 parser = _parser_dispatch(flav)

978 p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links)

979

980 try:

981 tables = p.parse_tables()

982 except ValueError as caught:

983 # if `io` is an io-like object, check if it's seekable

984 # and try to rewind it before trying the next parser

985 if hasattr(io, "seekable") and io.seekable():

986 io.seek(0)

987 elif hasattr(io, "seekable") and not io.seekable():

988 # if we couldn't rewind it, let the user know

989 raise ValueError(

990 f"The flavor {flav} failed to parse your input. "

991 "Since you passed a non-rewindable file "

992 "object, we can't rewind it to try "

993 "another parser. Try read_html() with a different flavor."

994 ) from caught

995

996 retained = caught

997 else:

998 break

999 else:

1000 assert retained is not None # for mypy

1001 raise retained

1002

1003 ret = []

1004 for table in tables:

1005 try:

1006 df = _data_to_frame(data=table, **kwargs)

1007 # Cast MultiIndex header to an Index of tuples when extracting header

1008 # links and replace nan with None (therefore can't use mi.to_flat_index()).

1009 # This maintains consistency of selection (e.g. df.columns.str[1])

1010 if extract_links in ("all", "header") and isinstance(

1011 df.columns, MultiIndex

1012 ):

1013 df.columns = Index(

1014 ((col[0], None if isna(col[1]) else col[1]) for col in df.columns),

1015 tupleize_cols=False,

1016 )

1017

1018 ret.append(df)

1019 except EmptyDataError: # empty table

1020 continue

1021 return ret

1022

1023

1024def read_html(

1025 io: FilePath | ReadBuffer[str],

1026 *,

1027 match: str | Pattern = ".+",

1028 flavor: str | None = None,

1029 header: int | Sequence[int] | None = None,

1030 index_col: int | Sequence[int] | None = None,

1031 skiprows: int | Sequence[int] | slice | None = None,

1032 attrs: dict[str, str] | None = None,

1033 parse_dates: bool = False,

1034 thousands: str | None = ",",

1035 encoding: str | None = None,

1036 decimal: str = ".",

1037 converters: dict | None = None,

1038 na_values: Iterable[object] | None = None,

1039 keep_default_na: bool = True,

1040 displayed_only: bool = True,

1041 extract_links: Literal[None, "header", "footer", "body", "all"] = None,

1042 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

1043) -> list[DataFrame]:

1044 r"""

1045 Read HTML tables into a ``list`` of ``DataFrame`` objects.

1046

1047 Parameters

1048 ----------

1049 io : str, path object, or file-like object

1050 String, path object (implementing ``os.PathLike[str]``), or file-like

1051 object implementing a string ``read()`` function.

1052 The string can represent a URL or the HTML itself. Note that

1053 lxml only accepts the http, ftp and file url protocols. If you have a

1054 URL that starts with ``'https'`` you might try removing the ``'s'``.

1055

1056 match : str or compiled regular expression, optional

1057 The set of tables containing text matching this regex or string will be

1058 returned. Unless the HTML is extremely simple you will probably need to

1059 pass a non-empty string here. Defaults to '.+' (match any non-empty

1060 string). The default value will return all tables contained on a page.

1061 This value is converted to a regular expression so that there is

1062 consistent behavior between Beautiful Soup and lxml.

1063

1064 flavor : str, optional

1065 The parsing engine to use. 'bs4' and 'html5lib' are synonymous with

1066 each other, they are both there for backwards compatibility. The

1067 default of ``None`` tries to use ``lxml`` to parse and if that fails it

1068 falls back on ``bs4`` + ``html5lib``.

1069

1070 header : int or list-like, optional

1071 The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to

1072 make the columns headers.

1073

1074 index_col : int or list-like, optional

1075 The column (or list of columns) to use to create the index.

1076

1077 skiprows : int, list-like or slice, optional

1078 Number of rows to skip after parsing the column integer. 0-based. If a

1079 sequence of integers or a slice is given, will skip the rows indexed by

1080 that sequence. Note that a single element sequence means 'skip the nth

1081 row' whereas an integer means 'skip n rows'.

1082

1083 attrs : dict, optional

1084 This is a dictionary of attributes that you can pass to use to identify

1085 the table in the HTML. These are not checked for validity before being

1086 passed to lxml or Beautiful Soup. However, these attributes must be

1087 valid HTML table attributes to work correctly. For example, ::

1088

1089 attrs = {'id': 'table'}

1090

1091 is a valid attribute dictionary because the 'id' HTML tag attribute is

1092 a valid HTML attribute for *any* HTML tag as per `this document

1093 <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>`__. ::

1094

1095 attrs = {'asdf': 'table'}

1096

1097 is *not* a valid attribute dictionary because 'asdf' is not a valid

1098 HTML attribute even if it is a valid XML attribute. Valid HTML 4.01

1099 table attributes can be found `here

1100 <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A

1101 working draft of the HTML 5 spec can be found `here

1102 <https://html.spec.whatwg.org/multipage/tables.html>`__. It contains the

1103 latest information on table attributes for the modern web.

1104

1105 parse_dates : bool, optional

1106 See :func:`~read_csv` for more details.

1107

1108 thousands : str, optional

1109 Separator to use to parse thousands. Defaults to ``','``.

1110

1111 encoding : str, optional

1112 The encoding used to decode the web page. Defaults to ``None``.``None``

1113 preserves the previous encoding behavior, which depends on the

1114 underlying parser library (e.g., the parser library will try to use

1115 the encoding provided by the document).

1116

1117 decimal : str, default '.'

1118 Character to recognize as decimal point (e.g. use ',' for European

1119 data).

1120

1121 converters : dict, default None

1122 Dict of functions for converting values in certain columns. Keys can

1123 either be integers or column labels, values are functions that take one

1124 input argument, the cell (not column) content, and return the

1125 transformed content.

1126

1127 na_values : iterable, default None

1128 Custom NA values.

1129

1130 keep_default_na : bool, default True

1131 If na_values are specified and keep_default_na is False the default NaN

1132 values are overridden, otherwise they're appended to.

1133

1134 displayed_only : bool, default True

1135 Whether elements with "display: none" should be parsed.

1136

1137 extract_links : {None, "all", "header", "body", "footer"}

1138 Table elements in the specified section(s) with <a> tags will have their

1139 href extracted.

1140

1141 .. versionadded:: 1.5.0

1142

1143 dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames

1144 Which dtype_backend to use, e.g. whether a DataFrame should have NumPy

1145 arrays, nullable dtypes are used for all dtypes that have a nullable

1146 implementation when "numpy_nullable" is set, pyarrow is used for all

1147 dtypes if "pyarrow" is set.

1148

1149 The dtype_backends are still experimential.

1150

1151 .. versionadded:: 2.0

1152

1153 Returns

1154 -------

1155 dfs

1156 A list of DataFrames.

1157

1158 See Also

1159 --------

1160 read_csv : Read a comma-separated values (csv) file into DataFrame.

1161

1162 Notes

1163 -----

1164 Before using this function you should read the :ref:`gotchas about the

1165 HTML parsing libraries <io.html.gotchas>`.

1166

1167 Expect to do some cleanup after you call this function. For example, you

1168 might need to manually assign column names if the column names are

1169 converted to NaN when you pass the `header=0` argument. We try to assume as

1170 little as possible about the structure of the table and push the

1171 idiosyncrasies of the HTML contained in the table to the user.

1172

1173 This function searches for ``<table>`` elements and only for ``<tr>``

1174 and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``

1175 element in the table. ``<td>`` stands for "table data". This function

1176 attempts to properly handle ``colspan`` and ``rowspan`` attributes.

1177 If the function has a ``<thead>`` argument, it is used to construct

1178 the header, otherwise the function attempts to find the header within

1179 the body (by putting rows with only ``<th>`` elements into the header).

1180

1181 Similar to :func:`~read_csv` the `header` argument is applied

1182 **after** `skiprows` is applied.

1183

1184 This function will *always* return a list of :class:`DataFrame` *or*

1185 it will fail, e.g., it will *not* return an empty list.

1186

1187 Examples

1188 --------

1189 See the :ref:`read_html documentation in the IO section of the docs

1190 <io.read_html>` for some examples of reading in HTML tables.

1191 """

1192 _importers()

1193

1194 # Type check here. We don't want to parse only to fail because of an

1195 # invalid value of an integer skiprows.

1196 if isinstance(skiprows, numbers.Integral) and skiprows < 0:

1197 raise ValueError(

1198 "cannot skip rows starting from the end of the "

1199 "data (you passed a negative value)"

1200 )

1201 if extract_links not in [None, "header", "footer", "body", "all"]:

1202 raise ValueError(

1203 "`extract_links` must be one of "

1204 '{None, "header", "footer", "body", "all"}, got '

1205 f'"{extract_links}"'

1206 )

1207 validate_header_arg(header)

1208 check_dtype_backend(dtype_backend)

1209

1210 io = stringify_path(io)

1211

1212 return _parse(

1213 flavor=flavor,

1214 io=io,

1215 match=match,

1216 header=header,

1217 index_col=index_col,

1218 skiprows=skiprows,

1219 parse_dates=parse_dates,

1220 thousands=thousands,

1221 attrs=attrs,

1222 encoding=encoding,

1223 decimal=decimal,

1224 converters=converters,

1225 na_values=na_values,

1226 keep_default_na=keep_default_na,

1227 displayed_only=displayed_only,

1228 extract_links=extract_links,

1229 dtype_backend=dtype_backend,

1230 )