Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/html.py: 21%

1"""

2:mod:`pandas.io.html` is a module containing functionality for dealing with

3HTML IO.

5"""

7from __future__ import annotations

9from collections import abc

10import numbers

11import re

12from re import Pattern

13from typing import (

14 TYPE_CHECKING,

15 Literal,

16 cast,

17)

18import warnings

20from pandas._libs import lib

21from pandas.compat._optional import import_optional_dependency

22from pandas.errors import (

23 AbstractMethodError,

24 EmptyDataError,

25)

26from pandas.util._decorators import doc

27from pandas.util._exceptions import find_stack_level

28from pandas.util._validators import check_dtype_backend

30from pandas.core.dtypes.common import is_list_like

32from pandas import isna

33from pandas.core.indexes.base import Index

34from pandas.core.indexes.multi import MultiIndex

35from pandas.core.series import Series

36from pandas.core.shared_docs import _shared_docs

38from pandas.io.common import (

39 file_exists,

40 get_handle,

41 is_file_like,

42 is_fsspec_url,

43 is_url,

44 stringify_path,

45 validate_header_arg,

46)

47from pandas.io.formats.printing import pprint_thing

48from pandas.io.parsers import TextParser

50if TYPE_CHECKING:

51 from collections.abc import (

52 Iterable,

53 Sequence,

54 )

56 from pandas._typing import (

57 BaseBuffer,

58 DtypeBackend,

59 FilePath,

60 HTMLFlavors,

61 ReadBuffer,

62 StorageOptions,

63 )

65 from pandas import DataFrame

67#############

68# READ HTML #

69#############

70_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}")

73def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str:

74 """

75 Replace extra whitespace inside of a string with a single space.

77 Parameters

78 ----------

79 s : str or unicode

80 The string from which to remove extra whitespace.

81 regex : re.Pattern

82 The regular expression to use to remove extra whitespace.

84 Returns

85 -------

86 subd : str or unicode

87 `s` with all extra whitespace replaced with a single space.

88 """

89 return regex.sub(" ", s.strip())

92def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]:

93 """

94 Get an iterator given an integer, slice or container.

96 Parameters

97 ----------

98 skiprows : int, slice, container

99 The iterator to use to skip rows; can also be a slice.

100

101 Raises

102 ------

103 TypeError

104 * If `skiprows` is not a slice, integer, or Container

105

106 Returns

107 -------

108 it : iterable

109 A proper iterator to use to skip rows of a DataFrame.

110 """

111 if isinstance(skiprows, slice):

112 start, step = skiprows.start or 0, skiprows.step or 1

113 return list(range(start, skiprows.stop, step))

114 elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):

115 return cast("int | Sequence[int]", skiprows)

116 elif skiprows is None:

117 return 0

118 raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows")

119

120

121def _read(

122 obj: FilePath | BaseBuffer,

123 encoding: str | None,

124 storage_options: StorageOptions | None,

125) -> str | bytes:

126 """

127 Try to read from a url, file or string.

128

129 Parameters

130 ----------

131 obj : str, unicode, path object, or file-like object

132

133 Returns

134 -------

135 raw_text : str

136 """

137 text: str | bytes

138 if (

139 is_url(obj)

140 or hasattr(obj, "read")

141 or (isinstance(obj, str) and file_exists(obj))

142 ):

143 with get_handle(

144 obj, "r", encoding=encoding, storage_options=storage_options

145 ) as handles:

146 text = handles.handle.read()

147 elif isinstance(obj, (str, bytes)):

148 text = obj

149 else:

150 raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")

151 return text

152

153

154class _HtmlFrameParser:

155 """

156 Base class for parsers that parse HTML into DataFrames.

157

158 Parameters

159 ----------

160 io : str or file-like

161 This can be either a string of raw HTML, a valid URL using the HTTP,

162 FTP, or FILE protocols or a file-like object.

163

164 match : str or regex

165 The text to match in the document.

166

167 attrs : dict

168 List of HTML <table> element attributes to match.

169

170 encoding : str

171 Encoding to be used by parser

172

173 displayed_only : bool

174 Whether or not items with "display:none" should be ignored

175

176 extract_links : {None, "all", "header", "body", "footer"}

177 Table elements in the specified section(s) with <a> tags will have their

178 href extracted.

179

180 .. versionadded:: 1.5.0

181

182 Attributes

183 ----------

184 io : str or file-like

185 raw HTML, URL, or file-like object

186

187 match : regex

188 The text to match in the raw HTML

189

190 attrs : dict-like

191 A dictionary of valid table attributes to use to search for table

192 elements.

193

194 encoding : str

195 Encoding to be used by parser

196

197 displayed_only : bool

198 Whether or not items with "display:none" should be ignored

199

200 extract_links : {None, "all", "header", "body", "footer"}

201 Table elements in the specified section(s) with <a> tags will have their

202 href extracted.

203

204 .. versionadded:: 1.5.0

205

206 Notes

207 -----

208 To subclass this class effectively you must override the following methods:

209 * :func:`_build_doc`

210 * :func:`_attr_getter`

211 * :func:`_href_getter`

212 * :func:`_text_getter`

213 * :func:`_parse_td`

214 * :func:`_parse_thead_tr`

215 * :func:`_parse_tbody_tr`

216 * :func:`_parse_tfoot_tr`

217 * :func:`_parse_tables`

218 * :func:`_equals_tag`

219 See each method's respective documentation for details on their

220 functionality.

221 """

222

223 def __init__(

224 self,

225 io: FilePath | ReadBuffer[str] | ReadBuffer[bytes],

226 match: str | Pattern,

227 attrs: dict[str, str] | None,

228 encoding: str,

229 displayed_only: bool,

230 extract_links: Literal[None, "header", "footer", "body", "all"],

231 storage_options: StorageOptions = None,

232 ) -> None:

233 self.io = io

234 self.match = match

235 self.attrs = attrs

236 self.encoding = encoding

237 self.displayed_only = displayed_only

238 self.extract_links = extract_links

239 self.storage_options = storage_options

240

241 def parse_tables(self):

242 """

243 Parse and return all tables from the DOM.

244

245 Returns

246 -------

247 list of parsed (header, body, footer) tuples from tables.

248 """

249 tables = self._parse_tables(self._build_doc(), self.match, self.attrs)

250 return (self._parse_thead_tbody_tfoot(table) for table in tables)

251

252 def _attr_getter(self, obj, attr):

253 """

254 Return the attribute value of an individual DOM node.

255

256 Parameters

257 ----------

258 obj : node-like

259 A DOM node.

260

261 attr : str or unicode

262 The attribute, such as "colspan"

263

264 Returns

265 -------

266 str or unicode

267 The attribute value.

268 """

269 # Both lxml and BeautifulSoup have the same implementation:

270 return obj.get(attr)

271

272 def _href_getter(self, obj) -> str | None:

273 """

274 Return a href if the DOM node contains a child <a> or None.

275

276 Parameters

277 ----------

278 obj : node-like

279 A DOM node.

280

281 Returns

282 -------

283 href : str or unicode

284 The href from the <a> child of the DOM node.

285 """

286 raise AbstractMethodError(self)

287

288 def _text_getter(self, obj):

289 """

290 Return the text of an individual DOM node.

291

292 Parameters

293 ----------

294 obj : node-like

295 A DOM node.

296

297 Returns

298 -------

299 text : str or unicode

300 The text from an individual DOM node.

301 """

302 raise AbstractMethodError(self)

303

304 def _parse_td(self, obj):

305 """

306 Return the td elements from a row element.

307

308 Parameters

309 ----------

310 obj : node-like

311 A DOM <tr> node.

312

313 Returns

314 -------

315 list of node-like

316 These are the elements of each row, i.e., the columns.

317 """

318 raise AbstractMethodError(self)

319

320 def _parse_thead_tr(self, table):

321 """

322 Return the list of thead row elements from the parsed table element.

323

324 Parameters

325 ----------

326 table : a table element that contains zero or more thead elements.

327

328 Returns

329 -------

330 list of node-like

331 These are the <tr> row elements of a table.

332 """

333 raise AbstractMethodError(self)

334

335 def _parse_tbody_tr(self, table):

336 """

337 Return the list of tbody row elements from the parsed table element.

338

339 HTML5 table bodies consist of either 0 or more <tbody> elements (which

340 only contain <tr> elements) or 0 or more <tr> elements. This method

341 checks for both structures.

342

343 Parameters

344 ----------

345 table : a table element that contains row elements.

346

347 Returns

348 -------

349 list of node-like

350 These are the <tr> row elements of a table.

351 """

352 raise AbstractMethodError(self)

353

354 def _parse_tfoot_tr(self, table):

355 """

356 Return the list of tfoot row elements from the parsed table element.

357

358 Parameters

359 ----------

360 table : a table element that contains row elements.

361

362 Returns

363 -------

364 list of node-like

365 These are the <tr> row elements of a table.

366 """

367 raise AbstractMethodError(self)

368

369 def _parse_tables(self, document, match, attrs):

370 """

371 Return all tables from the parsed DOM.

372

373 Parameters

374 ----------

375 document : the DOM from which to parse the table element.

376

377 match : str or regular expression

378 The text to search for in the DOM tree.

379

380 attrs : dict

381 A dictionary of table attributes that can be used to disambiguate

382 multiple tables on a page.

383

384 Raises

385 ------

386 ValueError : `match` does not match any text in the document.

387

388 Returns

389 -------

390 list of node-like

391 HTML <table> elements to be parsed into raw data.

392 """

393 raise AbstractMethodError(self)

394

395 def _equals_tag(self, obj, tag) -> bool:

396 """

397 Return whether an individual DOM node matches a tag

398

399 Parameters

400 ----------

401 obj : node-like

402 A DOM node.

403

404 tag : str

405 Tag name to be checked for equality.

406

407 Returns

408 -------

409 boolean

410 Whether `obj`'s tag name is `tag`

411 """

412 raise AbstractMethodError(self)

413

414 def _build_doc(self):

415 """

416 Return a tree-like object that can be used to iterate over the DOM.

417

418 Returns

419 -------

420 node-like

421 The DOM from which to parse the table element.

422 """

423 raise AbstractMethodError(self)

424

425 def _parse_thead_tbody_tfoot(self, table_html):

426 """

427 Given a table, return parsed header, body, and foot.

428

429 Parameters

430 ----------

431 table_html : node-like

432

433 Returns

434 -------

435 tuple of (header, body, footer), each a list of list-of-text rows.

436

437 Notes

438 -----

439 Header and body are lists-of-lists. Top level list is a list of

440 rows. Each row is a list of str text.

441

442 Logic: Use <thead>, <tbody>, <tfoot> elements to identify

443 header, body, and footer, otherwise:

444 - Put all rows into body

445 - Move rows from top of body to header only if

446 all elements inside row are <th>

447 - Move rows from bottom of body to footer only if

448 all elements inside row are <th>

449 """

450 header_rows = self._parse_thead_tr(table_html)

451 body_rows = self._parse_tbody_tr(table_html)

452 footer_rows = self._parse_tfoot_tr(table_html)

453

454 def row_is_all_th(row):

455 return all(self._equals_tag(t, "th") for t in self._parse_td(row))

456

457 if not header_rows:

458 # The table has no <thead>. Move the top all-<th> rows from

459 # body_rows to header_rows. (This is a common case because many

460 # tables in the wild have no <thead> or <tfoot>

461 while body_rows and row_is_all_th(body_rows[0]):

462 header_rows.append(body_rows.pop(0))

463

464 header = self._expand_colspan_rowspan(header_rows, section="header")

465 body = self._expand_colspan_rowspan(body_rows, section="body")

466 footer = self._expand_colspan_rowspan(footer_rows, section="footer")

467

468 return header, body, footer

469

470 def _expand_colspan_rowspan(

471 self, rows, section: Literal["header", "footer", "body"]

472 ):

473 """

474 Given a list of <tr>s, return a list of text rows.

475

476 Parameters

477 ----------

478 rows : list of node-like

479 List of <tr>s

480 section : the section that the rows belong to (header, body or footer).

481

482 Returns

483 -------

484 list of list

485 Each returned row is a list of str text, or tuple (text, link)

486 if extract_links is not None.

487

488 Notes

489 -----

490 Any cell with ``rowspan`` or ``colspan`` will have its contents copied

491 to subsequent cells.

492 """

493 all_texts = [] # list of rows, each a list of str

494 text: str | tuple

495 remainder: list[

496 tuple[int, str | tuple, int]

497 ] = [] # list of (index, text, nrows)

498

499 for tr in rows:

500 texts = [] # the output for this row

501 next_remainder = []

502

503 index = 0

504 tds = self._parse_td(tr)

505 for td in tds:

506 # Append texts from previous rows with rowspan>1 that come

507 # before this <td>

508 while remainder and remainder[0][0] <= index:

509 prev_i, prev_text, prev_rowspan = remainder.pop(0)

510 texts.append(prev_text)

511 if prev_rowspan > 1:

512 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))

513 index += 1

514

515 # Append the text from this <td>, colspan times

516 text = _remove_whitespace(self._text_getter(td))

517 if self.extract_links in ("all", section):

518 href = self._href_getter(td)

519 text = (text, href)

520 rowspan = int(self._attr_getter(td, "rowspan") or 1)

521 colspan = int(self._attr_getter(td, "colspan") or 1)

522

523 for _ in range(colspan):

524 texts.append(text)

525 if rowspan > 1:

526 next_remainder.append((index, text, rowspan - 1))

527 index += 1

528

529 # Append texts from previous rows at the final position

530 for prev_i, prev_text, prev_rowspan in remainder:

531 texts.append(prev_text)

532 if prev_rowspan > 1:

533 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))

534

535 all_texts.append(texts)

536 remainder = next_remainder

537

538 # Append rows that only appear because the previous row had non-1

539 # rowspan

540 while remainder:

541 next_remainder = []

542 texts = []

543 for prev_i, prev_text, prev_rowspan in remainder:

544 texts.append(prev_text)

545 if prev_rowspan > 1:

546 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))

547 all_texts.append(texts)

548 remainder = next_remainder

549

550 return all_texts

551

552 def _handle_hidden_tables(self, tbl_list, attr_name: str):

553 """

554 Return list of tables, potentially removing hidden elements

555

556 Parameters

557 ----------

558 tbl_list : list of node-like

559 Type of list elements will vary depending upon parser used

560 attr_name : str

561 Name of the accessor for retrieving HTML attributes

562

563 Returns

564 -------

565 list of node-like

566 Return type matches `tbl_list`

567 """

568 if not self.displayed_only:

569 return tbl_list

570

571 return [

572 x

573 for x in tbl_list

574 if "display:none"

575 not in getattr(x, attr_name).get("style", "").replace(" ", "")

576 ]

577

578

579class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):

580 """

581 HTML to DataFrame parser that uses BeautifulSoup under the hood.

582

583 See Also

584 --------

585 pandas.io.html._HtmlFrameParser

586 pandas.io.html._LxmlFrameParser

587

588 Notes

589 -----

590 Documentation strings for this class are in the base class

591 :class:`pandas.io.html._HtmlFrameParser`.

592 """

593

594 def _parse_tables(self, document, match, attrs):

595 element_name = "table"

596 tables = document.find_all(element_name, attrs=attrs)

597 if not tables:

598 raise ValueError("No tables found")

599

600 result = []

601 unique_tables = set()

602 tables = self._handle_hidden_tables(tables, "attrs")

603

604 for table in tables:

605 if self.displayed_only:

606 for elem in table.find_all("style"):

607 elem.decompose()

608

609 for elem in table.find_all(style=re.compile(r"display:\s*none")):

610 elem.decompose()

611

612 if table not in unique_tables and table.find(string=match) is not None:

613 result.append(table)

614 unique_tables.add(table)

615 if not result:

616 raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")

617 return result

618

619 def _href_getter(self, obj) -> str | None:

620 a = obj.find("a", href=True)

621 return None if not a else a["href"]

622

623 def _text_getter(self, obj):

624 return obj.text

625

626 def _equals_tag(self, obj, tag) -> bool:

627 return obj.name == tag

628

629 def _parse_td(self, row):

630 return row.find_all(("td", "th"), recursive=False)

631

632 def _parse_thead_tr(self, table):

633 return table.select("thead tr")

634

635 def _parse_tbody_tr(self, table):

636 from_tbody = table.select("tbody tr")

637 from_root = table.find_all("tr", recursive=False)

638 # HTML spec: at most one of these lists has content

639 return from_tbody + from_root

640

641 def _parse_tfoot_tr(self, table):

642 return table.select("tfoot tr")

643

644 def _setup_build_doc(self):

645 raw_text = _read(self.io, self.encoding, self.storage_options)

646 if not raw_text:

647 raise ValueError(f"No text parsed from document: {self.io}")

648 return raw_text

649

650 def _build_doc(self):

651 from bs4 import BeautifulSoup

652

653 bdoc = self._setup_build_doc()

654 if isinstance(bdoc, bytes) and self.encoding is not None:

655 udoc = bdoc.decode(self.encoding)

656 from_encoding = None

657 else:

658 udoc = bdoc

659 from_encoding = self.encoding

660

661 soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)

662

663 for br in soup.find_all("br"):

664 br.replace_with("\n" + br.text)

665

666 return soup

667

668

669def _build_xpath_expr(attrs) -> str:

670 """

671 Build an xpath expression to simulate bs4's ability to pass in kwargs to

672 search for attributes when using the lxml parser.

673

674 Parameters

675 ----------

676 attrs : dict

677 A dict of HTML attributes. These are NOT checked for validity.

678

679 Returns

680 -------

681 expr : unicode

682 An XPath expression that checks for the given HTML attributes.

683 """

684 # give class attribute as class_ because class is a python keyword

685 if "class_" in attrs:

686 attrs["class"] = attrs.pop("class_")

687

688 s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()])

689 return f"[{s}]"

690

691

692_re_namespace = {"re": "http://exslt.org/regular-expressions"}

693

694

695class _LxmlFrameParser(_HtmlFrameParser):

696 """

697 HTML to DataFrame parser that uses lxml under the hood.

698

699 Warning

700 -------

701 This parser can only handle HTTP, FTP, and FILE urls.

702

703 See Also

704 --------

705 _HtmlFrameParser

706 _BeautifulSoupLxmlFrameParser

707

708 Notes

709 -----

710 Documentation strings for this class are in the base class

711 :class:`_HtmlFrameParser`.

712 """

713

714 def _href_getter(self, obj) -> str | None:

715 href = obj.xpath(".//a/@href")

716 return None if not href else href[0]

717

718 def _text_getter(self, obj):

719 return obj.text_content()

720

721 def _parse_td(self, row):

722 # Look for direct children only: the "row" element here may be a

723 # <thead> or <tfoot> (see _parse_thead_tr).

724 return row.xpath("./td|./th")

725

726 def _parse_tables(self, document, match, kwargs):

727 pattern = match.pattern

728

729 # 1. check all descendants for the given pattern and only search tables

730 # GH 49929

731 xpath_expr = f"//table[.//text()[re:test(., {repr(pattern)})]]"

732

733 # if any table attributes were given build an xpath expression to

734 # search for them

735 if kwargs:

736 xpath_expr += _build_xpath_expr(kwargs)

737

738 tables = document.xpath(xpath_expr, namespaces=_re_namespace)

739

740 tables = self._handle_hidden_tables(tables, "attrib")

741 if self.displayed_only:

742 for table in tables:

743 # lxml utilizes XPATH 1.0 which does not have regex

744 # support. As a result, we find all elements with a style

745 # attribute and iterate them to check for display:none

746 for elem in table.xpath(".//style"):

747 elem.drop_tree()

748 for elem in table.xpath(".//*[@style]"):

749 if "display:none" in elem.attrib.get("style", "").replace(" ", ""):

750 elem.drop_tree()

751 if not tables:

752 raise ValueError(f"No tables found matching regex {repr(pattern)}")

753 return tables

754

755 def _equals_tag(self, obj, tag) -> bool:

756 return obj.tag == tag

757

758 def _build_doc(self):

759 """

760 Raises

761 ------

762 ValueError

763 * If a URL that lxml cannot parse is passed.

764

765 Exception

766 * Any other ``Exception`` thrown. For example, trying to parse a

767 URL that is syntactically correct on a machine with no internet

768 connection will fail.

769

770 See Also

771 --------

772 pandas.io.html._HtmlFrameParser._build_doc

773 """

774 from lxml.etree import XMLSyntaxError

775 from lxml.html import (

776 HTMLParser,

777 fromstring,

778 parse,

779 )

780

781 parser = HTMLParser(recover=True, encoding=self.encoding)

782

783 try:

784 if is_url(self.io):

785 with get_handle(

786 self.io, "r", storage_options=self.storage_options

787 ) as f:

788 r = parse(f.handle, parser=parser)

789 else:

790 # try to parse the input in the simplest way

791 r = parse(self.io, parser=parser)

792 try:

793 r = r.getroot()

794 except AttributeError:

795 pass

796 except (UnicodeDecodeError, OSError) as e:

797 # if the input is a blob of html goop

798 if not is_url(self.io):

799 r = fromstring(self.io, parser=parser)

800

801 try:

802 r = r.getroot()

803 except AttributeError:

804 pass

805 else:

806 raise e

807 else:

808 if not hasattr(r, "text_content"):

809 raise XMLSyntaxError("no text parsed from document", 0, 0, 0)

810

811 for br in r.xpath("*//br"):

812 br.tail = "\n" + (br.tail or "")

813

814 return r

815

816 def _parse_thead_tr(self, table):

817 rows = []

818

819 for thead in table.xpath(".//thead"):

820 rows.extend(thead.xpath("./tr"))

821

822 # HACK: lxml does not clean up the clearly-erroneous

823 # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add

824 # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its

825 # children as though it's a <tr>.

826 #

827 # Better solution would be to use html5lib.

828 elements_at_root = thead.xpath("./td|./th")

829 if elements_at_root:

830 rows.append(thead)

831

832 return rows

833

834 def _parse_tbody_tr(self, table):

835 from_tbody = table.xpath(".//tbody//tr")

836 from_root = table.xpath("./tr")

837 # HTML spec: at most one of these lists has content

838 return from_tbody + from_root

839

840 def _parse_tfoot_tr(self, table):

841 return table.xpath(".//tfoot//tr")

842

843

844def _expand_elements(body) -> None:

845 data = [len(elem) for elem in body]

846 lens = Series(data)

847 lens_max = lens.max()

848 not_max = lens[lens != lens_max]

849

850 empty = [""]

851 for ind, length in not_max.items():

852 body[ind] += empty * (lens_max - length)

853

854

855def _data_to_frame(**kwargs):

856 head, body, foot = kwargs.pop("data")

857 header = kwargs.pop("header")

858 kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"])

859 if head:

860 body = head + body

861

862 # Infer header when there is a <thead> or top <th>-only rows

863 if header is None:

864 if len(head) == 1:

865 header = 0

866 else:

867 # ignore all-empty-text rows

868 header = [i for i, row in enumerate(head) if any(text for text in row)]

869

870 if foot:

871 body += foot

872

873 # fill out elements of body that are "ragged"

874 _expand_elements(body)

875 with TextParser(body, header=header, **kwargs) as tp:

876 return tp.read()

877

878

879_valid_parsers = {

880 "lxml": _LxmlFrameParser,

881 None: _LxmlFrameParser,

882 "html5lib": _BeautifulSoupHtml5LibFrameParser,

883 "bs4": _BeautifulSoupHtml5LibFrameParser,

884}

885

886

887def _parser_dispatch(flavor: HTMLFlavors | None) -> type[_HtmlFrameParser]:

888 """

889 Choose the parser based on the input flavor.

890

891 Parameters

892 ----------

893 flavor : {{"lxml", "html5lib", "bs4"}} or None

894 The type of parser to use. This must be a valid backend.

895

896 Returns

897 -------

898 cls : _HtmlFrameParser subclass

899 The parser class based on the requested input flavor.

900

901 Raises

902 ------

903 ValueError

904 * If `flavor` is not a valid backend.

905 ImportError

906 * If you do not have the requested `flavor`

907 """

908 valid_parsers = list(_valid_parsers.keys())

909 if flavor not in valid_parsers:

910 raise ValueError(

911 f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}"

912 )

913

914 if flavor in ("bs4", "html5lib"):

915 import_optional_dependency("html5lib")

916 import_optional_dependency("bs4")

917 else:

918 import_optional_dependency("lxml.etree")

919 return _valid_parsers[flavor]

920

921

922def _print_as_set(s) -> str:

923 arg = ", ".join([pprint_thing(el) for el in s])

924 return f"{{{arg}}}"

925

926

927def _validate_flavor(flavor):

928 if flavor is None:

929 flavor = "lxml", "bs4"

930 elif isinstance(flavor, str):

931 flavor = (flavor,)

932 elif isinstance(flavor, abc.Iterable):

933 if not all(isinstance(flav, str) for flav in flavor):

934 raise TypeError(

935 f"Object of type {repr(type(flavor).__name__)} "

936 f"is not an iterable of strings"

937 )

938 else:

939 msg = repr(flavor) if isinstance(flavor, str) else str(flavor)

940 msg += " is not a valid flavor"

941 raise ValueError(msg)

942

943 flavor = tuple(flavor)

944 valid_flavors = set(_valid_parsers)

945 flavor_set = set(flavor)

946

947 if not flavor_set & valid_flavors:

948 raise ValueError(

949 f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid "

950 f"flavors are {_print_as_set(valid_flavors)}"

951 )

952 return flavor

953

954

955def _parse(

956 flavor,

957 io,

958 match,

959 attrs,

960 encoding,

961 displayed_only,

962 extract_links,

963 storage_options,

964 **kwargs,

965):

966 flavor = _validate_flavor(flavor)

967 compiled_match = re.compile(match) # you can pass a compiled regex here

968

969 retained = None

970 for flav in flavor:

971 parser = _parser_dispatch(flav)

972 p = parser(

973 io,

974 compiled_match,

975 attrs,

976 encoding,

977 displayed_only,

978 extract_links,

979 storage_options,

980 )

981

982 try:

983 tables = p.parse_tables()

984 except ValueError as caught:

985 # if `io` is an io-like object, check if it's seekable

986 # and try to rewind it before trying the next parser

987 if hasattr(io, "seekable") and io.seekable():

988 io.seek(0)

989 elif hasattr(io, "seekable") and not io.seekable():

990 # if we couldn't rewind it, let the user know

991 raise ValueError(

992 f"The flavor {flav} failed to parse your input. "

993 "Since you passed a non-rewindable file "

994 "object, we can't rewind it to try "

995 "another parser. Try read_html() with a different flavor."

996 ) from caught

997

998 retained = caught

999 else:

1000 break

1001 else:

1002 assert retained is not None # for mypy

1003 raise retained

1004

1005 ret = []

1006 for table in tables:

1007 try:

1008 df = _data_to_frame(data=table, **kwargs)

1009 # Cast MultiIndex header to an Index of tuples when extracting header

1010 # links and replace nan with None (therefore can't use mi.to_flat_index()).

1011 # This maintains consistency of selection (e.g. df.columns.str[1])

1012 if extract_links in ("all", "header") and isinstance(

1013 df.columns, MultiIndex

1014 ):

1015 df.columns = Index(

1016 ((col[0], None if isna(col[1]) else col[1]) for col in df.columns),

1017 tupleize_cols=False,

1018 )

1019

1020 ret.append(df)

1021 except EmptyDataError: # empty table

1022 continue

1023 return ret

1024

1025

1026@doc(storage_options=_shared_docs["storage_options"])

1027def read_html(

1028 io: FilePath | ReadBuffer[str],

1029 *,

1030 match: str | Pattern = ".+",

1031 flavor: HTMLFlavors | Sequence[HTMLFlavors] | None = None,

1032 header: int | Sequence[int] | None = None,

1033 index_col: int | Sequence[int] | None = None,

1034 skiprows: int | Sequence[int] | slice | None = None,

1035 attrs: dict[str, str] | None = None,

1036 parse_dates: bool = False,

1037 thousands: str | None = ",",

1038 encoding: str | None = None,

1039 decimal: str = ".",

1040 converters: dict | None = None,

1041 na_values: Iterable[object] | None = None,

1042 keep_default_na: bool = True,

1043 displayed_only: bool = True,

1044 extract_links: Literal[None, "header", "footer", "body", "all"] = None,

1045 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

1046 storage_options: StorageOptions = None,

1047) -> list[DataFrame]:

1048 r"""

1049 Read HTML tables into a ``list`` of ``DataFrame`` objects.

1050

1051 Parameters

1052 ----------

1053 io : str, path object, or file-like object

1054 String, path object (implementing ``os.PathLike[str]``), or file-like

1055 object implementing a string ``read()`` function.

1056 The string can represent a URL or the HTML itself. Note that

1057 lxml only accepts the http, ftp and file url protocols. If you have a

1058 URL that starts with ``'https'`` you might try removing the ``'s'``.

1059

1060 .. deprecated:: 2.1.0

1061 Passing html literal strings is deprecated.

1062 Wrap literal string/bytes input in ``io.StringIO``/``io.BytesIO`` instead.

1063

1064 match : str or compiled regular expression, optional

1065 The set of tables containing text matching this regex or string will be

1066 returned. Unless the HTML is extremely simple you will probably need to

1067 pass a non-empty string here. Defaults to '.+' (match any non-empty

1068 string). The default value will return all tables contained on a page.

1069 This value is converted to a regular expression so that there is

1070 consistent behavior between Beautiful Soup and lxml.

1071

1072 flavor : {{"lxml", "html5lib", "bs4"}} or list-like, optional

1073 The parsing engine (or list of parsing engines) to use. 'bs4' and

1074 'html5lib' are synonymous with each other, they are both there for

1075 backwards compatibility. The default of ``None`` tries to use ``lxml``

1076 to parse and if that fails it falls back on ``bs4`` + ``html5lib``.

1077

1078 header : int or list-like, optional

1079 The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to

1080 make the columns headers.

1081

1082 index_col : int or list-like, optional

1083 The column (or list of columns) to use to create the index.

1084

1085 skiprows : int, list-like or slice, optional

1086 Number of rows to skip after parsing the column integer. 0-based. If a

1087 sequence of integers or a slice is given, will skip the rows indexed by

1088 that sequence. Note that a single element sequence means 'skip the nth

1089 row' whereas an integer means 'skip n rows'.

1090

1091 attrs : dict, optional

1092 This is a dictionary of attributes that you can pass to use to identify

1093 the table in the HTML. These are not checked for validity before being

1094 passed to lxml or Beautiful Soup. However, these attributes must be

1095 valid HTML table attributes to work correctly. For example, ::

1096

1097 attrs = {{'id': 'table'}}

1098

1099 is a valid attribute dictionary because the 'id' HTML tag attribute is

1100 a valid HTML attribute for *any* HTML tag as per `this document

1101 <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>`__. ::

1102

1103 attrs = {{'asdf': 'table'}}

1104

1105 is *not* a valid attribute dictionary because 'asdf' is not a valid

1106 HTML attribute even if it is a valid XML attribute. Valid HTML 4.01

1107 table attributes can be found `here

1108 <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A

1109 working draft of the HTML 5 spec can be found `here

1110 <https://html.spec.whatwg.org/multipage/tables.html>`__. It contains the

1111 latest information on table attributes for the modern web.

1112

1113 parse_dates : bool, optional

1114 See :func:`~read_csv` for more details.

1115

1116 thousands : str, optional

1117 Separator to use to parse thousands. Defaults to ``','``.

1118

1119 encoding : str, optional

1120 The encoding used to decode the web page. Defaults to ``None``.``None``

1121 preserves the previous encoding behavior, which depends on the

1122 underlying parser library (e.g., the parser library will try to use

1123 the encoding provided by the document).

1124

1125 decimal : str, default '.'

1126 Character to recognize as decimal point (e.g. use ',' for European

1127 data).

1128

1129 converters : dict, default None

1130 Dict of functions for converting values in certain columns. Keys can

1131 either be integers or column labels, values are functions that take one

1132 input argument, the cell (not column) content, and return the

1133 transformed content.

1134

1135 na_values : iterable, default None

1136 Custom NA values.

1137

1138 keep_default_na : bool, default True

1139 If na_values are specified and keep_default_na is False the default NaN

1140 values are overridden, otherwise they're appended to.

1141

1142 displayed_only : bool, default True

1143 Whether elements with "display: none" should be parsed.

1144

1145 extract_links : {{None, "all", "header", "body", "footer"}}

1146 Table elements in the specified section(s) with <a> tags will have their

1147 href extracted.

1148

1149 .. versionadded:: 1.5.0

1150

1151 dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'

1152 Back-end data type applied to the resultant :class:`DataFrame`

1153 (still experimental). Behaviour is as follows:

1154

1155 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`

1156 (default).

1157 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`

1158 DataFrame.

1159

1160 .. versionadded:: 2.0

1161

1162 {storage_options}

1163

1164 .. versionadded:: 2.1.0

1165

1166 Returns

1167 -------

1168 dfs

1169 A list of DataFrames.

1170

1171 See Also

1172 --------

1173 read_csv : Read a comma-separated values (csv) file into DataFrame.

1174

1175 Notes

1176 -----

1177 Before using this function you should read the :ref:`gotchas about the

1178 HTML parsing libraries <io.html.gotchas>`.

1179

1180 Expect to do some cleanup after you call this function. For example, you

1181 might need to manually assign column names if the column names are

1182 converted to NaN when you pass the `header=0` argument. We try to assume as

1183 little as possible about the structure of the table and push the

1184 idiosyncrasies of the HTML contained in the table to the user.

1185

1186 This function searches for ``<table>`` elements and only for ``<tr>``

1187 and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``

1188 element in the table. ``<td>`` stands for "table data". This function

1189 attempts to properly handle ``colspan`` and ``rowspan`` attributes.

1190 If the function has a ``<thead>`` argument, it is used to construct

1191 the header, otherwise the function attempts to find the header within

1192 the body (by putting rows with only ``<th>`` elements into the header).

1193

1194 Similar to :func:`~read_csv` the `header` argument is applied

1195 **after** `skiprows` is applied.

1196

1197 This function will *always* return a list of :class:`DataFrame` *or*

1198 it will fail, e.g., it will *not* return an empty list.

1199

1200 Examples

1201 --------

1202 See the :ref:`read_html documentation in the IO section of the docs

1203 <io.read_html>` for some examples of reading in HTML tables.

1204 """

1205 # Type check here. We don't want to parse only to fail because of an

1206 # invalid value of an integer skiprows.

1207 if isinstance(skiprows, numbers.Integral) and skiprows < 0:

1208 raise ValueError(

1209 "cannot skip rows starting from the end of the "

1210 "data (you passed a negative value)"

1211 )

1212 if extract_links not in [None, "header", "footer", "body", "all"]:

1213 raise ValueError(

1214 "`extract_links` must be one of "

1215 '{None, "header", "footer", "body", "all"}, got '

1216 f'"{extract_links}"'

1217 )

1218

1219 validate_header_arg(header)

1220 check_dtype_backend(dtype_backend)

1221

1222 io = stringify_path(io)

1223

1224 if isinstance(io, str) and not any(

1225 [

1226 is_file_like(io),

1227 file_exists(io),

1228 is_url(io),

1229 is_fsspec_url(io),

1230 ]

1231 ):

1232 warnings.warn(

1233 "Passing literal html to 'read_html' is deprecated and "

1234 "will be removed in a future version. To read from a "

1235 "literal string, wrap it in a 'StringIO' object.",

1236 FutureWarning,

1237 stacklevel=find_stack_level(),

1238 )

1239

1240 return _parse(

1241 flavor=flavor,

1242 io=io,

1243 match=match,

1244 header=header,

1245 index_col=index_col,

1246 skiprows=skiprows,

1247 parse_dates=parse_dates,

1248 thousands=thousands,

1249 attrs=attrs,

1250 encoding=encoding,

1251 decimal=decimal,

1252 converters=converters,

1253 na_values=na_values,

1254 keep_default_na=keep_default_na,

1255 displayed_only=displayed_only,

1256 extract_links=extract_links,

1257 dtype_backend=dtype_backend,

1258 storage_options=storage_options,

1259 )