Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/html.py: 21%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2:mod:`pandas.io.html` is a module containing functionality for dealing with
3HTML IO.
5"""
7from __future__ import annotations
9from collections import abc
10import numbers
11import re
12from re import Pattern
13from typing import (
14 TYPE_CHECKING,
15 Literal,
16 cast,
17)
18import warnings
20from pandas._libs import lib
21from pandas.compat._optional import import_optional_dependency
22from pandas.errors import (
23 AbstractMethodError,
24 EmptyDataError,
25)
26from pandas.util._decorators import doc
27from pandas.util._exceptions import find_stack_level
28from pandas.util._validators import check_dtype_backend
30from pandas.core.dtypes.common import is_list_like
32from pandas import isna
33from pandas.core.indexes.base import Index
34from pandas.core.indexes.multi import MultiIndex
35from pandas.core.series import Series
36from pandas.core.shared_docs import _shared_docs
38from pandas.io.common import (
39 file_exists,
40 get_handle,
41 is_file_like,
42 is_fsspec_url,
43 is_url,
44 stringify_path,
45 validate_header_arg,
46)
47from pandas.io.formats.printing import pprint_thing
48from pandas.io.parsers import TextParser
50if TYPE_CHECKING:
51 from collections.abc import (
52 Iterable,
53 Sequence,
54 )
56 from pandas._typing import (
57 BaseBuffer,
58 DtypeBackend,
59 FilePath,
60 HTMLFlavors,
61 ReadBuffer,
62 StorageOptions,
63 )
65 from pandas import DataFrame
67#############
68# READ HTML #
69#############
70_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}")
73def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str:
74 """
75 Replace extra whitespace inside of a string with a single space.
77 Parameters
78 ----------
79 s : str or unicode
80 The string from which to remove extra whitespace.
81 regex : re.Pattern
82 The regular expression to use to remove extra whitespace.
84 Returns
85 -------
86 subd : str or unicode
87 `s` with all extra whitespace replaced with a single space.
88 """
89 return regex.sub(" ", s.strip())
92def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]:
93 """
94 Get an iterator given an integer, slice or container.
96 Parameters
97 ----------
98 skiprows : int, slice, container
99 The iterator to use to skip rows; can also be a slice.
101 Raises
102 ------
103 TypeError
104 * If `skiprows` is not a slice, integer, or Container
106 Returns
107 -------
108 it : iterable
109 A proper iterator to use to skip rows of a DataFrame.
110 """
111 if isinstance(skiprows, slice):
112 start, step = skiprows.start or 0, skiprows.step or 1
113 return list(range(start, skiprows.stop, step))
114 elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):
115 return cast("int | Sequence[int]", skiprows)
116 elif skiprows is None:
117 return 0
118 raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows")
121def _read(
122 obj: FilePath | BaseBuffer,
123 encoding: str | None,
124 storage_options: StorageOptions | None,
125) -> str | bytes:
126 """
127 Try to read from a url, file or string.
129 Parameters
130 ----------
131 obj : str, unicode, path object, or file-like object
133 Returns
134 -------
135 raw_text : str
136 """
137 text: str | bytes
138 if (
139 is_url(obj)
140 or hasattr(obj, "read")
141 or (isinstance(obj, str) and file_exists(obj))
142 ):
143 with get_handle(
144 obj, "r", encoding=encoding, storage_options=storage_options
145 ) as handles:
146 text = handles.handle.read()
147 elif isinstance(obj, (str, bytes)):
148 text = obj
149 else:
150 raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")
151 return text
154class _HtmlFrameParser:
155 """
156 Base class for parsers that parse HTML into DataFrames.
158 Parameters
159 ----------
160 io : str or file-like
161 This can be either a string of raw HTML, a valid URL using the HTTP,
162 FTP, or FILE protocols or a file-like object.
164 match : str or regex
165 The text to match in the document.
167 attrs : dict
168 List of HTML <table> element attributes to match.
170 encoding : str
171 Encoding to be used by parser
173 displayed_only : bool
174 Whether or not items with "display:none" should be ignored
176 extract_links : {None, "all", "header", "body", "footer"}
177 Table elements in the specified section(s) with <a> tags will have their
178 href extracted.
180 .. versionadded:: 1.5.0
182 Attributes
183 ----------
184 io : str or file-like
185 raw HTML, URL, or file-like object
187 match : regex
188 The text to match in the raw HTML
190 attrs : dict-like
191 A dictionary of valid table attributes to use to search for table
192 elements.
194 encoding : str
195 Encoding to be used by parser
197 displayed_only : bool
198 Whether or not items with "display:none" should be ignored
200 extract_links : {None, "all", "header", "body", "footer"}
201 Table elements in the specified section(s) with <a> tags will have their
202 href extracted.
204 .. versionadded:: 1.5.0
206 Notes
207 -----
208 To subclass this class effectively you must override the following methods:
209 * :func:`_build_doc`
210 * :func:`_attr_getter`
211 * :func:`_href_getter`
212 * :func:`_text_getter`
213 * :func:`_parse_td`
214 * :func:`_parse_thead_tr`
215 * :func:`_parse_tbody_tr`
216 * :func:`_parse_tfoot_tr`
217 * :func:`_parse_tables`
218 * :func:`_equals_tag`
219 See each method's respective documentation for details on their
220 functionality.
221 """
223 def __init__(
224 self,
225 io: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
226 match: str | Pattern,
227 attrs: dict[str, str] | None,
228 encoding: str,
229 displayed_only: bool,
230 extract_links: Literal[None, "header", "footer", "body", "all"],
231 storage_options: StorageOptions = None,
232 ) -> None:
233 self.io = io
234 self.match = match
235 self.attrs = attrs
236 self.encoding = encoding
237 self.displayed_only = displayed_only
238 self.extract_links = extract_links
239 self.storage_options = storage_options
241 def parse_tables(self):
242 """
243 Parse and return all tables from the DOM.
245 Returns
246 -------
247 list of parsed (header, body, footer) tuples from tables.
248 """
249 tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
250 return (self._parse_thead_tbody_tfoot(table) for table in tables)
252 def _attr_getter(self, obj, attr):
253 """
254 Return the attribute value of an individual DOM node.
256 Parameters
257 ----------
258 obj : node-like
259 A DOM node.
261 attr : str or unicode
262 The attribute, such as "colspan"
264 Returns
265 -------
266 str or unicode
267 The attribute value.
268 """
269 # Both lxml and BeautifulSoup have the same implementation:
270 return obj.get(attr)
272 def _href_getter(self, obj) -> str | None:
273 """
274 Return a href if the DOM node contains a child <a> or None.
276 Parameters
277 ----------
278 obj : node-like
279 A DOM node.
281 Returns
282 -------
283 href : str or unicode
284 The href from the <a> child of the DOM node.
285 """
286 raise AbstractMethodError(self)
288 def _text_getter(self, obj):
289 """
290 Return the text of an individual DOM node.
292 Parameters
293 ----------
294 obj : node-like
295 A DOM node.
297 Returns
298 -------
299 text : str or unicode
300 The text from an individual DOM node.
301 """
302 raise AbstractMethodError(self)
304 def _parse_td(self, obj):
305 """
306 Return the td elements from a row element.
308 Parameters
309 ----------
310 obj : node-like
311 A DOM <tr> node.
313 Returns
314 -------
315 list of node-like
316 These are the elements of each row, i.e., the columns.
317 """
318 raise AbstractMethodError(self)
320 def _parse_thead_tr(self, table):
321 """
322 Return the list of thead row elements from the parsed table element.
324 Parameters
325 ----------
326 table : a table element that contains zero or more thead elements.
328 Returns
329 -------
330 list of node-like
331 These are the <tr> row elements of a table.
332 """
333 raise AbstractMethodError(self)
335 def _parse_tbody_tr(self, table):
336 """
337 Return the list of tbody row elements from the parsed table element.
339 HTML5 table bodies consist of either 0 or more <tbody> elements (which
340 only contain <tr> elements) or 0 or more <tr> elements. This method
341 checks for both structures.
343 Parameters
344 ----------
345 table : a table element that contains row elements.
347 Returns
348 -------
349 list of node-like
350 These are the <tr> row elements of a table.
351 """
352 raise AbstractMethodError(self)
354 def _parse_tfoot_tr(self, table):
355 """
356 Return the list of tfoot row elements from the parsed table element.
358 Parameters
359 ----------
360 table : a table element that contains row elements.
362 Returns
363 -------
364 list of node-like
365 These are the <tr> row elements of a table.
366 """
367 raise AbstractMethodError(self)
369 def _parse_tables(self, document, match, attrs):
370 """
371 Return all tables from the parsed DOM.
373 Parameters
374 ----------
375 document : the DOM from which to parse the table element.
377 match : str or regular expression
378 The text to search for in the DOM tree.
380 attrs : dict
381 A dictionary of table attributes that can be used to disambiguate
382 multiple tables on a page.
384 Raises
385 ------
386 ValueError : `match` does not match any text in the document.
388 Returns
389 -------
390 list of node-like
391 HTML <table> elements to be parsed into raw data.
392 """
393 raise AbstractMethodError(self)
395 def _equals_tag(self, obj, tag) -> bool:
396 """
397 Return whether an individual DOM node matches a tag
399 Parameters
400 ----------
401 obj : node-like
402 A DOM node.
404 tag : str
405 Tag name to be checked for equality.
407 Returns
408 -------
409 boolean
410 Whether `obj`'s tag name is `tag`
411 """
412 raise AbstractMethodError(self)
414 def _build_doc(self):
415 """
416 Return a tree-like object that can be used to iterate over the DOM.
418 Returns
419 -------
420 node-like
421 The DOM from which to parse the table element.
422 """
423 raise AbstractMethodError(self)
425 def _parse_thead_tbody_tfoot(self, table_html):
426 """
427 Given a table, return parsed header, body, and foot.
429 Parameters
430 ----------
431 table_html : node-like
433 Returns
434 -------
435 tuple of (header, body, footer), each a list of list-of-text rows.
437 Notes
438 -----
439 Header and body are lists-of-lists. Top level list is a list of
440 rows. Each row is a list of str text.
442 Logic: Use <thead>, <tbody>, <tfoot> elements to identify
443 header, body, and footer, otherwise:
444 - Put all rows into body
445 - Move rows from top of body to header only if
446 all elements inside row are <th>
447 - Move rows from bottom of body to footer only if
448 all elements inside row are <th>
449 """
450 header_rows = self._parse_thead_tr(table_html)
451 body_rows = self._parse_tbody_tr(table_html)
452 footer_rows = self._parse_tfoot_tr(table_html)
454 def row_is_all_th(row):
455 return all(self._equals_tag(t, "th") for t in self._parse_td(row))
457 if not header_rows:
458 # The table has no <thead>. Move the top all-<th> rows from
459 # body_rows to header_rows. (This is a common case because many
460 # tables in the wild have no <thead> or <tfoot>
461 while body_rows and row_is_all_th(body_rows[0]):
462 header_rows.append(body_rows.pop(0))
464 header = self._expand_colspan_rowspan(header_rows, section="header")
465 body = self._expand_colspan_rowspan(body_rows, section="body")
466 footer = self._expand_colspan_rowspan(footer_rows, section="footer")
468 return header, body, footer
470 def _expand_colspan_rowspan(
471 self, rows, section: Literal["header", "footer", "body"]
472 ):
473 """
474 Given a list of <tr>s, return a list of text rows.
476 Parameters
477 ----------
478 rows : list of node-like
479 List of <tr>s
480 section : the section that the rows belong to (header, body or footer).
482 Returns
483 -------
484 list of list
485 Each returned row is a list of str text, or tuple (text, link)
486 if extract_links is not None.
488 Notes
489 -----
490 Any cell with ``rowspan`` or ``colspan`` will have its contents copied
491 to subsequent cells.
492 """
493 all_texts = [] # list of rows, each a list of str
494 text: str | tuple
495 remainder: list[
496 tuple[int, str | tuple, int]
497 ] = [] # list of (index, text, nrows)
499 for tr in rows:
500 texts = [] # the output for this row
501 next_remainder = []
503 index = 0
504 tds = self._parse_td(tr)
505 for td in tds:
506 # Append texts from previous rows with rowspan>1 that come
507 # before this <td>
508 while remainder and remainder[0][0] <= index:
509 prev_i, prev_text, prev_rowspan = remainder.pop(0)
510 texts.append(prev_text)
511 if prev_rowspan > 1:
512 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
513 index += 1
515 # Append the text from this <td>, colspan times
516 text = _remove_whitespace(self._text_getter(td))
517 if self.extract_links in ("all", section):
518 href = self._href_getter(td)
519 text = (text, href)
520 rowspan = int(self._attr_getter(td, "rowspan") or 1)
521 colspan = int(self._attr_getter(td, "colspan") or 1)
523 for _ in range(colspan):
524 texts.append(text)
525 if rowspan > 1:
526 next_remainder.append((index, text, rowspan - 1))
527 index += 1
529 # Append texts from previous rows at the final position
530 for prev_i, prev_text, prev_rowspan in remainder:
531 texts.append(prev_text)
532 if prev_rowspan > 1:
533 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
535 all_texts.append(texts)
536 remainder = next_remainder
538 # Append rows that only appear because the previous row had non-1
539 # rowspan
540 while remainder:
541 next_remainder = []
542 texts = []
543 for prev_i, prev_text, prev_rowspan in remainder:
544 texts.append(prev_text)
545 if prev_rowspan > 1:
546 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
547 all_texts.append(texts)
548 remainder = next_remainder
550 return all_texts
552 def _handle_hidden_tables(self, tbl_list, attr_name: str):
553 """
554 Return list of tables, potentially removing hidden elements
556 Parameters
557 ----------
558 tbl_list : list of node-like
559 Type of list elements will vary depending upon parser used
560 attr_name : str
561 Name of the accessor for retrieving HTML attributes
563 Returns
564 -------
565 list of node-like
566 Return type matches `tbl_list`
567 """
568 if not self.displayed_only:
569 return tbl_list
571 return [
572 x
573 for x in tbl_list
574 if "display:none"
575 not in getattr(x, attr_name).get("style", "").replace(" ", "")
576 ]
579class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
580 """
581 HTML to DataFrame parser that uses BeautifulSoup under the hood.
583 See Also
584 --------
585 pandas.io.html._HtmlFrameParser
586 pandas.io.html._LxmlFrameParser
588 Notes
589 -----
590 Documentation strings for this class are in the base class
591 :class:`pandas.io.html._HtmlFrameParser`.
592 """
594 def _parse_tables(self, document, match, attrs):
595 element_name = "table"
596 tables = document.find_all(element_name, attrs=attrs)
597 if not tables:
598 raise ValueError("No tables found")
600 result = []
601 unique_tables = set()
602 tables = self._handle_hidden_tables(tables, "attrs")
604 for table in tables:
605 if self.displayed_only:
606 for elem in table.find_all("style"):
607 elem.decompose()
609 for elem in table.find_all(style=re.compile(r"display:\s*none")):
610 elem.decompose()
612 if table not in unique_tables and table.find(string=match) is not None:
613 result.append(table)
614 unique_tables.add(table)
615 if not result:
616 raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
617 return result
619 def _href_getter(self, obj) -> str | None:
620 a = obj.find("a", href=True)
621 return None if not a else a["href"]
623 def _text_getter(self, obj):
624 return obj.text
626 def _equals_tag(self, obj, tag) -> bool:
627 return obj.name == tag
629 def _parse_td(self, row):
630 return row.find_all(("td", "th"), recursive=False)
632 def _parse_thead_tr(self, table):
633 return table.select("thead tr")
635 def _parse_tbody_tr(self, table):
636 from_tbody = table.select("tbody tr")
637 from_root = table.find_all("tr", recursive=False)
638 # HTML spec: at most one of these lists has content
639 return from_tbody + from_root
641 def _parse_tfoot_tr(self, table):
642 return table.select("tfoot tr")
644 def _setup_build_doc(self):
645 raw_text = _read(self.io, self.encoding, self.storage_options)
646 if not raw_text:
647 raise ValueError(f"No text parsed from document: {self.io}")
648 return raw_text
650 def _build_doc(self):
651 from bs4 import BeautifulSoup
653 bdoc = self._setup_build_doc()
654 if isinstance(bdoc, bytes) and self.encoding is not None:
655 udoc = bdoc.decode(self.encoding)
656 from_encoding = None
657 else:
658 udoc = bdoc
659 from_encoding = self.encoding
661 soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
663 for br in soup.find_all("br"):
664 br.replace_with("\n" + br.text)
666 return soup
669def _build_xpath_expr(attrs) -> str:
670 """
671 Build an xpath expression to simulate bs4's ability to pass in kwargs to
672 search for attributes when using the lxml parser.
674 Parameters
675 ----------
676 attrs : dict
677 A dict of HTML attributes. These are NOT checked for validity.
679 Returns
680 -------
681 expr : unicode
682 An XPath expression that checks for the given HTML attributes.
683 """
684 # give class attribute as class_ because class is a python keyword
685 if "class_" in attrs:
686 attrs["class"] = attrs.pop("class_")
688 s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()])
689 return f"[{s}]"
692_re_namespace = {"re": "http://exslt.org/regular-expressions"}
695class _LxmlFrameParser(_HtmlFrameParser):
696 """
697 HTML to DataFrame parser that uses lxml under the hood.
699 Warning
700 -------
701 This parser can only handle HTTP, FTP, and FILE urls.
703 See Also
704 --------
705 _HtmlFrameParser
706 _BeautifulSoupLxmlFrameParser
708 Notes
709 -----
710 Documentation strings for this class are in the base class
711 :class:`_HtmlFrameParser`.
712 """
714 def _href_getter(self, obj) -> str | None:
715 href = obj.xpath(".//a/@href")
716 return None if not href else href[0]
718 def _text_getter(self, obj):
719 return obj.text_content()
721 def _parse_td(self, row):
722 # Look for direct children only: the "row" element here may be a
723 # <thead> or <tfoot> (see _parse_thead_tr).
724 return row.xpath("./td|./th")
726 def _parse_tables(self, document, match, kwargs):
727 pattern = match.pattern
729 # 1. check all descendants for the given pattern and only search tables
730 # GH 49929
731 xpath_expr = f"//table[.//text()[re:test(., {repr(pattern)})]]"
733 # if any table attributes were given build an xpath expression to
734 # search for them
735 if kwargs:
736 xpath_expr += _build_xpath_expr(kwargs)
738 tables = document.xpath(xpath_expr, namespaces=_re_namespace)
740 tables = self._handle_hidden_tables(tables, "attrib")
741 if self.displayed_only:
742 for table in tables:
743 # lxml utilizes XPATH 1.0 which does not have regex
744 # support. As a result, we find all elements with a style
745 # attribute and iterate them to check for display:none
746 for elem in table.xpath(".//style"):
747 elem.drop_tree()
748 for elem in table.xpath(".//*[@style]"):
749 if "display:none" in elem.attrib.get("style", "").replace(" ", ""):
750 elem.drop_tree()
751 if not tables:
752 raise ValueError(f"No tables found matching regex {repr(pattern)}")
753 return tables
755 def _equals_tag(self, obj, tag) -> bool:
756 return obj.tag == tag
758 def _build_doc(self):
759 """
760 Raises
761 ------
762 ValueError
763 * If a URL that lxml cannot parse is passed.
765 Exception
766 * Any other ``Exception`` thrown. For example, trying to parse a
767 URL that is syntactically correct on a machine with no internet
768 connection will fail.
770 See Also
771 --------
772 pandas.io.html._HtmlFrameParser._build_doc
773 """
774 from lxml.etree import XMLSyntaxError
775 from lxml.html import (
776 HTMLParser,
777 fromstring,
778 parse,
779 )
781 parser = HTMLParser(recover=True, encoding=self.encoding)
783 try:
784 if is_url(self.io):
785 with get_handle(
786 self.io, "r", storage_options=self.storage_options
787 ) as f:
788 r = parse(f.handle, parser=parser)
789 else:
790 # try to parse the input in the simplest way
791 r = parse(self.io, parser=parser)
792 try:
793 r = r.getroot()
794 except AttributeError:
795 pass
796 except (UnicodeDecodeError, OSError) as e:
797 # if the input is a blob of html goop
798 if not is_url(self.io):
799 r = fromstring(self.io, parser=parser)
801 try:
802 r = r.getroot()
803 except AttributeError:
804 pass
805 else:
806 raise e
807 else:
808 if not hasattr(r, "text_content"):
809 raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
811 for br in r.xpath("*//br"):
812 br.tail = "\n" + (br.tail or "")
814 return r
816 def _parse_thead_tr(self, table):
817 rows = []
819 for thead in table.xpath(".//thead"):
820 rows.extend(thead.xpath("./tr"))
822 # HACK: lxml does not clean up the clearly-erroneous
823 # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add
824 # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its
825 # children as though it's a <tr>.
826 #
827 # Better solution would be to use html5lib.
828 elements_at_root = thead.xpath("./td|./th")
829 if elements_at_root:
830 rows.append(thead)
832 return rows
834 def _parse_tbody_tr(self, table):
835 from_tbody = table.xpath(".//tbody//tr")
836 from_root = table.xpath("./tr")
837 # HTML spec: at most one of these lists has content
838 return from_tbody + from_root
840 def _parse_tfoot_tr(self, table):
841 return table.xpath(".//tfoot//tr")
844def _expand_elements(body) -> None:
845 data = [len(elem) for elem in body]
846 lens = Series(data)
847 lens_max = lens.max()
848 not_max = lens[lens != lens_max]
850 empty = [""]
851 for ind, length in not_max.items():
852 body[ind] += empty * (lens_max - length)
855def _data_to_frame(**kwargs):
856 head, body, foot = kwargs.pop("data")
857 header = kwargs.pop("header")
858 kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"])
859 if head:
860 body = head + body
862 # Infer header when there is a <thead> or top <th>-only rows
863 if header is None:
864 if len(head) == 1:
865 header = 0
866 else:
867 # ignore all-empty-text rows
868 header = [i for i, row in enumerate(head) if any(text for text in row)]
870 if foot:
871 body += foot
873 # fill out elements of body that are "ragged"
874 _expand_elements(body)
875 with TextParser(body, header=header, **kwargs) as tp:
876 return tp.read()
879_valid_parsers = {
880 "lxml": _LxmlFrameParser,
881 None: _LxmlFrameParser,
882 "html5lib": _BeautifulSoupHtml5LibFrameParser,
883 "bs4": _BeautifulSoupHtml5LibFrameParser,
884}
887def _parser_dispatch(flavor: HTMLFlavors | None) -> type[_HtmlFrameParser]:
888 """
889 Choose the parser based on the input flavor.
891 Parameters
892 ----------
893 flavor : {{"lxml", "html5lib", "bs4"}} or None
894 The type of parser to use. This must be a valid backend.
896 Returns
897 -------
898 cls : _HtmlFrameParser subclass
899 The parser class based on the requested input flavor.
901 Raises
902 ------
903 ValueError
904 * If `flavor` is not a valid backend.
905 ImportError
906 * If you do not have the requested `flavor`
907 """
908 valid_parsers = list(_valid_parsers.keys())
909 if flavor not in valid_parsers:
910 raise ValueError(
911 f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}"
912 )
914 if flavor in ("bs4", "html5lib"):
915 import_optional_dependency("html5lib")
916 import_optional_dependency("bs4")
917 else:
918 import_optional_dependency("lxml.etree")
919 return _valid_parsers[flavor]
922def _print_as_set(s) -> str:
923 arg = ", ".join([pprint_thing(el) for el in s])
924 return f"{{{arg}}}"
927def _validate_flavor(flavor):
928 if flavor is None:
929 flavor = "lxml", "bs4"
930 elif isinstance(flavor, str):
931 flavor = (flavor,)
932 elif isinstance(flavor, abc.Iterable):
933 if not all(isinstance(flav, str) for flav in flavor):
934 raise TypeError(
935 f"Object of type {repr(type(flavor).__name__)} "
936 f"is not an iterable of strings"
937 )
938 else:
939 msg = repr(flavor) if isinstance(flavor, str) else str(flavor)
940 msg += " is not a valid flavor"
941 raise ValueError(msg)
943 flavor = tuple(flavor)
944 valid_flavors = set(_valid_parsers)
945 flavor_set = set(flavor)
947 if not flavor_set & valid_flavors:
948 raise ValueError(
949 f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid "
950 f"flavors are {_print_as_set(valid_flavors)}"
951 )
952 return flavor
955def _parse(
956 flavor,
957 io,
958 match,
959 attrs,
960 encoding,
961 displayed_only,
962 extract_links,
963 storage_options,
964 **kwargs,
965):
966 flavor = _validate_flavor(flavor)
967 compiled_match = re.compile(match) # you can pass a compiled regex here
969 retained = None
970 for flav in flavor:
971 parser = _parser_dispatch(flav)
972 p = parser(
973 io,
974 compiled_match,
975 attrs,
976 encoding,
977 displayed_only,
978 extract_links,
979 storage_options,
980 )
982 try:
983 tables = p.parse_tables()
984 except ValueError as caught:
985 # if `io` is an io-like object, check if it's seekable
986 # and try to rewind it before trying the next parser
987 if hasattr(io, "seekable") and io.seekable():
988 io.seek(0)
989 elif hasattr(io, "seekable") and not io.seekable():
990 # if we couldn't rewind it, let the user know
991 raise ValueError(
992 f"The flavor {flav} failed to parse your input. "
993 "Since you passed a non-rewindable file "
994 "object, we can't rewind it to try "
995 "another parser. Try read_html() with a different flavor."
996 ) from caught
998 retained = caught
999 else:
1000 break
1001 else:
1002 assert retained is not None # for mypy
1003 raise retained
1005 ret = []
1006 for table in tables:
1007 try:
1008 df = _data_to_frame(data=table, **kwargs)
1009 # Cast MultiIndex header to an Index of tuples when extracting header
1010 # links and replace nan with None (therefore can't use mi.to_flat_index()).
1011 # This maintains consistency of selection (e.g. df.columns.str[1])
1012 if extract_links in ("all", "header") and isinstance(
1013 df.columns, MultiIndex
1014 ):
1015 df.columns = Index(
1016 ((col[0], None if isna(col[1]) else col[1]) for col in df.columns),
1017 tupleize_cols=False,
1018 )
1020 ret.append(df)
1021 except EmptyDataError: # empty table
1022 continue
1023 return ret
1026@doc(storage_options=_shared_docs["storage_options"])
1027def read_html(
1028 io: FilePath | ReadBuffer[str],
1029 *,
1030 match: str | Pattern = ".+",
1031 flavor: HTMLFlavors | Sequence[HTMLFlavors] | None = None,
1032 header: int | Sequence[int] | None = None,
1033 index_col: int | Sequence[int] | None = None,
1034 skiprows: int | Sequence[int] | slice | None = None,
1035 attrs: dict[str, str] | None = None,
1036 parse_dates: bool = False,
1037 thousands: str | None = ",",
1038 encoding: str | None = None,
1039 decimal: str = ".",
1040 converters: dict | None = None,
1041 na_values: Iterable[object] | None = None,
1042 keep_default_na: bool = True,
1043 displayed_only: bool = True,
1044 extract_links: Literal[None, "header", "footer", "body", "all"] = None,
1045 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
1046 storage_options: StorageOptions = None,
1047) -> list[DataFrame]:
1048 r"""
1049 Read HTML tables into a ``list`` of ``DataFrame`` objects.
1051 Parameters
1052 ----------
1053 io : str, path object, or file-like object
1054 String, path object (implementing ``os.PathLike[str]``), or file-like
1055 object implementing a string ``read()`` function.
1056 The string can represent a URL or the HTML itself. Note that
1057 lxml only accepts the http, ftp and file url protocols. If you have a
1058 URL that starts with ``'https'`` you might try removing the ``'s'``.
1060 .. deprecated:: 2.1.0
1061 Passing html literal strings is deprecated.
1062 Wrap literal string/bytes input in ``io.StringIO``/``io.BytesIO`` instead.
1064 match : str or compiled regular expression, optional
1065 The set of tables containing text matching this regex or string will be
1066 returned. Unless the HTML is extremely simple you will probably need to
1067 pass a non-empty string here. Defaults to '.+' (match any non-empty
1068 string). The default value will return all tables contained on a page.
1069 This value is converted to a regular expression so that there is
1070 consistent behavior between Beautiful Soup and lxml.
1072 flavor : {{"lxml", "html5lib", "bs4"}} or list-like, optional
1073 The parsing engine (or list of parsing engines) to use. 'bs4' and
1074 'html5lib' are synonymous with each other, they are both there for
1075 backwards compatibility. The default of ``None`` tries to use ``lxml``
1076 to parse and if that fails it falls back on ``bs4`` + ``html5lib``.
1078 header : int or list-like, optional
1079 The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to
1080 make the columns headers.
1082 index_col : int or list-like, optional
1083 The column (or list of columns) to use to create the index.
1085 skiprows : int, list-like or slice, optional
1086 Number of rows to skip after parsing the column integer. 0-based. If a
1087 sequence of integers or a slice is given, will skip the rows indexed by
1088 that sequence. Note that a single element sequence means 'skip the nth
1089 row' whereas an integer means 'skip n rows'.
1091 attrs : dict, optional
1092 This is a dictionary of attributes that you can pass to use to identify
1093 the table in the HTML. These are not checked for validity before being
1094 passed to lxml or Beautiful Soup. However, these attributes must be
1095 valid HTML table attributes to work correctly. For example, ::
1097 attrs = {{'id': 'table'}}
1099 is a valid attribute dictionary because the 'id' HTML tag attribute is
1100 a valid HTML attribute for *any* HTML tag as per `this document
1101 <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>`__. ::
1103 attrs = {{'asdf': 'table'}}
1105 is *not* a valid attribute dictionary because 'asdf' is not a valid
1106 HTML attribute even if it is a valid XML attribute. Valid HTML 4.01
1107 table attributes can be found `here
1108 <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A
1109 working draft of the HTML 5 spec can be found `here
1110 <https://html.spec.whatwg.org/multipage/tables.html>`__. It contains the
1111 latest information on table attributes for the modern web.
1113 parse_dates : bool, optional
1114 See :func:`~read_csv` for more details.
1116 thousands : str, optional
1117 Separator to use to parse thousands. Defaults to ``','``.
1119 encoding : str, optional
1120 The encoding used to decode the web page. Defaults to ``None``.``None``
1121 preserves the previous encoding behavior, which depends on the
1122 underlying parser library (e.g., the parser library will try to use
1123 the encoding provided by the document).
1125 decimal : str, default '.'
1126 Character to recognize as decimal point (e.g. use ',' for European
1127 data).
1129 converters : dict, default None
1130 Dict of functions for converting values in certain columns. Keys can
1131 either be integers or column labels, values are functions that take one
1132 input argument, the cell (not column) content, and return the
1133 transformed content.
1135 na_values : iterable, default None
1136 Custom NA values.
1138 keep_default_na : bool, default True
1139 If na_values are specified and keep_default_na is False the default NaN
1140 values are overridden, otherwise they're appended to.
1142 displayed_only : bool, default True
1143 Whether elements with "display: none" should be parsed.
1145 extract_links : {{None, "all", "header", "body", "footer"}}
1146 Table elements in the specified section(s) with <a> tags will have their
1147 href extracted.
1149 .. versionadded:: 1.5.0
1151 dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
1152 Back-end data type applied to the resultant :class:`DataFrame`
1153 (still experimental). Behaviour is as follows:
1155 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
1156 (default).
1157 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
1158 DataFrame.
1160 .. versionadded:: 2.0
1162 {storage_options}
1164 .. versionadded:: 2.1.0
1166 Returns
1167 -------
1168 dfs
1169 A list of DataFrames.
1171 See Also
1172 --------
1173 read_csv : Read a comma-separated values (csv) file into DataFrame.
1175 Notes
1176 -----
1177 Before using this function you should read the :ref:`gotchas about the
1178 HTML parsing libraries <io.html.gotchas>`.
1180 Expect to do some cleanup after you call this function. For example, you
1181 might need to manually assign column names if the column names are
1182 converted to NaN when you pass the `header=0` argument. We try to assume as
1183 little as possible about the structure of the table and push the
1184 idiosyncrasies of the HTML contained in the table to the user.
1186 This function searches for ``<table>`` elements and only for ``<tr>``
1187 and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``
1188 element in the table. ``<td>`` stands for "table data". This function
1189 attempts to properly handle ``colspan`` and ``rowspan`` attributes.
1190 If the function has a ``<thead>`` argument, it is used to construct
1191 the header, otherwise the function attempts to find the header within
1192 the body (by putting rows with only ``<th>`` elements into the header).
1194 Similar to :func:`~read_csv` the `header` argument is applied
1195 **after** `skiprows` is applied.
1197 This function will *always* return a list of :class:`DataFrame` *or*
1198 it will fail, e.g., it will *not* return an empty list.
1200 Examples
1201 --------
1202 See the :ref:`read_html documentation in the IO section of the docs
1203 <io.read_html>` for some examples of reading in HTML tables.
1204 """
1205 # Type check here. We don't want to parse only to fail because of an
1206 # invalid value of an integer skiprows.
1207 if isinstance(skiprows, numbers.Integral) and skiprows < 0:
1208 raise ValueError(
1209 "cannot skip rows starting from the end of the "
1210 "data (you passed a negative value)"
1211 )
1212 if extract_links not in [None, "header", "footer", "body", "all"]:
1213 raise ValueError(
1214 "`extract_links` must be one of "
1215 '{None, "header", "footer", "body", "all"}, got '
1216 f'"{extract_links}"'
1217 )
1219 validate_header_arg(header)
1220 check_dtype_backend(dtype_backend)
1222 io = stringify_path(io)
1224 if isinstance(io, str) and not any(
1225 [
1226 is_file_like(io),
1227 file_exists(io),
1228 is_url(io),
1229 is_fsspec_url(io),
1230 ]
1231 ):
1232 warnings.warn(
1233 "Passing literal html to 'read_html' is deprecated and "
1234 "will be removed in a future version. To read from a "
1235 "literal string, wrap it in a 'StringIO' object.",
1236 FutureWarning,
1237 stacklevel=find_stack_level(),
1238 )
1240 return _parse(
1241 flavor=flavor,
1242 io=io,
1243 match=match,
1244 header=header,
1245 index_col=index_col,
1246 skiprows=skiprows,
1247 parse_dates=parse_dates,
1248 thousands=thousands,
1249 attrs=attrs,
1250 encoding=encoding,
1251 decimal=decimal,
1252 converters=converters,
1253 na_values=na_values,
1254 keep_default_na=keep_default_na,
1255 displayed_only=displayed_only,
1256 extract_links=extract_links,
1257 dtype_backend=dtype_backend,
1258 storage_options=storage_options,
1259 )