Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/html.py: 21%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2:mod:`pandas.io.html` is a module containing functionality for dealing with
3HTML IO.
5"""
7from __future__ import annotations
9from collections import abc
10import numbers
11import re
12from typing import (
13 TYPE_CHECKING,
14 Iterable,
15 Literal,
16 Pattern,
17 Sequence,
18 cast,
19)
21from pandas._libs import lib
22from pandas._typing import (
23 BaseBuffer,
24 DtypeBackend,
25 FilePath,
26 ReadBuffer,
27)
28from pandas.compat._optional import import_optional_dependency
29from pandas.errors import (
30 AbstractMethodError,
31 EmptyDataError,
32)
33from pandas.util._validators import check_dtype_backend
35from pandas.core.dtypes.common import is_list_like
37from pandas import isna
38from pandas.core.indexes.base import Index
39from pandas.core.indexes.multi import MultiIndex
40from pandas.core.series import Series
42from pandas.io.common import (
43 file_exists,
44 get_handle,
45 is_url,
46 stringify_path,
47 urlopen,
48 validate_header_arg,
49)
50from pandas.io.formats.printing import pprint_thing
51from pandas.io.parsers import TextParser
53if TYPE_CHECKING:
54 from pandas import DataFrame
56_IMPORTS = False
57_HAS_BS4 = False
58_HAS_LXML = False
59_HAS_HTML5LIB = False
62def _importers() -> None:
63 # import things we need
64 # but make this done on a first use basis
66 global _IMPORTS
67 if _IMPORTS:
68 return
70 global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB
71 bs4 = import_optional_dependency("bs4", errors="ignore")
72 _HAS_BS4 = bs4 is not None
74 lxml = import_optional_dependency("lxml.etree", errors="ignore")
75 _HAS_LXML = lxml is not None
77 html5lib = import_optional_dependency("html5lib", errors="ignore")
78 _HAS_HTML5LIB = html5lib is not None
80 _IMPORTS = True
83#############
84# READ HTML #
85#############
86_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}")
89def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str:
90 """
91 Replace extra whitespace inside of a string with a single space.
93 Parameters
94 ----------
95 s : str or unicode
96 The string from which to remove extra whitespace.
97 regex : re.Pattern
98 The regular expression to use to remove extra whitespace.
100 Returns
101 -------
102 subd : str or unicode
103 `s` with all extra whitespace replaced with a single space.
104 """
105 return regex.sub(" ", s.strip())
108def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]:
109 """
110 Get an iterator given an integer, slice or container.
112 Parameters
113 ----------
114 skiprows : int, slice, container
115 The iterator to use to skip rows; can also be a slice.
117 Raises
118 ------
119 TypeError
120 * If `skiprows` is not a slice, integer, or Container
122 Returns
123 -------
124 it : iterable
125 A proper iterator to use to skip rows of a DataFrame.
126 """
127 if isinstance(skiprows, slice):
128 start, step = skiprows.start or 0, skiprows.step or 1
129 return list(range(start, skiprows.stop, step))
130 elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):
131 return cast("int | Sequence[int]", skiprows)
132 elif skiprows is None:
133 return 0
134 raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows")
137def _read(obj: FilePath | BaseBuffer, encoding: str | None) -> str | bytes:
138 """
139 Try to read from a url, file or string.
141 Parameters
142 ----------
143 obj : str, unicode, path object, or file-like object
145 Returns
146 -------
147 raw_text : str
148 """
149 text: str | bytes
150 if (
151 is_url(obj)
152 or hasattr(obj, "read")
153 or (isinstance(obj, str) and file_exists(obj))
154 ):
155 with get_handle(obj, "r", encoding=encoding) as handles:
156 text = handles.handle.read()
157 elif isinstance(obj, (str, bytes)):
158 text = obj
159 else:
160 raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")
161 return text
164class _HtmlFrameParser:
165 """
166 Base class for parsers that parse HTML into DataFrames.
168 Parameters
169 ----------
170 io : str or file-like
171 This can be either a string of raw HTML, a valid URL using the HTTP,
172 FTP, or FILE protocols or a file-like object.
174 match : str or regex
175 The text to match in the document.
177 attrs : dict
178 List of HTML <table> element attributes to match.
180 encoding : str
181 Encoding to be used by parser
183 displayed_only : bool
184 Whether or not items with "display:none" should be ignored
186 extract_links : {None, "all", "header", "body", "footer"}
187 Table elements in the specified section(s) with <a> tags will have their
188 href extracted.
190 .. versionadded:: 1.5.0
192 Attributes
193 ----------
194 io : str or file-like
195 raw HTML, URL, or file-like object
197 match : regex
198 The text to match in the raw HTML
200 attrs : dict-like
201 A dictionary of valid table attributes to use to search for table
202 elements.
204 encoding : str
205 Encoding to be used by parser
207 displayed_only : bool
208 Whether or not items with "display:none" should be ignored
210 extract_links : {None, "all", "header", "body", "footer"}
211 Table elements in the specified section(s) with <a> tags will have their
212 href extracted.
214 .. versionadded:: 1.5.0
216 Notes
217 -----
218 To subclass this class effectively you must override the following methods:
219 * :func:`_build_doc`
220 * :func:`_attr_getter`
221 * :func:`_href_getter`
222 * :func:`_text_getter`
223 * :func:`_parse_td`
224 * :func:`_parse_thead_tr`
225 * :func:`_parse_tbody_tr`
226 * :func:`_parse_tfoot_tr`
227 * :func:`_parse_tables`
228 * :func:`_equals_tag`
229 See each method's respective documentation for details on their
230 functionality.
231 """
233 def __init__(
234 self,
235 io: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
236 match: str | Pattern,
237 attrs: dict[str, str] | None,
238 encoding: str,
239 displayed_only: bool,
240 extract_links: Literal[None, "header", "footer", "body", "all"],
241 ) -> None:
242 self.io = io
243 self.match = match
244 self.attrs = attrs
245 self.encoding = encoding
246 self.displayed_only = displayed_only
247 self.extract_links = extract_links
249 def parse_tables(self):
250 """
251 Parse and return all tables from the DOM.
253 Returns
254 -------
255 list of parsed (header, body, footer) tuples from tables.
256 """
257 tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
258 return (self._parse_thead_tbody_tfoot(table) for table in tables)
260 def _attr_getter(self, obj, attr):
261 """
262 Return the attribute value of an individual DOM node.
264 Parameters
265 ----------
266 obj : node-like
267 A DOM node.
269 attr : str or unicode
270 The attribute, such as "colspan"
272 Returns
273 -------
274 str or unicode
275 The attribute value.
276 """
277 # Both lxml and BeautifulSoup have the same implementation:
278 return obj.get(attr)
280 def _href_getter(self, obj):
281 """
282 Return a href if the DOM node contains a child <a> or None.
284 Parameters
285 ----------
286 obj : node-like
287 A DOM node.
289 Returns
290 -------
291 href : str or unicode
292 The href from the <a> child of the DOM node.
293 """
294 raise AbstractMethodError(self)
296 def _text_getter(self, obj):
297 """
298 Return the text of an individual DOM node.
300 Parameters
301 ----------
302 obj : node-like
303 A DOM node.
305 Returns
306 -------
307 text : str or unicode
308 The text from an individual DOM node.
309 """
310 raise AbstractMethodError(self)
312 def _parse_td(self, obj):
313 """
314 Return the td elements from a row element.
316 Parameters
317 ----------
318 obj : node-like
319 A DOM <tr> node.
321 Returns
322 -------
323 list of node-like
324 These are the elements of each row, i.e., the columns.
325 """
326 raise AbstractMethodError(self)
328 def _parse_thead_tr(self, table):
329 """
330 Return the list of thead row elements from the parsed table element.
332 Parameters
333 ----------
334 table : a table element that contains zero or more thead elements.
336 Returns
337 -------
338 list of node-like
339 These are the <tr> row elements of a table.
340 """
341 raise AbstractMethodError(self)
343 def _parse_tbody_tr(self, table):
344 """
345 Return the list of tbody row elements from the parsed table element.
347 HTML5 table bodies consist of either 0 or more <tbody> elements (which
348 only contain <tr> elements) or 0 or more <tr> elements. This method
349 checks for both structures.
351 Parameters
352 ----------
353 table : a table element that contains row elements.
355 Returns
356 -------
357 list of node-like
358 These are the <tr> row elements of a table.
359 """
360 raise AbstractMethodError(self)
362 def _parse_tfoot_tr(self, table):
363 """
364 Return the list of tfoot row elements from the parsed table element.
366 Parameters
367 ----------
368 table : a table element that contains row elements.
370 Returns
371 -------
372 list of node-like
373 These are the <tr> row elements of a table.
374 """
375 raise AbstractMethodError(self)
377 def _parse_tables(self, doc, match, attrs):
378 """
379 Return all tables from the parsed DOM.
381 Parameters
382 ----------
383 doc : the DOM from which to parse the table element.
385 match : str or regular expression
386 The text to search for in the DOM tree.
388 attrs : dict
389 A dictionary of table attributes that can be used to disambiguate
390 multiple tables on a page.
392 Raises
393 ------
394 ValueError : `match` does not match any text in the document.
396 Returns
397 -------
398 list of node-like
399 HTML <table> elements to be parsed into raw data.
400 """
401 raise AbstractMethodError(self)
403 def _equals_tag(self, obj, tag):
404 """
405 Return whether an individual DOM node matches a tag
407 Parameters
408 ----------
409 obj : node-like
410 A DOM node.
412 tag : str
413 Tag name to be checked for equality.
415 Returns
416 -------
417 boolean
418 Whether `obj`'s tag name is `tag`
419 """
420 raise AbstractMethodError(self)
422 def _build_doc(self):
423 """
424 Return a tree-like object that can be used to iterate over the DOM.
426 Returns
427 -------
428 node-like
429 The DOM from which to parse the table element.
430 """
431 raise AbstractMethodError(self)
433 def _parse_thead_tbody_tfoot(self, table_html):
434 """
435 Given a table, return parsed header, body, and foot.
437 Parameters
438 ----------
439 table_html : node-like
441 Returns
442 -------
443 tuple of (header, body, footer), each a list of list-of-text rows.
445 Notes
446 -----
447 Header and body are lists-of-lists. Top level list is a list of
448 rows. Each row is a list of str text.
450 Logic: Use <thead>, <tbody>, <tfoot> elements to identify
451 header, body, and footer, otherwise:
452 - Put all rows into body
453 - Move rows from top of body to header only if
454 all elements inside row are <th>
455 - Move rows from bottom of body to footer only if
456 all elements inside row are <th>
457 """
458 header_rows = self._parse_thead_tr(table_html)
459 body_rows = self._parse_tbody_tr(table_html)
460 footer_rows = self._parse_tfoot_tr(table_html)
462 def row_is_all_th(row):
463 return all(self._equals_tag(t, "th") for t in self._parse_td(row))
465 if not header_rows:
466 # The table has no <thead>. Move the top all-<th> rows from
467 # body_rows to header_rows. (This is a common case because many
468 # tables in the wild have no <thead> or <tfoot>
469 while body_rows and row_is_all_th(body_rows[0]):
470 header_rows.append(body_rows.pop(0))
472 header = self._expand_colspan_rowspan(header_rows, section="header")
473 body = self._expand_colspan_rowspan(body_rows, section="body")
474 footer = self._expand_colspan_rowspan(footer_rows, section="footer")
476 return header, body, footer
478 def _expand_colspan_rowspan(
479 self, rows, section: Literal["header", "footer", "body"]
480 ):
481 """
482 Given a list of <tr>s, return a list of text rows.
484 Parameters
485 ----------
486 rows : list of node-like
487 List of <tr>s
488 section : the section that the rows belong to (header, body or footer).
490 Returns
491 -------
492 list of list
493 Each returned row is a list of str text, or tuple (text, link)
494 if extract_links is not None.
496 Notes
497 -----
498 Any cell with ``rowspan`` or ``colspan`` will have its contents copied
499 to subsequent cells.
500 """
501 all_texts = [] # list of rows, each a list of str
502 text: str | tuple
503 remainder: list[
504 tuple[int, str | tuple, int]
505 ] = [] # list of (index, text, nrows)
507 for tr in rows:
508 texts = [] # the output for this row
509 next_remainder = []
511 index = 0
512 tds = self._parse_td(tr)
513 for td in tds:
514 # Append texts from previous rows with rowspan>1 that come
515 # before this <td>
516 while remainder and remainder[0][0] <= index:
517 prev_i, prev_text, prev_rowspan = remainder.pop(0)
518 texts.append(prev_text)
519 if prev_rowspan > 1:
520 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
521 index += 1
523 # Append the text from this <td>, colspan times
524 text = _remove_whitespace(self._text_getter(td))
525 if self.extract_links in ("all", section):
526 href = self._href_getter(td)
527 text = (text, href)
528 rowspan = int(self._attr_getter(td, "rowspan") or 1)
529 colspan = int(self._attr_getter(td, "colspan") or 1)
531 for _ in range(colspan):
532 texts.append(text)
533 if rowspan > 1:
534 next_remainder.append((index, text, rowspan - 1))
535 index += 1
537 # Append texts from previous rows at the final position
538 for prev_i, prev_text, prev_rowspan in remainder:
539 texts.append(prev_text)
540 if prev_rowspan > 1:
541 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
543 all_texts.append(texts)
544 remainder = next_remainder
546 # Append rows that only appear because the previous row had non-1
547 # rowspan
548 while remainder:
549 next_remainder = []
550 texts = []
551 for prev_i, prev_text, prev_rowspan in remainder:
552 texts.append(prev_text)
553 if prev_rowspan > 1:
554 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
555 all_texts.append(texts)
556 remainder = next_remainder
558 return all_texts
560 def _handle_hidden_tables(self, tbl_list, attr_name):
561 """
562 Return list of tables, potentially removing hidden elements
564 Parameters
565 ----------
566 tbl_list : list of node-like
567 Type of list elements will vary depending upon parser used
568 attr_name : str
569 Name of the accessor for retrieving HTML attributes
571 Returns
572 -------
573 list of node-like
574 Return type matches `tbl_list`
575 """
576 if not self.displayed_only:
577 return tbl_list
579 return [
580 x
581 for x in tbl_list
582 if "display:none"
583 not in getattr(x, attr_name).get("style", "").replace(" ", "")
584 ]
587class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
588 """
589 HTML to DataFrame parser that uses BeautifulSoup under the hood.
591 See Also
592 --------
593 pandas.io.html._HtmlFrameParser
594 pandas.io.html._LxmlFrameParser
596 Notes
597 -----
598 Documentation strings for this class are in the base class
599 :class:`pandas.io.html._HtmlFrameParser`.
600 """
602 def __init__(self, *args, **kwargs) -> None:
603 super().__init__(*args, **kwargs)
604 from bs4 import SoupStrainer
606 self._strainer = SoupStrainer("table")
608 def _parse_tables(self, doc, match, attrs):
609 element_name = self._strainer.name
610 tables = doc.find_all(element_name, attrs=attrs)
612 if not tables:
613 raise ValueError("No tables found")
615 result = []
616 unique_tables = set()
617 tables = self._handle_hidden_tables(tables, "attrs")
619 for table in tables:
620 if self.displayed_only:
621 for elem in table.find_all(style=re.compile(r"display:\s*none")):
622 elem.decompose()
624 if table not in unique_tables and table.find(string=match) is not None:
625 result.append(table)
626 unique_tables.add(table)
628 if not result:
629 raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
630 return result
632 def _href_getter(self, obj) -> str | None:
633 a = obj.find("a", href=True)
634 return None if not a else a["href"]
636 def _text_getter(self, obj):
637 return obj.text
639 def _equals_tag(self, obj, tag):
640 return obj.name == tag
642 def _parse_td(self, row):
643 return row.find_all(("td", "th"), recursive=False)
645 def _parse_thead_tr(self, table):
646 return table.select("thead tr")
648 def _parse_tbody_tr(self, table):
649 from_tbody = table.select("tbody tr")
650 from_root = table.find_all("tr", recursive=False)
651 # HTML spec: at most one of these lists has content
652 return from_tbody + from_root
654 def _parse_tfoot_tr(self, table):
655 return table.select("tfoot tr")
657 def _setup_build_doc(self):
658 raw_text = _read(self.io, self.encoding)
659 if not raw_text:
660 raise ValueError(f"No text parsed from document: {self.io}")
661 return raw_text
663 def _build_doc(self):
664 from bs4 import BeautifulSoup
666 bdoc = self._setup_build_doc()
667 if isinstance(bdoc, bytes) and self.encoding is not None:
668 udoc = bdoc.decode(self.encoding)
669 from_encoding = None
670 else:
671 udoc = bdoc
672 from_encoding = self.encoding
674 soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
676 for br in soup.find_all("br"):
677 br.replace_with("\n" + br.text)
679 return soup
682def _build_xpath_expr(attrs) -> str:
683 """
684 Build an xpath expression to simulate bs4's ability to pass in kwargs to
685 search for attributes when using the lxml parser.
687 Parameters
688 ----------
689 attrs : dict
690 A dict of HTML attributes. These are NOT checked for validity.
692 Returns
693 -------
694 expr : unicode
695 An XPath expression that checks for the given HTML attributes.
696 """
697 # give class attribute as class_ because class is a python keyword
698 if "class_" in attrs:
699 attrs["class"] = attrs.pop("class_")
701 s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()])
702 return f"[{s}]"
705_re_namespace = {"re": "http://exslt.org/regular-expressions"}
708class _LxmlFrameParser(_HtmlFrameParser):
709 """
710 HTML to DataFrame parser that uses lxml under the hood.
712 Warning
713 -------
714 This parser can only handle HTTP, FTP, and FILE urls.
716 See Also
717 --------
718 _HtmlFrameParser
719 _BeautifulSoupLxmlFrameParser
721 Notes
722 -----
723 Documentation strings for this class are in the base class
724 :class:`_HtmlFrameParser`.
725 """
727 def _href_getter(self, obj) -> str | None:
728 href = obj.xpath(".//a/@href")
729 return None if not href else href[0]
731 def _text_getter(self, obj):
732 return obj.text_content()
734 def _parse_td(self, row):
735 # Look for direct children only: the "row" element here may be a
736 # <thead> or <tfoot> (see _parse_thead_tr).
737 return row.xpath("./td|./th")
739 def _parse_tables(self, doc, match, kwargs):
740 pattern = match.pattern
742 # 1. check all descendants for the given pattern and only search tables
743 # GH 49929
744 xpath_expr = f"//table[.//text()[re:test(., {repr(pattern)})]]"
746 # if any table attributes were given build an xpath expression to
747 # search for them
748 if kwargs:
749 xpath_expr += _build_xpath_expr(kwargs)
751 tables = doc.xpath(xpath_expr, namespaces=_re_namespace)
753 tables = self._handle_hidden_tables(tables, "attrib")
754 if self.displayed_only:
755 for table in tables:
756 # lxml utilizes XPATH 1.0 which does not have regex
757 # support. As a result, we find all elements with a style
758 # attribute and iterate them to check for display:none
759 for elem in table.xpath(".//*[@style]"):
760 if "display:none" in elem.attrib.get("style", "").replace(" ", ""):
761 elem.getparent().remove(elem)
763 if not tables:
764 raise ValueError(f"No tables found matching regex {repr(pattern)}")
765 return tables
767 def _equals_tag(self, obj, tag):
768 return obj.tag == tag
770 def _build_doc(self):
771 """
772 Raises
773 ------
774 ValueError
775 * If a URL that lxml cannot parse is passed.
777 Exception
778 * Any other ``Exception`` thrown. For example, trying to parse a
779 URL that is syntactically correct on a machine with no internet
780 connection will fail.
782 See Also
783 --------
784 pandas.io.html._HtmlFrameParser._build_doc
785 """
786 from lxml.etree import XMLSyntaxError
787 from lxml.html import (
788 HTMLParser,
789 fromstring,
790 parse,
791 )
793 parser = HTMLParser(recover=True, encoding=self.encoding)
795 try:
796 if is_url(self.io):
797 with urlopen(self.io) as f:
798 r = parse(f, parser=parser)
799 else:
800 # try to parse the input in the simplest way
801 r = parse(self.io, parser=parser)
802 try:
803 r = r.getroot()
804 except AttributeError:
805 pass
806 except (UnicodeDecodeError, OSError) as e:
807 # if the input is a blob of html goop
808 if not is_url(self.io):
809 r = fromstring(self.io, parser=parser)
811 try:
812 r = r.getroot()
813 except AttributeError:
814 pass
815 else:
816 raise e
817 else:
818 if not hasattr(r, "text_content"):
819 raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
821 for br in r.xpath("*//br"):
822 br.tail = "\n" + (br.tail or "")
824 return r
826 def _parse_thead_tr(self, table):
827 rows = []
829 for thead in table.xpath(".//thead"):
830 rows.extend(thead.xpath("./tr"))
832 # HACK: lxml does not clean up the clearly-erroneous
833 # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add
834 # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its
835 # children as though it's a <tr>.
836 #
837 # Better solution would be to use html5lib.
838 elements_at_root = thead.xpath("./td|./th")
839 if elements_at_root:
840 rows.append(thead)
842 return rows
844 def _parse_tbody_tr(self, table):
845 from_tbody = table.xpath(".//tbody//tr")
846 from_root = table.xpath("./tr")
847 # HTML spec: at most one of these lists has content
848 return from_tbody + from_root
850 def _parse_tfoot_tr(self, table):
851 return table.xpath(".//tfoot//tr")
854def _expand_elements(body) -> None:
855 data = [len(elem) for elem in body]
856 lens = Series(data)
857 lens_max = lens.max()
858 not_max = lens[lens != lens_max]
860 empty = [""]
861 for ind, length in not_max.items():
862 body[ind] += empty * (lens_max - length)
865def _data_to_frame(**kwargs):
866 head, body, foot = kwargs.pop("data")
867 header = kwargs.pop("header")
868 kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"])
869 if head:
870 body = head + body
872 # Infer header when there is a <thead> or top <th>-only rows
873 if header is None:
874 if len(head) == 1:
875 header = 0
876 else:
877 # ignore all-empty-text rows
878 header = [i for i, row in enumerate(head) if any(text for text in row)]
880 if foot:
881 body += foot
883 # fill out elements of body that are "ragged"
884 _expand_elements(body)
885 with TextParser(body, header=header, **kwargs) as tp:
886 return tp.read()
889_valid_parsers = {
890 "lxml": _LxmlFrameParser,
891 None: _LxmlFrameParser,
892 "html5lib": _BeautifulSoupHtml5LibFrameParser,
893 "bs4": _BeautifulSoupHtml5LibFrameParser,
894}
897def _parser_dispatch(flavor: str | None) -> type[_HtmlFrameParser]:
898 """
899 Choose the parser based on the input flavor.
901 Parameters
902 ----------
903 flavor : str
904 The type of parser to use. This must be a valid backend.
906 Returns
907 -------
908 cls : _HtmlFrameParser subclass
909 The parser class based on the requested input flavor.
911 Raises
912 ------
913 ValueError
914 * If `flavor` is not a valid backend.
915 ImportError
916 * If you do not have the requested `flavor`
917 """
918 valid_parsers = list(_valid_parsers.keys())
919 if flavor not in valid_parsers:
920 raise ValueError(
921 f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}"
922 )
924 if flavor in ("bs4", "html5lib"):
925 if not _HAS_HTML5LIB:
926 raise ImportError("html5lib not found, please install it")
927 if not _HAS_BS4:
928 raise ImportError("BeautifulSoup4 (bs4) not found, please install it")
929 # Although we call this above, we want to raise here right before use.
930 bs4 = import_optional_dependency("bs4") # noqa:F841
932 else:
933 if not _HAS_LXML:
934 raise ImportError("lxml not found, please install it")
935 return _valid_parsers[flavor]
938def _print_as_set(s) -> str:
939 arg = ", ".join([pprint_thing(el) for el in s])
940 return f"{{{arg}}}"
943def _validate_flavor(flavor):
944 if flavor is None:
945 flavor = "lxml", "bs4"
946 elif isinstance(flavor, str):
947 flavor = (flavor,)
948 elif isinstance(flavor, abc.Iterable):
949 if not all(isinstance(flav, str) for flav in flavor):
950 raise TypeError(
951 f"Object of type {repr(type(flavor).__name__)} "
952 f"is not an iterable of strings"
953 )
954 else:
955 msg = repr(flavor) if isinstance(flavor, str) else str(flavor)
956 msg += " is not a valid flavor"
957 raise ValueError(msg)
959 flavor = tuple(flavor)
960 valid_flavors = set(_valid_parsers)
961 flavor_set = set(flavor)
963 if not flavor_set & valid_flavors:
964 raise ValueError(
965 f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid "
966 f"flavors are {_print_as_set(valid_flavors)}"
967 )
968 return flavor
971def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs):
972 flavor = _validate_flavor(flavor)
973 compiled_match = re.compile(match) # you can pass a compiled regex here
975 retained = None
976 for flav in flavor:
977 parser = _parser_dispatch(flav)
978 p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links)
980 try:
981 tables = p.parse_tables()
982 except ValueError as caught:
983 # if `io` is an io-like object, check if it's seekable
984 # and try to rewind it before trying the next parser
985 if hasattr(io, "seekable") and io.seekable():
986 io.seek(0)
987 elif hasattr(io, "seekable") and not io.seekable():
988 # if we couldn't rewind it, let the user know
989 raise ValueError(
990 f"The flavor {flav} failed to parse your input. "
991 "Since you passed a non-rewindable file "
992 "object, we can't rewind it to try "
993 "another parser. Try read_html() with a different flavor."
994 ) from caught
996 retained = caught
997 else:
998 break
999 else:
1000 assert retained is not None # for mypy
1001 raise retained
1003 ret = []
1004 for table in tables:
1005 try:
1006 df = _data_to_frame(data=table, **kwargs)
1007 # Cast MultiIndex header to an Index of tuples when extracting header
1008 # links and replace nan with None (therefore can't use mi.to_flat_index()).
1009 # This maintains consistency of selection (e.g. df.columns.str[1])
1010 if extract_links in ("all", "header") and isinstance(
1011 df.columns, MultiIndex
1012 ):
1013 df.columns = Index(
1014 ((col[0], None if isna(col[1]) else col[1]) for col in df.columns),
1015 tupleize_cols=False,
1016 )
1018 ret.append(df)
1019 except EmptyDataError: # empty table
1020 continue
1021 return ret
1024def read_html(
1025 io: FilePath | ReadBuffer[str],
1026 *,
1027 match: str | Pattern = ".+",
1028 flavor: str | None = None,
1029 header: int | Sequence[int] | None = None,
1030 index_col: int | Sequence[int] | None = None,
1031 skiprows: int | Sequence[int] | slice | None = None,
1032 attrs: dict[str, str] | None = None,
1033 parse_dates: bool = False,
1034 thousands: str | None = ",",
1035 encoding: str | None = None,
1036 decimal: str = ".",
1037 converters: dict | None = None,
1038 na_values: Iterable[object] | None = None,
1039 keep_default_na: bool = True,
1040 displayed_only: bool = True,
1041 extract_links: Literal[None, "header", "footer", "body", "all"] = None,
1042 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
1043) -> list[DataFrame]:
1044 r"""
1045 Read HTML tables into a ``list`` of ``DataFrame`` objects.
1047 Parameters
1048 ----------
1049 io : str, path object, or file-like object
1050 String, path object (implementing ``os.PathLike[str]``), or file-like
1051 object implementing a string ``read()`` function.
1052 The string can represent a URL or the HTML itself. Note that
1053 lxml only accepts the http, ftp and file url protocols. If you have a
1054 URL that starts with ``'https'`` you might try removing the ``'s'``.
1056 match : str or compiled regular expression, optional
1057 The set of tables containing text matching this regex or string will be
1058 returned. Unless the HTML is extremely simple you will probably need to
1059 pass a non-empty string here. Defaults to '.+' (match any non-empty
1060 string). The default value will return all tables contained on a page.
1061 This value is converted to a regular expression so that there is
1062 consistent behavior between Beautiful Soup and lxml.
1064 flavor : str, optional
1065 The parsing engine to use. 'bs4' and 'html5lib' are synonymous with
1066 each other, they are both there for backwards compatibility. The
1067 default of ``None`` tries to use ``lxml`` to parse and if that fails it
1068 falls back on ``bs4`` + ``html5lib``.
1070 header : int or list-like, optional
1071 The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to
1072 make the columns headers.
1074 index_col : int or list-like, optional
1075 The column (or list of columns) to use to create the index.
1077 skiprows : int, list-like or slice, optional
1078 Number of rows to skip after parsing the column integer. 0-based. If a
1079 sequence of integers or a slice is given, will skip the rows indexed by
1080 that sequence. Note that a single element sequence means 'skip the nth
1081 row' whereas an integer means 'skip n rows'.
1083 attrs : dict, optional
1084 This is a dictionary of attributes that you can pass to use to identify
1085 the table in the HTML. These are not checked for validity before being
1086 passed to lxml or Beautiful Soup. However, these attributes must be
1087 valid HTML table attributes to work correctly. For example, ::
1089 attrs = {'id': 'table'}
1091 is a valid attribute dictionary because the 'id' HTML tag attribute is
1092 a valid HTML attribute for *any* HTML tag as per `this document
1093 <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>`__. ::
1095 attrs = {'asdf': 'table'}
1097 is *not* a valid attribute dictionary because 'asdf' is not a valid
1098 HTML attribute even if it is a valid XML attribute. Valid HTML 4.01
1099 table attributes can be found `here
1100 <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A
1101 working draft of the HTML 5 spec can be found `here
1102 <https://html.spec.whatwg.org/multipage/tables.html>`__. It contains the
1103 latest information on table attributes for the modern web.
1105 parse_dates : bool, optional
1106 See :func:`~read_csv` for more details.
1108 thousands : str, optional
1109 Separator to use to parse thousands. Defaults to ``','``.
1111 encoding : str, optional
1112 The encoding used to decode the web page. Defaults to ``None``.``None``
1113 preserves the previous encoding behavior, which depends on the
1114 underlying parser library (e.g., the parser library will try to use
1115 the encoding provided by the document).
1117 decimal : str, default '.'
1118 Character to recognize as decimal point (e.g. use ',' for European
1119 data).
1121 converters : dict, default None
1122 Dict of functions for converting values in certain columns. Keys can
1123 either be integers or column labels, values are functions that take one
1124 input argument, the cell (not column) content, and return the
1125 transformed content.
1127 na_values : iterable, default None
1128 Custom NA values.
1130 keep_default_na : bool, default True
1131 If na_values are specified and keep_default_na is False the default NaN
1132 values are overridden, otherwise they're appended to.
1134 displayed_only : bool, default True
1135 Whether elements with "display: none" should be parsed.
1137 extract_links : {None, "all", "header", "body", "footer"}
1138 Table elements in the specified section(s) with <a> tags will have their
1139 href extracted.
1141 .. versionadded:: 1.5.0
1143 dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames
1144 Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
1145 arrays, nullable dtypes are used for all dtypes that have a nullable
1146 implementation when "numpy_nullable" is set, pyarrow is used for all
1147 dtypes if "pyarrow" is set.
1149 The dtype_backends are still experimential.
1151 .. versionadded:: 2.0
1153 Returns
1154 -------
1155 dfs
1156 A list of DataFrames.
1158 See Also
1159 --------
1160 read_csv : Read a comma-separated values (csv) file into DataFrame.
1162 Notes
1163 -----
1164 Before using this function you should read the :ref:`gotchas about the
1165 HTML parsing libraries <io.html.gotchas>`.
1167 Expect to do some cleanup after you call this function. For example, you
1168 might need to manually assign column names if the column names are
1169 converted to NaN when you pass the `header=0` argument. We try to assume as
1170 little as possible about the structure of the table and push the
1171 idiosyncrasies of the HTML contained in the table to the user.
1173 This function searches for ``<table>`` elements and only for ``<tr>``
1174 and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``
1175 element in the table. ``<td>`` stands for "table data". This function
1176 attempts to properly handle ``colspan`` and ``rowspan`` attributes.
1177 If the function has a ``<thead>`` argument, it is used to construct
1178 the header, otherwise the function attempts to find the header within
1179 the body (by putting rows with only ``<th>`` elements into the header).
1181 Similar to :func:`~read_csv` the `header` argument is applied
1182 **after** `skiprows` is applied.
1184 This function will *always* return a list of :class:`DataFrame` *or*
1185 it will fail, e.g., it will *not* return an empty list.
1187 Examples
1188 --------
1189 See the :ref:`read_html documentation in the IO section of the docs
1190 <io.read_html>` for some examples of reading in HTML tables.
1191 """
1192 _importers()
1194 # Type check here. We don't want to parse only to fail because of an
1195 # invalid value of an integer skiprows.
1196 if isinstance(skiprows, numbers.Integral) and skiprows < 0:
1197 raise ValueError(
1198 "cannot skip rows starting from the end of the "
1199 "data (you passed a negative value)"
1200 )
1201 if extract_links not in [None, "header", "footer", "body", "all"]:
1202 raise ValueError(
1203 "`extract_links` must be one of "
1204 '{None, "header", "footer", "body", "all"}, got '
1205 f'"{extract_links}"'
1206 )
1207 validate_header_arg(header)
1208 check_dtype_backend(dtype_backend)
1210 io = stringify_path(io)
1212 return _parse(
1213 flavor=flavor,
1214 io=io,
1215 match=match,
1216 header=header,
1217 index_col=index_col,
1218 skiprows=skiprows,
1219 parse_dates=parse_dates,
1220 thousands=thousands,
1221 attrs=attrs,
1222 encoding=encoding,
1223 decimal=decimal,
1224 converters=converters,
1225 na_values=na_values,
1226 keep_default_na=keep_default_na,
1227 displayed_only=displayed_only,
1228 extract_links=extract_links,
1229 dtype_backend=dtype_backend,
1230 )