Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/xml.py: 17%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2:mod:`pandas.io.xml` is a module for reading XML.
3"""
5from __future__ import annotations
7import io
8from typing import (
9 Any,
10 Callable,
11 Sequence,
12)
14from pandas._libs import lib
15from pandas._typing import (
16 TYPE_CHECKING,
17 CompressionOptions,
18 ConvertersArg,
19 DtypeArg,
20 DtypeBackend,
21 FilePath,
22 ParseDatesArg,
23 ReadBuffer,
24 StorageOptions,
25 XMLParsers,
26)
27from pandas.compat._optional import import_optional_dependency
28from pandas.errors import (
29 AbstractMethodError,
30 ParserError,
31)
32from pandas.util._decorators import doc
33from pandas.util._validators import check_dtype_backend
35from pandas.core.dtypes.common import is_list_like
37from pandas.core.shared_docs import _shared_docs
39from pandas.io.common import (
40 file_exists,
41 get_handle,
42 infer_compression,
43 is_fsspec_url,
44 is_url,
45 stringify_path,
46)
47from pandas.io.parsers import TextParser
49if TYPE_CHECKING:
50 from xml.etree.ElementTree import Element
52 from lxml import etree
54 from pandas import DataFrame
57@doc(
58 storage_options=_shared_docs["storage_options"],
59 decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",
60)
61class _XMLFrameParser:
62 """
63 Internal subclass to parse XML into DataFrames.
65 Parameters
66 ----------
67 path_or_buffer : a valid JSON str, path object or file-like object
68 Any valid string path is acceptable. The string could be a URL. Valid
69 URL schemes include http, ftp, s3, and file.
71 xpath : str or regex
72 The XPath expression to parse required set of nodes for
73 migration to `Data Frame`. `etree` supports limited XPath.
75 namespaces : dict
76 The namespaces defined in XML document (`xmlns:namespace='URI')
77 as dicts with key being namespace and value the URI.
79 elems_only : bool
80 Parse only the child elements at the specified `xpath`.
82 attrs_only : bool
83 Parse only the attributes at the specified `xpath`.
85 names : list
86 Column names for Data Frame of parsed XML data.
88 dtype : dict
89 Data type for data or columns. E.g. {{'a': np.float64,
90 'b': np.int32, 'c': 'Int64'}}
92 .. versionadded:: 1.5.0
94 converters : dict, optional
95 Dict of functions for converting values in certain columns. Keys can
96 either be integers or column labels.
98 .. versionadded:: 1.5.0
100 parse_dates : bool or list of int or names or list of lists or dict
101 Converts either index or select columns to datetimes
103 .. versionadded:: 1.5.0
105 encoding : str
106 Encoding of xml object or document.
108 stylesheet : str or file-like
109 URL, file, file-like object, or a raw string containing XSLT,
110 `etree` does not support XSLT but retained for consistency.
112 iterparse : dict, optional
113 Dict with row element as key and list of descendant elements
114 and/or attributes as value to be retrieved in iterparsing of
115 XML document.
117 .. versionadded:: 1.5.0
119 {decompression_options}
121 .. versionchanged:: 1.4.0 Zstandard support.
123 {storage_options}
125 See also
126 --------
127 pandas.io.xml._EtreeFrameParser
128 pandas.io.xml._LxmlFrameParser
130 Notes
131 -----
132 To subclass this class effectively you must override the following methods:`
133 * :func:`parse_data`
134 * :func:`_parse_nodes`
135 * :func:`_iterparse_nodes`
136 * :func:`_parse_doc`
137 * :func:`_validate_names`
138 * :func:`_validate_path`
141 See each method's respective documentation for details on their
142 functionality.
143 """
145 def __init__(
146 self,
147 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
148 xpath: str,
149 namespaces: dict[str, str] | None,
150 elems_only: bool,
151 attrs_only: bool,
152 names: Sequence[str] | None,
153 dtype: DtypeArg | None,
154 converters: ConvertersArg | None,
155 parse_dates: ParseDatesArg | None,
156 encoding: str | None,
157 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
158 iterparse: dict[str, list[str]] | None,
159 compression: CompressionOptions,
160 storage_options: StorageOptions,
161 ) -> None:
162 self.path_or_buffer = path_or_buffer
163 self.xpath = xpath
164 self.namespaces = namespaces
165 self.elems_only = elems_only
166 self.attrs_only = attrs_only
167 self.names = names
168 self.dtype = dtype
169 self.converters = converters
170 self.parse_dates = parse_dates
171 self.encoding = encoding
172 self.stylesheet = stylesheet
173 self.iterparse = iterparse
174 self.is_style = None
175 self.compression = compression
176 self.storage_options = storage_options
178 def parse_data(self) -> list[dict[str, str | None]]:
179 """
180 Parse xml data.
182 This method will call the other internal methods to
183 validate xpath, names, parse and return specific nodes.
184 """
186 raise AbstractMethodError(self)
188 def _parse_nodes(self, elems: list[Any]) -> list[dict[str, str | None]]:
189 """
190 Parse xml nodes.
192 This method will parse the children and attributes of elements
193 in xpath, conditionally for only elements, only attributes
194 or both while optionally renaming node names.
196 Raises
197 ------
198 ValueError
199 * If only elements and only attributes are specified.
201 Notes
202 -----
203 Namespace URIs will be removed from return node values. Also,
204 elements with missing children or attributes compared to siblings
205 will have optional keys filled with None values.
206 """
208 dicts: list[dict[str, str | None]]
210 if self.elems_only and self.attrs_only:
211 raise ValueError("Either element or attributes can be parsed not both.")
212 if self.elems_only:
213 if self.names:
214 dicts = [
215 {
216 **(
217 {el.tag: el.text.strip()}
218 if el.text and not el.text.isspace()
219 else {}
220 ),
221 **{
222 nm: ch.text.strip() if ch.text else None
223 for nm, ch in zip(self.names, el.findall("*"))
224 },
225 }
226 for el in elems
227 ]
228 else:
229 dicts = [
230 {
231 ch.tag: ch.text.strip() if ch.text else None
232 for ch in el.findall("*")
233 }
234 for el in elems
235 ]
237 elif self.attrs_only:
238 dicts = [
239 {k: v.strip() if v else None for k, v in el.attrib.items()}
240 for el in elems
241 ]
243 else:
244 if self.names:
245 dicts = [
246 {
247 **el.attrib,
248 **(
249 {el.tag: el.text.strip()}
250 if el.text and not el.text.isspace()
251 else {}
252 ),
253 **{
254 nm: ch.text.strip() if ch.text else None
255 for nm, ch in zip(self.names, el.findall("*"))
256 },
257 }
258 for el in elems
259 ]
261 else:
262 dicts = [
263 {
264 **el.attrib,
265 **(
266 {el.tag: el.text.strip()}
267 if el.text and not el.text.isspace()
268 else {}
269 ),
270 **{
271 ch.tag: ch.text.strip() if ch.text else None
272 for ch in el.findall("*")
273 },
274 }
275 for el in elems
276 ]
278 dicts = [
279 {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts
280 ]
282 keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
283 dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
285 if self.names:
286 dicts = [dict(zip(self.names, d.values())) for d in dicts]
288 return dicts
290 def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
291 """
292 Iterparse xml nodes.
294 This method will read in local disk, decompressed XML files for elements
295 and underlying descendants using iterparse, a method to iterate through
296 an XML tree without holding entire XML tree in memory.
298 Raises
299 ------
300 TypeError
301 * If `iterparse` is not a dict or its dict value is not list-like.
302 ParserError
303 * If `path_or_buffer` is not a physical file on disk or file-like object.
304 * If no data is returned from selected items in `iterparse`.
306 Notes
307 -----
308 Namespace URIs will be removed from return node values. Also,
309 elements with missing children or attributes in submitted list
310 will have optional keys filled with None values.
311 """
313 dicts: list[dict[str, str | None]] = []
314 row: dict[str, str | None] | None = None
316 if not isinstance(self.iterparse, dict):
317 raise TypeError(
318 f"{type(self.iterparse).__name__} is not a valid type for iterparse"
319 )
321 row_node = next(iter(self.iterparse.keys())) if self.iterparse else ""
322 if not is_list_like(self.iterparse[row_node]):
323 raise TypeError(
324 f"{type(self.iterparse[row_node])} is not a valid type "
325 "for value in iterparse"
326 )
328 if (not hasattr(self.path_or_buffer, "read")) and (
329 not isinstance(self.path_or_buffer, str)
330 or is_url(self.path_or_buffer)
331 or is_fsspec_url(self.path_or_buffer)
332 or self.path_or_buffer.startswith(("<?xml", "<"))
333 or infer_compression(self.path_or_buffer, "infer") is not None
334 ):
335 raise ParserError(
336 "iterparse is designed for large XML files that are fully extracted on "
337 "local disk and not as compressed files or online sources."
338 )
340 iterparse_repeats = len(self.iterparse[row_node]) != len(
341 set(self.iterparse[row_node])
342 )
344 for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):
345 curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag
347 if event == "start":
348 if curr_elem == row_node:
349 row = {}
351 if row is not None:
352 if self.names and iterparse_repeats:
353 for col, nm in zip(self.iterparse[row_node], self.names):
354 if curr_elem == col:
355 elem_val = elem.text.strip() if elem.text else None
356 if elem_val not in row.values() and nm not in row:
357 row[nm] = elem_val
359 if col in elem.attrib:
360 if elem.attrib[col] not in row.values() and nm not in row:
361 row[nm] = elem.attrib[col]
362 else:
363 for col in self.iterparse[row_node]:
364 if curr_elem == col:
365 row[col] = elem.text.strip() if elem.text else None
366 if col in elem.attrib:
367 row[col] = elem.attrib[col]
369 if event == "end":
370 if curr_elem == row_node and row is not None:
371 dicts.append(row)
372 row = None
374 elem.clear()
375 if hasattr(elem, "getprevious"):
376 while (
377 elem.getprevious() is not None and elem.getparent() is not None
378 ):
379 del elem.getparent()[0]
381 if dicts == []:
382 raise ParserError("No result from selected items in iterparse.")
384 keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
385 dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
387 if self.names:
388 dicts = [dict(zip(self.names, d.values())) for d in dicts]
390 return dicts
392 def _validate_path(self) -> list[Any]:
393 """
394 Validate xpath.
396 This method checks for syntax, evaluation, or empty nodes return.
398 Raises
399 ------
400 SyntaxError
401 * If xpah is not supported or issues with namespaces.
403 ValueError
404 * If xpah does not return any nodes.
405 """
407 raise AbstractMethodError(self)
409 def _validate_names(self) -> None:
410 """
411 Validate names.
413 This method will check if names is a list-like and aligns
414 with length of parse nodes.
416 Raises
417 ------
418 ValueError
419 * If value is not a list and less then length of nodes.
420 """
421 raise AbstractMethodError(self)
423 def _parse_doc(
424 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
425 ) -> Element | etree._Element:
426 """
427 Build tree from path_or_buffer.
429 This method will parse XML object into tree
430 either from string/bytes or file location.
431 """
432 raise AbstractMethodError(self)
435class _EtreeFrameParser(_XMLFrameParser):
436 """
437 Internal class to parse XML into DataFrames with the Python
438 standard library XML module: `xml.etree.ElementTree`.
439 """
441 def parse_data(self) -> list[dict[str, str | None]]:
442 from xml.etree.ElementTree import iterparse
444 if self.stylesheet is not None:
445 raise ValueError(
446 "To use stylesheet, you need lxml installed and selected as parser."
447 )
449 if self.iterparse is None:
450 self.xml_doc = self._parse_doc(self.path_or_buffer)
451 elems = self._validate_path()
453 self._validate_names()
455 xml_dicts: list[dict[str, str | None]] = (
456 self._parse_nodes(elems)
457 if self.iterparse is None
458 else self._iterparse_nodes(iterparse)
459 )
461 return xml_dicts
463 def _validate_path(self) -> list[Any]:
464 """
465 Notes
466 -----
467 `etree` supports limited XPath. If user attempts a more complex
468 expression syntax error will raise.
469 """
471 msg = (
472 "xpath does not return any nodes or attributes. "
473 "Be sure to specify in `xpath` the parent nodes of "
474 "children and attributes to parse. "
475 "If document uses namespaces denoted with "
476 "xmlns, be sure to define namespaces and "
477 "use them in xpath."
478 )
479 try:
480 elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
481 children = [ch for el in elems for ch in el.findall("*")]
482 attrs = {k: v for el in elems for k, v in el.attrib.items()}
484 if elems is None:
485 raise ValueError(msg)
487 if elems is not None:
488 if self.elems_only and children == []:
489 raise ValueError(msg)
490 if self.attrs_only and attrs == {}:
491 raise ValueError(msg)
492 if children == [] and attrs == {}:
493 raise ValueError(msg)
495 except (KeyError, SyntaxError):
496 raise SyntaxError(
497 "You have used an incorrect or unsupported XPath "
498 "expression for etree library or you used an "
499 "undeclared namespace prefix."
500 )
502 return elems
504 def _validate_names(self) -> None:
505 children: list[Any]
507 if self.names:
508 if self.iterparse:
509 children = self.iterparse[next(iter(self.iterparse))]
510 else:
511 parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
512 children = parent.findall("*") if parent else []
514 if is_list_like(self.names):
515 if len(self.names) < len(children):
516 raise ValueError(
517 "names does not match length of child elements in xpath."
518 )
519 else:
520 raise TypeError(
521 f"{type(self.names).__name__} is not a valid type for names"
522 )
524 def _parse_doc(
525 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
526 ) -> Element:
527 from xml.etree.ElementTree import (
528 XMLParser,
529 parse,
530 )
532 handle_data = get_data_from_filepath(
533 filepath_or_buffer=raw_doc,
534 encoding=self.encoding,
535 compression=self.compression,
536 storage_options=self.storage_options,
537 )
539 with preprocess_data(handle_data) as xml_data:
540 curr_parser = XMLParser(encoding=self.encoding)
541 document = parse(xml_data, parser=curr_parser)
543 return document.getroot()
546class _LxmlFrameParser(_XMLFrameParser):
547 """
548 Internal class to parse XML into DataFrames with third-party
549 full-featured XML library, `lxml`, that supports
550 XPath 1.0 and XSLT 1.0.
551 """
553 def parse_data(self) -> list[dict[str, str | None]]:
554 """
555 Parse xml data.
557 This method will call the other internal methods to
558 validate xpath, names, optionally parse and run XSLT,
559 and parse original or transformed XML and return specific nodes.
560 """
561 from lxml.etree import iterparse
563 if self.iterparse is None:
564 self.xml_doc = self._parse_doc(self.path_or_buffer)
566 if self.stylesheet:
567 self.xsl_doc = self._parse_doc(self.stylesheet)
568 self.xml_doc = self._transform_doc()
570 elems = self._validate_path()
572 self._validate_names()
574 xml_dicts: list[dict[str, str | None]] = (
575 self._parse_nodes(elems)
576 if self.iterparse is None
577 else self._iterparse_nodes(iterparse)
578 )
580 return xml_dicts
582 def _validate_path(self) -> list[Any]:
583 msg = (
584 "xpath does not return any nodes or attributes. "
585 "Be sure to specify in `xpath` the parent nodes of "
586 "children and attributes to parse. "
587 "If document uses namespaces denoted with "
588 "xmlns, be sure to define namespaces and "
589 "use them in xpath."
590 )
592 elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
593 children = [ch for el in elems for ch in el.xpath("*")]
594 attrs = {k: v for el in elems for k, v in el.attrib.items()}
596 if elems == []:
597 raise ValueError(msg)
599 if elems != []:
600 if self.elems_only and children == []:
601 raise ValueError(msg)
602 if self.attrs_only and attrs == {}:
603 raise ValueError(msg)
604 if children == [] and attrs == {}:
605 raise ValueError(msg)
607 return elems
609 def _validate_names(self) -> None:
610 children: list[Any]
612 if self.names:
613 if self.iterparse:
614 children = self.iterparse[next(iter(self.iterparse))]
615 else:
616 children = self.xml_doc.xpath(
617 self.xpath + "[1]/*", namespaces=self.namespaces
618 )
620 if is_list_like(self.names):
621 if len(self.names) < len(children):
622 raise ValueError(
623 "names does not match length of child elements in xpath."
624 )
625 else:
626 raise TypeError(
627 f"{type(self.names).__name__} is not a valid type for names"
628 )
630 def _parse_doc(
631 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
632 ) -> etree._Element:
633 from lxml.etree import (
634 XMLParser,
635 fromstring,
636 parse,
637 )
639 handle_data = get_data_from_filepath(
640 filepath_or_buffer=raw_doc,
641 encoding=self.encoding,
642 compression=self.compression,
643 storage_options=self.storage_options,
644 )
646 with preprocess_data(handle_data) as xml_data:
647 curr_parser = XMLParser(encoding=self.encoding)
649 if isinstance(xml_data, io.StringIO):
650 if self.encoding is None:
651 raise TypeError(
652 "Can not pass encoding None when input is StringIO."
653 )
655 document = fromstring(
656 xml_data.getvalue().encode(self.encoding), parser=curr_parser
657 )
658 else:
659 document = parse(xml_data, parser=curr_parser)
661 return document
663 def _transform_doc(self) -> etree._XSLTResultTree:
664 """
665 Transform original tree using stylesheet.
667 This method will transform original xml using XSLT script into
668 am ideally flatter xml document for easier parsing and migration
669 to Data Frame.
670 """
671 from lxml.etree import XSLT
673 transformer = XSLT(self.xsl_doc)
674 new_doc = transformer(self.xml_doc)
676 return new_doc
679def get_data_from_filepath(
680 filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],
681 encoding: str | None,
682 compression: CompressionOptions,
683 storage_options: StorageOptions,
684) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:
685 """
686 Extract raw XML data.
688 The method accepts three input types:
689 1. filepath (string-like)
690 2. file-like object (e.g. open file object, StringIO)
691 3. XML string or bytes
693 This method turns (1) into (2) to simplify the rest of the processing.
694 It returns input types (2) and (3) unchanged.
695 """
696 if not isinstance(filepath_or_buffer, bytes):
697 filepath_or_buffer = stringify_path(filepath_or_buffer)
699 if (
700 isinstance(filepath_or_buffer, str)
701 and not filepath_or_buffer.startswith(("<?xml", "<"))
702 ) and (
703 not isinstance(filepath_or_buffer, str)
704 or is_url(filepath_or_buffer)
705 or is_fsspec_url(filepath_or_buffer)
706 or file_exists(filepath_or_buffer)
707 ):
708 with get_handle(
709 filepath_or_buffer,
710 "r",
711 encoding=encoding,
712 compression=compression,
713 storage_options=storage_options,
714 ) as handle_obj:
715 filepath_or_buffer = (
716 handle_obj.handle.read()
717 if hasattr(handle_obj.handle, "read")
718 else handle_obj.handle
719 )
721 return filepath_or_buffer
724def preprocess_data(data) -> io.StringIO | io.BytesIO:
725 """
726 Convert extracted raw data.
728 This method will return underlying data of extracted XML content.
729 The data either has a `read` attribute (e.g. a file object or a
730 StringIO/BytesIO) or is a string or bytes that is an XML document.
731 """
733 if isinstance(data, str):
734 data = io.StringIO(data)
736 elif isinstance(data, bytes):
737 data = io.BytesIO(data)
739 return data
742def _data_to_frame(data, **kwargs) -> DataFrame:
743 """
744 Convert parsed data to Data Frame.
746 This method will bind xml dictionary data of keys and values
747 into named columns of Data Frame using the built-in TextParser
748 class that build Data Frame and infers specific dtypes.
749 """
751 tags = next(iter(data))
752 nodes = [list(d.values()) for d in data]
754 try:
755 with TextParser(nodes, names=tags, **kwargs) as tp:
756 return tp.read()
757 except ParserError:
758 raise ParserError(
759 "XML document may be too complex for import. "
760 "Try to flatten document and use distinct "
761 "element and attribute names."
762 )
765def _parse(
766 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
767 xpath: str,
768 namespaces: dict[str, str] | None,
769 elems_only: bool,
770 attrs_only: bool,
771 names: Sequence[str] | None,
772 dtype: DtypeArg | None,
773 converters: ConvertersArg | None,
774 parse_dates: ParseDatesArg | None,
775 encoding: str | None,
776 parser: XMLParsers,
777 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
778 iterparse: dict[str, list[str]] | None,
779 compression: CompressionOptions,
780 storage_options: StorageOptions,
781 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
782 **kwargs,
783) -> DataFrame:
784 """
785 Call internal parsers.
787 This method will conditionally call internal parsers:
788 LxmlFrameParser and/or EtreeParser.
790 Raises
791 ------
792 ImportError
793 * If lxml is not installed if selected as parser.
795 ValueError
796 * If parser is not lxml or etree.
797 """
799 p: _EtreeFrameParser | _LxmlFrameParser
801 if parser == "lxml":
802 lxml = import_optional_dependency("lxml.etree", errors="ignore")
804 if lxml is not None:
805 p = _LxmlFrameParser(
806 path_or_buffer,
807 xpath,
808 namespaces,
809 elems_only,
810 attrs_only,
811 names,
812 dtype,
813 converters,
814 parse_dates,
815 encoding,
816 stylesheet,
817 iterparse,
818 compression,
819 storage_options,
820 )
821 else:
822 raise ImportError("lxml not found, please install or use the etree parser.")
824 elif parser == "etree":
825 p = _EtreeFrameParser(
826 path_or_buffer,
827 xpath,
828 namespaces,
829 elems_only,
830 attrs_only,
831 names,
832 dtype,
833 converters,
834 parse_dates,
835 encoding,
836 stylesheet,
837 iterparse,
838 compression,
839 storage_options,
840 )
841 else:
842 raise ValueError("Values for parser can only be lxml or etree.")
844 data_dicts = p.parse_data()
846 return _data_to_frame(
847 data=data_dicts,
848 dtype=dtype,
849 converters=converters,
850 parse_dates=parse_dates,
851 dtype_backend=dtype_backend,
852 **kwargs,
853 )
856@doc(
857 storage_options=_shared_docs["storage_options"],
858 decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",
859)
860def read_xml(
861 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
862 *,
863 xpath: str = "./*",
864 namespaces: dict[str, str] | None = None,
865 elems_only: bool = False,
866 attrs_only: bool = False,
867 names: Sequence[str] | None = None,
868 dtype: DtypeArg | None = None,
869 converters: ConvertersArg | None = None,
870 parse_dates: ParseDatesArg | None = None,
871 # encoding can not be None for lxml and StringIO input
872 encoding: str | None = "utf-8",
873 parser: XMLParsers = "lxml",
874 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,
875 iterparse: dict[str, list[str]] | None = None,
876 compression: CompressionOptions = "infer",
877 storage_options: StorageOptions = None,
878 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
879) -> DataFrame:
880 r"""
881 Read XML document into a ``DataFrame`` object.
883 .. versionadded:: 1.3.0
885 Parameters
886 ----------
887 path_or_buffer : str, path object, or file-like object
888 String, path object (implementing ``os.PathLike[str]``), or file-like
889 object implementing a ``read()`` function. The string can be any valid XML
890 string or a path. The string can further be a URL. Valid URL schemes
891 include http, ftp, s3, and file.
893 xpath : str, optional, default './\*'
894 The XPath to parse required set of nodes for migration to DataFrame.
895 XPath should return a collection of elements and not a single
896 element. Note: The ``etree`` parser supports limited XPath
897 expressions. For more complex XPath, use ``lxml`` which requires
898 installation.
900 namespaces : dict, optional
901 The namespaces defined in XML document as dicts with key being
902 namespace prefix and value the URI. There is no need to include all
903 namespaces in XML, only the ones used in ``xpath`` expression.
904 Note: if XML document uses default namespace denoted as
905 `xmlns='<URI>'` without a prefix, you must assign any temporary
906 namespace prefix such as 'doc' to the URI in order to parse
907 underlying nodes and/or attributes. For example, ::
909 namespaces = {{"doc": "https://example.com"}}
911 elems_only : bool, optional, default False
912 Parse only the child elements at the specified ``xpath``. By default,
913 all child elements and non-empty text nodes are returned.
915 attrs_only : bool, optional, default False
916 Parse only the attributes at the specified ``xpath``.
917 By default, all attributes are returned.
919 names : list-like, optional
920 Column names for DataFrame of parsed XML data. Use this parameter to
921 rename original element names and distinguish same named elements and
922 attributes.
924 dtype : Type name or dict of column -> type, optional
925 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
926 'c': 'Int64'}}
927 Use `str` or `object` together with suitable `na_values` settings
928 to preserve and not interpret dtype.
929 If converters are specified, they will be applied INSTEAD
930 of dtype conversion.
932 .. versionadded:: 1.5.0
934 converters : dict, optional
935 Dict of functions for converting values in certain columns. Keys can either
936 be integers or column labels.
938 .. versionadded:: 1.5.0
940 parse_dates : bool or list of int or names or list of lists or dict, default False
941 Identifiers to parse index or columns to datetime. The behavior is as follows:
943 * boolean. If True -> try parsing the index.
944 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
945 each as a separate date column.
946 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
947 a single date column.
948 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
949 result 'foo'
951 .. versionadded:: 1.5.0
953 encoding : str, optional, default 'utf-8'
954 Encoding of XML document.
956 parser : {{'lxml','etree'}}, default 'lxml'
957 Parser module to use for retrieval of data. Only 'lxml' and
958 'etree' are supported. With 'lxml' more complex XPath searches
959 and ability to use XSLT stylesheet are supported.
961 stylesheet : str, path object or file-like object
962 A URL, file-like object, or a raw string containing an XSLT script.
963 This stylesheet should flatten complex, deeply nested XML documents
964 for easier parsing. To use this feature you must have ``lxml`` module
965 installed and specify 'lxml' as ``parser``. The ``xpath`` must
966 reference nodes of transformed XML document generated after XSLT
967 transformation and not the original XML document. Only XSLT 1.0
968 scripts and not later versions is currently supported.
970 iterparse : dict, optional
971 The nodes or attributes to retrieve in iterparsing of XML document
972 as a dict with key being the name of repeating element and value being
973 list of elements or attribute names that are descendants of the repeated
974 element. Note: If this option is used, it will replace ``xpath`` parsing
975 and unlike xpath, descendants do not need to relate to each other but can
976 exist any where in document under the repeating element. This memory-
977 efficient method should be used for very large XML files (500MB, 1GB, or 5GB+).
978 For example, ::
980 iterparse = {{"row_element": ["child_elem", "attr", "grandchild_elem"]}}
982 .. versionadded:: 1.5.0
984 {decompression_options}
986 .. versionchanged:: 1.4.0 Zstandard support.
988 {storage_options}
990 dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames
991 Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
992 arrays, nullable dtypes are used for all dtypes that have a nullable
993 implementation when "numpy_nullable" is set, pyarrow is used for all
994 dtypes if "pyarrow" is set.
996 The dtype_backends are still experimential.
998 .. versionadded:: 2.0
1000 Returns
1001 -------
1002 df
1003 A DataFrame.
1005 See Also
1006 --------
1007 read_json : Convert a JSON string to pandas object.
1008 read_html : Read HTML tables into a list of DataFrame objects.
1010 Notes
1011 -----
1012 This method is best designed to import shallow XML documents in
1013 following format which is the ideal fit for the two-dimensions of a
1014 ``DataFrame`` (row by column). ::
1016 <root>
1017 <row>
1018 <column1>data</column1>
1019 <column2>data</column2>
1020 <column3>data</column3>
1021 ...
1022 </row>
1023 <row>
1024 ...
1025 </row>
1026 ...
1027 </root>
1029 As a file format, XML documents can be designed any way including
1030 layout of elements and attributes as long as it conforms to W3C
1031 specifications. Therefore, this method is a convenience handler for
1032 a specific flatter design and not all possible XML structures.
1034 However, for more complex XML documents, ``stylesheet`` allows you to
1035 temporarily redesign original document with XSLT (a special purpose
1036 language) for a flatter version for migration to a DataFrame.
1038 This function will *always* return a single :class:`DataFrame` or raise
1039 exceptions due to issues with XML document, ``xpath``, or other
1040 parameters.
1042 See the :ref:`read_xml documentation in the IO section of the docs
1043 <io.read_xml>` for more information in using this method to parse XML
1044 files to DataFrames.
1046 Examples
1047 --------
1048 >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
1049 ... <data xmlns="http://example.com">
1050 ... <row>
1051 ... <shape>square</shape>
1052 ... <degrees>360</degrees>
1053 ... <sides>4.0</sides>
1054 ... </row>
1055 ... <row>
1056 ... <shape>circle</shape>
1057 ... <degrees>360</degrees>
1058 ... <sides/>
1059 ... </row>
1060 ... <row>
1061 ... <shape>triangle</shape>
1062 ... <degrees>180</degrees>
1063 ... <sides>3.0</sides>
1064 ... </row>
1065 ... </data>'''
1067 >>> df = pd.read_xml(xml)
1068 >>> df
1069 shape degrees sides
1070 0 square 360 4.0
1071 1 circle 360 NaN
1072 2 triangle 180 3.0
1074 >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
1075 ... <data>
1076 ... <row shape="square" degrees="360" sides="4.0"/>
1077 ... <row shape="circle" degrees="360"/>
1078 ... <row shape="triangle" degrees="180" sides="3.0"/>
1079 ... </data>'''
1081 >>> df = pd.read_xml(xml, xpath=".//row")
1082 >>> df
1083 shape degrees sides
1084 0 square 360 4.0
1085 1 circle 360 NaN
1086 2 triangle 180 3.0
1088 >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
1089 ... <doc:data xmlns:doc="https://example.com">
1090 ... <doc:row>
1091 ... <doc:shape>square</doc:shape>
1092 ... <doc:degrees>360</doc:degrees>
1093 ... <doc:sides>4.0</doc:sides>
1094 ... </doc:row>
1095 ... <doc:row>
1096 ... <doc:shape>circle</doc:shape>
1097 ... <doc:degrees>360</doc:degrees>
1098 ... <doc:sides/>
1099 ... </doc:row>
1100 ... <doc:row>
1101 ... <doc:shape>triangle</doc:shape>
1102 ... <doc:degrees>180</doc:degrees>
1103 ... <doc:sides>3.0</doc:sides>
1104 ... </doc:row>
1105 ... </doc:data>'''
1107 >>> df = pd.read_xml(xml,
1108 ... xpath="//doc:row",
1109 ... namespaces={{"doc": "https://example.com"}})
1110 >>> df
1111 shape degrees sides
1112 0 square 360 4.0
1113 1 circle 360 NaN
1114 2 triangle 180 3.0
1115 """
1116 check_dtype_backend(dtype_backend)
1118 return _parse(
1119 path_or_buffer=path_or_buffer,
1120 xpath=xpath,
1121 namespaces=namespaces,
1122 elems_only=elems_only,
1123 attrs_only=attrs_only,
1124 names=names,
1125 dtype=dtype,
1126 converters=converters,
1127 parse_dates=parse_dates,
1128 encoding=encoding,
1129 parser=parser,
1130 stylesheet=stylesheet,
1131 iterparse=iterparse,
1132 compression=compression,
1133 storage_options=storage_options,
1134 dtype_backend=dtype_backend,
1135 )