Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/xml.py: 17%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2:mod:``pandas.io.xml`` is a module for reading XML.
3"""
5from __future__ import annotations
7import io
8from os import PathLike
9from typing import (
10 TYPE_CHECKING,
11 Any,
12 Callable,
13)
14import warnings
16from pandas._libs import lib
17from pandas.compat._optional import import_optional_dependency
18from pandas.errors import (
19 AbstractMethodError,
20 ParserError,
21)
22from pandas.util._decorators import doc
23from pandas.util._exceptions import find_stack_level
24from pandas.util._validators import check_dtype_backend
26from pandas.core.dtypes.common import is_list_like
28from pandas.core.shared_docs import _shared_docs
30from pandas.io.common import (
31 file_exists,
32 get_handle,
33 infer_compression,
34 is_file_like,
35 is_fsspec_url,
36 is_url,
37 stringify_path,
38)
39from pandas.io.parsers import TextParser
41if TYPE_CHECKING:
42 from collections.abc import Sequence
43 from xml.etree.ElementTree import Element
45 from lxml import etree
47 from pandas._typing import (
48 CompressionOptions,
49 ConvertersArg,
50 DtypeArg,
51 DtypeBackend,
52 FilePath,
53 ParseDatesArg,
54 ReadBuffer,
55 StorageOptions,
56 XMLParsers,
57 )
59 from pandas import DataFrame
62@doc(
63 storage_options=_shared_docs["storage_options"],
64 decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",
65)
66class _XMLFrameParser:
67 """
68 Internal subclass to parse XML into DataFrames.
70 Parameters
71 ----------
72 path_or_buffer : a valid JSON ``str``, path object or file-like object
73 Any valid string path is acceptable. The string could be a URL. Valid
74 URL schemes include http, ftp, s3, and file.
76 xpath : str or regex
77 The ``XPath`` expression to parse required set of nodes for
78 migration to :class:`~pandas.DataFrame`. ``etree`` supports limited ``XPath``.
80 namespaces : dict
81 The namespaces defined in XML document (``xmlns:namespace='URI'``)
82 as dicts with key being namespace and value the URI.
84 elems_only : bool
85 Parse only the child elements at the specified ``xpath``.
87 attrs_only : bool
88 Parse only the attributes at the specified ``xpath``.
90 names : list
91 Column names for :class:`~pandas.DataFrame` of parsed XML data.
93 dtype : dict
94 Data type for data or columns. E.g. {{'a': np.float64,
95 'b': np.int32, 'c': 'Int64'}}
97 .. versionadded:: 1.5.0
99 converters : dict, optional
100 Dict of functions for converting values in certain columns. Keys can
101 either be integers or column labels.
103 .. versionadded:: 1.5.0
105 parse_dates : bool or list of int or names or list of lists or dict
106 Converts either index or select columns to datetimes
108 .. versionadded:: 1.5.0
110 encoding : str
111 Encoding of xml object or document.
113 stylesheet : str or file-like
114 URL, file, file-like object, or a raw string containing XSLT,
115 ``etree`` does not support XSLT but retained for consistency.
117 iterparse : dict, optional
118 Dict with row element as key and list of descendant elements
119 and/or attributes as value to be retrieved in iterparsing of
120 XML document.
122 .. versionadded:: 1.5.0
124 {decompression_options}
126 .. versionchanged:: 1.4.0 Zstandard support.
128 {storage_options}
130 See also
131 --------
132 pandas.io.xml._EtreeFrameParser
133 pandas.io.xml._LxmlFrameParser
135 Notes
136 -----
137 To subclass this class effectively you must override the following methods:`
138 * :func:`parse_data`
139 * :func:`_parse_nodes`
140 * :func:`_iterparse_nodes`
141 * :func:`_parse_doc`
142 * :func:`_validate_names`
143 * :func:`_validate_path`
146 See each method's respective documentation for details on their
147 functionality.
148 """
150 def __init__(
151 self,
152 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
153 xpath: str,
154 namespaces: dict[str, str] | None,
155 elems_only: bool,
156 attrs_only: bool,
157 names: Sequence[str] | None,
158 dtype: DtypeArg | None,
159 converters: ConvertersArg | None,
160 parse_dates: ParseDatesArg | None,
161 encoding: str | None,
162 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
163 iterparse: dict[str, list[str]] | None,
164 compression: CompressionOptions,
165 storage_options: StorageOptions,
166 ) -> None:
167 self.path_or_buffer = path_or_buffer
168 self.xpath = xpath
169 self.namespaces = namespaces
170 self.elems_only = elems_only
171 self.attrs_only = attrs_only
172 self.names = names
173 self.dtype = dtype
174 self.converters = converters
175 self.parse_dates = parse_dates
176 self.encoding = encoding
177 self.stylesheet = stylesheet
178 self.iterparse = iterparse
179 self.is_style = None
180 self.compression: CompressionOptions = compression
181 self.storage_options = storage_options
183 def parse_data(self) -> list[dict[str, str | None]]:
184 """
185 Parse xml data.
187 This method will call the other internal methods to
188 validate ``xpath``, names, parse and return specific nodes.
189 """
191 raise AbstractMethodError(self)
193 def _parse_nodes(self, elems: list[Any]) -> list[dict[str, str | None]]:
194 """
195 Parse xml nodes.
197 This method will parse the children and attributes of elements
198 in ``xpath``, conditionally for only elements, only attributes
199 or both while optionally renaming node names.
201 Raises
202 ------
203 ValueError
204 * If only elements and only attributes are specified.
206 Notes
207 -----
208 Namespace URIs will be removed from return node values. Also,
209 elements with missing children or attributes compared to siblings
210 will have optional keys filled with None values.
211 """
213 dicts: list[dict[str, str | None]]
215 if self.elems_only and self.attrs_only:
216 raise ValueError("Either element or attributes can be parsed not both.")
217 if self.elems_only:
218 if self.names:
219 dicts = [
220 {
221 **(
222 {el.tag: el.text}
223 if el.text and not el.text.isspace()
224 else {}
225 ),
226 **{
227 nm: ch.text if ch.text else None
228 for nm, ch in zip(self.names, el.findall("*"))
229 },
230 }
231 for el in elems
232 ]
233 else:
234 dicts = [
235 {ch.tag: ch.text if ch.text else None for ch in el.findall("*")}
236 for el in elems
237 ]
239 elif self.attrs_only:
240 dicts = [
241 {k: v if v else None for k, v in el.attrib.items()} for el in elems
242 ]
244 elif self.names:
245 dicts = [
246 {
247 **el.attrib,
248 **({el.tag: el.text} if el.text and not el.text.isspace() else {}),
249 **{
250 nm: ch.text if ch.text else None
251 for nm, ch in zip(self.names, el.findall("*"))
252 },
253 }
254 for el in elems
255 ]
257 else:
258 dicts = [
259 {
260 **el.attrib,
261 **({el.tag: el.text} if el.text and not el.text.isspace() else {}),
262 **{ch.tag: ch.text if ch.text else None for ch in el.findall("*")},
263 }
264 for el in elems
265 ]
267 dicts = [
268 {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts
269 ]
271 keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
272 dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
274 if self.names:
275 dicts = [dict(zip(self.names, d.values())) for d in dicts]
277 return dicts
279 def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
280 """
281 Iterparse xml nodes.
283 This method will read in local disk, decompressed XML files for elements
284 and underlying descendants using iterparse, a method to iterate through
285 an XML tree without holding entire XML tree in memory.
287 Raises
288 ------
289 TypeError
290 * If ``iterparse`` is not a dict or its dict value is not list-like.
291 ParserError
292 * If ``path_or_buffer`` is not a physical file on disk or file-like object.
293 * If no data is returned from selected items in ``iterparse``.
295 Notes
296 -----
297 Namespace URIs will be removed from return node values. Also,
298 elements with missing children or attributes in submitted list
299 will have optional keys filled with None values.
300 """
302 dicts: list[dict[str, str | None]] = []
303 row: dict[str, str | None] | None = None
305 if not isinstance(self.iterparse, dict):
306 raise TypeError(
307 f"{type(self.iterparse).__name__} is not a valid type for iterparse"
308 )
310 row_node = next(iter(self.iterparse.keys())) if self.iterparse else ""
311 if not is_list_like(self.iterparse[row_node]):
312 raise TypeError(
313 f"{type(self.iterparse[row_node])} is not a valid type "
314 "for value in iterparse"
315 )
317 if (not hasattr(self.path_or_buffer, "read")) and (
318 not isinstance(self.path_or_buffer, (str, PathLike))
319 or is_url(self.path_or_buffer)
320 or is_fsspec_url(self.path_or_buffer)
321 or (
322 isinstance(self.path_or_buffer, str)
323 and self.path_or_buffer.startswith(("<?xml", "<"))
324 )
325 or infer_compression(self.path_or_buffer, "infer") is not None
326 ):
327 raise ParserError(
328 "iterparse is designed for large XML files that are fully extracted on "
329 "local disk and not as compressed files or online sources."
330 )
332 iterparse_repeats = len(self.iterparse[row_node]) != len(
333 set(self.iterparse[row_node])
334 )
336 for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):
337 curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag
339 if event == "start":
340 if curr_elem == row_node:
341 row = {}
343 if row is not None:
344 if self.names and iterparse_repeats:
345 for col, nm in zip(self.iterparse[row_node], self.names):
346 if curr_elem == col:
347 elem_val = elem.text if elem.text else None
348 if elem_val not in row.values() and nm not in row:
349 row[nm] = elem_val
351 if col in elem.attrib:
352 if elem.attrib[col] not in row.values() and nm not in row:
353 row[nm] = elem.attrib[col]
354 else:
355 for col in self.iterparse[row_node]:
356 if curr_elem == col:
357 row[col] = elem.text if elem.text else None
358 if col in elem.attrib:
359 row[col] = elem.attrib[col]
361 if event == "end":
362 if curr_elem == row_node and row is not None:
363 dicts.append(row)
364 row = None
366 elem.clear()
367 if hasattr(elem, "getprevious"):
368 while (
369 elem.getprevious() is not None and elem.getparent() is not None
370 ):
371 del elem.getparent()[0]
373 if dicts == []:
374 raise ParserError("No result from selected items in iterparse.")
376 keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
377 dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
379 if self.names:
380 dicts = [dict(zip(self.names, d.values())) for d in dicts]
382 return dicts
384 def _validate_path(self) -> list[Any]:
385 """
386 Validate ``xpath``.
388 This method checks for syntax, evaluation, or empty nodes return.
390 Raises
391 ------
392 SyntaxError
393 * If xpah is not supported or issues with namespaces.
395 ValueError
396 * If xpah does not return any nodes.
397 """
399 raise AbstractMethodError(self)
401 def _validate_names(self) -> None:
402 """
403 Validate names.
405 This method will check if names is a list-like and aligns
406 with length of parse nodes.
408 Raises
409 ------
410 ValueError
411 * If value is not a list and less then length of nodes.
412 """
413 raise AbstractMethodError(self)
415 def _parse_doc(
416 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
417 ) -> Element | etree._Element:
418 """
419 Build tree from path_or_buffer.
421 This method will parse XML object into tree
422 either from string/bytes or file location.
423 """
424 raise AbstractMethodError(self)
427class _EtreeFrameParser(_XMLFrameParser):
428 """
429 Internal class to parse XML into DataFrames with the Python
430 standard library XML module: `xml.etree.ElementTree`.
431 """
433 def parse_data(self) -> list[dict[str, str | None]]:
434 from xml.etree.ElementTree import iterparse
436 if self.stylesheet is not None:
437 raise ValueError(
438 "To use stylesheet, you need lxml installed and selected as parser."
439 )
441 if self.iterparse is None:
442 self.xml_doc = self._parse_doc(self.path_or_buffer)
443 elems = self._validate_path()
445 self._validate_names()
447 xml_dicts: list[dict[str, str | None]] = (
448 self._parse_nodes(elems)
449 if self.iterparse is None
450 else self._iterparse_nodes(iterparse)
451 )
453 return xml_dicts
455 def _validate_path(self) -> list[Any]:
456 """
457 Notes
458 -----
459 ``etree`` supports limited ``XPath``. If user attempts a more complex
460 expression syntax error will raise.
461 """
463 msg = (
464 "xpath does not return any nodes or attributes. "
465 "Be sure to specify in `xpath` the parent nodes of "
466 "children and attributes to parse. "
467 "If document uses namespaces denoted with "
468 "xmlns, be sure to define namespaces and "
469 "use them in xpath."
470 )
471 try:
472 elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
473 children = [ch for el in elems for ch in el.findall("*")]
474 attrs = {k: v for el in elems for k, v in el.attrib.items()}
476 if elems is None:
477 raise ValueError(msg)
479 if elems is not None:
480 if self.elems_only and children == []:
481 raise ValueError(msg)
482 if self.attrs_only and attrs == {}:
483 raise ValueError(msg)
484 if children == [] and attrs == {}:
485 raise ValueError(msg)
487 except (KeyError, SyntaxError):
488 raise SyntaxError(
489 "You have used an incorrect or unsupported XPath "
490 "expression for etree library or you used an "
491 "undeclared namespace prefix."
492 )
494 return elems
496 def _validate_names(self) -> None:
497 children: list[Any]
499 if self.names:
500 if self.iterparse:
501 children = self.iterparse[next(iter(self.iterparse))]
502 else:
503 parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
504 children = parent.findall("*") if parent is not None else []
506 if is_list_like(self.names):
507 if len(self.names) < len(children):
508 raise ValueError(
509 "names does not match length of child elements in xpath."
510 )
511 else:
512 raise TypeError(
513 f"{type(self.names).__name__} is not a valid type for names"
514 )
516 def _parse_doc(
517 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
518 ) -> Element:
519 from xml.etree.ElementTree import (
520 XMLParser,
521 parse,
522 )
524 handle_data = get_data_from_filepath(
525 filepath_or_buffer=raw_doc,
526 encoding=self.encoding,
527 compression=self.compression,
528 storage_options=self.storage_options,
529 )
531 with preprocess_data(handle_data) as xml_data:
532 curr_parser = XMLParser(encoding=self.encoding)
533 document = parse(xml_data, parser=curr_parser)
535 return document.getroot()
538class _LxmlFrameParser(_XMLFrameParser):
539 """
540 Internal class to parse XML into :class:`~pandas.DataFrame` with third-party
541 full-featured XML library, ``lxml``, that supports
542 ``XPath`` 1.0 and XSLT 1.0.
543 """
545 def parse_data(self) -> list[dict[str, str | None]]:
546 """
547 Parse xml data.
549 This method will call the other internal methods to
550 validate ``xpath``, names, optionally parse and run XSLT,
551 and parse original or transformed XML and return specific nodes.
552 """
553 from lxml.etree import iterparse
555 if self.iterparse is None:
556 self.xml_doc = self._parse_doc(self.path_or_buffer)
558 if self.stylesheet:
559 self.xsl_doc = self._parse_doc(self.stylesheet)
560 self.xml_doc = self._transform_doc()
562 elems = self._validate_path()
564 self._validate_names()
566 xml_dicts: list[dict[str, str | None]] = (
567 self._parse_nodes(elems)
568 if self.iterparse is None
569 else self._iterparse_nodes(iterparse)
570 )
572 return xml_dicts
574 def _validate_path(self) -> list[Any]:
575 msg = (
576 "xpath does not return any nodes or attributes. "
577 "Be sure to specify in `xpath` the parent nodes of "
578 "children and attributes to parse. "
579 "If document uses namespaces denoted with "
580 "xmlns, be sure to define namespaces and "
581 "use them in xpath."
582 )
584 elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
585 children = [ch for el in elems for ch in el.xpath("*")]
586 attrs = {k: v for el in elems for k, v in el.attrib.items()}
588 if elems == []:
589 raise ValueError(msg)
591 if elems != []:
592 if self.elems_only and children == []:
593 raise ValueError(msg)
594 if self.attrs_only and attrs == {}:
595 raise ValueError(msg)
596 if children == [] and attrs == {}:
597 raise ValueError(msg)
599 return elems
601 def _validate_names(self) -> None:
602 children: list[Any]
604 if self.names:
605 if self.iterparse:
606 children = self.iterparse[next(iter(self.iterparse))]
607 else:
608 children = self.xml_doc.xpath(
609 self.xpath + "[1]/*", namespaces=self.namespaces
610 )
612 if is_list_like(self.names):
613 if len(self.names) < len(children):
614 raise ValueError(
615 "names does not match length of child elements in xpath."
616 )
617 else:
618 raise TypeError(
619 f"{type(self.names).__name__} is not a valid type for names"
620 )
622 def _parse_doc(
623 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
624 ) -> etree._Element:
625 from lxml.etree import (
626 XMLParser,
627 fromstring,
628 parse,
629 )
631 handle_data = get_data_from_filepath(
632 filepath_or_buffer=raw_doc,
633 encoding=self.encoding,
634 compression=self.compression,
635 storage_options=self.storage_options,
636 )
638 with preprocess_data(handle_data) as xml_data:
639 curr_parser = XMLParser(encoding=self.encoding)
641 if isinstance(xml_data, io.StringIO):
642 if self.encoding is None:
643 raise TypeError(
644 "Can not pass encoding None when input is StringIO."
645 )
647 document = fromstring(
648 xml_data.getvalue().encode(self.encoding), parser=curr_parser
649 )
650 else:
651 document = parse(xml_data, parser=curr_parser)
653 return document
655 def _transform_doc(self) -> etree._XSLTResultTree:
656 """
657 Transform original tree using stylesheet.
659 This method will transform original xml using XSLT script into
660 am ideally flatter xml document for easier parsing and migration
661 to Data Frame.
662 """
663 from lxml.etree import XSLT
665 transformer = XSLT(self.xsl_doc)
666 new_doc = transformer(self.xml_doc)
668 return new_doc
671def get_data_from_filepath(
672 filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],
673 encoding: str | None,
674 compression: CompressionOptions,
675 storage_options: StorageOptions,
676) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:
677 """
678 Extract raw XML data.
680 The method accepts three input types:
681 1. filepath (string-like)
682 2. file-like object (e.g. open file object, StringIO)
683 3. XML string or bytes
685 This method turns (1) into (2) to simplify the rest of the processing.
686 It returns input types (2) and (3) unchanged.
687 """
688 if not isinstance(filepath_or_buffer, bytes):
689 filepath_or_buffer = stringify_path(filepath_or_buffer)
691 if (
692 isinstance(filepath_or_buffer, str)
693 and not filepath_or_buffer.startswith(("<?xml", "<"))
694 ) and (
695 not isinstance(filepath_or_buffer, str)
696 or is_url(filepath_or_buffer)
697 or is_fsspec_url(filepath_or_buffer)
698 or file_exists(filepath_or_buffer)
699 ):
700 with get_handle(
701 filepath_or_buffer,
702 "r",
703 encoding=encoding,
704 compression=compression,
705 storage_options=storage_options,
706 ) as handle_obj:
707 filepath_or_buffer = (
708 handle_obj.handle.read()
709 if hasattr(handle_obj.handle, "read")
710 else handle_obj.handle
711 )
713 return filepath_or_buffer
716def preprocess_data(data) -> io.StringIO | io.BytesIO:
717 """
718 Convert extracted raw data.
720 This method will return underlying data of extracted XML content.
721 The data either has a `read` attribute (e.g. a file object or a
722 StringIO/BytesIO) or is a string or bytes that is an XML document.
723 """
725 if isinstance(data, str):
726 data = io.StringIO(data)
728 elif isinstance(data, bytes):
729 data = io.BytesIO(data)
731 return data
734def _data_to_frame(data, **kwargs) -> DataFrame:
735 """
736 Convert parsed data to Data Frame.
738 This method will bind xml dictionary data of keys and values
739 into named columns of Data Frame using the built-in TextParser
740 class that build Data Frame and infers specific dtypes.
741 """
743 tags = next(iter(data))
744 nodes = [list(d.values()) for d in data]
746 try:
747 with TextParser(nodes, names=tags, **kwargs) as tp:
748 return tp.read()
749 except ParserError:
750 raise ParserError(
751 "XML document may be too complex for import. "
752 "Try to flatten document and use distinct "
753 "element and attribute names."
754 )
757def _parse(
758 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
759 xpath: str,
760 namespaces: dict[str, str] | None,
761 elems_only: bool,
762 attrs_only: bool,
763 names: Sequence[str] | None,
764 dtype: DtypeArg | None,
765 converters: ConvertersArg | None,
766 parse_dates: ParseDatesArg | None,
767 encoding: str | None,
768 parser: XMLParsers,
769 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
770 iterparse: dict[str, list[str]] | None,
771 compression: CompressionOptions,
772 storage_options: StorageOptions,
773 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
774 **kwargs,
775) -> DataFrame:
776 """
777 Call internal parsers.
779 This method will conditionally call internal parsers:
780 LxmlFrameParser and/or EtreeParser.
782 Raises
783 ------
784 ImportError
785 * If lxml is not installed if selected as parser.
787 ValueError
788 * If parser is not lxml or etree.
789 """
791 p: _EtreeFrameParser | _LxmlFrameParser
793 if isinstance(path_or_buffer, str) and not any(
794 [
795 is_file_like(path_or_buffer),
796 file_exists(path_or_buffer),
797 is_url(path_or_buffer),
798 is_fsspec_url(path_or_buffer),
799 ]
800 ):
801 warnings.warn(
802 "Passing literal xml to 'read_xml' is deprecated and "
803 "will be removed in a future version. To read from a "
804 "literal string, wrap it in a 'StringIO' object.",
805 FutureWarning,
806 stacklevel=find_stack_level(),
807 )
809 if parser == "lxml":
810 lxml = import_optional_dependency("lxml.etree", errors="ignore")
812 if lxml is not None:
813 p = _LxmlFrameParser(
814 path_or_buffer,
815 xpath,
816 namespaces,
817 elems_only,
818 attrs_only,
819 names,
820 dtype,
821 converters,
822 parse_dates,
823 encoding,
824 stylesheet,
825 iterparse,
826 compression,
827 storage_options,
828 )
829 else:
830 raise ImportError("lxml not found, please install or use the etree parser.")
832 elif parser == "etree":
833 p = _EtreeFrameParser(
834 path_or_buffer,
835 xpath,
836 namespaces,
837 elems_only,
838 attrs_only,
839 names,
840 dtype,
841 converters,
842 parse_dates,
843 encoding,
844 stylesheet,
845 iterparse,
846 compression,
847 storage_options,
848 )
849 else:
850 raise ValueError("Values for parser can only be lxml or etree.")
852 data_dicts = p.parse_data()
854 return _data_to_frame(
855 data=data_dicts,
856 dtype=dtype,
857 converters=converters,
858 parse_dates=parse_dates,
859 dtype_backend=dtype_backend,
860 **kwargs,
861 )
864@doc(
865 storage_options=_shared_docs["storage_options"],
866 decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",
867)
868def read_xml(
869 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
870 *,
871 xpath: str = "./*",
872 namespaces: dict[str, str] | None = None,
873 elems_only: bool = False,
874 attrs_only: bool = False,
875 names: Sequence[str] | None = None,
876 dtype: DtypeArg | None = None,
877 converters: ConvertersArg | None = None,
878 parse_dates: ParseDatesArg | None = None,
879 # encoding can not be None for lxml and StringIO input
880 encoding: str | None = "utf-8",
881 parser: XMLParsers = "lxml",
882 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,
883 iterparse: dict[str, list[str]] | None = None,
884 compression: CompressionOptions = "infer",
885 storage_options: StorageOptions | None = None,
886 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
887) -> DataFrame:
888 r"""
889 Read XML document into a :class:`~pandas.DataFrame` object.
891 .. versionadded:: 1.3.0
893 Parameters
894 ----------
895 path_or_buffer : str, path object, or file-like object
896 String, path object (implementing ``os.PathLike[str]``), or file-like
897 object implementing a ``read()`` function. The string can be any valid XML
898 string or a path. The string can further be a URL. Valid URL schemes
899 include http, ftp, s3, and file.
901 .. deprecated:: 2.1.0
902 Passing xml literal strings is deprecated.
903 Wrap literal xml input in ``io.StringIO`` or ``io.BytesIO`` instead.
905 xpath : str, optional, default './\*'
906 The ``XPath`` to parse required set of nodes for migration to
907 :class:`~pandas.DataFrame`.``XPath`` should return a collection of elements
908 and not a single element. Note: The ``etree`` parser supports limited ``XPath``
909 expressions. For more complex ``XPath``, use ``lxml`` which requires
910 installation.
912 namespaces : dict, optional
913 The namespaces defined in XML document as dicts with key being
914 namespace prefix and value the URI. There is no need to include all
915 namespaces in XML, only the ones used in ``xpath`` expression.
916 Note: if XML document uses default namespace denoted as
917 `xmlns='<URI>'` without a prefix, you must assign any temporary
918 namespace prefix such as 'doc' to the URI in order to parse
919 underlying nodes and/or attributes. For example, ::
921 namespaces = {{"doc": "https://example.com"}}
923 elems_only : bool, optional, default False
924 Parse only the child elements at the specified ``xpath``. By default,
925 all child elements and non-empty text nodes are returned.
927 attrs_only : bool, optional, default False
928 Parse only the attributes at the specified ``xpath``.
929 By default, all attributes are returned.
931 names : list-like, optional
932 Column names for DataFrame of parsed XML data. Use this parameter to
933 rename original element names and distinguish same named elements and
934 attributes.
936 dtype : Type name or dict of column -> type, optional
937 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
938 'c': 'Int64'}}
939 Use `str` or `object` together with suitable `na_values` settings
940 to preserve and not interpret dtype.
941 If converters are specified, they will be applied INSTEAD
942 of dtype conversion.
944 .. versionadded:: 1.5.0
946 converters : dict, optional
947 Dict of functions for converting values in certain columns. Keys can either
948 be integers or column labels.
950 .. versionadded:: 1.5.0
952 parse_dates : bool or list of int or names or list of lists or dict, default False
953 Identifiers to parse index or columns to datetime. The behavior is as follows:
955 * boolean. If True -> try parsing the index.
956 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
957 each as a separate date column.
958 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
959 a single date column.
960 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
961 result 'foo'
963 .. versionadded:: 1.5.0
965 encoding : str, optional, default 'utf-8'
966 Encoding of XML document.
968 parser : {{'lxml','etree'}}, default 'lxml'
969 Parser module to use for retrieval of data. Only 'lxml' and
970 'etree' are supported. With 'lxml' more complex ``XPath`` searches
971 and ability to use XSLT stylesheet are supported.
973 stylesheet : str, path object or file-like object
974 A URL, file-like object, or a raw string containing an XSLT script.
975 This stylesheet should flatten complex, deeply nested XML documents
976 for easier parsing. To use this feature you must have ``lxml`` module
977 installed and specify 'lxml' as ``parser``. The ``xpath`` must
978 reference nodes of transformed XML document generated after XSLT
979 transformation and not the original XML document. Only XSLT 1.0
980 scripts and not later versions is currently supported.
982 iterparse : dict, optional
983 The nodes or attributes to retrieve in iterparsing of XML document
984 as a dict with key being the name of repeating element and value being
985 list of elements or attribute names that are descendants of the repeated
986 element. Note: If this option is used, it will replace ``xpath`` parsing
987 and unlike ``xpath``, descendants do not need to relate to each other but can
988 exist any where in document under the repeating element. This memory-
989 efficient method should be used for very large XML files (500MB, 1GB, or 5GB+).
990 For example, ::
992 iterparse = {{"row_element": ["child_elem", "attr", "grandchild_elem"]}}
994 .. versionadded:: 1.5.0
996 {decompression_options}
998 .. versionchanged:: 1.4.0 Zstandard support.
1000 {storage_options}
1002 dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
1003 Back-end data type applied to the resultant :class:`DataFrame`
1004 (still experimental). Behaviour is as follows:
1006 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
1007 (default).
1008 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
1009 DataFrame.
1011 .. versionadded:: 2.0
1013 Returns
1014 -------
1015 df
1016 A DataFrame.
1018 See Also
1019 --------
1020 read_json : Convert a JSON string to pandas object.
1021 read_html : Read HTML tables into a list of DataFrame objects.
1023 Notes
1024 -----
1025 This method is best designed to import shallow XML documents in
1026 following format which is the ideal fit for the two-dimensions of a
1027 ``DataFrame`` (row by column). ::
1029 <root>
1030 <row>
1031 <column1>data</column1>
1032 <column2>data</column2>
1033 <column3>data</column3>
1034 ...
1035 </row>
1036 <row>
1037 ...
1038 </row>
1039 ...
1040 </root>
1042 As a file format, XML documents can be designed any way including
1043 layout of elements and attributes as long as it conforms to W3C
1044 specifications. Therefore, this method is a convenience handler for
1045 a specific flatter design and not all possible XML structures.
1047 However, for more complex XML documents, ``stylesheet`` allows you to
1048 temporarily redesign original document with XSLT (a special purpose
1049 language) for a flatter version for migration to a DataFrame.
1051 This function will *always* return a single :class:`DataFrame` or raise
1052 exceptions due to issues with XML document, ``xpath``, or other
1053 parameters.
1055 See the :ref:`read_xml documentation in the IO section of the docs
1056 <io.read_xml>` for more information in using this method to parse XML
1057 files to DataFrames.
1059 Examples
1060 --------
1061 >>> from io import StringIO
1062 >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
1063 ... <data xmlns="http://example.com">
1064 ... <row>
1065 ... <shape>square</shape>
1066 ... <degrees>360</degrees>
1067 ... <sides>4.0</sides>
1068 ... </row>
1069 ... <row>
1070 ... <shape>circle</shape>
1071 ... <degrees>360</degrees>
1072 ... <sides/>
1073 ... </row>
1074 ... <row>
1075 ... <shape>triangle</shape>
1076 ... <degrees>180</degrees>
1077 ... <sides>3.0</sides>
1078 ... </row>
1079 ... </data>'''
1081 >>> df = pd.read_xml(StringIO(xml))
1082 >>> df
1083 shape degrees sides
1084 0 square 360 4.0
1085 1 circle 360 NaN
1086 2 triangle 180 3.0
1088 >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
1089 ... <data>
1090 ... <row shape="square" degrees="360" sides="4.0"/>
1091 ... <row shape="circle" degrees="360"/>
1092 ... <row shape="triangle" degrees="180" sides="3.0"/>
1093 ... </data>'''
1095 >>> df = pd.read_xml(StringIO(xml), xpath=".//row")
1096 >>> df
1097 shape degrees sides
1098 0 square 360 4.0
1099 1 circle 360 NaN
1100 2 triangle 180 3.0
1102 >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
1103 ... <doc:data xmlns:doc="https://example.com">
1104 ... <doc:row>
1105 ... <doc:shape>square</doc:shape>
1106 ... <doc:degrees>360</doc:degrees>
1107 ... <doc:sides>4.0</doc:sides>
1108 ... </doc:row>
1109 ... <doc:row>
1110 ... <doc:shape>circle</doc:shape>
1111 ... <doc:degrees>360</doc:degrees>
1112 ... <doc:sides/>
1113 ... </doc:row>
1114 ... <doc:row>
1115 ... <doc:shape>triangle</doc:shape>
1116 ... <doc:degrees>180</doc:degrees>
1117 ... <doc:sides>3.0</doc:sides>
1118 ... </doc:row>
1119 ... </doc:data>'''
1121 >>> df = pd.read_xml(StringIO(xml),
1122 ... xpath="//doc:row",
1123 ... namespaces={{"doc": "https://example.com"}})
1124 >>> df
1125 shape degrees sides
1126 0 square 360 4.0
1127 1 circle 360 NaN
1128 2 triangle 180 3.0
1130 >>> xml_data = '''
1131 ... <data>
1132 ... <row>
1133 ... <index>0</index>
1134 ... <a>1</a>
1135 ... <b>2.5</b>
1136 ... <c>True</c>
1137 ... <d>a</d>
1138 ... <e>2019-12-31 00:00:00</e>
1139 ... </row>
1140 ... <row>
1141 ... <index>1</index>
1142 ... <b>4.5</b>
1143 ... <c>False</c>
1144 ... <d>b</d>
1145 ... <e>2019-12-31 00:00:00</e>
1146 ... </row>
1147 ... </data>
1148 ... '''
1150 >>> df = pd.read_xml(StringIO(xml_data),
1151 ... dtype_backend="numpy_nullable",
1152 ... parse_dates=["e"])
1153 >>> df
1154 index a b c d e
1155 0 0 1 2.5 True a 2019-12-31
1156 1 1 <NA> 4.5 False b 2019-12-31
1157 """
1158 check_dtype_backend(dtype_backend)
1160 return _parse(
1161 path_or_buffer=path_or_buffer,
1162 xpath=xpath,
1163 namespaces=namespaces,
1164 elems_only=elems_only,
1165 attrs_only=attrs_only,
1166 names=names,
1167 dtype=dtype,
1168 converters=converters,
1169 parse_dates=parse_dates,
1170 encoding=encoding,
1171 parser=parser,
1172 stylesheet=stylesheet,
1173 iterparse=iterparse,
1174 compression=compression,
1175 storage_options=storage_options,
1176 dtype_backend=dtype_backend,
1177 )