Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/xml.py: 17%

1"""

2:mod:``pandas.io.xml`` is a module for reading XML.

3"""

5from __future__ import annotations

7import io

8from os import PathLike

9from typing import (

10 TYPE_CHECKING,

11 Any,

12 Callable,

13)

14import warnings

16from pandas._libs import lib

17from pandas.compat._optional import import_optional_dependency

18from pandas.errors import (

19 AbstractMethodError,

20 ParserError,

21)

22from pandas.util._decorators import doc

23from pandas.util._exceptions import find_stack_level

24from pandas.util._validators import check_dtype_backend

26from pandas.core.dtypes.common import is_list_like

28from pandas.core.shared_docs import _shared_docs

30from pandas.io.common import (

31 file_exists,

32 get_handle,

33 infer_compression,

34 is_file_like,

35 is_fsspec_url,

36 is_url,

37 stringify_path,

38)

39from pandas.io.parsers import TextParser

41if TYPE_CHECKING:

42 from collections.abc import Sequence

43 from xml.etree.ElementTree import Element

45 from lxml import etree

47 from pandas._typing import (

48 CompressionOptions,

49 ConvertersArg,

50 DtypeArg,

51 DtypeBackend,

52 FilePath,

53 ParseDatesArg,

54 ReadBuffer,

55 StorageOptions,

56 XMLParsers,

57 )

59 from pandas import DataFrame

62@doc(

63 storage_options=_shared_docs["storage_options"],

64 decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",

65)

66class _XMLFrameParser:

67 """

68 Internal subclass to parse XML into DataFrames.

70 Parameters

71 ----------

72 path_or_buffer : a valid JSON ``str``, path object or file-like object

73 Any valid string path is acceptable. The string could be a URL. Valid

74 URL schemes include http, ftp, s3, and file.

76 xpath : str or regex

77 The ``XPath`` expression to parse required set of nodes for

78 migration to :class:`~pandas.DataFrame`. ``etree`` supports limited ``XPath``.

80 namespaces : dict

81 The namespaces defined in XML document (``xmlns:namespace='URI'``)

82 as dicts with key being namespace and value the URI.

84 elems_only : bool

85 Parse only the child elements at the specified ``xpath``.

87 attrs_only : bool

88 Parse only the attributes at the specified ``xpath``.

90 names : list

91 Column names for :class:`~pandas.DataFrame` of parsed XML data.

93 dtype : dict

94 Data type for data or columns. E.g. {{'a': np.float64,

95 'b': np.int32, 'c': 'Int64'}}

97 .. versionadded:: 1.5.0

99 converters : dict, optional

100 Dict of functions for converting values in certain columns. Keys can

101 either be integers or column labels.

102

103 .. versionadded:: 1.5.0

104

105 parse_dates : bool or list of int or names or list of lists or dict

106 Converts either index or select columns to datetimes

107

108 .. versionadded:: 1.5.0

109

110 encoding : str

111 Encoding of xml object or document.

112

113 stylesheet : str or file-like

114 URL, file, file-like object, or a raw string containing XSLT,

115 ``etree`` does not support XSLT but retained for consistency.

116

117 iterparse : dict, optional

118 Dict with row element as key and list of descendant elements

119 and/or attributes as value to be retrieved in iterparsing of

120 XML document.

121

122 .. versionadded:: 1.5.0

123

124 {decompression_options}

125

126 .. versionchanged:: 1.4.0 Zstandard support.

127

128 {storage_options}

129

130 See also

131 --------

132 pandas.io.xml._EtreeFrameParser

133 pandas.io.xml._LxmlFrameParser

134

135 Notes

136 -----

137 To subclass this class effectively you must override the following methods:`

138 * :func:`parse_data`

139 * :func:`_parse_nodes`

140 * :func:`_iterparse_nodes`

141 * :func:`_parse_doc`

142 * :func:`_validate_names`

143 * :func:`_validate_path`

144

145

146 See each method's respective documentation for details on their

147 functionality.

148 """

149

150 def __init__(

151 self,

152 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],

153 xpath: str,

154 namespaces: dict[str, str] | None,

155 elems_only: bool,

156 attrs_only: bool,

157 names: Sequence[str] | None,

158 dtype: DtypeArg | None,

159 converters: ConvertersArg | None,

160 parse_dates: ParseDatesArg | None,

161 encoding: str | None,

162 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,

163 iterparse: dict[str, list[str]] | None,

164 compression: CompressionOptions,

165 storage_options: StorageOptions,

166 ) -> None:

167 self.path_or_buffer = path_or_buffer

168 self.xpath = xpath

169 self.namespaces = namespaces

170 self.elems_only = elems_only

171 self.attrs_only = attrs_only

172 self.names = names

173 self.dtype = dtype

174 self.converters = converters

175 self.parse_dates = parse_dates

176 self.encoding = encoding

177 self.stylesheet = stylesheet

178 self.iterparse = iterparse

179 self.is_style = None

180 self.compression: CompressionOptions = compression

181 self.storage_options = storage_options

182

183 def parse_data(self) -> list[dict[str, str | None]]:

184 """

185 Parse xml data.

186

187 This method will call the other internal methods to

188 validate ``xpath``, names, parse and return specific nodes.

189 """

190

191 raise AbstractMethodError(self)

192

193 def _parse_nodes(self, elems: list[Any]) -> list[dict[str, str | None]]:

194 """

195 Parse xml nodes.

196

197 This method will parse the children and attributes of elements

198 in ``xpath``, conditionally for only elements, only attributes

199 or both while optionally renaming node names.

200

201 Raises

202 ------

203 ValueError

204 * If only elements and only attributes are specified.

205

206 Notes

207 -----

208 Namespace URIs will be removed from return node values. Also,

209 elements with missing children or attributes compared to siblings

210 will have optional keys filled with None values.

211 """

212

213 dicts: list[dict[str, str | None]]

214

215 if self.elems_only and self.attrs_only:

216 raise ValueError("Either element or attributes can be parsed not both.")

217 if self.elems_only:

218 if self.names:

219 dicts = [

220 {

221 **(

222 {el.tag: el.text}

223 if el.text and not el.text.isspace()

224 else {}

225 ),

226 **{

227 nm: ch.text if ch.text else None

228 for nm, ch in zip(self.names, el.findall("*"))

229 },

230 }

231 for el in elems

232 ]

233 else:

234 dicts = [

235 {ch.tag: ch.text if ch.text else None for ch in el.findall("*")}

236 for el in elems

237 ]

238

239 elif self.attrs_only:

240 dicts = [

241 {k: v if v else None for k, v in el.attrib.items()} for el in elems

242 ]

243

244 elif self.names:

245 dicts = [

246 {

247 **el.attrib,

248 **({el.tag: el.text} if el.text and not el.text.isspace() else {}),

249 **{

250 nm: ch.text if ch.text else None

251 for nm, ch in zip(self.names, el.findall("*"))

252 },

253 }

254 for el in elems

255 ]

256

257 else:

258 dicts = [

259 {

260 **el.attrib,

261 **({el.tag: el.text} if el.text and not el.text.isspace() else {}),

262 **{ch.tag: ch.text if ch.text else None for ch in el.findall("*")},

263 }

264 for el in elems

265 ]

266

267 dicts = [

268 {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts

269 ]

270

271 keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))

272 dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]

273

274 if self.names:

275 dicts = [dict(zip(self.names, d.values())) for d in dicts]

276

277 return dicts

278

279 def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:

280 """

281 Iterparse xml nodes.

282

283 This method will read in local disk, decompressed XML files for elements

284 and underlying descendants using iterparse, a method to iterate through

285 an XML tree without holding entire XML tree in memory.

286

287 Raises

288 ------

289 TypeError

290 * If ``iterparse`` is not a dict or its dict value is not list-like.

291 ParserError

292 * If ``path_or_buffer`` is not a physical file on disk or file-like object.

293 * If no data is returned from selected items in ``iterparse``.

294

295 Notes

296 -----

297 Namespace URIs will be removed from return node values. Also,

298 elements with missing children or attributes in submitted list

299 will have optional keys filled with None values.

300 """

301

302 dicts: list[dict[str, str | None]] = []

303 row: dict[str, str | None] | None = None

304

305 if not isinstance(self.iterparse, dict):

306 raise TypeError(

307 f"{type(self.iterparse).__name__} is not a valid type for iterparse"

308 )

309

310 row_node = next(iter(self.iterparse.keys())) if self.iterparse else ""

311 if not is_list_like(self.iterparse[row_node]):

312 raise TypeError(

313 f"{type(self.iterparse[row_node])} is not a valid type "

314 "for value in iterparse"

315 )

316

317 if (not hasattr(self.path_or_buffer, "read")) and (

318 not isinstance(self.path_or_buffer, (str, PathLike))

319 or is_url(self.path_or_buffer)

320 or is_fsspec_url(self.path_or_buffer)

321 or (

322 isinstance(self.path_or_buffer, str)

323 and self.path_or_buffer.startswith(("<?xml", "<"))

324 )

325 or infer_compression(self.path_or_buffer, "infer") is not None

326 ):

327 raise ParserError(

328 "iterparse is designed for large XML files that are fully extracted on "

329 "local disk and not as compressed files or online sources."

330 )

331

332 iterparse_repeats = len(self.iterparse[row_node]) != len(

333 set(self.iterparse[row_node])

334 )

335

336 for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):

337 curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag

338

339 if event == "start":

340 if curr_elem == row_node:

341 row = {}

342

343 if row is not None:

344 if self.names and iterparse_repeats:

345 for col, nm in zip(self.iterparse[row_node], self.names):

346 if curr_elem == col:

347 elem_val = elem.text if elem.text else None

348 if elem_val not in row.values() and nm not in row:

349 row[nm] = elem_val

350

351 if col in elem.attrib:

352 if elem.attrib[col] not in row.values() and nm not in row:

353 row[nm] = elem.attrib[col]

354 else:

355 for col in self.iterparse[row_node]:

356 if curr_elem == col:

357 row[col] = elem.text if elem.text else None

358 if col in elem.attrib:

359 row[col] = elem.attrib[col]

360

361 if event == "end":

362 if curr_elem == row_node and row is not None:

363 dicts.append(row)

364 row = None

365

366 elem.clear()

367 if hasattr(elem, "getprevious"):

368 while (

369 elem.getprevious() is not None and elem.getparent() is not None

370 ):

371 del elem.getparent()[0]

372

373 if dicts == []:

374 raise ParserError("No result from selected items in iterparse.")

375

376 keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))

377 dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]

378

379 if self.names:

380 dicts = [dict(zip(self.names, d.values())) for d in dicts]

381

382 return dicts

383

384 def _validate_path(self) -> list[Any]:

385 """

386 Validate ``xpath``.

387

388 This method checks for syntax, evaluation, or empty nodes return.

389

390 Raises

391 ------

392 SyntaxError

393 * If xpah is not supported or issues with namespaces.

394

395 ValueError

396 * If xpah does not return any nodes.

397 """

398

399 raise AbstractMethodError(self)

400

401 def _validate_names(self) -> None:

402 """

403 Validate names.

404

405 This method will check if names is a list-like and aligns

406 with length of parse nodes.

407

408 Raises

409 ------

410 ValueError

411 * If value is not a list and less then length of nodes.

412 """

413 raise AbstractMethodError(self)

414

415 def _parse_doc(

416 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]

417 ) -> Element | etree._Element:

418 """

419 Build tree from path_or_buffer.

420

421 This method will parse XML object into tree

422 either from string/bytes or file location.

423 """

424 raise AbstractMethodError(self)

425

426

427class _EtreeFrameParser(_XMLFrameParser):

428 """

429 Internal class to parse XML into DataFrames with the Python

430 standard library XML module: `xml.etree.ElementTree`.

431 """

432

433 def parse_data(self) -> list[dict[str, str | None]]:

434 from xml.etree.ElementTree import iterparse

435

436 if self.stylesheet is not None:

437 raise ValueError(

438 "To use stylesheet, you need lxml installed and selected as parser."

439 )

440

441 if self.iterparse is None:

442 self.xml_doc = self._parse_doc(self.path_or_buffer)

443 elems = self._validate_path()

444

445 self._validate_names()

446

447 xml_dicts: list[dict[str, str | None]] = (

448 self._parse_nodes(elems)

449 if self.iterparse is None

450 else self._iterparse_nodes(iterparse)

451 )

452

453 return xml_dicts

454

455 def _validate_path(self) -> list[Any]:

456 """

457 Notes

458 -----

459 ``etree`` supports limited ``XPath``. If user attempts a more complex

460 expression syntax error will raise.

461 """

462

463 msg = (

464 "xpath does not return any nodes or attributes. "

465 "Be sure to specify in `xpath` the parent nodes of "

466 "children and attributes to parse. "

467 "If document uses namespaces denoted with "

468 "xmlns, be sure to define namespaces and "

469 "use them in xpath."

470 )

471 try:

472 elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)

473 children = [ch for el in elems for ch in el.findall("*")]

474 attrs = {k: v for el in elems for k, v in el.attrib.items()}

475

476 if elems is None:

477 raise ValueError(msg)

478

479 if elems is not None:

480 if self.elems_only and children == []:

481 raise ValueError(msg)

482 if self.attrs_only and attrs == {}:

483 raise ValueError(msg)

484 if children == [] and attrs == {}:

485 raise ValueError(msg)

486

487 except (KeyError, SyntaxError):

488 raise SyntaxError(

489 "You have used an incorrect or unsupported XPath "

490 "expression for etree library or you used an "

491 "undeclared namespace prefix."

492 )

493

494 return elems

495

496 def _validate_names(self) -> None:

497 children: list[Any]

498

499 if self.names:

500 if self.iterparse:

501 children = self.iterparse[next(iter(self.iterparse))]

502 else:

503 parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces)

504 children = parent.findall("*") if parent is not None else []

505

506 if is_list_like(self.names):

507 if len(self.names) < len(children):

508 raise ValueError(

509 "names does not match length of child elements in xpath."

510 )

511 else:

512 raise TypeError(

513 f"{type(self.names).__name__} is not a valid type for names"

514 )

515

516 def _parse_doc(

517 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]

518 ) -> Element:

519 from xml.etree.ElementTree import (

520 XMLParser,

521 parse,

522 )

523

524 handle_data = get_data_from_filepath(

525 filepath_or_buffer=raw_doc,

526 encoding=self.encoding,

527 compression=self.compression,

528 storage_options=self.storage_options,

529 )

530

531 with preprocess_data(handle_data) as xml_data:

532 curr_parser = XMLParser(encoding=self.encoding)

533 document = parse(xml_data, parser=curr_parser)

534

535 return document.getroot()

536

537

538class _LxmlFrameParser(_XMLFrameParser):

539 """

540 Internal class to parse XML into :class:`~pandas.DataFrame` with third-party

541 full-featured XML library, ``lxml``, that supports

542 ``XPath`` 1.0 and XSLT 1.0.

543 """

544

545 def parse_data(self) -> list[dict[str, str | None]]:

546 """

547 Parse xml data.

548

549 This method will call the other internal methods to

550 validate ``xpath``, names, optionally parse and run XSLT,

551 and parse original or transformed XML and return specific nodes.

552 """

553 from lxml.etree import iterparse

554

555 if self.iterparse is None:

556 self.xml_doc = self._parse_doc(self.path_or_buffer)

557

558 if self.stylesheet:

559 self.xsl_doc = self._parse_doc(self.stylesheet)

560 self.xml_doc = self._transform_doc()

561

562 elems = self._validate_path()

563

564 self._validate_names()

565

566 xml_dicts: list[dict[str, str | None]] = (

567 self._parse_nodes(elems)

568 if self.iterparse is None

569 else self._iterparse_nodes(iterparse)

570 )

571

572 return xml_dicts

573

574 def _validate_path(self) -> list[Any]:

575 msg = (

576 "xpath does not return any nodes or attributes. "

577 "Be sure to specify in `xpath` the parent nodes of "

578 "children and attributes to parse. "

579 "If document uses namespaces denoted with "

580 "xmlns, be sure to define namespaces and "

581 "use them in xpath."

582 )

583

584 elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)

585 children = [ch for el in elems for ch in el.xpath("*")]

586 attrs = {k: v for el in elems for k, v in el.attrib.items()}

587

588 if elems == []:

589 raise ValueError(msg)

590

591 if elems != []:

592 if self.elems_only and children == []:

593 raise ValueError(msg)

594 if self.attrs_only and attrs == {}:

595 raise ValueError(msg)

596 if children == [] and attrs == {}:

597 raise ValueError(msg)

598

599 return elems

600

601 def _validate_names(self) -> None:

602 children: list[Any]

603

604 if self.names:

605 if self.iterparse:

606 children = self.iterparse[next(iter(self.iterparse))]

607 else:

608 children = self.xml_doc.xpath(

609 self.xpath + "[1]/*", namespaces=self.namespaces

610 )

611

612 if is_list_like(self.names):

613 if len(self.names) < len(children):

614 raise ValueError(

615 "names does not match length of child elements in xpath."

616 )

617 else:

618 raise TypeError(

619 f"{type(self.names).__name__} is not a valid type for names"

620 )

621

622 def _parse_doc(

623 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]

624 ) -> etree._Element:

625 from lxml.etree import (

626 XMLParser,

627 fromstring,

628 parse,

629 )

630

631 handle_data = get_data_from_filepath(

632 filepath_or_buffer=raw_doc,

633 encoding=self.encoding,

634 compression=self.compression,

635 storage_options=self.storage_options,

636 )

637

638 with preprocess_data(handle_data) as xml_data:

639 curr_parser = XMLParser(encoding=self.encoding)

640

641 if isinstance(xml_data, io.StringIO):

642 if self.encoding is None:

643 raise TypeError(

644 "Can not pass encoding None when input is StringIO."

645 )

646

647 document = fromstring(

648 xml_data.getvalue().encode(self.encoding), parser=curr_parser

649 )

650 else:

651 document = parse(xml_data, parser=curr_parser)

652

653 return document

654

655 def _transform_doc(self) -> etree._XSLTResultTree:

656 """

657 Transform original tree using stylesheet.

658

659 This method will transform original xml using XSLT script into

660 am ideally flatter xml document for easier parsing and migration

661 to Data Frame.

662 """

663 from lxml.etree import XSLT

664

665 transformer = XSLT(self.xsl_doc)

666 new_doc = transformer(self.xml_doc)

667

668 return new_doc

669

670

671def get_data_from_filepath(

672 filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],

673 encoding: str | None,

674 compression: CompressionOptions,

675 storage_options: StorageOptions,

676) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:

677 """

678 Extract raw XML data.

679

680 The method accepts three input types:

681 1. filepath (string-like)

682 2. file-like object (e.g. open file object, StringIO)

683 3. XML string or bytes

684

685 This method turns (1) into (2) to simplify the rest of the processing.

686 It returns input types (2) and (3) unchanged.

687 """

688 if not isinstance(filepath_or_buffer, bytes):

689 filepath_or_buffer = stringify_path(filepath_or_buffer)

690

691 if (

692 isinstance(filepath_or_buffer, str)

693 and not filepath_or_buffer.startswith(("<?xml", "<"))

694 ) and (

695 not isinstance(filepath_or_buffer, str)

696 or is_url(filepath_or_buffer)

697 or is_fsspec_url(filepath_or_buffer)

698 or file_exists(filepath_or_buffer)

699 ):

700 with get_handle(

701 filepath_or_buffer,

702 "r",

703 encoding=encoding,

704 compression=compression,

705 storage_options=storage_options,

706 ) as handle_obj:

707 filepath_or_buffer = (

708 handle_obj.handle.read()

709 if hasattr(handle_obj.handle, "read")

710 else handle_obj.handle

711 )

712

713 return filepath_or_buffer

714

715

716def preprocess_data(data) -> io.StringIO | io.BytesIO:

717 """

718 Convert extracted raw data.

719

720 This method will return underlying data of extracted XML content.

721 The data either has a `read` attribute (e.g. a file object or a

722 StringIO/BytesIO) or is a string or bytes that is an XML document.

723 """

724

725 if isinstance(data, str):

726 data = io.StringIO(data)

727

728 elif isinstance(data, bytes):

729 data = io.BytesIO(data)

730

731 return data

732

733

734def _data_to_frame(data, **kwargs) -> DataFrame:

735 """

736 Convert parsed data to Data Frame.

737

738 This method will bind xml dictionary data of keys and values

739 into named columns of Data Frame using the built-in TextParser

740 class that build Data Frame and infers specific dtypes.

741 """

742

743 tags = next(iter(data))

744 nodes = [list(d.values()) for d in data]

745

746 try:

747 with TextParser(nodes, names=tags, **kwargs) as tp:

748 return tp.read()

749 except ParserError:

750 raise ParserError(

751 "XML document may be too complex for import. "

752 "Try to flatten document and use distinct "

753 "element and attribute names."

754 )

755

756

757def _parse(

758 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],

759 xpath: str,

760 namespaces: dict[str, str] | None,

761 elems_only: bool,

762 attrs_only: bool,

763 names: Sequence[str] | None,

764 dtype: DtypeArg | None,

765 converters: ConvertersArg | None,

766 parse_dates: ParseDatesArg | None,

767 encoding: str | None,

768 parser: XMLParsers,

769 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,

770 iterparse: dict[str, list[str]] | None,

771 compression: CompressionOptions,

772 storage_options: StorageOptions,

773 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

774 **kwargs,

775) -> DataFrame:

776 """

777 Call internal parsers.

778

779 This method will conditionally call internal parsers:

780 LxmlFrameParser and/or EtreeParser.

781

782 Raises

783 ------

784 ImportError

785 * If lxml is not installed if selected as parser.

786

787 ValueError

788 * If parser is not lxml or etree.

789 """

790

791 p: _EtreeFrameParser | _LxmlFrameParser

792

793 if isinstance(path_or_buffer, str) and not any(

794 [

795 is_file_like(path_or_buffer),

796 file_exists(path_or_buffer),

797 is_url(path_or_buffer),

798 is_fsspec_url(path_or_buffer),

799 ]

800 ):

801 warnings.warn(

802 "Passing literal xml to 'read_xml' is deprecated and "

803 "will be removed in a future version. To read from a "

804 "literal string, wrap it in a 'StringIO' object.",

805 FutureWarning,

806 stacklevel=find_stack_level(),

807 )

808

809 if parser == "lxml":

810 lxml = import_optional_dependency("lxml.etree", errors="ignore")

811

812 if lxml is not None:

813 p = _LxmlFrameParser(

814 path_or_buffer,

815 xpath,

816 namespaces,

817 elems_only,

818 attrs_only,

819 names,

820 dtype,

821 converters,

822 parse_dates,

823 encoding,

824 stylesheet,

825 iterparse,

826 compression,

827 storage_options,

828 )

829 else:

830 raise ImportError("lxml not found, please install or use the etree parser.")

831

832 elif parser == "etree":

833 p = _EtreeFrameParser(

834 path_or_buffer,

835 xpath,

836 namespaces,

837 elems_only,

838 attrs_only,

839 names,

840 dtype,

841 converters,

842 parse_dates,

843 encoding,

844 stylesheet,

845 iterparse,

846 compression,

847 storage_options,

848 )

849 else:

850 raise ValueError("Values for parser can only be lxml or etree.")

851

852 data_dicts = p.parse_data()

853

854 return _data_to_frame(

855 data=data_dicts,

856 dtype=dtype,

857 converters=converters,

858 parse_dates=parse_dates,

859 dtype_backend=dtype_backend,

860 **kwargs,

861 )

862

863

864@doc(

865 storage_options=_shared_docs["storage_options"],

866 decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",

867)

868def read_xml(

869 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],

870 *,

871 xpath: str = "./*",

872 namespaces: dict[str, str] | None = None,

873 elems_only: bool = False,

874 attrs_only: bool = False,

875 names: Sequence[str] | None = None,

876 dtype: DtypeArg | None = None,

877 converters: ConvertersArg | None = None,

878 parse_dates: ParseDatesArg | None = None,

879 # encoding can not be None for lxml and StringIO input

880 encoding: str | None = "utf-8",

881 parser: XMLParsers = "lxml",

882 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,

883 iterparse: dict[str, list[str]] | None = None,

884 compression: CompressionOptions = "infer",

885 storage_options: StorageOptions | None = None,

886 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

887) -> DataFrame:

888 r"""

889 Read XML document into a :class:`~pandas.DataFrame` object.

890

891 .. versionadded:: 1.3.0

892

893 Parameters

894 ----------

895 path_or_buffer : str, path object, or file-like object

896 String, path object (implementing ``os.PathLike[str]``), or file-like

897 object implementing a ``read()`` function. The string can be any valid XML

898 string or a path. The string can further be a URL. Valid URL schemes

899 include http, ftp, s3, and file.

900

901 .. deprecated:: 2.1.0

902 Passing xml literal strings is deprecated.

903 Wrap literal xml input in ``io.StringIO`` or ``io.BytesIO`` instead.

904

905 xpath : str, optional, default './\*'

906 The ``XPath`` to parse required set of nodes for migration to

907 :class:`~pandas.DataFrame`.``XPath`` should return a collection of elements

908 and not a single element. Note: The ``etree`` parser supports limited ``XPath``

909 expressions. For more complex ``XPath``, use ``lxml`` which requires

910 installation.

911

912 namespaces : dict, optional

913 The namespaces defined in XML document as dicts with key being

914 namespace prefix and value the URI. There is no need to include all

915 namespaces in XML, only the ones used in ``xpath`` expression.

916 Note: if XML document uses default namespace denoted as

917 `xmlns='<URI>'` without a prefix, you must assign any temporary

918 namespace prefix such as 'doc' to the URI in order to parse

919 underlying nodes and/or attributes. For example, ::

920

921 namespaces = {{"doc": "https://example.com"}}

922

923 elems_only : bool, optional, default False

924 Parse only the child elements at the specified ``xpath``. By default,

925 all child elements and non-empty text nodes are returned.

926

927 attrs_only : bool, optional, default False

928 Parse only the attributes at the specified ``xpath``.

929 By default, all attributes are returned.

930

931 names : list-like, optional

932 Column names for DataFrame of parsed XML data. Use this parameter to

933 rename original element names and distinguish same named elements and

934 attributes.

935

936 dtype : Type name or dict of column -> type, optional

937 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,

938 'c': 'Int64'}}

939 Use `str` or `object` together with suitable `na_values` settings

940 to preserve and not interpret dtype.

941 If converters are specified, they will be applied INSTEAD

942 of dtype conversion.

943

944 .. versionadded:: 1.5.0

945

946 converters : dict, optional

947 Dict of functions for converting values in certain columns. Keys can either

948 be integers or column labels.

949

950 .. versionadded:: 1.5.0

951

952 parse_dates : bool or list of int or names or list of lists or dict, default False

953 Identifiers to parse index or columns to datetime. The behavior is as follows:

954

955 * boolean. If True -> try parsing the index.

956 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3

957 each as a separate date column.

958 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as

959 a single date column.

960 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call

961 result 'foo'

962

963 .. versionadded:: 1.5.0

964

965 encoding : str, optional, default 'utf-8'

966 Encoding of XML document.

967

968 parser : {{'lxml','etree'}}, default 'lxml'

969 Parser module to use for retrieval of data. Only 'lxml' and

970 'etree' are supported. With 'lxml' more complex ``XPath`` searches

971 and ability to use XSLT stylesheet are supported.

972

973 stylesheet : str, path object or file-like object

974 A URL, file-like object, or a raw string containing an XSLT script.

975 This stylesheet should flatten complex, deeply nested XML documents

976 for easier parsing. To use this feature you must have ``lxml`` module

977 installed and specify 'lxml' as ``parser``. The ``xpath`` must

978 reference nodes of transformed XML document generated after XSLT

979 transformation and not the original XML document. Only XSLT 1.0

980 scripts and not later versions is currently supported.

981

982 iterparse : dict, optional

983 The nodes or attributes to retrieve in iterparsing of XML document

984 as a dict with key being the name of repeating element and value being

985 list of elements or attribute names that are descendants of the repeated

986 element. Note: If this option is used, it will replace ``xpath`` parsing

987 and unlike ``xpath``, descendants do not need to relate to each other but can

988 exist any where in document under the repeating element. This memory-

989 efficient method should be used for very large XML files (500MB, 1GB, or 5GB+).

990 For example, ::

991

992 iterparse = {{"row_element": ["child_elem", "attr", "grandchild_elem"]}}

993

994 .. versionadded:: 1.5.0

995

996 {decompression_options}

997

998 .. versionchanged:: 1.4.0 Zstandard support.

999

1000 {storage_options}

1001

1002 dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'

1003 Back-end data type applied to the resultant :class:`DataFrame`

1004 (still experimental). Behaviour is as follows:

1005

1006 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`

1007 (default).

1008 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`

1009 DataFrame.

1010

1011 .. versionadded:: 2.0

1012

1013 Returns

1014 -------

1015 df

1016 A DataFrame.

1017

1018 See Also

1019 --------

1020 read_json : Convert a JSON string to pandas object.

1021 read_html : Read HTML tables into a list of DataFrame objects.

1022

1023 Notes

1024 -----

1025 This method is best designed to import shallow XML documents in

1026 following format which is the ideal fit for the two-dimensions of a

1027 ``DataFrame`` (row by column). ::

1028

1029 <root>

1030 <row>

1031 <column1>data</column1>

1032 <column2>data</column2>

1033 <column3>data</column3>

1034 ...

1035 </row>

1036 <row>

1037 ...

1038 </row>

1039 ...

1040 </root>

1041

1042 As a file format, XML documents can be designed any way including

1043 layout of elements and attributes as long as it conforms to W3C

1044 specifications. Therefore, this method is a convenience handler for

1045 a specific flatter design and not all possible XML structures.

1046

1047 However, for more complex XML documents, ``stylesheet`` allows you to

1048 temporarily redesign original document with XSLT (a special purpose

1049 language) for a flatter version for migration to a DataFrame.

1050

1051 This function will *always* return a single :class:`DataFrame` or raise

1052 exceptions due to issues with XML document, ``xpath``, or other

1053 parameters.

1054

1055 See the :ref:`read_xml documentation in the IO section of the docs

1056 <io.read_xml>` for more information in using this method to parse XML

1057 files to DataFrames.

1058

1059 Examples

1060 --------

1061 >>> from io import StringIO

1062 >>> xml = '''<?xml version='1.0' encoding='utf-8'?>

1063 ... <data xmlns="http://example.com">

1064 ... <row>

1065 ... <shape>square</shape>

1066 ... <degrees>360</degrees>

1067 ... <sides>4.0</sides>

1068 ... </row>

1069 ... <row>

1070 ... <shape>circle</shape>

1071 ... <degrees>360</degrees>

1072 ... <sides/>

1073 ... </row>

1074 ... <row>

1075 ... <shape>triangle</shape>

1076 ... <degrees>180</degrees>

1077 ... <sides>3.0</sides>

1078 ... </row>

1079 ... </data>'''

1080

1081 >>> df = pd.read_xml(StringIO(xml))

1082 >>> df

1083 shape degrees sides

1084 0 square 360 4.0

1085 1 circle 360 NaN

1086 2 triangle 180 3.0

1087

1088 >>> xml = '''<?xml version='1.0' encoding='utf-8'?>

1089 ... <data>

1090 ... <row shape="square" degrees="360" sides="4.0"/>

1091 ... <row shape="circle" degrees="360"/>

1092 ... <row shape="triangle" degrees="180" sides="3.0"/>

1093 ... </data>'''

1094

1095 >>> df = pd.read_xml(StringIO(xml), xpath=".//row")

1096 >>> df

1097 shape degrees sides

1098 0 square 360 4.0

1099 1 circle 360 NaN

1100 2 triangle 180 3.0

1101

1102 >>> xml = '''<?xml version='1.0' encoding='utf-8'?>

1103 ... <doc:data xmlns:doc="https://example.com">

1104 ... <doc:row>

1105 ... <doc:shape>square</doc:shape>

1106 ... <doc:degrees>360</doc:degrees>

1107 ... <doc:sides>4.0</doc:sides>

1108 ... </doc:row>

1109 ... <doc:row>

1110 ... <doc:shape>circle</doc:shape>

1111 ... <doc:degrees>360</doc:degrees>

1112 ... <doc:sides/>

1113 ... </doc:row>

1114 ... <doc:row>

1115 ... <doc:shape>triangle</doc:shape>

1116 ... <doc:degrees>180</doc:degrees>

1117 ... <doc:sides>3.0</doc:sides>

1118 ... </doc:row>

1119 ... </doc:data>'''

1120

1121 >>> df = pd.read_xml(StringIO(xml),

1122 ... xpath="//doc:row",

1123 ... namespaces={{"doc": "https://example.com"}})

1124 >>> df

1125 shape degrees sides

1126 0 square 360 4.0

1127 1 circle 360 NaN

1128 2 triangle 180 3.0

1129

1130 >>> xml_data = '''

1131 ... <data>

1132 ... <row>

1133 ... <index>0</index>

1134 ... <a>1</a>

1135 ... <b>2.5</b>

1136 ... <c>True</c>

1137 ... <d>a</d>

1138 ... <e>2019-12-31 00:00:00</e>

1139 ... </row>

1140 ... <row>

1141 ... <index>1</index>

1142 ... <b>4.5</b>

1143 ... <c>False</c>

1144 ... <d>b</d>

1145 ... <e>2019-12-31 00:00:00</e>

1146 ... </row>

1147 ... </data>

1148 ... '''

1149

1150 >>> df = pd.read_xml(StringIO(xml_data),

1151 ... dtype_backend="numpy_nullable",

1152 ... parse_dates=["e"])

1153 >>> df

1154 index a b c d e

1155 0 0 1 2.5 True a 2019-12-31

1156 1 1 <NA> 4.5 False b 2019-12-31

1157 """

1158 check_dtype_backend(dtype_backend)

1159

1160 return _parse(

1161 path_or_buffer=path_or_buffer,

1162 xpath=xpath,

1163 namespaces=namespaces,

1164 elems_only=elems_only,

1165 attrs_only=attrs_only,

1166 names=names,

1167 dtype=dtype,

1168 converters=converters,

1169 parse_dates=parse_dates,

1170 encoding=encoding,

1171 parser=parser,

1172 stylesheet=stylesheet,

1173 iterparse=iterparse,

1174 compression=compression,

1175 storage_options=storage_options,

1176 dtype_backend=dtype_backend,

1177 )