Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/xml.py: 17%

1"""

2:mod:`pandas.io.xml` is a module for reading XML.

3"""

5from __future__ import annotations

7import io

8from typing import (

9 Any,

10 Callable,

11 Sequence,

12)

14from pandas._libs import lib

15from pandas._typing import (

16 TYPE_CHECKING,

17 CompressionOptions,

18 ConvertersArg,

19 DtypeArg,

20 DtypeBackend,

21 FilePath,

22 ParseDatesArg,

23 ReadBuffer,

24 StorageOptions,

25 XMLParsers,

26)

27from pandas.compat._optional import import_optional_dependency

28from pandas.errors import (

29 AbstractMethodError,

30 ParserError,

31)

32from pandas.util._decorators import doc

33from pandas.util._validators import check_dtype_backend

35from pandas.core.dtypes.common import is_list_like

37from pandas.core.shared_docs import _shared_docs

39from pandas.io.common import (

40 file_exists,

41 get_handle,

42 infer_compression,

43 is_fsspec_url,

44 is_url,

45 stringify_path,

46)

47from pandas.io.parsers import TextParser

49if TYPE_CHECKING:

50 from xml.etree.ElementTree import Element

52 from lxml import etree

54 from pandas import DataFrame

57@doc(

58 storage_options=_shared_docs["storage_options"],

59 decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",

60)

61class _XMLFrameParser:

62 """

63 Internal subclass to parse XML into DataFrames.

65 Parameters

66 ----------

67 path_or_buffer : a valid JSON str, path object or file-like object

68 Any valid string path is acceptable. The string could be a URL. Valid

69 URL schemes include http, ftp, s3, and file.

71 xpath : str or regex

72 The XPath expression to parse required set of nodes for

73 migration to `Data Frame`. `etree` supports limited XPath.

75 namespaces : dict

76 The namespaces defined in XML document (`xmlns:namespace='URI')

77 as dicts with key being namespace and value the URI.

79 elems_only : bool

80 Parse only the child elements at the specified `xpath`.

82 attrs_only : bool

83 Parse only the attributes at the specified `xpath`.

85 names : list

86 Column names for Data Frame of parsed XML data.

88 dtype : dict

89 Data type for data or columns. E.g. {{'a': np.float64,

90 'b': np.int32, 'c': 'Int64'}}

92 .. versionadded:: 1.5.0

94 converters : dict, optional

95 Dict of functions for converting values in certain columns. Keys can

96 either be integers or column labels.

98 .. versionadded:: 1.5.0

100 parse_dates : bool or list of int or names or list of lists or dict

101 Converts either index or select columns to datetimes

102

103 .. versionadded:: 1.5.0

104

105 encoding : str

106 Encoding of xml object or document.

107

108 stylesheet : str or file-like

109 URL, file, file-like object, or a raw string containing XSLT,

110 `etree` does not support XSLT but retained for consistency.

111

112 iterparse : dict, optional

113 Dict with row element as key and list of descendant elements

114 and/or attributes as value to be retrieved in iterparsing of

115 XML document.

116

117 .. versionadded:: 1.5.0

118

119 {decompression_options}

120

121 .. versionchanged:: 1.4.0 Zstandard support.

122

123 {storage_options}

124

125 See also

126 --------

127 pandas.io.xml._EtreeFrameParser

128 pandas.io.xml._LxmlFrameParser

129

130 Notes

131 -----

132 To subclass this class effectively you must override the following methods:`

133 * :func:`parse_data`

134 * :func:`_parse_nodes`

135 * :func:`_iterparse_nodes`

136 * :func:`_parse_doc`

137 * :func:`_validate_names`

138 * :func:`_validate_path`

139

140

141 See each method's respective documentation for details on their

142 functionality.

143 """

144

145 def __init__(

146 self,

147 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],

148 xpath: str,

149 namespaces: dict[str, str] | None,

150 elems_only: bool,

151 attrs_only: bool,

152 names: Sequence[str] | None,

153 dtype: DtypeArg | None,

154 converters: ConvertersArg | None,

155 parse_dates: ParseDatesArg | None,

156 encoding: str | None,

157 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,

158 iterparse: dict[str, list[str]] | None,

159 compression: CompressionOptions,

160 storage_options: StorageOptions,

161 ) -> None:

162 self.path_or_buffer = path_or_buffer

163 self.xpath = xpath

164 self.namespaces = namespaces

165 self.elems_only = elems_only

166 self.attrs_only = attrs_only

167 self.names = names

168 self.dtype = dtype

169 self.converters = converters

170 self.parse_dates = parse_dates

171 self.encoding = encoding

172 self.stylesheet = stylesheet

173 self.iterparse = iterparse

174 self.is_style = None

175 self.compression = compression

176 self.storage_options = storage_options

177

178 def parse_data(self) -> list[dict[str, str | None]]:

179 """

180 Parse xml data.

181

182 This method will call the other internal methods to

183 validate xpath, names, parse and return specific nodes.

184 """

185

186 raise AbstractMethodError(self)

187

188 def _parse_nodes(self, elems: list[Any]) -> list[dict[str, str | None]]:

189 """

190 Parse xml nodes.

191

192 This method will parse the children and attributes of elements

193 in xpath, conditionally for only elements, only attributes

194 or both while optionally renaming node names.

195

196 Raises

197 ------

198 ValueError

199 * If only elements and only attributes are specified.

200

201 Notes

202 -----

203 Namespace URIs will be removed from return node values. Also,

204 elements with missing children or attributes compared to siblings

205 will have optional keys filled with None values.

206 """

207

208 dicts: list[dict[str, str | None]]

209

210 if self.elems_only and self.attrs_only:

211 raise ValueError("Either element or attributes can be parsed not both.")

212 if self.elems_only:

213 if self.names:

214 dicts = [

215 {

216 **(

217 {el.tag: el.text.strip()}

218 if el.text and not el.text.isspace()

219 else {}

220 ),

221 **{

222 nm: ch.text.strip() if ch.text else None

223 for nm, ch in zip(self.names, el.findall("*"))

224 },

225 }

226 for el in elems

227 ]

228 else:

229 dicts = [

230 {

231 ch.tag: ch.text.strip() if ch.text else None

232 for ch in el.findall("*")

233 }

234 for el in elems

235 ]

236

237 elif self.attrs_only:

238 dicts = [

239 {k: v.strip() if v else None for k, v in el.attrib.items()}

240 for el in elems

241 ]

242

243 else:

244 if self.names:

245 dicts = [

246 {

247 **el.attrib,

248 **(

249 {el.tag: el.text.strip()}

250 if el.text and not el.text.isspace()

251 else {}

252 ),

253 **{

254 nm: ch.text.strip() if ch.text else None

255 for nm, ch in zip(self.names, el.findall("*"))

256 },

257 }

258 for el in elems

259 ]

260

261 else:

262 dicts = [

263 {

264 **el.attrib,

265 **(

266 {el.tag: el.text.strip()}

267 if el.text and not el.text.isspace()

268 else {}

269 ),

270 **{

271 ch.tag: ch.text.strip() if ch.text else None

272 for ch in el.findall("*")

273 },

274 }

275 for el in elems

276 ]

277

278 dicts = [

279 {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts

280 ]

281

282 keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))

283 dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]

284

285 if self.names:

286 dicts = [dict(zip(self.names, d.values())) for d in dicts]

287

288 return dicts

289

290 def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:

291 """

292 Iterparse xml nodes.

293

294 This method will read in local disk, decompressed XML files for elements

295 and underlying descendants using iterparse, a method to iterate through

296 an XML tree without holding entire XML tree in memory.

297

298 Raises

299 ------

300 TypeError

301 * If `iterparse` is not a dict or its dict value is not list-like.

302 ParserError

303 * If `path_or_buffer` is not a physical file on disk or file-like object.

304 * If no data is returned from selected items in `iterparse`.

305

306 Notes

307 -----

308 Namespace URIs will be removed from return node values. Also,

309 elements with missing children or attributes in submitted list

310 will have optional keys filled with None values.

311 """

312

313 dicts: list[dict[str, str | None]] = []

314 row: dict[str, str | None] | None = None

315

316 if not isinstance(self.iterparse, dict):

317 raise TypeError(

318 f"{type(self.iterparse).__name__} is not a valid type for iterparse"

319 )

320

321 row_node = next(iter(self.iterparse.keys())) if self.iterparse else ""

322 if not is_list_like(self.iterparse[row_node]):

323 raise TypeError(

324 f"{type(self.iterparse[row_node])} is not a valid type "

325 "for value in iterparse"

326 )

327

328 if (not hasattr(self.path_or_buffer, "read")) and (

329 not isinstance(self.path_or_buffer, str)

330 or is_url(self.path_or_buffer)

331 or is_fsspec_url(self.path_or_buffer)

332 or self.path_or_buffer.startswith(("<?xml", "<"))

333 or infer_compression(self.path_or_buffer, "infer") is not None

334 ):

335 raise ParserError(

336 "iterparse is designed for large XML files that are fully extracted on "

337 "local disk and not as compressed files or online sources."

338 )

339

340 iterparse_repeats = len(self.iterparse[row_node]) != len(

341 set(self.iterparse[row_node])

342 )

343

344 for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):

345 curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag

346

347 if event == "start":

348 if curr_elem == row_node:

349 row = {}

350

351 if row is not None:

352 if self.names and iterparse_repeats:

353 for col, nm in zip(self.iterparse[row_node], self.names):

354 if curr_elem == col:

355 elem_val = elem.text.strip() if elem.text else None

356 if elem_val not in row.values() and nm not in row:

357 row[nm] = elem_val

358

359 if col in elem.attrib:

360 if elem.attrib[col] not in row.values() and nm not in row:

361 row[nm] = elem.attrib[col]

362 else:

363 for col in self.iterparse[row_node]:

364 if curr_elem == col:

365 row[col] = elem.text.strip() if elem.text else None

366 if col in elem.attrib:

367 row[col] = elem.attrib[col]

368

369 if event == "end":

370 if curr_elem == row_node and row is not None:

371 dicts.append(row)

372 row = None

373

374 elem.clear()

375 if hasattr(elem, "getprevious"):

376 while (

377 elem.getprevious() is not None and elem.getparent() is not None

378 ):

379 del elem.getparent()[0]

380

381 if dicts == []:

382 raise ParserError("No result from selected items in iterparse.")

383

384 keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))

385 dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]

386

387 if self.names:

388 dicts = [dict(zip(self.names, d.values())) for d in dicts]

389

390 return dicts

391

392 def _validate_path(self) -> list[Any]:

393 """

394 Validate xpath.

395

396 This method checks for syntax, evaluation, or empty nodes return.

397

398 Raises

399 ------

400 SyntaxError

401 * If xpah is not supported or issues with namespaces.

402

403 ValueError

404 * If xpah does not return any nodes.

405 """

406

407 raise AbstractMethodError(self)

408

409 def _validate_names(self) -> None:

410 """

411 Validate names.

412

413 This method will check if names is a list-like and aligns

414 with length of parse nodes.

415

416 Raises

417 ------

418 ValueError

419 * If value is not a list and less then length of nodes.

420 """

421 raise AbstractMethodError(self)

422

423 def _parse_doc(

424 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]

425 ) -> Element | etree._Element:

426 """

427 Build tree from path_or_buffer.

428

429 This method will parse XML object into tree

430 either from string/bytes or file location.

431 """

432 raise AbstractMethodError(self)

433

434

435class _EtreeFrameParser(_XMLFrameParser):

436 """

437 Internal class to parse XML into DataFrames with the Python

438 standard library XML module: `xml.etree.ElementTree`.

439 """

440

441 def parse_data(self) -> list[dict[str, str | None]]:

442 from xml.etree.ElementTree import iterparse

443

444 if self.stylesheet is not None:

445 raise ValueError(

446 "To use stylesheet, you need lxml installed and selected as parser."

447 )

448

449 if self.iterparse is None:

450 self.xml_doc = self._parse_doc(self.path_or_buffer)

451 elems = self._validate_path()

452

453 self._validate_names()

454

455 xml_dicts: list[dict[str, str | None]] = (

456 self._parse_nodes(elems)

457 if self.iterparse is None

458 else self._iterparse_nodes(iterparse)

459 )

460

461 return xml_dicts

462

463 def _validate_path(self) -> list[Any]:

464 """

465 Notes

466 -----

467 `etree` supports limited XPath. If user attempts a more complex

468 expression syntax error will raise.

469 """

470

471 msg = (

472 "xpath does not return any nodes or attributes. "

473 "Be sure to specify in `xpath` the parent nodes of "

474 "children and attributes to parse. "

475 "If document uses namespaces denoted with "

476 "xmlns, be sure to define namespaces and "

477 "use them in xpath."

478 )

479 try:

480 elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)

481 children = [ch for el in elems for ch in el.findall("*")]

482 attrs = {k: v for el in elems for k, v in el.attrib.items()}

483

484 if elems is None:

485 raise ValueError(msg)

486

487 if elems is not None:

488 if self.elems_only and children == []:

489 raise ValueError(msg)

490 if self.attrs_only and attrs == {}:

491 raise ValueError(msg)

492 if children == [] and attrs == {}:

493 raise ValueError(msg)

494

495 except (KeyError, SyntaxError):

496 raise SyntaxError(

497 "You have used an incorrect or unsupported XPath "

498 "expression for etree library or you used an "

499 "undeclared namespace prefix."

500 )

501

502 return elems

503

504 def _validate_names(self) -> None:

505 children: list[Any]

506

507 if self.names:

508 if self.iterparse:

509 children = self.iterparse[next(iter(self.iterparse))]

510 else:

511 parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces)

512 children = parent.findall("*") if parent else []

513

514 if is_list_like(self.names):

515 if len(self.names) < len(children):

516 raise ValueError(

517 "names does not match length of child elements in xpath."

518 )

519 else:

520 raise TypeError(

521 f"{type(self.names).__name__} is not a valid type for names"

522 )

523

524 def _parse_doc(

525 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]

526 ) -> Element:

527 from xml.etree.ElementTree import (

528 XMLParser,

529 parse,

530 )

531

532 handle_data = get_data_from_filepath(

533 filepath_or_buffer=raw_doc,

534 encoding=self.encoding,

535 compression=self.compression,

536 storage_options=self.storage_options,

537 )

538

539 with preprocess_data(handle_data) as xml_data:

540 curr_parser = XMLParser(encoding=self.encoding)

541 document = parse(xml_data, parser=curr_parser)

542

543 return document.getroot()

544

545

546class _LxmlFrameParser(_XMLFrameParser):

547 """

548 Internal class to parse XML into DataFrames with third-party

549 full-featured XML library, `lxml`, that supports

550 XPath 1.0 and XSLT 1.0.

551 """

552

553 def parse_data(self) -> list[dict[str, str | None]]:

554 """

555 Parse xml data.

556

557 This method will call the other internal methods to

558 validate xpath, names, optionally parse and run XSLT,

559 and parse original or transformed XML and return specific nodes.

560 """

561 from lxml.etree import iterparse

562

563 if self.iterparse is None:

564 self.xml_doc = self._parse_doc(self.path_or_buffer)

565

566 if self.stylesheet:

567 self.xsl_doc = self._parse_doc(self.stylesheet)

568 self.xml_doc = self._transform_doc()

569

570 elems = self._validate_path()

571

572 self._validate_names()

573

574 xml_dicts: list[dict[str, str | None]] = (

575 self._parse_nodes(elems)

576 if self.iterparse is None

577 else self._iterparse_nodes(iterparse)

578 )

579

580 return xml_dicts

581

582 def _validate_path(self) -> list[Any]:

583 msg = (

584 "xpath does not return any nodes or attributes. "

585 "Be sure to specify in `xpath` the parent nodes of "

586 "children and attributes to parse. "

587 "If document uses namespaces denoted with "

588 "xmlns, be sure to define namespaces and "

589 "use them in xpath."

590 )

591

592 elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)

593 children = [ch for el in elems for ch in el.xpath("*")]

594 attrs = {k: v for el in elems for k, v in el.attrib.items()}

595

596 if elems == []:

597 raise ValueError(msg)

598

599 if elems != []:

600 if self.elems_only and children == []:

601 raise ValueError(msg)

602 if self.attrs_only and attrs == {}:

603 raise ValueError(msg)

604 if children == [] and attrs == {}:

605 raise ValueError(msg)

606

607 return elems

608

609 def _validate_names(self) -> None:

610 children: list[Any]

611

612 if self.names:

613 if self.iterparse:

614 children = self.iterparse[next(iter(self.iterparse))]

615 else:

616 children = self.xml_doc.xpath(

617 self.xpath + "[1]/*", namespaces=self.namespaces

618 )

619

620 if is_list_like(self.names):

621 if len(self.names) < len(children):

622 raise ValueError(

623 "names does not match length of child elements in xpath."

624 )

625 else:

626 raise TypeError(

627 f"{type(self.names).__name__} is not a valid type for names"

628 )

629

630 def _parse_doc(

631 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]

632 ) -> etree._Element:

633 from lxml.etree import (

634 XMLParser,

635 fromstring,

636 parse,

637 )

638

639 handle_data = get_data_from_filepath(

640 filepath_or_buffer=raw_doc,

641 encoding=self.encoding,

642 compression=self.compression,

643 storage_options=self.storage_options,

644 )

645

646 with preprocess_data(handle_data) as xml_data:

647 curr_parser = XMLParser(encoding=self.encoding)

648

649 if isinstance(xml_data, io.StringIO):

650 if self.encoding is None:

651 raise TypeError(

652 "Can not pass encoding None when input is StringIO."

653 )

654

655 document = fromstring(

656 xml_data.getvalue().encode(self.encoding), parser=curr_parser

657 )

658 else:

659 document = parse(xml_data, parser=curr_parser)

660

661 return document

662

663 def _transform_doc(self) -> etree._XSLTResultTree:

664 """

665 Transform original tree using stylesheet.

666

667 This method will transform original xml using XSLT script into

668 am ideally flatter xml document for easier parsing and migration

669 to Data Frame.

670 """

671 from lxml.etree import XSLT

672

673 transformer = XSLT(self.xsl_doc)

674 new_doc = transformer(self.xml_doc)

675

676 return new_doc

677

678

679def get_data_from_filepath(

680 filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],

681 encoding: str | None,

682 compression: CompressionOptions,

683 storage_options: StorageOptions,

684) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:

685 """

686 Extract raw XML data.

687

688 The method accepts three input types:

689 1. filepath (string-like)

690 2. file-like object (e.g. open file object, StringIO)

691 3. XML string or bytes

692

693 This method turns (1) into (2) to simplify the rest of the processing.

694 It returns input types (2) and (3) unchanged.

695 """

696 if not isinstance(filepath_or_buffer, bytes):

697 filepath_or_buffer = stringify_path(filepath_or_buffer)

698

699 if (

700 isinstance(filepath_or_buffer, str)

701 and not filepath_or_buffer.startswith(("<?xml", "<"))

702 ) and (

703 not isinstance(filepath_or_buffer, str)

704 or is_url(filepath_or_buffer)

705 or is_fsspec_url(filepath_or_buffer)

706 or file_exists(filepath_or_buffer)

707 ):

708 with get_handle(

709 filepath_or_buffer,

710 "r",

711 encoding=encoding,

712 compression=compression,

713 storage_options=storage_options,

714 ) as handle_obj:

715 filepath_or_buffer = (

716 handle_obj.handle.read()

717 if hasattr(handle_obj.handle, "read")

718 else handle_obj.handle

719 )

720

721 return filepath_or_buffer

722

723

724def preprocess_data(data) -> io.StringIO | io.BytesIO:

725 """

726 Convert extracted raw data.

727

728 This method will return underlying data of extracted XML content.

729 The data either has a `read` attribute (e.g. a file object or a

730 StringIO/BytesIO) or is a string or bytes that is an XML document.

731 """

732

733 if isinstance(data, str):

734 data = io.StringIO(data)

735

736 elif isinstance(data, bytes):

737 data = io.BytesIO(data)

738

739 return data

740

741

742def _data_to_frame(data, **kwargs) -> DataFrame:

743 """

744 Convert parsed data to Data Frame.

745

746 This method will bind xml dictionary data of keys and values

747 into named columns of Data Frame using the built-in TextParser

748 class that build Data Frame and infers specific dtypes.

749 """

750

751 tags = next(iter(data))

752 nodes = [list(d.values()) for d in data]

753

754 try:

755 with TextParser(nodes, names=tags, **kwargs) as tp:

756 return tp.read()

757 except ParserError:

758 raise ParserError(

759 "XML document may be too complex for import. "

760 "Try to flatten document and use distinct "

761 "element and attribute names."

762 )

763

764

765def _parse(

766 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],

767 xpath: str,

768 namespaces: dict[str, str] | None,

769 elems_only: bool,

770 attrs_only: bool,

771 names: Sequence[str] | None,

772 dtype: DtypeArg | None,

773 converters: ConvertersArg | None,

774 parse_dates: ParseDatesArg | None,

775 encoding: str | None,

776 parser: XMLParsers,

777 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,

778 iterparse: dict[str, list[str]] | None,

779 compression: CompressionOptions,

780 storage_options: StorageOptions,

781 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

782 **kwargs,

783) -> DataFrame:

784 """

785 Call internal parsers.

786

787 This method will conditionally call internal parsers:

788 LxmlFrameParser and/or EtreeParser.

789

790 Raises

791 ------

792 ImportError

793 * If lxml is not installed if selected as parser.

794

795 ValueError

796 * If parser is not lxml or etree.

797 """

798

799 p: _EtreeFrameParser | _LxmlFrameParser

800

801 if parser == "lxml":

802 lxml = import_optional_dependency("lxml.etree", errors="ignore")

803

804 if lxml is not None:

805 p = _LxmlFrameParser(

806 path_or_buffer,

807 xpath,

808 namespaces,

809 elems_only,

810 attrs_only,

811 names,

812 dtype,

813 converters,

814 parse_dates,

815 encoding,

816 stylesheet,

817 iterparse,

818 compression,

819 storage_options,

820 )

821 else:

822 raise ImportError("lxml not found, please install or use the etree parser.")

823

824 elif parser == "etree":

825 p = _EtreeFrameParser(

826 path_or_buffer,

827 xpath,

828 namespaces,

829 elems_only,

830 attrs_only,

831 names,

832 dtype,

833 converters,

834 parse_dates,

835 encoding,

836 stylesheet,

837 iterparse,

838 compression,

839 storage_options,

840 )

841 else:

842 raise ValueError("Values for parser can only be lxml or etree.")

843

844 data_dicts = p.parse_data()

845

846 return _data_to_frame(

847 data=data_dicts,

848 dtype=dtype,

849 converters=converters,

850 parse_dates=parse_dates,

851 dtype_backend=dtype_backend,

852 **kwargs,

853 )

854

855

856@doc(

857 storage_options=_shared_docs["storage_options"],

858 decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",

859)

860def read_xml(

861 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],

862 *,

863 xpath: str = "./*",

864 namespaces: dict[str, str] | None = None,

865 elems_only: bool = False,

866 attrs_only: bool = False,

867 names: Sequence[str] | None = None,

868 dtype: DtypeArg | None = None,

869 converters: ConvertersArg | None = None,

870 parse_dates: ParseDatesArg | None = None,

871 # encoding can not be None for lxml and StringIO input

872 encoding: str | None = "utf-8",

873 parser: XMLParsers = "lxml",

874 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,

875 iterparse: dict[str, list[str]] | None = None,

876 compression: CompressionOptions = "infer",

877 storage_options: StorageOptions = None,

878 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

879) -> DataFrame:

880 r"""

881 Read XML document into a ``DataFrame`` object.

882

883 .. versionadded:: 1.3.0

884

885 Parameters

886 ----------

887 path_or_buffer : str, path object, or file-like object

888 String, path object (implementing ``os.PathLike[str]``), or file-like

889 object implementing a ``read()`` function. The string can be any valid XML

890 string or a path. The string can further be a URL. Valid URL schemes

891 include http, ftp, s3, and file.

892

893 xpath : str, optional, default './\*'

894 The XPath to parse required set of nodes for migration to DataFrame.

895 XPath should return a collection of elements and not a single

896 element. Note: The ``etree`` parser supports limited XPath

897 expressions. For more complex XPath, use ``lxml`` which requires

898 installation.

899

900 namespaces : dict, optional

901 The namespaces defined in XML document as dicts with key being

902 namespace prefix and value the URI. There is no need to include all

903 namespaces in XML, only the ones used in ``xpath`` expression.

904 Note: if XML document uses default namespace denoted as

905 `xmlns='<URI>'` without a prefix, you must assign any temporary

906 namespace prefix such as 'doc' to the URI in order to parse

907 underlying nodes and/or attributes. For example, ::

908

909 namespaces = {{"doc": "https://example.com"}}

910

911 elems_only : bool, optional, default False

912 Parse only the child elements at the specified ``xpath``. By default,

913 all child elements and non-empty text nodes are returned.

914

915 attrs_only : bool, optional, default False

916 Parse only the attributes at the specified ``xpath``.

917 By default, all attributes are returned.

918

919 names : list-like, optional

920 Column names for DataFrame of parsed XML data. Use this parameter to

921 rename original element names and distinguish same named elements and

922 attributes.

923

924 dtype : Type name or dict of column -> type, optional

925 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,

926 'c': 'Int64'}}

927 Use `str` or `object` together with suitable `na_values` settings

928 to preserve and not interpret dtype.

929 If converters are specified, they will be applied INSTEAD

930 of dtype conversion.

931

932 .. versionadded:: 1.5.0

933

934 converters : dict, optional

935 Dict of functions for converting values in certain columns. Keys can either

936 be integers or column labels.

937

938 .. versionadded:: 1.5.0

939

940 parse_dates : bool or list of int or names or list of lists or dict, default False

941 Identifiers to parse index or columns to datetime. The behavior is as follows:

942

943 * boolean. If True -> try parsing the index.

944 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3

945 each as a separate date column.

946 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as

947 a single date column.

948 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call

949 result 'foo'

950

951 .. versionadded:: 1.5.0

952

953 encoding : str, optional, default 'utf-8'

954 Encoding of XML document.

955

956 parser : {{'lxml','etree'}}, default 'lxml'

957 Parser module to use for retrieval of data. Only 'lxml' and

958 'etree' are supported. With 'lxml' more complex XPath searches

959 and ability to use XSLT stylesheet are supported.

960

961 stylesheet : str, path object or file-like object

962 A URL, file-like object, or a raw string containing an XSLT script.

963 This stylesheet should flatten complex, deeply nested XML documents

964 for easier parsing. To use this feature you must have ``lxml`` module

965 installed and specify 'lxml' as ``parser``. The ``xpath`` must

966 reference nodes of transformed XML document generated after XSLT

967 transformation and not the original XML document. Only XSLT 1.0

968 scripts and not later versions is currently supported.

969

970 iterparse : dict, optional

971 The nodes or attributes to retrieve in iterparsing of XML document

972 as a dict with key being the name of repeating element and value being

973 list of elements or attribute names that are descendants of the repeated

974 element. Note: If this option is used, it will replace ``xpath`` parsing

975 and unlike xpath, descendants do not need to relate to each other but can

976 exist any where in document under the repeating element. This memory-

977 efficient method should be used for very large XML files (500MB, 1GB, or 5GB+).

978 For example, ::

979

980 iterparse = {{"row_element": ["child_elem", "attr", "grandchild_elem"]}}

981

982 .. versionadded:: 1.5.0

983

984 {decompression_options}

985

986 .. versionchanged:: 1.4.0 Zstandard support.

987

988 {storage_options}

989

990 dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames

991 Which dtype_backend to use, e.g. whether a DataFrame should have NumPy

992 arrays, nullable dtypes are used for all dtypes that have a nullable

993 implementation when "numpy_nullable" is set, pyarrow is used for all

994 dtypes if "pyarrow" is set.

995

996 The dtype_backends are still experimential.

997

998 .. versionadded:: 2.0

999

1000 Returns

1001 -------

1002 df

1003 A DataFrame.

1004

1005 See Also

1006 --------

1007 read_json : Convert a JSON string to pandas object.

1008 read_html : Read HTML tables into a list of DataFrame objects.

1009

1010 Notes

1011 -----

1012 This method is best designed to import shallow XML documents in

1013 following format which is the ideal fit for the two-dimensions of a

1014 ``DataFrame`` (row by column). ::

1015

1016 <root>

1017 <row>

1018 <column1>data</column1>

1019 <column2>data</column2>

1020 <column3>data</column3>

1021 ...

1022 </row>

1023 <row>

1024 ...

1025 </row>

1026 ...

1027 </root>

1028

1029 As a file format, XML documents can be designed any way including

1030 layout of elements and attributes as long as it conforms to W3C

1031 specifications. Therefore, this method is a convenience handler for

1032 a specific flatter design and not all possible XML structures.

1033

1034 However, for more complex XML documents, ``stylesheet`` allows you to

1035 temporarily redesign original document with XSLT (a special purpose

1036 language) for a flatter version for migration to a DataFrame.

1037

1038 This function will *always* return a single :class:`DataFrame` or raise

1039 exceptions due to issues with XML document, ``xpath``, or other

1040 parameters.

1041

1042 See the :ref:`read_xml documentation in the IO section of the docs

1043 <io.read_xml>` for more information in using this method to parse XML

1044 files to DataFrames.

1045

1046 Examples

1047 --------

1048 >>> xml = '''<?xml version='1.0' encoding='utf-8'?>

1049 ... <data xmlns="http://example.com">

1050 ... <row>

1051 ... <shape>square</shape>

1052 ... <degrees>360</degrees>

1053 ... <sides>4.0</sides>

1054 ... </row>

1055 ... <row>

1056 ... <shape>circle</shape>

1057 ... <degrees>360</degrees>

1058 ... <sides/>

1059 ... </row>

1060 ... <row>

1061 ... <shape>triangle</shape>

1062 ... <degrees>180</degrees>

1063 ... <sides>3.0</sides>

1064 ... </row>

1065 ... </data>'''

1066

1067 >>> df = pd.read_xml(xml)

1068 >>> df

1069 shape degrees sides

1070 0 square 360 4.0

1071 1 circle 360 NaN

1072 2 triangle 180 3.0

1073

1074 >>> xml = '''<?xml version='1.0' encoding='utf-8'?>

1075 ... <data>

1076 ... <row shape="square" degrees="360" sides="4.0"/>

1077 ... <row shape="circle" degrees="360"/>

1078 ... <row shape="triangle" degrees="180" sides="3.0"/>

1079 ... </data>'''

1080

1081 >>> df = pd.read_xml(xml, xpath=".//row")

1082 >>> df

1083 shape degrees sides

1084 0 square 360 4.0

1085 1 circle 360 NaN

1086 2 triangle 180 3.0

1087

1088 >>> xml = '''<?xml version='1.0' encoding='utf-8'?>

1089 ... <doc:data xmlns:doc="https://example.com">

1090 ... <doc:row>

1091 ... <doc:shape>square</doc:shape>

1092 ... <doc:degrees>360</doc:degrees>

1093 ... <doc:sides>4.0</doc:sides>

1094 ... </doc:row>

1095 ... <doc:row>

1096 ... <doc:shape>circle</doc:shape>

1097 ... <doc:degrees>360</doc:degrees>

1098 ... <doc:sides/>

1099 ... </doc:row>

1100 ... <doc:row>

1101 ... <doc:shape>triangle</doc:shape>

1102 ... <doc:degrees>180</doc:degrees>

1103 ... <doc:sides>3.0</doc:sides>

1104 ... </doc:row>

1105 ... </doc:data>'''

1106

1107 >>> df = pd.read_xml(xml,

1108 ... xpath="//doc:row",

1109 ... namespaces={{"doc": "https://example.com"}})

1110 >>> df

1111 shape degrees sides

1112 0 square 360 4.0

1113 1 circle 360 NaN

1114 2 triangle 180 3.0

1115 """

1116 check_dtype_backend(dtype_backend)

1117

1118 return _parse(

1119 path_or_buffer=path_or_buffer,

1120 xpath=xpath,

1121 namespaces=namespaces,

1122 elems_only=elems_only,

1123 attrs_only=attrs_only,

1124 names=names,

1125 dtype=dtype,

1126 converters=converters,

1127 parse_dates=parse_dates,

1128 encoding=encoding,

1129 parser=parser,

1130 stylesheet=stylesheet,

1131 iterparse=iterparse,

1132 compression=compression,

1133 storage_options=storage_options,

1134 dtype_backend=dtype_backend,

1135 )