Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/xml.py: 17%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

247 statements  

1""" 

2:mod:``pandas.io.xml`` is a module for reading XML. 

3""" 

4 

5from __future__ import annotations 

6 

7import io 

8from os import PathLike 

9from typing import ( 

10 TYPE_CHECKING, 

11 Any, 

12 Callable, 

13) 

14import warnings 

15 

16from pandas._libs import lib 

17from pandas.compat._optional import import_optional_dependency 

18from pandas.errors import ( 

19 AbstractMethodError, 

20 ParserError, 

21) 

22from pandas.util._decorators import doc 

23from pandas.util._exceptions import find_stack_level 

24from pandas.util._validators import check_dtype_backend 

25 

26from pandas.core.dtypes.common import is_list_like 

27 

28from pandas.core.shared_docs import _shared_docs 

29 

30from pandas.io.common import ( 

31 file_exists, 

32 get_handle, 

33 infer_compression, 

34 is_file_like, 

35 is_fsspec_url, 

36 is_url, 

37 stringify_path, 

38) 

39from pandas.io.parsers import TextParser 

40 

41if TYPE_CHECKING: 

42 from collections.abc import Sequence 

43 from xml.etree.ElementTree import Element 

44 

45 from lxml import etree 

46 

47 from pandas._typing import ( 

48 CompressionOptions, 

49 ConvertersArg, 

50 DtypeArg, 

51 DtypeBackend, 

52 FilePath, 

53 ParseDatesArg, 

54 ReadBuffer, 

55 StorageOptions, 

56 XMLParsers, 

57 ) 

58 

59 from pandas import DataFrame 

60 

61 

62@doc( 

63 storage_options=_shared_docs["storage_options"], 

64 decompression_options=_shared_docs["decompression_options"] % "path_or_buffer", 

65) 

66class _XMLFrameParser: 

67 """ 

68 Internal subclass to parse XML into DataFrames. 

69 

70 Parameters 

71 ---------- 

72 path_or_buffer : a valid JSON ``str``, path object or file-like object 

73 Any valid string path is acceptable. The string could be a URL. Valid 

74 URL schemes include http, ftp, s3, and file. 

75 

76 xpath : str or regex 

77 The ``XPath`` expression to parse required set of nodes for 

78 migration to :class:`~pandas.DataFrame`. ``etree`` supports limited ``XPath``. 

79 

80 namespaces : dict 

81 The namespaces defined in XML document (``xmlns:namespace='URI'``) 

82 as dicts with key being namespace and value the URI. 

83 

84 elems_only : bool 

85 Parse only the child elements at the specified ``xpath``. 

86 

87 attrs_only : bool 

88 Parse only the attributes at the specified ``xpath``. 

89 

90 names : list 

91 Column names for :class:`~pandas.DataFrame` of parsed XML data. 

92 

93 dtype : dict 

94 Data type for data or columns. E.g. {{'a': np.float64, 

95 'b': np.int32, 'c': 'Int64'}} 

96 

97 .. versionadded:: 1.5.0 

98 

99 converters : dict, optional 

100 Dict of functions for converting values in certain columns. Keys can 

101 either be integers or column labels. 

102 

103 .. versionadded:: 1.5.0 

104 

105 parse_dates : bool or list of int or names or list of lists or dict 

106 Converts either index or select columns to datetimes 

107 

108 .. versionadded:: 1.5.0 

109 

110 encoding : str 

111 Encoding of xml object or document. 

112 

113 stylesheet : str or file-like 

114 URL, file, file-like object, or a raw string containing XSLT, 

115 ``etree`` does not support XSLT but retained for consistency. 

116 

117 iterparse : dict, optional 

118 Dict with row element as key and list of descendant elements 

119 and/or attributes as value to be retrieved in iterparsing of 

120 XML document. 

121 

122 .. versionadded:: 1.5.0 

123 

124 {decompression_options} 

125 

126 .. versionchanged:: 1.4.0 Zstandard support. 

127 

128 {storage_options} 

129 

130 See also 

131 -------- 

132 pandas.io.xml._EtreeFrameParser 

133 pandas.io.xml._LxmlFrameParser 

134 

135 Notes 

136 ----- 

137 To subclass this class effectively you must override the following methods:` 

138 * :func:`parse_data` 

139 * :func:`_parse_nodes` 

140 * :func:`_iterparse_nodes` 

141 * :func:`_parse_doc` 

142 * :func:`_validate_names` 

143 * :func:`_validate_path` 

144 

145 

146 See each method's respective documentation for details on their 

147 functionality. 

148 """ 

149 

150 def __init__( 

151 self, 

152 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], 

153 xpath: str, 

154 namespaces: dict[str, str] | None, 

155 elems_only: bool, 

156 attrs_only: bool, 

157 names: Sequence[str] | None, 

158 dtype: DtypeArg | None, 

159 converters: ConvertersArg | None, 

160 parse_dates: ParseDatesArg | None, 

161 encoding: str | None, 

162 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, 

163 iterparse: dict[str, list[str]] | None, 

164 compression: CompressionOptions, 

165 storage_options: StorageOptions, 

166 ) -> None: 

167 self.path_or_buffer = path_or_buffer 

168 self.xpath = xpath 

169 self.namespaces = namespaces 

170 self.elems_only = elems_only 

171 self.attrs_only = attrs_only 

172 self.names = names 

173 self.dtype = dtype 

174 self.converters = converters 

175 self.parse_dates = parse_dates 

176 self.encoding = encoding 

177 self.stylesheet = stylesheet 

178 self.iterparse = iterparse 

179 self.is_style = None 

180 self.compression: CompressionOptions = compression 

181 self.storage_options = storage_options 

182 

183 def parse_data(self) -> list[dict[str, str | None]]: 

184 """ 

185 Parse xml data. 

186 

187 This method will call the other internal methods to 

188 validate ``xpath``, names, parse and return specific nodes. 

189 """ 

190 

191 raise AbstractMethodError(self) 

192 

193 def _parse_nodes(self, elems: list[Any]) -> list[dict[str, str | None]]: 

194 """ 

195 Parse xml nodes. 

196 

197 This method will parse the children and attributes of elements 

198 in ``xpath``, conditionally for only elements, only attributes 

199 or both while optionally renaming node names. 

200 

201 Raises 

202 ------ 

203 ValueError 

204 * If only elements and only attributes are specified. 

205 

206 Notes 

207 ----- 

208 Namespace URIs will be removed from return node values. Also, 

209 elements with missing children or attributes compared to siblings 

210 will have optional keys filled with None values. 

211 """ 

212 

213 dicts: list[dict[str, str | None]] 

214 

215 if self.elems_only and self.attrs_only: 

216 raise ValueError("Either element or attributes can be parsed not both.") 

217 if self.elems_only: 

218 if self.names: 

219 dicts = [ 

220 { 

221 **( 

222 {el.tag: el.text} 

223 if el.text and not el.text.isspace() 

224 else {} 

225 ), 

226 **{ 

227 nm: ch.text if ch.text else None 

228 for nm, ch in zip(self.names, el.findall("*")) 

229 }, 

230 } 

231 for el in elems 

232 ] 

233 else: 

234 dicts = [ 

235 {ch.tag: ch.text if ch.text else None for ch in el.findall("*")} 

236 for el in elems 

237 ] 

238 

239 elif self.attrs_only: 

240 dicts = [ 

241 {k: v if v else None for k, v in el.attrib.items()} for el in elems 

242 ] 

243 

244 elif self.names: 

245 dicts = [ 

246 { 

247 **el.attrib, 

248 **({el.tag: el.text} if el.text and not el.text.isspace() else {}), 

249 **{ 

250 nm: ch.text if ch.text else None 

251 for nm, ch in zip(self.names, el.findall("*")) 

252 }, 

253 } 

254 for el in elems 

255 ] 

256 

257 else: 

258 dicts = [ 

259 { 

260 **el.attrib, 

261 **({el.tag: el.text} if el.text and not el.text.isspace() else {}), 

262 **{ch.tag: ch.text if ch.text else None for ch in el.findall("*")}, 

263 } 

264 for el in elems 

265 ] 

266 

267 dicts = [ 

268 {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts 

269 ] 

270 

271 keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) 

272 dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] 

273 

274 if self.names: 

275 dicts = [dict(zip(self.names, d.values())) for d in dicts] 

276 

277 return dicts 

278 

279 def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]: 

280 """ 

281 Iterparse xml nodes. 

282 

283 This method will read in local disk, decompressed XML files for elements 

284 and underlying descendants using iterparse, a method to iterate through 

285 an XML tree without holding entire XML tree in memory. 

286 

287 Raises 

288 ------ 

289 TypeError 

290 * If ``iterparse`` is not a dict or its dict value is not list-like. 

291 ParserError 

292 * If ``path_or_buffer`` is not a physical file on disk or file-like object. 

293 * If no data is returned from selected items in ``iterparse``. 

294 

295 Notes 

296 ----- 

297 Namespace URIs will be removed from return node values. Also, 

298 elements with missing children or attributes in submitted list 

299 will have optional keys filled with None values. 

300 """ 

301 

302 dicts: list[dict[str, str | None]] = [] 

303 row: dict[str, str | None] | None = None 

304 

305 if not isinstance(self.iterparse, dict): 

306 raise TypeError( 

307 f"{type(self.iterparse).__name__} is not a valid type for iterparse" 

308 ) 

309 

310 row_node = next(iter(self.iterparse.keys())) if self.iterparse else "" 

311 if not is_list_like(self.iterparse[row_node]): 

312 raise TypeError( 

313 f"{type(self.iterparse[row_node])} is not a valid type " 

314 "for value in iterparse" 

315 ) 

316 

317 if (not hasattr(self.path_or_buffer, "read")) and ( 

318 not isinstance(self.path_or_buffer, (str, PathLike)) 

319 or is_url(self.path_or_buffer) 

320 or is_fsspec_url(self.path_or_buffer) 

321 or ( 

322 isinstance(self.path_or_buffer, str) 

323 and self.path_or_buffer.startswith(("<?xml", "<")) 

324 ) 

325 or infer_compression(self.path_or_buffer, "infer") is not None 

326 ): 

327 raise ParserError( 

328 "iterparse is designed for large XML files that are fully extracted on " 

329 "local disk and not as compressed files or online sources." 

330 ) 

331 

332 iterparse_repeats = len(self.iterparse[row_node]) != len( 

333 set(self.iterparse[row_node]) 

334 ) 

335 

336 for event, elem in iterparse(self.path_or_buffer, events=("start", "end")): 

337 curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag 

338 

339 if event == "start": 

340 if curr_elem == row_node: 

341 row = {} 

342 

343 if row is not None: 

344 if self.names and iterparse_repeats: 

345 for col, nm in zip(self.iterparse[row_node], self.names): 

346 if curr_elem == col: 

347 elem_val = elem.text if elem.text else None 

348 if elem_val not in row.values() and nm not in row: 

349 row[nm] = elem_val 

350 

351 if col in elem.attrib: 

352 if elem.attrib[col] not in row.values() and nm not in row: 

353 row[nm] = elem.attrib[col] 

354 else: 

355 for col in self.iterparse[row_node]: 

356 if curr_elem == col: 

357 row[col] = elem.text if elem.text else None 

358 if col in elem.attrib: 

359 row[col] = elem.attrib[col] 

360 

361 if event == "end": 

362 if curr_elem == row_node and row is not None: 

363 dicts.append(row) 

364 row = None 

365 

366 elem.clear() 

367 if hasattr(elem, "getprevious"): 

368 while ( 

369 elem.getprevious() is not None and elem.getparent() is not None 

370 ): 

371 del elem.getparent()[0] 

372 

373 if dicts == []: 

374 raise ParserError("No result from selected items in iterparse.") 

375 

376 keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) 

377 dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] 

378 

379 if self.names: 

380 dicts = [dict(zip(self.names, d.values())) for d in dicts] 

381 

382 return dicts 

383 

384 def _validate_path(self) -> list[Any]: 

385 """ 

386 Validate ``xpath``. 

387 

388 This method checks for syntax, evaluation, or empty nodes return. 

389 

390 Raises 

391 ------ 

392 SyntaxError 

393 * If xpah is not supported or issues with namespaces. 

394 

395 ValueError 

396 * If xpah does not return any nodes. 

397 """ 

398 

399 raise AbstractMethodError(self) 

400 

401 def _validate_names(self) -> None: 

402 """ 

403 Validate names. 

404 

405 This method will check if names is a list-like and aligns 

406 with length of parse nodes. 

407 

408 Raises 

409 ------ 

410 ValueError 

411 * If value is not a list and less then length of nodes. 

412 """ 

413 raise AbstractMethodError(self) 

414 

415 def _parse_doc( 

416 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] 

417 ) -> Element | etree._Element: 

418 """ 

419 Build tree from path_or_buffer. 

420 

421 This method will parse XML object into tree 

422 either from string/bytes or file location. 

423 """ 

424 raise AbstractMethodError(self) 

425 

426 

427class _EtreeFrameParser(_XMLFrameParser): 

428 """ 

429 Internal class to parse XML into DataFrames with the Python 

430 standard library XML module: `xml.etree.ElementTree`. 

431 """ 

432 

433 def parse_data(self) -> list[dict[str, str | None]]: 

434 from xml.etree.ElementTree import iterparse 

435 

436 if self.stylesheet is not None: 

437 raise ValueError( 

438 "To use stylesheet, you need lxml installed and selected as parser." 

439 ) 

440 

441 if self.iterparse is None: 

442 self.xml_doc = self._parse_doc(self.path_or_buffer) 

443 elems = self._validate_path() 

444 

445 self._validate_names() 

446 

447 xml_dicts: list[dict[str, str | None]] = ( 

448 self._parse_nodes(elems) 

449 if self.iterparse is None 

450 else self._iterparse_nodes(iterparse) 

451 ) 

452 

453 return xml_dicts 

454 

455 def _validate_path(self) -> list[Any]: 

456 """ 

457 Notes 

458 ----- 

459 ``etree`` supports limited ``XPath``. If user attempts a more complex 

460 expression syntax error will raise. 

461 """ 

462 

463 msg = ( 

464 "xpath does not return any nodes or attributes. " 

465 "Be sure to specify in `xpath` the parent nodes of " 

466 "children and attributes to parse. " 

467 "If document uses namespaces denoted with " 

468 "xmlns, be sure to define namespaces and " 

469 "use them in xpath." 

470 ) 

471 try: 

472 elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces) 

473 children = [ch for el in elems for ch in el.findall("*")] 

474 attrs = {k: v for el in elems for k, v in el.attrib.items()} 

475 

476 if elems is None: 

477 raise ValueError(msg) 

478 

479 if elems is not None: 

480 if self.elems_only and children == []: 

481 raise ValueError(msg) 

482 if self.attrs_only and attrs == {}: 

483 raise ValueError(msg) 

484 if children == [] and attrs == {}: 

485 raise ValueError(msg) 

486 

487 except (KeyError, SyntaxError): 

488 raise SyntaxError( 

489 "You have used an incorrect or unsupported XPath " 

490 "expression for etree library or you used an " 

491 "undeclared namespace prefix." 

492 ) 

493 

494 return elems 

495 

496 def _validate_names(self) -> None: 

497 children: list[Any] 

498 

499 if self.names: 

500 if self.iterparse: 

501 children = self.iterparse[next(iter(self.iterparse))] 

502 else: 

503 parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces) 

504 children = parent.findall("*") if parent is not None else [] 

505 

506 if is_list_like(self.names): 

507 if len(self.names) < len(children): 

508 raise ValueError( 

509 "names does not match length of child elements in xpath." 

510 ) 

511 else: 

512 raise TypeError( 

513 f"{type(self.names).__name__} is not a valid type for names" 

514 ) 

515 

516 def _parse_doc( 

517 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] 

518 ) -> Element: 

519 from xml.etree.ElementTree import ( 

520 XMLParser, 

521 parse, 

522 ) 

523 

524 handle_data = get_data_from_filepath( 

525 filepath_or_buffer=raw_doc, 

526 encoding=self.encoding, 

527 compression=self.compression, 

528 storage_options=self.storage_options, 

529 ) 

530 

531 with preprocess_data(handle_data) as xml_data: 

532 curr_parser = XMLParser(encoding=self.encoding) 

533 document = parse(xml_data, parser=curr_parser) 

534 

535 return document.getroot() 

536 

537 

538class _LxmlFrameParser(_XMLFrameParser): 

539 """ 

540 Internal class to parse XML into :class:`~pandas.DataFrame` with third-party 

541 full-featured XML library, ``lxml``, that supports 

542 ``XPath`` 1.0 and XSLT 1.0. 

543 """ 

544 

545 def parse_data(self) -> list[dict[str, str | None]]: 

546 """ 

547 Parse xml data. 

548 

549 This method will call the other internal methods to 

550 validate ``xpath``, names, optionally parse and run XSLT, 

551 and parse original or transformed XML and return specific nodes. 

552 """ 

553 from lxml.etree import iterparse 

554 

555 if self.iterparse is None: 

556 self.xml_doc = self._parse_doc(self.path_or_buffer) 

557 

558 if self.stylesheet: 

559 self.xsl_doc = self._parse_doc(self.stylesheet) 

560 self.xml_doc = self._transform_doc() 

561 

562 elems = self._validate_path() 

563 

564 self._validate_names() 

565 

566 xml_dicts: list[dict[str, str | None]] = ( 

567 self._parse_nodes(elems) 

568 if self.iterparse is None 

569 else self._iterparse_nodes(iterparse) 

570 ) 

571 

572 return xml_dicts 

573 

574 def _validate_path(self) -> list[Any]: 

575 msg = ( 

576 "xpath does not return any nodes or attributes. " 

577 "Be sure to specify in `xpath` the parent nodes of " 

578 "children and attributes to parse. " 

579 "If document uses namespaces denoted with " 

580 "xmlns, be sure to define namespaces and " 

581 "use them in xpath." 

582 ) 

583 

584 elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) 

585 children = [ch for el in elems for ch in el.xpath("*")] 

586 attrs = {k: v for el in elems for k, v in el.attrib.items()} 

587 

588 if elems == []: 

589 raise ValueError(msg) 

590 

591 if elems != []: 

592 if self.elems_only and children == []: 

593 raise ValueError(msg) 

594 if self.attrs_only and attrs == {}: 

595 raise ValueError(msg) 

596 if children == [] and attrs == {}: 

597 raise ValueError(msg) 

598 

599 return elems 

600 

601 def _validate_names(self) -> None: 

602 children: list[Any] 

603 

604 if self.names: 

605 if self.iterparse: 

606 children = self.iterparse[next(iter(self.iterparse))] 

607 else: 

608 children = self.xml_doc.xpath( 

609 self.xpath + "[1]/*", namespaces=self.namespaces 

610 ) 

611 

612 if is_list_like(self.names): 

613 if len(self.names) < len(children): 

614 raise ValueError( 

615 "names does not match length of child elements in xpath." 

616 ) 

617 else: 

618 raise TypeError( 

619 f"{type(self.names).__name__} is not a valid type for names" 

620 ) 

621 

622 def _parse_doc( 

623 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] 

624 ) -> etree._Element: 

625 from lxml.etree import ( 

626 XMLParser, 

627 fromstring, 

628 parse, 

629 ) 

630 

631 handle_data = get_data_from_filepath( 

632 filepath_or_buffer=raw_doc, 

633 encoding=self.encoding, 

634 compression=self.compression, 

635 storage_options=self.storage_options, 

636 ) 

637 

638 with preprocess_data(handle_data) as xml_data: 

639 curr_parser = XMLParser(encoding=self.encoding) 

640 

641 if isinstance(xml_data, io.StringIO): 

642 if self.encoding is None: 

643 raise TypeError( 

644 "Can not pass encoding None when input is StringIO." 

645 ) 

646 

647 document = fromstring( 

648 xml_data.getvalue().encode(self.encoding), parser=curr_parser 

649 ) 

650 else: 

651 document = parse(xml_data, parser=curr_parser) 

652 

653 return document 

654 

655 def _transform_doc(self) -> etree._XSLTResultTree: 

656 """ 

657 Transform original tree using stylesheet. 

658 

659 This method will transform original xml using XSLT script into 

660 am ideally flatter xml document for easier parsing and migration 

661 to Data Frame. 

662 """ 

663 from lxml.etree import XSLT 

664 

665 transformer = XSLT(self.xsl_doc) 

666 new_doc = transformer(self.xml_doc) 

667 

668 return new_doc 

669 

670 

671def get_data_from_filepath( 

672 filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str], 

673 encoding: str | None, 

674 compression: CompressionOptions, 

675 storage_options: StorageOptions, 

676) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]: 

677 """ 

678 Extract raw XML data. 

679 

680 The method accepts three input types: 

681 1. filepath (string-like) 

682 2. file-like object (e.g. open file object, StringIO) 

683 3. XML string or bytes 

684 

685 This method turns (1) into (2) to simplify the rest of the processing. 

686 It returns input types (2) and (3) unchanged. 

687 """ 

688 if not isinstance(filepath_or_buffer, bytes): 

689 filepath_or_buffer = stringify_path(filepath_or_buffer) 

690 

691 if ( 

692 isinstance(filepath_or_buffer, str) 

693 and not filepath_or_buffer.startswith(("<?xml", "<")) 

694 ) and ( 

695 not isinstance(filepath_or_buffer, str) 

696 or is_url(filepath_or_buffer) 

697 or is_fsspec_url(filepath_or_buffer) 

698 or file_exists(filepath_or_buffer) 

699 ): 

700 with get_handle( 

701 filepath_or_buffer, 

702 "r", 

703 encoding=encoding, 

704 compression=compression, 

705 storage_options=storage_options, 

706 ) as handle_obj: 

707 filepath_or_buffer = ( 

708 handle_obj.handle.read() 

709 if hasattr(handle_obj.handle, "read") 

710 else handle_obj.handle 

711 ) 

712 

713 return filepath_or_buffer 

714 

715 

716def preprocess_data(data) -> io.StringIO | io.BytesIO: 

717 """ 

718 Convert extracted raw data. 

719 

720 This method will return underlying data of extracted XML content. 

721 The data either has a `read` attribute (e.g. a file object or a 

722 StringIO/BytesIO) or is a string or bytes that is an XML document. 

723 """ 

724 

725 if isinstance(data, str): 

726 data = io.StringIO(data) 

727 

728 elif isinstance(data, bytes): 

729 data = io.BytesIO(data) 

730 

731 return data 

732 

733 

734def _data_to_frame(data, **kwargs) -> DataFrame: 

735 """ 

736 Convert parsed data to Data Frame. 

737 

738 This method will bind xml dictionary data of keys and values 

739 into named columns of Data Frame using the built-in TextParser 

740 class that build Data Frame and infers specific dtypes. 

741 """ 

742 

743 tags = next(iter(data)) 

744 nodes = [list(d.values()) for d in data] 

745 

746 try: 

747 with TextParser(nodes, names=tags, **kwargs) as tp: 

748 return tp.read() 

749 except ParserError: 

750 raise ParserError( 

751 "XML document may be too complex for import. " 

752 "Try to flatten document and use distinct " 

753 "element and attribute names." 

754 ) 

755 

756 

757def _parse( 

758 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], 

759 xpath: str, 

760 namespaces: dict[str, str] | None, 

761 elems_only: bool, 

762 attrs_only: bool, 

763 names: Sequence[str] | None, 

764 dtype: DtypeArg | None, 

765 converters: ConvertersArg | None, 

766 parse_dates: ParseDatesArg | None, 

767 encoding: str | None, 

768 parser: XMLParsers, 

769 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, 

770 iterparse: dict[str, list[str]] | None, 

771 compression: CompressionOptions, 

772 storage_options: StorageOptions, 

773 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

774 **kwargs, 

775) -> DataFrame: 

776 """ 

777 Call internal parsers. 

778 

779 This method will conditionally call internal parsers: 

780 LxmlFrameParser and/or EtreeParser. 

781 

782 Raises 

783 ------ 

784 ImportError 

785 * If lxml is not installed if selected as parser. 

786 

787 ValueError 

788 * If parser is not lxml or etree. 

789 """ 

790 

791 p: _EtreeFrameParser | _LxmlFrameParser 

792 

793 if isinstance(path_or_buffer, str) and not any( 

794 [ 

795 is_file_like(path_or_buffer), 

796 file_exists(path_or_buffer), 

797 is_url(path_or_buffer), 

798 is_fsspec_url(path_or_buffer), 

799 ] 

800 ): 

801 warnings.warn( 

802 "Passing literal xml to 'read_xml' is deprecated and " 

803 "will be removed in a future version. To read from a " 

804 "literal string, wrap it in a 'StringIO' object.", 

805 FutureWarning, 

806 stacklevel=find_stack_level(), 

807 ) 

808 

809 if parser == "lxml": 

810 lxml = import_optional_dependency("lxml.etree", errors="ignore") 

811 

812 if lxml is not None: 

813 p = _LxmlFrameParser( 

814 path_or_buffer, 

815 xpath, 

816 namespaces, 

817 elems_only, 

818 attrs_only, 

819 names, 

820 dtype, 

821 converters, 

822 parse_dates, 

823 encoding, 

824 stylesheet, 

825 iterparse, 

826 compression, 

827 storage_options, 

828 ) 

829 else: 

830 raise ImportError("lxml not found, please install or use the etree parser.") 

831 

832 elif parser == "etree": 

833 p = _EtreeFrameParser( 

834 path_or_buffer, 

835 xpath, 

836 namespaces, 

837 elems_only, 

838 attrs_only, 

839 names, 

840 dtype, 

841 converters, 

842 parse_dates, 

843 encoding, 

844 stylesheet, 

845 iterparse, 

846 compression, 

847 storage_options, 

848 ) 

849 else: 

850 raise ValueError("Values for parser can only be lxml or etree.") 

851 

852 data_dicts = p.parse_data() 

853 

854 return _data_to_frame( 

855 data=data_dicts, 

856 dtype=dtype, 

857 converters=converters, 

858 parse_dates=parse_dates, 

859 dtype_backend=dtype_backend, 

860 **kwargs, 

861 ) 

862 

863 

864@doc( 

865 storage_options=_shared_docs["storage_options"], 

866 decompression_options=_shared_docs["decompression_options"] % "path_or_buffer", 

867) 

868def read_xml( 

869 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], 

870 *, 

871 xpath: str = "./*", 

872 namespaces: dict[str, str] | None = None, 

873 elems_only: bool = False, 

874 attrs_only: bool = False, 

875 names: Sequence[str] | None = None, 

876 dtype: DtypeArg | None = None, 

877 converters: ConvertersArg | None = None, 

878 parse_dates: ParseDatesArg | None = None, 

879 # encoding can not be None for lxml and StringIO input 

880 encoding: str | None = "utf-8", 

881 parser: XMLParsers = "lxml", 

882 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None, 

883 iterparse: dict[str, list[str]] | None = None, 

884 compression: CompressionOptions = "infer", 

885 storage_options: StorageOptions | None = None, 

886 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

887) -> DataFrame: 

888 r""" 

889 Read XML document into a :class:`~pandas.DataFrame` object. 

890 

891 .. versionadded:: 1.3.0 

892 

893 Parameters 

894 ---------- 

895 path_or_buffer : str, path object, or file-like object 

896 String, path object (implementing ``os.PathLike[str]``), or file-like 

897 object implementing a ``read()`` function. The string can be any valid XML 

898 string or a path. The string can further be a URL. Valid URL schemes 

899 include http, ftp, s3, and file. 

900 

901 .. deprecated:: 2.1.0 

902 Passing xml literal strings is deprecated. 

903 Wrap literal xml input in ``io.StringIO`` or ``io.BytesIO`` instead. 

904 

905 xpath : str, optional, default './\*' 

906 The ``XPath`` to parse required set of nodes for migration to 

907 :class:`~pandas.DataFrame`.``XPath`` should return a collection of elements 

908 and not a single element. Note: The ``etree`` parser supports limited ``XPath`` 

909 expressions. For more complex ``XPath``, use ``lxml`` which requires 

910 installation. 

911 

912 namespaces : dict, optional 

913 The namespaces defined in XML document as dicts with key being 

914 namespace prefix and value the URI. There is no need to include all 

915 namespaces in XML, only the ones used in ``xpath`` expression. 

916 Note: if XML document uses default namespace denoted as 

917 `xmlns='<URI>'` without a prefix, you must assign any temporary 

918 namespace prefix such as 'doc' to the URI in order to parse 

919 underlying nodes and/or attributes. For example, :: 

920 

921 namespaces = {{"doc": "https://example.com"}} 

922 

923 elems_only : bool, optional, default False 

924 Parse only the child elements at the specified ``xpath``. By default, 

925 all child elements and non-empty text nodes are returned. 

926 

927 attrs_only : bool, optional, default False 

928 Parse only the attributes at the specified ``xpath``. 

929 By default, all attributes are returned. 

930 

931 names : list-like, optional 

932 Column names for DataFrame of parsed XML data. Use this parameter to 

933 rename original element names and distinguish same named elements and 

934 attributes. 

935 

936 dtype : Type name or dict of column -> type, optional 

937 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, 

938 'c': 'Int64'}} 

939 Use `str` or `object` together with suitable `na_values` settings 

940 to preserve and not interpret dtype. 

941 If converters are specified, they will be applied INSTEAD 

942 of dtype conversion. 

943 

944 .. versionadded:: 1.5.0 

945 

946 converters : dict, optional 

947 Dict of functions for converting values in certain columns. Keys can either 

948 be integers or column labels. 

949 

950 .. versionadded:: 1.5.0 

951 

952 parse_dates : bool or list of int or names or list of lists or dict, default False 

953 Identifiers to parse index or columns to datetime. The behavior is as follows: 

954 

955 * boolean. If True -> try parsing the index. 

956 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 

957 each as a separate date column. 

958 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as 

959 a single date column. 

960 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call 

961 result 'foo' 

962 

963 .. versionadded:: 1.5.0 

964 

965 encoding : str, optional, default 'utf-8' 

966 Encoding of XML document. 

967 

968 parser : {{'lxml','etree'}}, default 'lxml' 

969 Parser module to use for retrieval of data. Only 'lxml' and 

970 'etree' are supported. With 'lxml' more complex ``XPath`` searches 

971 and ability to use XSLT stylesheet are supported. 

972 

973 stylesheet : str, path object or file-like object 

974 A URL, file-like object, or a raw string containing an XSLT script. 

975 This stylesheet should flatten complex, deeply nested XML documents 

976 for easier parsing. To use this feature you must have ``lxml`` module 

977 installed and specify 'lxml' as ``parser``. The ``xpath`` must 

978 reference nodes of transformed XML document generated after XSLT 

979 transformation and not the original XML document. Only XSLT 1.0 

980 scripts and not later versions is currently supported. 

981 

982 iterparse : dict, optional 

983 The nodes or attributes to retrieve in iterparsing of XML document 

984 as a dict with key being the name of repeating element and value being 

985 list of elements or attribute names that are descendants of the repeated 

986 element. Note: If this option is used, it will replace ``xpath`` parsing 

987 and unlike ``xpath``, descendants do not need to relate to each other but can 

988 exist any where in document under the repeating element. This memory- 

989 efficient method should be used for very large XML files (500MB, 1GB, or 5GB+). 

990 For example, :: 

991 

992 iterparse = {{"row_element": ["child_elem", "attr", "grandchild_elem"]}} 

993 

994 .. versionadded:: 1.5.0 

995 

996 {decompression_options} 

997 

998 .. versionchanged:: 1.4.0 Zstandard support. 

999 

1000 {storage_options} 

1001 

1002 dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' 

1003 Back-end data type applied to the resultant :class:`DataFrame` 

1004 (still experimental). Behaviour is as follows: 

1005 

1006 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` 

1007 (default). 

1008 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` 

1009 DataFrame. 

1010 

1011 .. versionadded:: 2.0 

1012 

1013 Returns 

1014 ------- 

1015 df 

1016 A DataFrame. 

1017 

1018 See Also 

1019 -------- 

1020 read_json : Convert a JSON string to pandas object. 

1021 read_html : Read HTML tables into a list of DataFrame objects. 

1022 

1023 Notes 

1024 ----- 

1025 This method is best designed to import shallow XML documents in 

1026 following format which is the ideal fit for the two-dimensions of a 

1027 ``DataFrame`` (row by column). :: 

1028 

1029 <root> 

1030 <row> 

1031 <column1>data</column1> 

1032 <column2>data</column2> 

1033 <column3>data</column3> 

1034 ... 

1035 </row> 

1036 <row> 

1037 ... 

1038 </row> 

1039 ... 

1040 </root> 

1041 

1042 As a file format, XML documents can be designed any way including 

1043 layout of elements and attributes as long as it conforms to W3C 

1044 specifications. Therefore, this method is a convenience handler for 

1045 a specific flatter design and not all possible XML structures. 

1046 

1047 However, for more complex XML documents, ``stylesheet`` allows you to 

1048 temporarily redesign original document with XSLT (a special purpose 

1049 language) for a flatter version for migration to a DataFrame. 

1050 

1051 This function will *always* return a single :class:`DataFrame` or raise 

1052 exceptions due to issues with XML document, ``xpath``, or other 

1053 parameters. 

1054 

1055 See the :ref:`read_xml documentation in the IO section of the docs 

1056 <io.read_xml>` for more information in using this method to parse XML 

1057 files to DataFrames. 

1058 

1059 Examples 

1060 -------- 

1061 >>> from io import StringIO 

1062 >>> xml = '''<?xml version='1.0' encoding='utf-8'?> 

1063 ... <data xmlns="http://example.com"> 

1064 ... <row> 

1065 ... <shape>square</shape> 

1066 ... <degrees>360</degrees> 

1067 ... <sides>4.0</sides> 

1068 ... </row> 

1069 ... <row> 

1070 ... <shape>circle</shape> 

1071 ... <degrees>360</degrees> 

1072 ... <sides/> 

1073 ... </row> 

1074 ... <row> 

1075 ... <shape>triangle</shape> 

1076 ... <degrees>180</degrees> 

1077 ... <sides>3.0</sides> 

1078 ... </row> 

1079 ... </data>''' 

1080 

1081 >>> df = pd.read_xml(StringIO(xml)) 

1082 >>> df 

1083 shape degrees sides 

1084 0 square 360 4.0 

1085 1 circle 360 NaN 

1086 2 triangle 180 3.0 

1087 

1088 >>> xml = '''<?xml version='1.0' encoding='utf-8'?> 

1089 ... <data> 

1090 ... <row shape="square" degrees="360" sides="4.0"/> 

1091 ... <row shape="circle" degrees="360"/> 

1092 ... <row shape="triangle" degrees="180" sides="3.0"/> 

1093 ... </data>''' 

1094 

1095 >>> df = pd.read_xml(StringIO(xml), xpath=".//row") 

1096 >>> df 

1097 shape degrees sides 

1098 0 square 360 4.0 

1099 1 circle 360 NaN 

1100 2 triangle 180 3.0 

1101 

1102 >>> xml = '''<?xml version='1.0' encoding='utf-8'?> 

1103 ... <doc:data xmlns:doc="https://example.com"> 

1104 ... <doc:row> 

1105 ... <doc:shape>square</doc:shape> 

1106 ... <doc:degrees>360</doc:degrees> 

1107 ... <doc:sides>4.0</doc:sides> 

1108 ... </doc:row> 

1109 ... <doc:row> 

1110 ... <doc:shape>circle</doc:shape> 

1111 ... <doc:degrees>360</doc:degrees> 

1112 ... <doc:sides/> 

1113 ... </doc:row> 

1114 ... <doc:row> 

1115 ... <doc:shape>triangle</doc:shape> 

1116 ... <doc:degrees>180</doc:degrees> 

1117 ... <doc:sides>3.0</doc:sides> 

1118 ... </doc:row> 

1119 ... </doc:data>''' 

1120 

1121 >>> df = pd.read_xml(StringIO(xml), 

1122 ... xpath="//doc:row", 

1123 ... namespaces={{"doc": "https://example.com"}}) 

1124 >>> df 

1125 shape degrees sides 

1126 0 square 360 4.0 

1127 1 circle 360 NaN 

1128 2 triangle 180 3.0 

1129 

1130 >>> xml_data = ''' 

1131 ... <data> 

1132 ... <row> 

1133 ... <index>0</index> 

1134 ... <a>1</a> 

1135 ... <b>2.5</b> 

1136 ... <c>True</c> 

1137 ... <d>a</d> 

1138 ... <e>2019-12-31 00:00:00</e> 

1139 ... </row> 

1140 ... <row> 

1141 ... <index>1</index> 

1142 ... <b>4.5</b> 

1143 ... <c>False</c> 

1144 ... <d>b</d> 

1145 ... <e>2019-12-31 00:00:00</e> 

1146 ... </row> 

1147 ... </data> 

1148 ... ''' 

1149 

1150 >>> df = pd.read_xml(StringIO(xml_data), 

1151 ... dtype_backend="numpy_nullable", 

1152 ... parse_dates=["e"]) 

1153 >>> df 

1154 index a b c d e 

1155 0 0 1 2.5 True a 2019-12-31 

1156 1 1 <NA> 4.5 False b 2019-12-31 

1157 """ 

1158 check_dtype_backend(dtype_backend) 

1159 

1160 return _parse( 

1161 path_or_buffer=path_or_buffer, 

1162 xpath=xpath, 

1163 namespaces=namespaces, 

1164 elems_only=elems_only, 

1165 attrs_only=attrs_only, 

1166 names=names, 

1167 dtype=dtype, 

1168 converters=converters, 

1169 parse_dates=parse_dates, 

1170 encoding=encoding, 

1171 parser=parser, 

1172 stylesheet=stylesheet, 

1173 iterparse=iterparse, 

1174 compression=compression, 

1175 storage_options=storage_options, 

1176 dtype_backend=dtype_backend, 

1177 )