Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/xml.py: 17%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

241 statements  

1""" 

2:mod:`pandas.io.xml` is a module for reading XML. 

3""" 

4 

5from __future__ import annotations 

6 

7import io 

8from typing import ( 

9 Any, 

10 Callable, 

11 Sequence, 

12) 

13 

14from pandas._libs import lib 

15from pandas._typing import ( 

16 TYPE_CHECKING, 

17 CompressionOptions, 

18 ConvertersArg, 

19 DtypeArg, 

20 DtypeBackend, 

21 FilePath, 

22 ParseDatesArg, 

23 ReadBuffer, 

24 StorageOptions, 

25 XMLParsers, 

26) 

27from pandas.compat._optional import import_optional_dependency 

28from pandas.errors import ( 

29 AbstractMethodError, 

30 ParserError, 

31) 

32from pandas.util._decorators import doc 

33from pandas.util._validators import check_dtype_backend 

34 

35from pandas.core.dtypes.common import is_list_like 

36 

37from pandas.core.shared_docs import _shared_docs 

38 

39from pandas.io.common import ( 

40 file_exists, 

41 get_handle, 

42 infer_compression, 

43 is_fsspec_url, 

44 is_url, 

45 stringify_path, 

46) 

47from pandas.io.parsers import TextParser 

48 

49if TYPE_CHECKING: 

50 from xml.etree.ElementTree import Element 

51 

52 from lxml import etree 

53 

54 from pandas import DataFrame 

55 

56 

57@doc( 

58 storage_options=_shared_docs["storage_options"], 

59 decompression_options=_shared_docs["decompression_options"] % "path_or_buffer", 

60) 

61class _XMLFrameParser: 

62 """ 

63 Internal subclass to parse XML into DataFrames. 

64 

65 Parameters 

66 ---------- 

67 path_or_buffer : a valid JSON str, path object or file-like object 

68 Any valid string path is acceptable. The string could be a URL. Valid 

69 URL schemes include http, ftp, s3, and file. 

70 

71 xpath : str or regex 

72 The XPath expression to parse required set of nodes for 

73 migration to `Data Frame`. `etree` supports limited XPath. 

74 

75 namespaces : dict 

76 The namespaces defined in XML document (`xmlns:namespace='URI') 

77 as dicts with key being namespace and value the URI. 

78 

79 elems_only : bool 

80 Parse only the child elements at the specified `xpath`. 

81 

82 attrs_only : bool 

83 Parse only the attributes at the specified `xpath`. 

84 

85 names : list 

86 Column names for Data Frame of parsed XML data. 

87 

88 dtype : dict 

89 Data type for data or columns. E.g. {{'a': np.float64, 

90 'b': np.int32, 'c': 'Int64'}} 

91 

92 .. versionadded:: 1.5.0 

93 

94 converters : dict, optional 

95 Dict of functions for converting values in certain columns. Keys can 

96 either be integers or column labels. 

97 

98 .. versionadded:: 1.5.0 

99 

100 parse_dates : bool or list of int or names or list of lists or dict 

101 Converts either index or select columns to datetimes 

102 

103 .. versionadded:: 1.5.0 

104 

105 encoding : str 

106 Encoding of xml object or document. 

107 

108 stylesheet : str or file-like 

109 URL, file, file-like object, or a raw string containing XSLT, 

110 `etree` does not support XSLT but retained for consistency. 

111 

112 iterparse : dict, optional 

113 Dict with row element as key and list of descendant elements 

114 and/or attributes as value to be retrieved in iterparsing of 

115 XML document. 

116 

117 .. versionadded:: 1.5.0 

118 

119 {decompression_options} 

120 

121 .. versionchanged:: 1.4.0 Zstandard support. 

122 

123 {storage_options} 

124 

125 See also 

126 -------- 

127 pandas.io.xml._EtreeFrameParser 

128 pandas.io.xml._LxmlFrameParser 

129 

130 Notes 

131 ----- 

132 To subclass this class effectively you must override the following methods:` 

133 * :func:`parse_data` 

134 * :func:`_parse_nodes` 

135 * :func:`_iterparse_nodes` 

136 * :func:`_parse_doc` 

137 * :func:`_validate_names` 

138 * :func:`_validate_path` 

139 

140 

141 See each method's respective documentation for details on their 

142 functionality. 

143 """ 

144 

145 def __init__( 

146 self, 

147 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], 

148 xpath: str, 

149 namespaces: dict[str, str] | None, 

150 elems_only: bool, 

151 attrs_only: bool, 

152 names: Sequence[str] | None, 

153 dtype: DtypeArg | None, 

154 converters: ConvertersArg | None, 

155 parse_dates: ParseDatesArg | None, 

156 encoding: str | None, 

157 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, 

158 iterparse: dict[str, list[str]] | None, 

159 compression: CompressionOptions, 

160 storage_options: StorageOptions, 

161 ) -> None: 

162 self.path_or_buffer = path_or_buffer 

163 self.xpath = xpath 

164 self.namespaces = namespaces 

165 self.elems_only = elems_only 

166 self.attrs_only = attrs_only 

167 self.names = names 

168 self.dtype = dtype 

169 self.converters = converters 

170 self.parse_dates = parse_dates 

171 self.encoding = encoding 

172 self.stylesheet = stylesheet 

173 self.iterparse = iterparse 

174 self.is_style = None 

175 self.compression = compression 

176 self.storage_options = storage_options 

177 

178 def parse_data(self) -> list[dict[str, str | None]]: 

179 """ 

180 Parse xml data. 

181 

182 This method will call the other internal methods to 

183 validate xpath, names, parse and return specific nodes. 

184 """ 

185 

186 raise AbstractMethodError(self) 

187 

188 def _parse_nodes(self, elems: list[Any]) -> list[dict[str, str | None]]: 

189 """ 

190 Parse xml nodes. 

191 

192 This method will parse the children and attributes of elements 

193 in xpath, conditionally for only elements, only attributes 

194 or both while optionally renaming node names. 

195 

196 Raises 

197 ------ 

198 ValueError 

199 * If only elements and only attributes are specified. 

200 

201 Notes 

202 ----- 

203 Namespace URIs will be removed from return node values. Also, 

204 elements with missing children or attributes compared to siblings 

205 will have optional keys filled with None values. 

206 """ 

207 

208 dicts: list[dict[str, str | None]] 

209 

210 if self.elems_only and self.attrs_only: 

211 raise ValueError("Either element or attributes can be parsed not both.") 

212 if self.elems_only: 

213 if self.names: 

214 dicts = [ 

215 { 

216 **( 

217 {el.tag: el.text.strip()} 

218 if el.text and not el.text.isspace() 

219 else {} 

220 ), 

221 **{ 

222 nm: ch.text.strip() if ch.text else None 

223 for nm, ch in zip(self.names, el.findall("*")) 

224 }, 

225 } 

226 for el in elems 

227 ] 

228 else: 

229 dicts = [ 

230 { 

231 ch.tag: ch.text.strip() if ch.text else None 

232 for ch in el.findall("*") 

233 } 

234 for el in elems 

235 ] 

236 

237 elif self.attrs_only: 

238 dicts = [ 

239 {k: v.strip() if v else None for k, v in el.attrib.items()} 

240 for el in elems 

241 ] 

242 

243 else: 

244 if self.names: 

245 dicts = [ 

246 { 

247 **el.attrib, 

248 **( 

249 {el.tag: el.text.strip()} 

250 if el.text and not el.text.isspace() 

251 else {} 

252 ), 

253 **{ 

254 nm: ch.text.strip() if ch.text else None 

255 for nm, ch in zip(self.names, el.findall("*")) 

256 }, 

257 } 

258 for el in elems 

259 ] 

260 

261 else: 

262 dicts = [ 

263 { 

264 **el.attrib, 

265 **( 

266 {el.tag: el.text.strip()} 

267 if el.text and not el.text.isspace() 

268 else {} 

269 ), 

270 **{ 

271 ch.tag: ch.text.strip() if ch.text else None 

272 for ch in el.findall("*") 

273 }, 

274 } 

275 for el in elems 

276 ] 

277 

278 dicts = [ 

279 {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts 

280 ] 

281 

282 keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) 

283 dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] 

284 

285 if self.names: 

286 dicts = [dict(zip(self.names, d.values())) for d in dicts] 

287 

288 return dicts 

289 

290 def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]: 

291 """ 

292 Iterparse xml nodes. 

293 

294 This method will read in local disk, decompressed XML files for elements 

295 and underlying descendants using iterparse, a method to iterate through 

296 an XML tree without holding entire XML tree in memory. 

297 

298 Raises 

299 ------ 

300 TypeError 

301 * If `iterparse` is not a dict or its dict value is not list-like. 

302 ParserError 

303 * If `path_or_buffer` is not a physical file on disk or file-like object. 

304 * If no data is returned from selected items in `iterparse`. 

305 

306 Notes 

307 ----- 

308 Namespace URIs will be removed from return node values. Also, 

309 elements with missing children or attributes in submitted list 

310 will have optional keys filled with None values. 

311 """ 

312 

313 dicts: list[dict[str, str | None]] = [] 

314 row: dict[str, str | None] | None = None 

315 

316 if not isinstance(self.iterparse, dict): 

317 raise TypeError( 

318 f"{type(self.iterparse).__name__} is not a valid type for iterparse" 

319 ) 

320 

321 row_node = next(iter(self.iterparse.keys())) if self.iterparse else "" 

322 if not is_list_like(self.iterparse[row_node]): 

323 raise TypeError( 

324 f"{type(self.iterparse[row_node])} is not a valid type " 

325 "for value in iterparse" 

326 ) 

327 

328 if (not hasattr(self.path_or_buffer, "read")) and ( 

329 not isinstance(self.path_or_buffer, str) 

330 or is_url(self.path_or_buffer) 

331 or is_fsspec_url(self.path_or_buffer) 

332 or self.path_or_buffer.startswith(("<?xml", "<")) 

333 or infer_compression(self.path_or_buffer, "infer") is not None 

334 ): 

335 raise ParserError( 

336 "iterparse is designed for large XML files that are fully extracted on " 

337 "local disk and not as compressed files or online sources." 

338 ) 

339 

340 iterparse_repeats = len(self.iterparse[row_node]) != len( 

341 set(self.iterparse[row_node]) 

342 ) 

343 

344 for event, elem in iterparse(self.path_or_buffer, events=("start", "end")): 

345 curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag 

346 

347 if event == "start": 

348 if curr_elem == row_node: 

349 row = {} 

350 

351 if row is not None: 

352 if self.names and iterparse_repeats: 

353 for col, nm in zip(self.iterparse[row_node], self.names): 

354 if curr_elem == col: 

355 elem_val = elem.text.strip() if elem.text else None 

356 if elem_val not in row.values() and nm not in row: 

357 row[nm] = elem_val 

358 

359 if col in elem.attrib: 

360 if elem.attrib[col] not in row.values() and nm not in row: 

361 row[nm] = elem.attrib[col] 

362 else: 

363 for col in self.iterparse[row_node]: 

364 if curr_elem == col: 

365 row[col] = elem.text.strip() if elem.text else None 

366 if col in elem.attrib: 

367 row[col] = elem.attrib[col] 

368 

369 if event == "end": 

370 if curr_elem == row_node and row is not None: 

371 dicts.append(row) 

372 row = None 

373 

374 elem.clear() 

375 if hasattr(elem, "getprevious"): 

376 while ( 

377 elem.getprevious() is not None and elem.getparent() is not None 

378 ): 

379 del elem.getparent()[0] 

380 

381 if dicts == []: 

382 raise ParserError("No result from selected items in iterparse.") 

383 

384 keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) 

385 dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] 

386 

387 if self.names: 

388 dicts = [dict(zip(self.names, d.values())) for d in dicts] 

389 

390 return dicts 

391 

392 def _validate_path(self) -> list[Any]: 

393 """ 

394 Validate xpath. 

395 

396 This method checks for syntax, evaluation, or empty nodes return. 

397 

398 Raises 

399 ------ 

400 SyntaxError 

401 * If xpah is not supported or issues with namespaces. 

402 

403 ValueError 

404 * If xpah does not return any nodes. 

405 """ 

406 

407 raise AbstractMethodError(self) 

408 

409 def _validate_names(self) -> None: 

410 """ 

411 Validate names. 

412 

413 This method will check if names is a list-like and aligns 

414 with length of parse nodes. 

415 

416 Raises 

417 ------ 

418 ValueError 

419 * If value is not a list and less then length of nodes. 

420 """ 

421 raise AbstractMethodError(self) 

422 

423 def _parse_doc( 

424 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] 

425 ) -> Element | etree._Element: 

426 """ 

427 Build tree from path_or_buffer. 

428 

429 This method will parse XML object into tree 

430 either from string/bytes or file location. 

431 """ 

432 raise AbstractMethodError(self) 

433 

434 

435class _EtreeFrameParser(_XMLFrameParser): 

436 """ 

437 Internal class to parse XML into DataFrames with the Python 

438 standard library XML module: `xml.etree.ElementTree`. 

439 """ 

440 

441 def parse_data(self) -> list[dict[str, str | None]]: 

442 from xml.etree.ElementTree import iterparse 

443 

444 if self.stylesheet is not None: 

445 raise ValueError( 

446 "To use stylesheet, you need lxml installed and selected as parser." 

447 ) 

448 

449 if self.iterparse is None: 

450 self.xml_doc = self._parse_doc(self.path_or_buffer) 

451 elems = self._validate_path() 

452 

453 self._validate_names() 

454 

455 xml_dicts: list[dict[str, str | None]] = ( 

456 self._parse_nodes(elems) 

457 if self.iterparse is None 

458 else self._iterparse_nodes(iterparse) 

459 ) 

460 

461 return xml_dicts 

462 

463 def _validate_path(self) -> list[Any]: 

464 """ 

465 Notes 

466 ----- 

467 `etree` supports limited XPath. If user attempts a more complex 

468 expression syntax error will raise. 

469 """ 

470 

471 msg = ( 

472 "xpath does not return any nodes or attributes. " 

473 "Be sure to specify in `xpath` the parent nodes of " 

474 "children and attributes to parse. " 

475 "If document uses namespaces denoted with " 

476 "xmlns, be sure to define namespaces and " 

477 "use them in xpath." 

478 ) 

479 try: 

480 elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces) 

481 children = [ch for el in elems for ch in el.findall("*")] 

482 attrs = {k: v for el in elems for k, v in el.attrib.items()} 

483 

484 if elems is None: 

485 raise ValueError(msg) 

486 

487 if elems is not None: 

488 if self.elems_only and children == []: 

489 raise ValueError(msg) 

490 if self.attrs_only and attrs == {}: 

491 raise ValueError(msg) 

492 if children == [] and attrs == {}: 

493 raise ValueError(msg) 

494 

495 except (KeyError, SyntaxError): 

496 raise SyntaxError( 

497 "You have used an incorrect or unsupported XPath " 

498 "expression for etree library or you used an " 

499 "undeclared namespace prefix." 

500 ) 

501 

502 return elems 

503 

504 def _validate_names(self) -> None: 

505 children: list[Any] 

506 

507 if self.names: 

508 if self.iterparse: 

509 children = self.iterparse[next(iter(self.iterparse))] 

510 else: 

511 parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces) 

512 children = parent.findall("*") if parent else [] 

513 

514 if is_list_like(self.names): 

515 if len(self.names) < len(children): 

516 raise ValueError( 

517 "names does not match length of child elements in xpath." 

518 ) 

519 else: 

520 raise TypeError( 

521 f"{type(self.names).__name__} is not a valid type for names" 

522 ) 

523 

524 def _parse_doc( 

525 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] 

526 ) -> Element: 

527 from xml.etree.ElementTree import ( 

528 XMLParser, 

529 parse, 

530 ) 

531 

532 handle_data = get_data_from_filepath( 

533 filepath_or_buffer=raw_doc, 

534 encoding=self.encoding, 

535 compression=self.compression, 

536 storage_options=self.storage_options, 

537 ) 

538 

539 with preprocess_data(handle_data) as xml_data: 

540 curr_parser = XMLParser(encoding=self.encoding) 

541 document = parse(xml_data, parser=curr_parser) 

542 

543 return document.getroot() 

544 

545 

546class _LxmlFrameParser(_XMLFrameParser): 

547 """ 

548 Internal class to parse XML into DataFrames with third-party 

549 full-featured XML library, `lxml`, that supports 

550 XPath 1.0 and XSLT 1.0. 

551 """ 

552 

553 def parse_data(self) -> list[dict[str, str | None]]: 

554 """ 

555 Parse xml data. 

556 

557 This method will call the other internal methods to 

558 validate xpath, names, optionally parse and run XSLT, 

559 and parse original or transformed XML and return specific nodes. 

560 """ 

561 from lxml.etree import iterparse 

562 

563 if self.iterparse is None: 

564 self.xml_doc = self._parse_doc(self.path_or_buffer) 

565 

566 if self.stylesheet: 

567 self.xsl_doc = self._parse_doc(self.stylesheet) 

568 self.xml_doc = self._transform_doc() 

569 

570 elems = self._validate_path() 

571 

572 self._validate_names() 

573 

574 xml_dicts: list[dict[str, str | None]] = ( 

575 self._parse_nodes(elems) 

576 if self.iterparse is None 

577 else self._iterparse_nodes(iterparse) 

578 ) 

579 

580 return xml_dicts 

581 

582 def _validate_path(self) -> list[Any]: 

583 msg = ( 

584 "xpath does not return any nodes or attributes. " 

585 "Be sure to specify in `xpath` the parent nodes of " 

586 "children and attributes to parse. " 

587 "If document uses namespaces denoted with " 

588 "xmlns, be sure to define namespaces and " 

589 "use them in xpath." 

590 ) 

591 

592 elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) 

593 children = [ch for el in elems for ch in el.xpath("*")] 

594 attrs = {k: v for el in elems for k, v in el.attrib.items()} 

595 

596 if elems == []: 

597 raise ValueError(msg) 

598 

599 if elems != []: 

600 if self.elems_only and children == []: 

601 raise ValueError(msg) 

602 if self.attrs_only and attrs == {}: 

603 raise ValueError(msg) 

604 if children == [] and attrs == {}: 

605 raise ValueError(msg) 

606 

607 return elems 

608 

609 def _validate_names(self) -> None: 

610 children: list[Any] 

611 

612 if self.names: 

613 if self.iterparse: 

614 children = self.iterparse[next(iter(self.iterparse))] 

615 else: 

616 children = self.xml_doc.xpath( 

617 self.xpath + "[1]/*", namespaces=self.namespaces 

618 ) 

619 

620 if is_list_like(self.names): 

621 if len(self.names) < len(children): 

622 raise ValueError( 

623 "names does not match length of child elements in xpath." 

624 ) 

625 else: 

626 raise TypeError( 

627 f"{type(self.names).__name__} is not a valid type for names" 

628 ) 

629 

630 def _parse_doc( 

631 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] 

632 ) -> etree._Element: 

633 from lxml.etree import ( 

634 XMLParser, 

635 fromstring, 

636 parse, 

637 ) 

638 

639 handle_data = get_data_from_filepath( 

640 filepath_or_buffer=raw_doc, 

641 encoding=self.encoding, 

642 compression=self.compression, 

643 storage_options=self.storage_options, 

644 ) 

645 

646 with preprocess_data(handle_data) as xml_data: 

647 curr_parser = XMLParser(encoding=self.encoding) 

648 

649 if isinstance(xml_data, io.StringIO): 

650 if self.encoding is None: 

651 raise TypeError( 

652 "Can not pass encoding None when input is StringIO." 

653 ) 

654 

655 document = fromstring( 

656 xml_data.getvalue().encode(self.encoding), parser=curr_parser 

657 ) 

658 else: 

659 document = parse(xml_data, parser=curr_parser) 

660 

661 return document 

662 

663 def _transform_doc(self) -> etree._XSLTResultTree: 

664 """ 

665 Transform original tree using stylesheet. 

666 

667 This method will transform original xml using XSLT script into 

668 am ideally flatter xml document for easier parsing and migration 

669 to Data Frame. 

670 """ 

671 from lxml.etree import XSLT 

672 

673 transformer = XSLT(self.xsl_doc) 

674 new_doc = transformer(self.xml_doc) 

675 

676 return new_doc 

677 

678 

679def get_data_from_filepath( 

680 filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str], 

681 encoding: str | None, 

682 compression: CompressionOptions, 

683 storage_options: StorageOptions, 

684) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]: 

685 """ 

686 Extract raw XML data. 

687 

688 The method accepts three input types: 

689 1. filepath (string-like) 

690 2. file-like object (e.g. open file object, StringIO) 

691 3. XML string or bytes 

692 

693 This method turns (1) into (2) to simplify the rest of the processing. 

694 It returns input types (2) and (3) unchanged. 

695 """ 

696 if not isinstance(filepath_or_buffer, bytes): 

697 filepath_or_buffer = stringify_path(filepath_or_buffer) 

698 

699 if ( 

700 isinstance(filepath_or_buffer, str) 

701 and not filepath_or_buffer.startswith(("<?xml", "<")) 

702 ) and ( 

703 not isinstance(filepath_or_buffer, str) 

704 or is_url(filepath_or_buffer) 

705 or is_fsspec_url(filepath_or_buffer) 

706 or file_exists(filepath_or_buffer) 

707 ): 

708 with get_handle( 

709 filepath_or_buffer, 

710 "r", 

711 encoding=encoding, 

712 compression=compression, 

713 storage_options=storage_options, 

714 ) as handle_obj: 

715 filepath_or_buffer = ( 

716 handle_obj.handle.read() 

717 if hasattr(handle_obj.handle, "read") 

718 else handle_obj.handle 

719 ) 

720 

721 return filepath_or_buffer 

722 

723 

724def preprocess_data(data) -> io.StringIO | io.BytesIO: 

725 """ 

726 Convert extracted raw data. 

727 

728 This method will return underlying data of extracted XML content. 

729 The data either has a `read` attribute (e.g. a file object or a 

730 StringIO/BytesIO) or is a string or bytes that is an XML document. 

731 """ 

732 

733 if isinstance(data, str): 

734 data = io.StringIO(data) 

735 

736 elif isinstance(data, bytes): 

737 data = io.BytesIO(data) 

738 

739 return data 

740 

741 

742def _data_to_frame(data, **kwargs) -> DataFrame: 

743 """ 

744 Convert parsed data to Data Frame. 

745 

746 This method will bind xml dictionary data of keys and values 

747 into named columns of Data Frame using the built-in TextParser 

748 class that build Data Frame and infers specific dtypes. 

749 """ 

750 

751 tags = next(iter(data)) 

752 nodes = [list(d.values()) for d in data] 

753 

754 try: 

755 with TextParser(nodes, names=tags, **kwargs) as tp: 

756 return tp.read() 

757 except ParserError: 

758 raise ParserError( 

759 "XML document may be too complex for import. " 

760 "Try to flatten document and use distinct " 

761 "element and attribute names." 

762 ) 

763 

764 

765def _parse( 

766 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], 

767 xpath: str, 

768 namespaces: dict[str, str] | None, 

769 elems_only: bool, 

770 attrs_only: bool, 

771 names: Sequence[str] | None, 

772 dtype: DtypeArg | None, 

773 converters: ConvertersArg | None, 

774 parse_dates: ParseDatesArg | None, 

775 encoding: str | None, 

776 parser: XMLParsers, 

777 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, 

778 iterparse: dict[str, list[str]] | None, 

779 compression: CompressionOptions, 

780 storage_options: StorageOptions, 

781 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

782 **kwargs, 

783) -> DataFrame: 

784 """ 

785 Call internal parsers. 

786 

787 This method will conditionally call internal parsers: 

788 LxmlFrameParser and/or EtreeParser. 

789 

790 Raises 

791 ------ 

792 ImportError 

793 * If lxml is not installed if selected as parser. 

794 

795 ValueError 

796 * If parser is not lxml or etree. 

797 """ 

798 

799 p: _EtreeFrameParser | _LxmlFrameParser 

800 

801 if parser == "lxml": 

802 lxml = import_optional_dependency("lxml.etree", errors="ignore") 

803 

804 if lxml is not None: 

805 p = _LxmlFrameParser( 

806 path_or_buffer, 

807 xpath, 

808 namespaces, 

809 elems_only, 

810 attrs_only, 

811 names, 

812 dtype, 

813 converters, 

814 parse_dates, 

815 encoding, 

816 stylesheet, 

817 iterparse, 

818 compression, 

819 storage_options, 

820 ) 

821 else: 

822 raise ImportError("lxml not found, please install or use the etree parser.") 

823 

824 elif parser == "etree": 

825 p = _EtreeFrameParser( 

826 path_or_buffer, 

827 xpath, 

828 namespaces, 

829 elems_only, 

830 attrs_only, 

831 names, 

832 dtype, 

833 converters, 

834 parse_dates, 

835 encoding, 

836 stylesheet, 

837 iterparse, 

838 compression, 

839 storage_options, 

840 ) 

841 else: 

842 raise ValueError("Values for parser can only be lxml or etree.") 

843 

844 data_dicts = p.parse_data() 

845 

846 return _data_to_frame( 

847 data=data_dicts, 

848 dtype=dtype, 

849 converters=converters, 

850 parse_dates=parse_dates, 

851 dtype_backend=dtype_backend, 

852 **kwargs, 

853 ) 

854 

855 

856@doc( 

857 storage_options=_shared_docs["storage_options"], 

858 decompression_options=_shared_docs["decompression_options"] % "path_or_buffer", 

859) 

860def read_xml( 

861 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], 

862 *, 

863 xpath: str = "./*", 

864 namespaces: dict[str, str] | None = None, 

865 elems_only: bool = False, 

866 attrs_only: bool = False, 

867 names: Sequence[str] | None = None, 

868 dtype: DtypeArg | None = None, 

869 converters: ConvertersArg | None = None, 

870 parse_dates: ParseDatesArg | None = None, 

871 # encoding can not be None for lxml and StringIO input 

872 encoding: str | None = "utf-8", 

873 parser: XMLParsers = "lxml", 

874 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None, 

875 iterparse: dict[str, list[str]] | None = None, 

876 compression: CompressionOptions = "infer", 

877 storage_options: StorageOptions = None, 

878 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

879) -> DataFrame: 

880 r""" 

881 Read XML document into a ``DataFrame`` object. 

882 

883 .. versionadded:: 1.3.0 

884 

885 Parameters 

886 ---------- 

887 path_or_buffer : str, path object, or file-like object 

888 String, path object (implementing ``os.PathLike[str]``), or file-like 

889 object implementing a ``read()`` function. The string can be any valid XML 

890 string or a path. The string can further be a URL. Valid URL schemes 

891 include http, ftp, s3, and file. 

892 

893 xpath : str, optional, default './\*' 

894 The XPath to parse required set of nodes for migration to DataFrame. 

895 XPath should return a collection of elements and not a single 

896 element. Note: The ``etree`` parser supports limited XPath 

897 expressions. For more complex XPath, use ``lxml`` which requires 

898 installation. 

899 

900 namespaces : dict, optional 

901 The namespaces defined in XML document as dicts with key being 

902 namespace prefix and value the URI. There is no need to include all 

903 namespaces in XML, only the ones used in ``xpath`` expression. 

904 Note: if XML document uses default namespace denoted as 

905 `xmlns='<URI>'` without a prefix, you must assign any temporary 

906 namespace prefix such as 'doc' to the URI in order to parse 

907 underlying nodes and/or attributes. For example, :: 

908 

909 namespaces = {{"doc": "https://example.com"}} 

910 

911 elems_only : bool, optional, default False 

912 Parse only the child elements at the specified ``xpath``. By default, 

913 all child elements and non-empty text nodes are returned. 

914 

915 attrs_only : bool, optional, default False 

916 Parse only the attributes at the specified ``xpath``. 

917 By default, all attributes are returned. 

918 

919 names : list-like, optional 

920 Column names for DataFrame of parsed XML data. Use this parameter to 

921 rename original element names and distinguish same named elements and 

922 attributes. 

923 

924 dtype : Type name or dict of column -> type, optional 

925 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, 

926 'c': 'Int64'}} 

927 Use `str` or `object` together with suitable `na_values` settings 

928 to preserve and not interpret dtype. 

929 If converters are specified, they will be applied INSTEAD 

930 of dtype conversion. 

931 

932 .. versionadded:: 1.5.0 

933 

934 converters : dict, optional 

935 Dict of functions for converting values in certain columns. Keys can either 

936 be integers or column labels. 

937 

938 .. versionadded:: 1.5.0 

939 

940 parse_dates : bool or list of int or names or list of lists or dict, default False 

941 Identifiers to parse index or columns to datetime. The behavior is as follows: 

942 

943 * boolean. If True -> try parsing the index. 

944 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 

945 each as a separate date column. 

946 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as 

947 a single date column. 

948 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call 

949 result 'foo' 

950 

951 .. versionadded:: 1.5.0 

952 

953 encoding : str, optional, default 'utf-8' 

954 Encoding of XML document. 

955 

956 parser : {{'lxml','etree'}}, default 'lxml' 

957 Parser module to use for retrieval of data. Only 'lxml' and 

958 'etree' are supported. With 'lxml' more complex XPath searches 

959 and ability to use XSLT stylesheet are supported. 

960 

961 stylesheet : str, path object or file-like object 

962 A URL, file-like object, or a raw string containing an XSLT script. 

963 This stylesheet should flatten complex, deeply nested XML documents 

964 for easier parsing. To use this feature you must have ``lxml`` module 

965 installed and specify 'lxml' as ``parser``. The ``xpath`` must 

966 reference nodes of transformed XML document generated after XSLT 

967 transformation and not the original XML document. Only XSLT 1.0 

968 scripts and not later versions is currently supported. 

969 

970 iterparse : dict, optional 

971 The nodes or attributes to retrieve in iterparsing of XML document 

972 as a dict with key being the name of repeating element and value being 

973 list of elements or attribute names that are descendants of the repeated 

974 element. Note: If this option is used, it will replace ``xpath`` parsing 

975 and unlike xpath, descendants do not need to relate to each other but can 

976 exist any where in document under the repeating element. This memory- 

977 efficient method should be used for very large XML files (500MB, 1GB, or 5GB+). 

978 For example, :: 

979 

980 iterparse = {{"row_element": ["child_elem", "attr", "grandchild_elem"]}} 

981 

982 .. versionadded:: 1.5.0 

983 

984 {decompression_options} 

985 

986 .. versionchanged:: 1.4.0 Zstandard support. 

987 

988 {storage_options} 

989 

990 dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames 

991 Which dtype_backend to use, e.g. whether a DataFrame should have NumPy 

992 arrays, nullable dtypes are used for all dtypes that have a nullable 

993 implementation when "numpy_nullable" is set, pyarrow is used for all 

994 dtypes if "pyarrow" is set. 

995 

996 The dtype_backends are still experimential. 

997 

998 .. versionadded:: 2.0 

999 

1000 Returns 

1001 ------- 

1002 df 

1003 A DataFrame. 

1004 

1005 See Also 

1006 -------- 

1007 read_json : Convert a JSON string to pandas object. 

1008 read_html : Read HTML tables into a list of DataFrame objects. 

1009 

1010 Notes 

1011 ----- 

1012 This method is best designed to import shallow XML documents in 

1013 following format which is the ideal fit for the two-dimensions of a 

1014 ``DataFrame`` (row by column). :: 

1015 

1016 <root> 

1017 <row> 

1018 <column1>data</column1> 

1019 <column2>data</column2> 

1020 <column3>data</column3> 

1021 ... 

1022 </row> 

1023 <row> 

1024 ... 

1025 </row> 

1026 ... 

1027 </root> 

1028 

1029 As a file format, XML documents can be designed any way including 

1030 layout of elements and attributes as long as it conforms to W3C 

1031 specifications. Therefore, this method is a convenience handler for 

1032 a specific flatter design and not all possible XML structures. 

1033 

1034 However, for more complex XML documents, ``stylesheet`` allows you to 

1035 temporarily redesign original document with XSLT (a special purpose 

1036 language) for a flatter version for migration to a DataFrame. 

1037 

1038 This function will *always* return a single :class:`DataFrame` or raise 

1039 exceptions due to issues with XML document, ``xpath``, or other 

1040 parameters. 

1041 

1042 See the :ref:`read_xml documentation in the IO section of the docs 

1043 <io.read_xml>` for more information in using this method to parse XML 

1044 files to DataFrames. 

1045 

1046 Examples 

1047 -------- 

1048 >>> xml = '''<?xml version='1.0' encoding='utf-8'?> 

1049 ... <data xmlns="http://example.com"> 

1050 ... <row> 

1051 ... <shape>square</shape> 

1052 ... <degrees>360</degrees> 

1053 ... <sides>4.0</sides> 

1054 ... </row> 

1055 ... <row> 

1056 ... <shape>circle</shape> 

1057 ... <degrees>360</degrees> 

1058 ... <sides/> 

1059 ... </row> 

1060 ... <row> 

1061 ... <shape>triangle</shape> 

1062 ... <degrees>180</degrees> 

1063 ... <sides>3.0</sides> 

1064 ... </row> 

1065 ... </data>''' 

1066 

1067 >>> df = pd.read_xml(xml) 

1068 >>> df 

1069 shape degrees sides 

1070 0 square 360 4.0 

1071 1 circle 360 NaN 

1072 2 triangle 180 3.0 

1073 

1074 >>> xml = '''<?xml version='1.0' encoding='utf-8'?> 

1075 ... <data> 

1076 ... <row shape="square" degrees="360" sides="4.0"/> 

1077 ... <row shape="circle" degrees="360"/> 

1078 ... <row shape="triangle" degrees="180" sides="3.0"/> 

1079 ... </data>''' 

1080 

1081 >>> df = pd.read_xml(xml, xpath=".//row") 

1082 >>> df 

1083 shape degrees sides 

1084 0 square 360 4.0 

1085 1 circle 360 NaN 

1086 2 triangle 180 3.0 

1087 

1088 >>> xml = '''<?xml version='1.0' encoding='utf-8'?> 

1089 ... <doc:data xmlns:doc="https://example.com"> 

1090 ... <doc:row> 

1091 ... <doc:shape>square</doc:shape> 

1092 ... <doc:degrees>360</doc:degrees> 

1093 ... <doc:sides>4.0</doc:sides> 

1094 ... </doc:row> 

1095 ... <doc:row> 

1096 ... <doc:shape>circle</doc:shape> 

1097 ... <doc:degrees>360</doc:degrees> 

1098 ... <doc:sides/> 

1099 ... </doc:row> 

1100 ... <doc:row> 

1101 ... <doc:shape>triangle</doc:shape> 

1102 ... <doc:degrees>180</doc:degrees> 

1103 ... <doc:sides>3.0</doc:sides> 

1104 ... </doc:row> 

1105 ... </doc:data>''' 

1106 

1107 >>> df = pd.read_xml(xml, 

1108 ... xpath="//doc:row", 

1109 ... namespaces={{"doc": "https://example.com"}}) 

1110 >>> df 

1111 shape degrees sides 

1112 0 square 360 4.0 

1113 1 circle 360 NaN 

1114 2 triangle 180 3.0 

1115 """ 

1116 check_dtype_backend(dtype_backend) 

1117 

1118 return _parse( 

1119 path_or_buffer=path_or_buffer, 

1120 xpath=xpath, 

1121 namespaces=namespaces, 

1122 elems_only=elems_only, 

1123 attrs_only=attrs_only, 

1124 names=names, 

1125 dtype=dtype, 

1126 converters=converters, 

1127 parse_dates=parse_dates, 

1128 encoding=encoding, 

1129 parser=parser, 

1130 stylesheet=stylesheet, 

1131 iterparse=iterparse, 

1132 compression=compression, 

1133 storage_options=storage_options, 

1134 dtype_backend=dtype_backend, 

1135 )