Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/element.py: 25%

954 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-01 06:54 +0000

1# Use of this source code is governed by the MIT license. 

2__license__ = "MIT" 

3 

4try: 

5 from collections.abc import Callable # Python 3.6 

6except ImportError as e: 

7 from collections import Callable 

8import re 

9import sys 

10import warnings 

11 

12from bs4.css import CSS 

13from bs4.formatter import ( 

14 Formatter, 

15 HTMLFormatter, 

16 XMLFormatter, 

17) 

18 

19DEFAULT_OUTPUT_ENCODING = "utf-8" 

20 

21nonwhitespace_re = re.compile(r"\S+") 

22 

23# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on 

24# the off chance someone imported it for their own use. 

25whitespace_re = re.compile(r"\s+") 

26 

27def _alias(attr): 

28 """Alias one attribute name to another for backward compatibility""" 

29 @property 

30 def alias(self): 

31 return getattr(self, attr) 

32 

33 @alias.setter 

34 def alias(self): 

35 return setattr(self, attr) 

36 return alias 

37 

38 

39# These encodings are recognized by Python (so PageElement.encode 

40# could theoretically support them) but XML and HTML don't recognize 

41# them (so they should not show up in an XML or HTML document as that 

42# document's encoding). 

43# 

44# If an XML document is encoded in one of these encodings, no encoding 

45# will be mentioned in the XML declaration. If an HTML document is 

46# encoded in one of these encodings, and the HTML document has a 

47# <meta> tag that mentions an encoding, the encoding will be given as 

48# the empty string. 

49# 

50# Source: 

51# https://docs.python.org/3/library/codecs.html#python-specific-encodings 

52PYTHON_SPECIFIC_ENCODINGS = set([ 

53 "idna", 

54 "mbcs", 

55 "oem", 

56 "palmos", 

57 "punycode", 

58 "raw_unicode_escape", 

59 "undefined", 

60 "unicode_escape", 

61 "raw-unicode-escape", 

62 "unicode-escape", 

63 "string-escape", 

64 "string_escape", 

65]) 

66 

67 

68class NamespacedAttribute(str): 

69 """A namespaced string (e.g. 'xml:lang') that remembers the namespace 

70 ('xml') and the name ('lang') that were used to create it. 

71 """ 

72 

73 def __new__(cls, prefix, name=None, namespace=None): 

74 if not name: 

75 # This is the default namespace. Its name "has no value" 

76 # per https://www.w3.org/TR/xml-names/#defaulting 

77 name = None 

78 

79 if not name: 

80 obj = str.__new__(cls, prefix) 

81 elif not prefix: 

82 # Not really namespaced. 

83 obj = str.__new__(cls, name) 

84 else: 

85 obj = str.__new__(cls, prefix + ":" + name) 

86 obj.prefix = prefix 

87 obj.name = name 

88 obj.namespace = namespace 

89 return obj 

90 

91class AttributeValueWithCharsetSubstitution(str): 

92 """A stand-in object for a character encoding specified in HTML.""" 

93 

94class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): 

95 """A generic stand-in for the value of a meta tag's 'charset' attribute. 

96 

97 When Beautiful Soup parses the markup '<meta charset="utf8">', the 

98 value of the 'charset' attribute will be one of these objects. 

99 """ 

100 

101 def __new__(cls, original_value): 

102 obj = str.__new__(cls, original_value) 

103 obj.original_value = original_value 

104 return obj 

105 

106 def encode(self, encoding): 

107 """When an HTML document is being encoded to a given encoding, the 

108 value of a meta tag's 'charset' is the name of the encoding. 

109 """ 

110 if encoding in PYTHON_SPECIFIC_ENCODINGS: 

111 return '' 

112 return encoding 

113 

114 

115class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): 

116 """A generic stand-in for the value of a meta tag's 'content' attribute. 

117 

118 When Beautiful Soup parses the markup: 

119 <meta http-equiv="content-type" content="text/html; charset=utf8"> 

120 

121 The value of the 'content' attribute will be one of these objects. 

122 """ 

123 

124 CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) 

125 

126 def __new__(cls, original_value): 

127 match = cls.CHARSET_RE.search(original_value) 

128 if match is None: 

129 # No substitution necessary. 

130 return str.__new__(str, original_value) 

131 

132 obj = str.__new__(cls, original_value) 

133 obj.original_value = original_value 

134 return obj 

135 

136 def encode(self, encoding): 

137 if encoding in PYTHON_SPECIFIC_ENCODINGS: 

138 return '' 

139 def rewrite(match): 

140 return match.group(1) + encoding 

141 return self.CHARSET_RE.sub(rewrite, self.original_value) 

142 

143 

144class PageElement(object): 

145 """Contains the navigational information for some part of the page: 

146 that is, its current location in the parse tree. 

147 

148 NavigableString, Tag, etc. are all subclasses of PageElement. 

149 """ 

150 

151 # In general, we can't tell just by looking at an element whether 

152 # it's contained in an XML document or an HTML document. But for 

153 # Tags (q.v.) we can store this information at parse time. 

154 known_xml = None 

155 

156 def setup(self, parent=None, previous_element=None, next_element=None, 

157 previous_sibling=None, next_sibling=None): 

158 """Sets up the initial relations between this element and 

159 other elements. 

160 

161 :param parent: The parent of this element. 

162 

163 :param previous_element: The element parsed immediately before 

164 this one. 

165 

166 :param next_element: The element parsed immediately before 

167 this one. 

168 

169 :param previous_sibling: The most recently encountered element 

170 on the same level of the parse tree as this one. 

171 

172 :param previous_sibling: The next element to be encountered 

173 on the same level of the parse tree as this one. 

174 """ 

175 self.parent = parent 

176 

177 self.previous_element = previous_element 

178 if previous_element is not None: 

179 self.previous_element.next_element = self 

180 

181 self.next_element = next_element 

182 if self.next_element is not None: 

183 self.next_element.previous_element = self 

184 

185 self.next_sibling = next_sibling 

186 if self.next_sibling is not None: 

187 self.next_sibling.previous_sibling = self 

188 

189 if (previous_sibling is None 

190 and self.parent is not None and self.parent.contents): 

191 previous_sibling = self.parent.contents[-1] 

192 

193 self.previous_sibling = previous_sibling 

194 if previous_sibling is not None: 

195 self.previous_sibling.next_sibling = self 

196 

197 def format_string(self, s, formatter): 

198 """Format the given string using the given formatter. 

199 

200 :param s: A string. 

201 :param formatter: A Formatter object, or a string naming one of the standard formatters. 

202 """ 

203 if formatter is None: 

204 return s 

205 if not isinstance(formatter, Formatter): 

206 formatter = self.formatter_for_name(formatter) 

207 output = formatter.substitute(s) 

208 return output 

209 

210 def formatter_for_name(self, formatter): 

211 """Look up or create a Formatter for the given identifier, 

212 if necessary. 

213 

214 :param formatter: Can be a Formatter object (used as-is), a 

215 function (used as the entity substitution hook for an 

216 XMLFormatter or HTMLFormatter), or a string (used to look 

217 up an XMLFormatter or HTMLFormatter in the appropriate 

218 registry. 

219 """ 

220 if isinstance(formatter, Formatter): 

221 return formatter 

222 if self._is_xml: 

223 c = XMLFormatter 

224 else: 

225 c = HTMLFormatter 

226 if isinstance(formatter, Callable): 

227 return c(entity_substitution=formatter) 

228 return c.REGISTRY[formatter] 

229 

230 @property 

231 def _is_xml(self): 

232 """Is this element part of an XML tree or an HTML tree? 

233 

234 This is used in formatter_for_name, when deciding whether an 

235 XMLFormatter or HTMLFormatter is more appropriate. It can be 

236 inefficient, but it should be called very rarely. 

237 """ 

238 if self.known_xml is not None: 

239 # Most of the time we will have determined this when the 

240 # document is parsed. 

241 return self.known_xml 

242 

243 # Otherwise, it's likely that this element was created by 

244 # direct invocation of the constructor from within the user's 

245 # Python code. 

246 if self.parent is None: 

247 # This is the top-level object. It should have .known_xml set 

248 # from tree creation. If not, take a guess--BS is usually 

249 # used on HTML markup. 

250 return getattr(self, 'is_xml', False) 

251 return self.parent._is_xml 

252 

253 nextSibling = _alias("next_sibling") # BS3 

254 previousSibling = _alias("previous_sibling") # BS3 

255 

256 default = object() 

257 def _all_strings(self, strip=False, types=default): 

258 """Yield all strings of certain classes, possibly stripping them. 

259 

260 This is implemented differently in Tag and NavigableString. 

261 """ 

262 raise NotImplementedError() 

263 

264 @property 

265 def stripped_strings(self): 

266 """Yield all strings in this PageElement, stripping them first. 

267 

268 :yield: A sequence of stripped strings. 

269 """ 

270 for string in self._all_strings(True): 

271 yield string 

272 

273 def get_text(self, separator="", strip=False, 

274 types=default): 

275 """Get all child strings of this PageElement, concatenated using the 

276 given separator. 

277 

278 :param separator: Strings will be concatenated using this separator. 

279 

280 :param strip: If True, strings will be stripped before being 

281 concatenated. 

282 

283 :param types: A tuple of NavigableString subclasses. Any 

284 strings of a subclass not found in this list will be 

285 ignored. Although there are exceptions, the default 

286 behavior in most cases is to consider only NavigableString 

287 and CData objects. That means no comments, processing 

288 instructions, etc. 

289 

290 :return: A string. 

291 """ 

292 return separator.join([s for s in self._all_strings( 

293 strip, types=types)]) 

294 getText = get_text 

295 text = property(get_text) 

296 

297 def replace_with(self, *args): 

298 """Replace this PageElement with one or more PageElements, keeping the 

299 rest of the tree the same. 

300 

301 :param args: One or more PageElements. 

302 :return: `self`, no longer part of the tree. 

303 """ 

304 if self.parent is None: 

305 raise ValueError( 

306 "Cannot replace one element with another when the " 

307 "element to be replaced is not part of a tree.") 

308 if len(args) == 1 and args[0] is self: 

309 return 

310 if any(x is self.parent for x in args): 

311 raise ValueError("Cannot replace a Tag with its parent.") 

312 old_parent = self.parent 

313 my_index = self.parent.index(self) 

314 self.extract(_self_index=my_index) 

315 for idx, replace_with in enumerate(args, start=my_index): 

316 old_parent.insert(idx, replace_with) 

317 return self 

318 replaceWith = replace_with # BS3 

319 

320 def unwrap(self): 

321 """Replace this PageElement with its contents. 

322 

323 :return: `self`, no longer part of the tree. 

324 """ 

325 my_parent = self.parent 

326 if self.parent is None: 

327 raise ValueError( 

328 "Cannot replace an element with its contents when that" 

329 "element is not part of a tree.") 

330 my_index = self.parent.index(self) 

331 self.extract(_self_index=my_index) 

332 for child in reversed(self.contents[:]): 

333 my_parent.insert(my_index, child) 

334 return self 

335 replace_with_children = unwrap 

336 replaceWithChildren = unwrap # BS3 

337 

338 def wrap(self, wrap_inside): 

339 """Wrap this PageElement inside another one. 

340 

341 :param wrap_inside: A PageElement. 

342 :return: `wrap_inside`, occupying the position in the tree that used 

343 to be occupied by `self`, and with `self` inside it. 

344 """ 

345 me = self.replace_with(wrap_inside) 

346 wrap_inside.append(me) 

347 return wrap_inside 

348 

349 def extract(self, _self_index=None): 

350 """Destructively rips this element out of the tree. 

351 

352 :param _self_index: The location of this element in its parent's 

353 .contents, if known. Passing this in allows for a performance 

354 optimization. 

355 

356 :return: `self`, no longer part of the tree. 

357 """ 

358 if self.parent is not None: 

359 if _self_index is None: 

360 _self_index = self.parent.index(self) 

361 del self.parent.contents[_self_index] 

362 

363 #Find the two elements that would be next to each other if 

364 #this element (and any children) hadn't been parsed. Connect 

365 #the two. 

366 last_child = self._last_descendant() 

367 next_element = last_child.next_element 

368 

369 if (self.previous_element is not None and 

370 self.previous_element is not next_element): 

371 self.previous_element.next_element = next_element 

372 if next_element is not None and next_element is not self.previous_element: 

373 next_element.previous_element = self.previous_element 

374 self.previous_element = None 

375 last_child.next_element = None 

376 

377 self.parent = None 

378 if (self.previous_sibling is not None 

379 and self.previous_sibling is not self.next_sibling): 

380 self.previous_sibling.next_sibling = self.next_sibling 

381 if (self.next_sibling is not None 

382 and self.next_sibling is not self.previous_sibling): 

383 self.next_sibling.previous_sibling = self.previous_sibling 

384 self.previous_sibling = self.next_sibling = None 

385 return self 

386 

387 def _last_descendant(self, is_initialized=True, accept_self=True): 

388 """Finds the last element beneath this object to be parsed. 

389 

390 :param is_initialized: Has `setup` been called on this PageElement 

391 yet? 

392 :param accept_self: Is `self` an acceptable answer to the question? 

393 """ 

394 if is_initialized and self.next_sibling is not None: 

395 last_child = self.next_sibling.previous_element 

396 else: 

397 last_child = self 

398 while isinstance(last_child, Tag) and last_child.contents: 

399 last_child = last_child.contents[-1] 

400 if not accept_self and last_child is self: 

401 last_child = None 

402 return last_child 

403 # BS3: Not part of the API! 

404 _lastRecursiveChild = _last_descendant 

405 

406 def insert(self, position, new_child): 

407 """Insert a new PageElement in the list of this PageElement's children. 

408 

409 This works the same way as `list.insert`. 

410 

411 :param position: The numeric position that should be occupied 

412 in `self.children` by the new PageElement. 

413 :param new_child: A PageElement. 

414 """ 

415 if new_child is None: 

416 raise ValueError("Cannot insert None into a tag.") 

417 if new_child is self: 

418 raise ValueError("Cannot insert a tag into itself.") 

419 if (isinstance(new_child, str) 

420 and not isinstance(new_child, NavigableString)): 

421 new_child = NavigableString(new_child) 

422 

423 from bs4 import BeautifulSoup 

424 if isinstance(new_child, BeautifulSoup): 

425 # We don't want to end up with a situation where one BeautifulSoup 

426 # object contains another. Insert the children one at a time. 

427 for subchild in list(new_child.contents): 

428 self.insert(position, subchild) 

429 position += 1 

430 return 

431 position = min(position, len(self.contents)) 

432 if hasattr(new_child, 'parent') and new_child.parent is not None: 

433 # We're 'inserting' an element that's already one 

434 # of this object's children. 

435 if new_child.parent is self: 

436 current_index = self.index(new_child) 

437 if current_index < position: 

438 # We're moving this element further down the list 

439 # of this object's children. That means that when 

440 # we extract this element, our target index will 

441 # jump down one. 

442 position -= 1 

443 new_child.extract() 

444 

445 new_child.parent = self 

446 previous_child = None 

447 if position == 0: 

448 new_child.previous_sibling = None 

449 new_child.previous_element = self 

450 else: 

451 previous_child = self.contents[position - 1] 

452 new_child.previous_sibling = previous_child 

453 new_child.previous_sibling.next_sibling = new_child 

454 new_child.previous_element = previous_child._last_descendant(False) 

455 if new_child.previous_element is not None: 

456 new_child.previous_element.next_element = new_child 

457 

458 new_childs_last_element = new_child._last_descendant(False) 

459 

460 if position >= len(self.contents): 

461 new_child.next_sibling = None 

462 

463 parent = self 

464 parents_next_sibling = None 

465 while parents_next_sibling is None and parent is not None: 

466 parents_next_sibling = parent.next_sibling 

467 parent = parent.parent 

468 if parents_next_sibling is not None: 

469 # We found the element that comes next in the document. 

470 break 

471 if parents_next_sibling is not None: 

472 new_childs_last_element.next_element = parents_next_sibling 

473 else: 

474 # The last element of this tag is the last element in 

475 # the document. 

476 new_childs_last_element.next_element = None 

477 else: 

478 next_child = self.contents[position] 

479 new_child.next_sibling = next_child 

480 if new_child.next_sibling is not None: 

481 new_child.next_sibling.previous_sibling = new_child 

482 new_childs_last_element.next_element = next_child 

483 

484 if new_childs_last_element.next_element is not None: 

485 new_childs_last_element.next_element.previous_element = new_childs_last_element 

486 self.contents.insert(position, new_child) 

487 

488 def append(self, tag): 

489 """Appends the given PageElement to the contents of this one. 

490 

491 :param tag: A PageElement. 

492 """ 

493 self.insert(len(self.contents), tag) 

494 

495 def extend(self, tags): 

496 """Appends the given PageElements to this one's contents. 

497 

498 :param tags: A list of PageElements. If a single Tag is 

499 provided instead, this PageElement's contents will be extended 

500 with that Tag's contents. 

501 """ 

502 if isinstance(tags, Tag): 

503 tags = tags.contents 

504 if isinstance(tags, list): 

505 # Moving items around the tree may change their position in 

506 # the original list. Make a list that won't change. 

507 tags = list(tags) 

508 for tag in tags: 

509 self.append(tag) 

510 

511 def insert_before(self, *args): 

512 """Makes the given element(s) the immediate predecessor of this one. 

513 

514 All the elements will have the same parent, and the given elements 

515 will be immediately before this one. 

516 

517 :param args: One or more PageElements. 

518 """ 

519 parent = self.parent 

520 if parent is None: 

521 raise ValueError( 

522 "Element has no parent, so 'before' has no meaning.") 

523 if any(x is self for x in args): 

524 raise ValueError("Can't insert an element before itself.") 

525 for predecessor in args: 

526 # Extract first so that the index won't be screwed up if they 

527 # are siblings. 

528 if isinstance(predecessor, PageElement): 

529 predecessor.extract() 

530 index = parent.index(self) 

531 parent.insert(index, predecessor) 

532 

533 def insert_after(self, *args): 

534 """Makes the given element(s) the immediate successor of this one. 

535 

536 The elements will have the same parent, and the given elements 

537 will be immediately after this one. 

538 

539 :param args: One or more PageElements. 

540 """ 

541 # Do all error checking before modifying the tree. 

542 parent = self.parent 

543 if parent is None: 

544 raise ValueError( 

545 "Element has no parent, so 'after' has no meaning.") 

546 if any(x is self for x in args): 

547 raise ValueError("Can't insert an element after itself.") 

548 

549 offset = 0 

550 for successor in args: 

551 # Extract first so that the index won't be screwed up if they 

552 # are siblings. 

553 if isinstance(successor, PageElement): 

554 successor.extract() 

555 index = parent.index(self) 

556 parent.insert(index+1+offset, successor) 

557 offset += 1 

558 

559 def find_next(self, name=None, attrs={}, string=None, **kwargs): 

560 """Find the first PageElement that matches the given criteria and 

561 appears later in the document than this PageElement. 

562 

563 All find_* methods take a common set of arguments. See the online 

564 documentation for detailed explanations. 

565 

566 :param name: A filter on tag name. 

567 :param attrs: A dictionary of filters on attribute values. 

568 :param string: A filter for a NavigableString with specific text. 

569 :kwargs: A dictionary of filters on attribute values. 

570 :return: A PageElement. 

571 :rtype: bs4.element.Tag | bs4.element.NavigableString 

572 """ 

573 return self._find_one(self.find_all_next, name, attrs, string, **kwargs) 

574 findNext = find_next # BS3 

575 

576 def find_all_next(self, name=None, attrs={}, string=None, limit=None, 

577 **kwargs): 

578 """Find all PageElements that match the given criteria and appear 

579 later in the document than this PageElement. 

580 

581 All find_* methods take a common set of arguments. See the online 

582 documentation for detailed explanations. 

583 

584 :param name: A filter on tag name. 

585 :param attrs: A dictionary of filters on attribute values. 

586 :param string: A filter for a NavigableString with specific text. 

587 :param limit: Stop looking after finding this many results. 

588 :kwargs: A dictionary of filters on attribute values. 

589 :return: A ResultSet containing PageElements. 

590 """ 

591 _stacklevel = kwargs.pop('_stacklevel', 2) 

592 return self._find_all(name, attrs, string, limit, self.next_elements, 

593 _stacklevel=_stacklevel+1, **kwargs) 

594 findAllNext = find_all_next # BS3 

595 

596 def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs): 

597 """Find the closest sibling to this PageElement that matches the 

598 given criteria and appears later in the document. 

599 

600 All find_* methods take a common set of arguments. See the 

601 online documentation for detailed explanations. 

602 

603 :param name: A filter on tag name. 

604 :param attrs: A dictionary of filters on attribute values. 

605 :param string: A filter for a NavigableString with specific text. 

606 :kwargs: A dictionary of filters on attribute values. 

607 :return: A PageElement. 

608 :rtype: bs4.element.Tag | bs4.element.NavigableString 

609 """ 

610 return self._find_one(self.find_next_siblings, name, attrs, string, 

611 **kwargs) 

612 findNextSibling = find_next_sibling # BS3 

613 

614 def find_next_siblings(self, name=None, attrs={}, string=None, limit=None, 

615 **kwargs): 

616 """Find all siblings of this PageElement that match the given criteria 

617 and appear later in the document. 

618 

619 All find_* methods take a common set of arguments. See the online 

620 documentation for detailed explanations. 

621 

622 :param name: A filter on tag name. 

623 :param attrs: A dictionary of filters on attribute values. 

624 :param string: A filter for a NavigableString with specific text. 

625 :param limit: Stop looking after finding this many results. 

626 :kwargs: A dictionary of filters on attribute values. 

627 :return: A ResultSet of PageElements. 

628 :rtype: bs4.element.ResultSet 

629 """ 

630 _stacklevel = kwargs.pop('_stacklevel', 2) 

631 return self._find_all( 

632 name, attrs, string, limit, 

633 self.next_siblings, _stacklevel=_stacklevel+1, **kwargs 

634 ) 

635 findNextSiblings = find_next_siblings # BS3 

636 fetchNextSiblings = find_next_siblings # BS2 

637 

638 def find_previous(self, name=None, attrs={}, string=None, **kwargs): 

639 """Look backwards in the document from this PageElement and find the 

640 first PageElement that matches the given criteria. 

641 

642 All find_* methods take a common set of arguments. See the online 

643 documentation for detailed explanations. 

644 

645 :param name: A filter on tag name. 

646 :param attrs: A dictionary of filters on attribute values. 

647 :param string: A filter for a NavigableString with specific text. 

648 :kwargs: A dictionary of filters on attribute values. 

649 :return: A PageElement. 

650 :rtype: bs4.element.Tag | bs4.element.NavigableString 

651 """ 

652 return self._find_one( 

653 self.find_all_previous, name, attrs, string, **kwargs) 

654 findPrevious = find_previous # BS3 

655 

656 def find_all_previous(self, name=None, attrs={}, string=None, limit=None, 

657 **kwargs): 

658 """Look backwards in the document from this PageElement and find all 

659 PageElements that match the given criteria. 

660 

661 All find_* methods take a common set of arguments. See the online 

662 documentation for detailed explanations. 

663 

664 :param name: A filter on tag name. 

665 :param attrs: A dictionary of filters on attribute values. 

666 :param string: A filter for a NavigableString with specific text. 

667 :param limit: Stop looking after finding this many results. 

668 :kwargs: A dictionary of filters on attribute values. 

669 :return: A ResultSet of PageElements. 

670 :rtype: bs4.element.ResultSet 

671 """ 

672 _stacklevel = kwargs.pop('_stacklevel', 2) 

673 return self._find_all( 

674 name, attrs, string, limit, self.previous_elements, 

675 _stacklevel=_stacklevel+1, **kwargs 

676 ) 

677 findAllPrevious = find_all_previous # BS3 

678 fetchPrevious = find_all_previous # BS2 

679 

680 def find_previous_sibling(self, name=None, attrs={}, string=None, **kwargs): 

681 """Returns the closest sibling to this PageElement that matches the 

682 given criteria and appears earlier in the document. 

683 

684 All find_* methods take a common set of arguments. See the online 

685 documentation for detailed explanations. 

686 

687 :param name: A filter on tag name. 

688 :param attrs: A dictionary of filters on attribute values. 

689 :param string: A filter for a NavigableString with specific text. 

690 :kwargs: A dictionary of filters on attribute values. 

691 :return: A PageElement. 

692 :rtype: bs4.element.Tag | bs4.element.NavigableString 

693 """ 

694 return self._find_one(self.find_previous_siblings, name, attrs, string, 

695 **kwargs) 

696 findPreviousSibling = find_previous_sibling # BS3 

697 

698 def find_previous_siblings(self, name=None, attrs={}, string=None, 

699 limit=None, **kwargs): 

700 """Returns all siblings to this PageElement that match the 

701 given criteria and appear earlier in the document. 

702 

703 All find_* methods take a common set of arguments. See the online 

704 documentation for detailed explanations. 

705 

706 :param name: A filter on tag name. 

707 :param attrs: A dictionary of filters on attribute values. 

708 :param string: A filter for a NavigableString with specific text. 

709 :param limit: Stop looking after finding this many results. 

710 :kwargs: A dictionary of filters on attribute values. 

711 :return: A ResultSet of PageElements. 

712 :rtype: bs4.element.ResultSet 

713 """ 

714 _stacklevel = kwargs.pop('_stacklevel', 2) 

715 return self._find_all( 

716 name, attrs, string, limit, 

717 self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs 

718 ) 

719 findPreviousSiblings = find_previous_siblings # BS3 

720 fetchPreviousSiblings = find_previous_siblings # BS2 

721 

722 def find_parent(self, name=None, attrs={}, **kwargs): 

723 """Find the closest parent of this PageElement that matches the given 

724 criteria. 

725 

726 All find_* methods take a common set of arguments. See the online 

727 documentation for detailed explanations. 

728 

729 :param name: A filter on tag name. 

730 :param attrs: A dictionary of filters on attribute values. 

731 :kwargs: A dictionary of filters on attribute values. 

732 

733 :return: A PageElement. 

734 :rtype: bs4.element.Tag | bs4.element.NavigableString 

735 """ 

736 # NOTE: We can't use _find_one because findParents takes a different 

737 # set of arguments. 

738 r = None 

739 l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs) 

740 if l: 

741 r = l[0] 

742 return r 

743 findParent = find_parent # BS3 

744 

745 def find_parents(self, name=None, attrs={}, limit=None, **kwargs): 

746 """Find all parents of this PageElement that match the given criteria. 

747 

748 All find_* methods take a common set of arguments. See the online 

749 documentation for detailed explanations. 

750 

751 :param name: A filter on tag name. 

752 :param attrs: A dictionary of filters on attribute values. 

753 :param limit: Stop looking after finding this many results. 

754 :kwargs: A dictionary of filters on attribute values. 

755 

756 :return: A PageElement. 

757 :rtype: bs4.element.Tag | bs4.element.NavigableString 

758 """ 

759 _stacklevel = kwargs.pop('_stacklevel', 2) 

760 return self._find_all(name, attrs, None, limit, self.parents, 

761 _stacklevel=_stacklevel+1, **kwargs) 

762 findParents = find_parents # BS3 

763 fetchParents = find_parents # BS2 

764 

765 @property 

766 def next(self): 

767 """The PageElement, if any, that was parsed just after this one. 

768 

769 :return: A PageElement. 

770 :rtype: bs4.element.Tag | bs4.element.NavigableString 

771 """ 

772 return self.next_element 

773 

774 @property 

775 def previous(self): 

776 """The PageElement, if any, that was parsed just before this one. 

777 

778 :return: A PageElement. 

779 :rtype: bs4.element.Tag | bs4.element.NavigableString 

780 """ 

781 return self.previous_element 

782 

783 #These methods do the real heavy lifting. 

784 

785 def _find_one(self, method, name, attrs, string, **kwargs): 

786 r = None 

787 l = method(name, attrs, string, 1, _stacklevel=4, **kwargs) 

788 if l: 

789 r = l[0] 

790 return r 

791 

792 def _find_all(self, name, attrs, string, limit, generator, **kwargs): 

793 "Iterates over a generator looking for things that match." 

794 _stacklevel = kwargs.pop('_stacklevel', 3) 

795 

796 if string is None and 'text' in kwargs: 

797 string = kwargs.pop('text') 

798 warnings.warn( 

799 "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.", 

800 DeprecationWarning, stacklevel=_stacklevel 

801 ) 

802 

803 if isinstance(name, SoupStrainer): 

804 strainer = name 

805 else: 

806 strainer = SoupStrainer(name, attrs, string, **kwargs) 

807 

808 if string is None and not limit and not attrs and not kwargs: 

809 if name is True or name is None: 

810 # Optimization to find all tags. 

811 result = (element for element in generator 

812 if isinstance(element, Tag)) 

813 return ResultSet(strainer, result) 

814 elif isinstance(name, str): 

815 # Optimization to find all tags with a given name. 

816 if name.count(':') == 1: 

817 # This is a name with a prefix. If this is a namespace-aware document, 

818 # we need to match the local name against tag.name. If not, 

819 # we need to match the fully-qualified name against tag.name. 

820 prefix, local_name = name.split(':', 1) 

821 else: 

822 prefix = None 

823 local_name = name 

824 result = (element for element in generator 

825 if isinstance(element, Tag) 

826 and ( 

827 element.name == name 

828 ) or ( 

829 element.name == local_name 

830 and (prefix is None or element.prefix == prefix) 

831 ) 

832 ) 

833 return ResultSet(strainer, result) 

834 results = ResultSet(strainer) 

835 while True: 

836 try: 

837 i = next(generator) 

838 except StopIteration: 

839 break 

840 if i: 

841 found = strainer.search(i) 

842 if found: 

843 results.append(found) 

844 if limit and len(results) >= limit: 

845 break 

846 return results 

847 

848 #These generators can be used to navigate starting from both 

849 #NavigableStrings and Tags. 

850 @property 

851 def next_elements(self): 

852 """All PageElements that were parsed after this one. 

853 

854 :yield: A sequence of PageElements. 

855 """ 

856 i = self.next_element 

857 while i is not None: 

858 yield i 

859 i = i.next_element 

860 

861 @property 

862 def next_siblings(self): 

863 """All PageElements that are siblings of this one but were parsed 

864 later. 

865 

866 :yield: A sequence of PageElements. 

867 """ 

868 i = self.next_sibling 

869 while i is not None: 

870 yield i 

871 i = i.next_sibling 

872 

873 @property 

874 def previous_elements(self): 

875 """All PageElements that were parsed before this one. 

876 

877 :yield: A sequence of PageElements. 

878 """ 

879 i = self.previous_element 

880 while i is not None: 

881 yield i 

882 i = i.previous_element 

883 

884 @property 

885 def previous_siblings(self): 

886 """All PageElements that are siblings of this one but were parsed 

887 earlier. 

888 

889 :yield: A sequence of PageElements. 

890 """ 

891 i = self.previous_sibling 

892 while i is not None: 

893 yield i 

894 i = i.previous_sibling 

895 

896 @property 

897 def parents(self): 

898 """All PageElements that are parents of this PageElement. 

899 

900 :yield: A sequence of PageElements. 

901 """ 

902 i = self.parent 

903 while i is not None: 

904 yield i 

905 i = i.parent 

906 

907 @property 

908 def decomposed(self): 

909 """Check whether a PageElement has been decomposed. 

910 

911 :rtype: bool 

912 """ 

913 return getattr(self, '_decomposed', False) or False 

914 

915 # Old non-property versions of the generators, for backwards 

916 # compatibility with BS3. 

917 def nextGenerator(self): 

918 return self.next_elements 

919 

920 def nextSiblingGenerator(self): 

921 return self.next_siblings 

922 

923 def previousGenerator(self): 

924 return self.previous_elements 

925 

926 def previousSiblingGenerator(self): 

927 return self.previous_siblings 

928 

929 def parentGenerator(self): 

930 return self.parents 

931 

932 

933class NavigableString(str, PageElement): 

934 """A Python Unicode string that is part of a parse tree. 

935 

936 When Beautiful Soup parses the markup <b>penguin</b>, it will 

937 create a NavigableString for the string "penguin". 

938 """ 

939 

940 PREFIX = '' 

941 SUFFIX = '' 

942 

943 def __new__(cls, value): 

944 """Create a new NavigableString. 

945 

946 When unpickling a NavigableString, this method is called with 

947 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 

948 passed in to the superclass's __new__ or the superclass won't know 

949 how to handle non-ASCII characters. 

950 """ 

951 if isinstance(value, str): 

952 u = str.__new__(cls, value) 

953 else: 

954 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 

955 u.setup() 

956 return u 

957 

958 def __deepcopy__(self, memo, recursive=False): 

959 """A copy of a NavigableString has the same contents and class 

960 as the original, but it is not connected to the parse tree. 

961 

962 :param recursive: This parameter is ignored; it's only defined 

963 so that NavigableString.__deepcopy__ implements the same 

964 signature as Tag.__deepcopy__. 

965 """ 

966 return type(self)(self) 

967 

968 def __copy__(self): 

969 """A copy of a NavigableString can only be a deep copy, because 

970 only one PageElement can occupy a given place in a parse tree. 

971 """ 

972 return self.__deepcopy__({}) 

973 

974 def __getnewargs__(self): 

975 return (str(self),) 

976 

977 def __getattr__(self, attr): 

978 """text.string gives you text. This is for backwards 

979 compatibility for Navigable*String, but for CData* it lets you 

980 get the string without the CData wrapper.""" 

981 if attr == 'string': 

982 return self 

983 else: 

984 raise AttributeError( 

985 "'%s' object has no attribute '%s'" % ( 

986 self.__class__.__name__, attr)) 

987 

988 def output_ready(self, formatter="minimal"): 

989 """Run the string through the provided formatter. 

990 

991 :param formatter: A Formatter object, or a string naming one of the standard formatters. 

992 """ 

993 output = self.format_string(self, formatter) 

994 return self.PREFIX + output + self.SUFFIX 

995 

996 @property 

997 def name(self): 

998 """Since a NavigableString is not a Tag, it has no .name. 

999 

1000 This property is implemented so that code like this doesn't crash 

1001 when run on a mixture of Tag and NavigableString objects: 

1002 [x.name for x in tag.children] 

1003 """ 

1004 return None 

1005 

1006 @name.setter 

1007 def name(self, name): 

1008 """Prevent NavigableString.name from ever being set.""" 

1009 raise AttributeError("A NavigableString cannot be given a name.") 

1010 

1011 def _all_strings(self, strip=False, types=PageElement.default): 

1012 """Yield all strings of certain classes, possibly stripping them. 

1013 

1014 This makes it easy for NavigableString to implement methods 

1015 like get_text() as conveniences, creating a consistent 

1016 text-extraction API across all PageElements. 

1017 

1018 :param strip: If True, all strings will be stripped before being 

1019 yielded. 

1020 

1021 :param types: A tuple of NavigableString subclasses. If this 

1022 NavigableString isn't one of those subclasses, the 

1023 sequence will be empty. By default, the subclasses 

1024 considered are NavigableString and CData objects. That 

1025 means no comments, processing instructions, etc. 

1026 

1027 :yield: A sequence that either contains this string, or is empty. 

1028 

1029 """ 

1030 if types is self.default: 

1031 # This is kept in Tag because it's full of subclasses of 

1032 # this class, which aren't defined until later in the file. 

1033 types = Tag.DEFAULT_INTERESTING_STRING_TYPES 

1034 

1035 # Do nothing if the caller is looking for specific types of 

1036 # string, and we're of a different type. 

1037 # 

1038 # We check specific types instead of using isinstance(self, 

1039 # types) because all of these classes subclass 

1040 # NavigableString. Anyone who's using this feature probably 

1041 # wants generic NavigableStrings but not other stuff. 

1042 my_type = type(self) 

1043 if types is not None: 

1044 if isinstance(types, type): 

1045 # Looking for a single type. 

1046 if my_type is not types: 

1047 return 

1048 elif my_type not in types: 

1049 # Looking for one of a list of types. 

1050 return 

1051 

1052 value = self 

1053 if strip: 

1054 value = value.strip() 

1055 if len(value) > 0: 

1056 yield value 

1057 strings = property(_all_strings) 

1058 

1059class PreformattedString(NavigableString): 

1060 """A NavigableString not subject to the normal formatting rules. 

1061 

1062 This is an abstract class used for special kinds of strings such 

1063 as comments (the Comment class) and CDATA blocks (the CData 

1064 class). 

1065 """ 

1066 

1067 PREFIX = '' 

1068 SUFFIX = '' 

1069 

1070 def output_ready(self, formatter=None): 

1071 """Make this string ready for output by adding any subclass-specific 

1072 prefix or suffix. 

1073 

1074 :param formatter: A Formatter object, or a string naming one 

1075 of the standard formatters. The string will be passed into the 

1076 Formatter, but only to trigger any side effects: the return 

1077 value is ignored. 

1078 

1079 :return: The string, with any subclass-specific prefix and 

1080 suffix added on. 

1081 """ 

1082 if formatter is not None: 

1083 ignore = self.format_string(self, formatter) 

1084 return self.PREFIX + self + self.SUFFIX 

1085 

1086class CData(PreformattedString): 

1087 """A CDATA block.""" 

1088 PREFIX = '<![CDATA[' 

1089 SUFFIX = ']]>' 

1090 

1091class ProcessingInstruction(PreformattedString): 

1092 """A SGML processing instruction.""" 

1093 

1094 PREFIX = '<?' 

1095 SUFFIX = '>' 

1096 

1097class XMLProcessingInstruction(ProcessingInstruction): 

1098 """An XML processing instruction.""" 

1099 PREFIX = '<?' 

1100 SUFFIX = '?>' 

1101 

1102class Comment(PreformattedString): 

1103 """An HTML or XML comment.""" 

1104 PREFIX = '<!--' 

1105 SUFFIX = '-->' 

1106 

1107 

1108class Declaration(PreformattedString): 

1109 """An XML declaration.""" 

1110 PREFIX = '<?' 

1111 SUFFIX = '?>' 

1112 

1113 

1114class Doctype(PreformattedString): 

1115 """A document type declaration.""" 

1116 @classmethod 

1117 def for_name_and_ids(cls, name, pub_id, system_id): 

1118 """Generate an appropriate document type declaration for a given 

1119 public ID and system ID. 

1120 

1121 :param name: The name of the document's root element, e.g. 'html'. 

1122 :param pub_id: The Formal Public Identifier for this document type, 

1123 e.g. '-//W3C//DTD XHTML 1.1//EN' 

1124 :param system_id: The system identifier for this document type, 

1125 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' 

1126 

1127 :return: A Doctype. 

1128 """ 

1129 value = name or '' 

1130 if pub_id is not None: 

1131 value += ' PUBLIC "%s"' % pub_id 

1132 if system_id is not None: 

1133 value += ' "%s"' % system_id 

1134 elif system_id is not None: 

1135 value += ' SYSTEM "%s"' % system_id 

1136 

1137 return Doctype(value) 

1138 

1139 PREFIX = '<!DOCTYPE ' 

1140 SUFFIX = '>\n' 

1141 

1142 

1143class Stylesheet(NavigableString): 

1144 """A NavigableString representing an stylesheet (probably 

1145 CSS). 

1146 

1147 Used to distinguish embedded stylesheets from textual content. 

1148 """ 

1149 pass 

1150 

1151 

1152class Script(NavigableString): 

1153 """A NavigableString representing an executable script (probably 

1154 Javascript). 

1155 

1156 Used to distinguish executable code from textual content. 

1157 """ 

1158 pass 

1159 

1160 

1161class TemplateString(NavigableString): 

1162 """A NavigableString representing a string found inside an HTML 

1163 template embedded in a larger document. 

1164 

1165 Used to distinguish such strings from the main body of the document. 

1166 """ 

1167 pass 

1168 

1169 

1170class RubyTextString(NavigableString): 

1171 """A NavigableString representing the contents of the <rt> HTML 

1172 element. 

1173 

1174 https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element 

1175 

1176 Can be used to distinguish such strings from the strings they're 

1177 annotating. 

1178 """ 

1179 pass 

1180 

1181 

1182class RubyParenthesisString(NavigableString): 

1183 """A NavigableString representing the contents of the <rp> HTML 

1184 element. 

1185 

1186 https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element 

1187 """ 

1188 pass 

1189 

1190 

1191class Tag(PageElement): 

1192 """Represents an HTML or XML tag that is part of a parse tree, along 

1193 with its attributes and contents. 

1194 

1195 When Beautiful Soup parses the markup <b>penguin</b>, it will 

1196 create a Tag object representing the <b> tag. 

1197 """ 

1198 

1199 def __init__(self, parser=None, builder=None, name=None, namespace=None, 

1200 prefix=None, attrs=None, parent=None, previous=None, 

1201 is_xml=None, sourceline=None, sourcepos=None, 

1202 can_be_empty_element=None, cdata_list_attributes=None, 

1203 preserve_whitespace_tags=None, 

1204 interesting_string_types=None, 

1205 namespaces=None 

1206 ): 

1207 """Basic constructor. 

1208 

1209 :param parser: A BeautifulSoup object. 

1210 :param builder: A TreeBuilder. 

1211 :param name: The name of the tag. 

1212 :param namespace: The URI of this Tag's XML namespace, if any. 

1213 :param prefix: The prefix for this Tag's XML namespace, if any. 

1214 :param attrs: A dictionary of this Tag's attribute values. 

1215 :param parent: The PageElement to use as this Tag's parent. 

1216 :param previous: The PageElement that was parsed immediately before 

1217 this tag. 

1218 :param is_xml: If True, this is an XML tag. Otherwise, this is an 

1219 HTML tag. 

1220 :param sourceline: The line number where this tag was found in its 

1221 source document. 

1222 :param sourcepos: The character position within `sourceline` where this 

1223 tag was found. 

1224 :param can_be_empty_element: If True, this tag should be 

1225 represented as <tag/>. If False, this tag should be represented 

1226 as <tag></tag>. 

1227 :param cdata_list_attributes: A list of attributes whose values should 

1228 be treated as CDATA if they ever show up on this tag. 

1229 :param preserve_whitespace_tags: A list of tag names whose contents 

1230 should have their whitespace preserved. 

1231 :param interesting_string_types: This is a NavigableString 

1232 subclass or a tuple of them. When iterating over this 

1233 Tag's strings in methods like Tag.strings or Tag.get_text, 

1234 these are the types of strings that are interesting enough 

1235 to be considered. The default is to consider 

1236 NavigableString and CData the only interesting string 

1237 subtypes. 

1238 :param namespaces: A dictionary mapping currently active 

1239 namespace prefixes to URIs. This can be used later to 

1240 construct CSS selectors. 

1241 """ 

1242 if parser is None: 

1243 self.parser_class = None 

1244 else: 

1245 # We don't actually store the parser object: that lets extracted 

1246 # chunks be garbage-collected. 

1247 self.parser_class = parser.__class__ 

1248 if name is None: 

1249 raise ValueError("No value provided for new tag's name.") 

1250 self.name = name 

1251 self.namespace = namespace 

1252 self._namespaces = namespaces or {} 

1253 self.prefix = prefix 

1254 if ((not builder or builder.store_line_numbers) 

1255 and (sourceline is not None or sourcepos is not None)): 

1256 self.sourceline = sourceline 

1257 self.sourcepos = sourcepos 

1258 if attrs is None: 

1259 attrs = {} 

1260 elif attrs: 

1261 if builder is not None and builder.cdata_list_attributes: 

1262 attrs = builder._replace_cdata_list_attribute_values( 

1263 self.name, attrs) 

1264 else: 

1265 attrs = dict(attrs) 

1266 else: 

1267 attrs = dict(attrs) 

1268 

1269 # If possible, determine ahead of time whether this tag is an 

1270 # XML tag. 

1271 if builder: 

1272 self.known_xml = builder.is_xml 

1273 else: 

1274 self.known_xml = is_xml 

1275 self.attrs = attrs 

1276 self.contents = [] 

1277 self.setup(parent, previous) 

1278 self.hidden = False 

1279 

1280 if builder is None: 

1281 # In the absence of a TreeBuilder, use whatever values were 

1282 # passed in here. They're probably None, unless this is a copy of some 

1283 # other tag. 

1284 self.can_be_empty_element = can_be_empty_element 

1285 self.cdata_list_attributes = cdata_list_attributes 

1286 self.preserve_whitespace_tags = preserve_whitespace_tags 

1287 self.interesting_string_types = interesting_string_types 

1288 else: 

1289 # Set up any substitutions for this tag, such as the charset in a META tag. 

1290 builder.set_up_substitutions(self) 

1291 

1292 # Ask the TreeBuilder whether this tag might be an empty-element tag. 

1293 self.can_be_empty_element = builder.can_be_empty_element(name) 

1294 

1295 # Keep track of the list of attributes of this tag that 

1296 # might need to be treated as a list. 

1297 # 

1298 # For performance reasons, we store the whole data structure 

1299 # rather than asking the question of every tag. Asking would 

1300 # require building a new data structure every time, and 

1301 # (unlike can_be_empty_element), we almost never need 

1302 # to check this. 

1303 self.cdata_list_attributes = builder.cdata_list_attributes 

1304 

1305 # Keep track of the names that might cause this tag to be treated as a 

1306 # whitespace-preserved tag. 

1307 self.preserve_whitespace_tags = builder.preserve_whitespace_tags 

1308 

1309 if self.name in builder.string_containers: 

1310 # This sort of tag uses a special string container 

1311 # subclass for most of its strings. When we ask the 

1312 self.interesting_string_types = builder.string_containers[self.name] 

1313 else: 

1314 self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES 

1315 

1316 parserClass = _alias("parser_class") # BS3 

1317 

1318 def __deepcopy__(self, memo, recursive=True): 

1319 """A deepcopy of a Tag is a new Tag, unconnected to the parse tree. 

1320 Its contents are a copy of the old Tag's contents. 

1321 """ 

1322 clone = self._clone() 

1323 

1324 if recursive: 

1325 # Clone this tag's descendants recursively, but without 

1326 # making any recursive function calls. 

1327 tag_stack = [clone] 

1328 for event, element in self._event_stream(self.descendants): 

1329 if event is Tag.END_ELEMENT_EVENT: 

1330 # Stop appending incoming Tags to the Tag that was 

1331 # just closed. 

1332 tag_stack.pop() 

1333 else: 

1334 descendant_clone = element.__deepcopy__( 

1335 memo, recursive=False 

1336 ) 

1337 # Add to its parent's .contents 

1338 tag_stack[-1].append(descendant_clone) 

1339 

1340 if event is Tag.START_ELEMENT_EVENT: 

1341 # Add the Tag itself to the stack so that its 

1342 # children will be .appended to it. 

1343 tag_stack.append(descendant_clone) 

1344 return clone 

1345 

1346 def __copy__(self): 

1347 """A copy of a Tag must always be a deep copy, because a Tag's 

1348 children can only have one parent at a time. 

1349 """ 

1350 return self.__deepcopy__({}) 

1351 

1352 def _clone(self): 

1353 """Create a new Tag just like this one, but with no 

1354 contents and unattached to any parse tree. 

1355 

1356 This is the first step in the deepcopy process. 

1357 """ 

1358 clone = type(self)( 

1359 None, self.builder, self.name, self.namespace, 

1360 self.prefix, self.attrs, is_xml=self._is_xml, 

1361 sourceline=self.sourceline, sourcepos=self.sourcepos, 

1362 can_be_empty_element=self.can_be_empty_element, 

1363 cdata_list_attributes=self.cdata_list_attributes, 

1364 preserve_whitespace_tags=self.preserve_whitespace_tags, 

1365 interesting_string_types=self.interesting_string_types 

1366 ) 

1367 for attr in ('can_be_empty_element', 'hidden'): 

1368 setattr(clone, attr, getattr(self, attr)) 

1369 return clone 

1370 

1371 @property 

1372 def is_empty_element(self): 

1373 """Is this tag an empty-element tag? (aka a self-closing tag) 

1374 

1375 A tag that has contents is never an empty-element tag. 

1376 

1377 A tag that has no contents may or may not be an empty-element 

1378 tag. It depends on the builder used to create the tag. If the 

1379 builder has a designated list of empty-element tags, then only 

1380 a tag whose name shows up in that list is considered an 

1381 empty-element tag. 

1382 

1383 If the builder has no designated list of empty-element tags, 

1384 then any tag with no contents is an empty-element tag. 

1385 """ 

1386 return len(self.contents) == 0 and self.can_be_empty_element 

1387 isSelfClosing = is_empty_element # BS3 

1388 

1389 @property 

1390 def string(self): 

1391 """Convenience property to get the single string within this 

1392 PageElement. 

1393 

1394 TODO It might make sense to have NavigableString.string return 

1395 itself. 

1396 

1397 :return: If this element has a single string child, return 

1398 value is that string. If this element has one child tag, 

1399 return value is the 'string' attribute of the child tag, 

1400 recursively. If this element is itself a string, has no 

1401 children, or has more than one child, return value is None. 

1402 """ 

1403 if len(self.contents) != 1: 

1404 return None 

1405 child = self.contents[0] 

1406 if isinstance(child, NavigableString): 

1407 return child 

1408 return child.string 

1409 

1410 @string.setter 

1411 def string(self, string): 

1412 """Replace this PageElement's contents with `string`.""" 

1413 self.clear() 

1414 self.append(string.__class__(string)) 

1415 

1416 DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData) 

1417 def _all_strings(self, strip=False, types=PageElement.default): 

1418 """Yield all strings of certain classes, possibly stripping them. 

1419 

1420 :param strip: If True, all strings will be stripped before being 

1421 yielded. 

1422 

1423 :param types: A tuple of NavigableString subclasses. Any strings of 

1424 a subclass not found in this list will be ignored. By 

1425 default, the subclasses considered are the ones found in 

1426 self.interesting_string_types. If that's not specified, 

1427 only NavigableString and CData objects will be 

1428 considered. That means no comments, processing 

1429 instructions, etc. 

1430 

1431 :yield: A sequence of strings. 

1432 

1433 """ 

1434 if types is self.default: 

1435 types = self.interesting_string_types 

1436 

1437 for descendant in self.descendants: 

1438 if (types is None and not isinstance(descendant, NavigableString)): 

1439 continue 

1440 descendant_type = type(descendant) 

1441 if isinstance(types, type): 

1442 if descendant_type is not types: 

1443 # We're not interested in strings of this type. 

1444 continue 

1445 elif types is not None and descendant_type not in types: 

1446 # We're not interested in strings of this type. 

1447 continue 

1448 if strip: 

1449 descendant = descendant.strip() 

1450 if len(descendant) == 0: 

1451 continue 

1452 yield descendant 

1453 strings = property(_all_strings) 

1454 

1455 def decompose(self): 

1456 """Recursively destroys this PageElement and its children. 

1457 

1458 This element will be removed from the tree and wiped out; so 

1459 will everything beneath it. 

1460 

1461 The behavior of a decomposed PageElement is undefined and you 

1462 should never use one for anything, but if you need to _check_ 

1463 whether an element has been decomposed, you can use the 

1464 `decomposed` property. 

1465 """ 

1466 self.extract() 

1467 i = self 

1468 while i is not None: 

1469 n = i.next_element 

1470 i.__dict__.clear() 

1471 i.contents = [] 

1472 i._decomposed = True 

1473 i = n 

1474 

1475 def clear(self, decompose=False): 

1476 """Wipe out all children of this PageElement by calling extract() 

1477 on them. 

1478 

1479 :param decompose: If this is True, decompose() (a more 

1480 destructive method) will be called instead of extract(). 

1481 """ 

1482 if decompose: 

1483 for element in self.contents[:]: 

1484 if isinstance(element, Tag): 

1485 element.decompose() 

1486 else: 

1487 element.extract() 

1488 else: 

1489 for element in self.contents[:]: 

1490 element.extract() 

1491 

1492 def smooth(self): 

1493 """Smooth out this element's children by consolidating consecutive 

1494 strings. 

1495 

1496 This makes pretty-printed output look more natural following a 

1497 lot of operations that modified the tree. 

1498 """ 

1499 # Mark the first position of every pair of children that need 

1500 # to be consolidated. Do this rather than making a copy of 

1501 # self.contents, since in most cases very few strings will be 

1502 # affected. 

1503 marked = [] 

1504 for i, a in enumerate(self.contents): 

1505 if isinstance(a, Tag): 

1506 # Recursively smooth children. 

1507 a.smooth() 

1508 if i == len(self.contents)-1: 

1509 # This is the last item in .contents, and it's not a 

1510 # tag. There's no chance it needs any work. 

1511 continue 

1512 b = self.contents[i+1] 

1513 if (isinstance(a, NavigableString) 

1514 and isinstance(b, NavigableString) 

1515 and not isinstance(a, PreformattedString) 

1516 and not isinstance(b, PreformattedString) 

1517 ): 

1518 marked.append(i) 

1519 

1520 # Go over the marked positions in reverse order, so that 

1521 # removing items from .contents won't affect the remaining 

1522 # positions. 

1523 for i in reversed(marked): 

1524 a = self.contents[i] 

1525 b = self.contents[i+1] 

1526 b.extract() 

1527 n = NavigableString(a+b) 

1528 a.replace_with(n) 

1529 

1530 def index(self, element): 

1531 """Find the index of a child by identity, not value. 

1532 

1533 Avoids issues with tag.contents.index(element) getting the 

1534 index of equal elements. 

1535 

1536 :param element: Look for this PageElement in `self.contents`. 

1537 """ 

1538 for i, child in enumerate(self.contents): 

1539 if child is element: 

1540 return i 

1541 raise ValueError("Tag.index: element not in tag") 

1542 

1543 def get(self, key, default=None): 

1544 """Returns the value of the 'key' attribute for the tag, or 

1545 the value given for 'default' if it doesn't have that 

1546 attribute.""" 

1547 return self.attrs.get(key, default) 

1548 

1549 def get_attribute_list(self, key, default=None): 

1550 """The same as get(), but always returns a list. 

1551 

1552 :param key: The attribute to look for. 

1553 :param default: Use this value if the attribute is not present 

1554 on this PageElement. 

1555 :return: A list of values, probably containing only a single 

1556 value. 

1557 """ 

1558 value = self.get(key, default) 

1559 if not isinstance(value, list): 

1560 value = [value] 

1561 return value 

1562 

1563 def has_attr(self, key): 

1564 """Does this PageElement have an attribute with the given name?""" 

1565 return key in self.attrs 

1566 

1567 def __hash__(self): 

1568 return str(self).__hash__() 

1569 

1570 def __getitem__(self, key): 

1571 """tag[key] returns the value of the 'key' attribute for the Tag, 

1572 and throws an exception if it's not there.""" 

1573 return self.attrs[key] 

1574 

1575 def __iter__(self): 

1576 "Iterating over a Tag iterates over its contents." 

1577 return iter(self.contents) 

1578 

1579 def __len__(self): 

1580 "The length of a Tag is the length of its list of contents." 

1581 return len(self.contents) 

1582 

1583 def __contains__(self, x): 

1584 return x in self.contents 

1585 

1586 def __bool__(self): 

1587 "A tag is non-None even if it has no contents." 

1588 return True 

1589 

1590 def __setitem__(self, key, value): 

1591 """Setting tag[key] sets the value of the 'key' attribute for the 

1592 tag.""" 

1593 self.attrs[key] = value 

1594 

1595 def __delitem__(self, key): 

1596 "Deleting tag[key] deletes all 'key' attributes for the tag." 

1597 self.attrs.pop(key, None) 

1598 

1599 def __call__(self, *args, **kwargs): 

1600 """Calling a Tag like a function is the same as calling its 

1601 find_all() method. Eg. tag('a') returns a list of all the A tags 

1602 found within this tag.""" 

1603 return self.find_all(*args, **kwargs) 

1604 

1605 def __getattr__(self, tag): 

1606 """Calling tag.subtag is the same as calling tag.find(name="subtag")""" 

1607 #print("Getattr %s.%s" % (self.__class__, tag)) 

1608 if len(tag) > 3 and tag.endswith('Tag'): 

1609 # BS3: soup.aTag -> "soup.find("a") 

1610 tag_name = tag[:-3] 

1611 warnings.warn( 

1612 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( 

1613 name=tag_name 

1614 ), 

1615 DeprecationWarning, stacklevel=2 

1616 ) 

1617 return self.find(tag_name) 

1618 # We special case contents to avoid recursion. 

1619 elif not tag.startswith("__") and not tag == "contents": 

1620 return self.find(tag) 

1621 raise AttributeError( 

1622 "'%s' object has no attribute '%s'" % (self.__class__, tag)) 

1623 

1624 def __eq__(self, other): 

1625 """Returns true iff this Tag has the same name, the same attributes, 

1626 and the same contents (recursively) as `other`.""" 

1627 if self is other: 

1628 return True 

1629 if (not hasattr(other, 'name') or 

1630 not hasattr(other, 'attrs') or 

1631 not hasattr(other, 'contents') or 

1632 self.name != other.name or 

1633 self.attrs != other.attrs or 

1634 len(self) != len(other)): 

1635 return False 

1636 for i, my_child in enumerate(self.contents): 

1637 if my_child != other.contents[i]: 

1638 return False 

1639 return True 

1640 

1641 def __ne__(self, other): 

1642 """Returns true iff this Tag is not identical to `other`, 

1643 as defined in __eq__.""" 

1644 return not self == other 

1645 

1646 def __repr__(self, encoding="unicode-escape"): 

1647 """Renders this PageElement as a string. 

1648 

1649 :param encoding: The encoding to use (Python 2 only). 

1650 TODO: This is now ignored and a warning should be issued 

1651 if a value is provided. 

1652 :return: A (Unicode) string. 

1653 """ 

1654 # "The return value must be a string object", i.e. Unicode 

1655 return self.decode() 

1656 

1657 def __unicode__(self): 

1658 """Renders this PageElement as a Unicode string.""" 

1659 return self.decode() 

1660 

1661 __str__ = __repr__ = __unicode__ 

1662 

1663 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, 

1664 indent_level=None, formatter="minimal", 

1665 errors="xmlcharrefreplace"): 

1666 """Render a bytestring representation of this PageElement and its 

1667 contents. 

1668 

1669 :param encoding: The destination encoding. 

1670 :param indent_level: Each line of the rendering will be 

1671 indented this many levels. (The formatter decides what a 

1672 'level' means in terms of spaces or other characters 

1673 output.) Used internally in recursive calls while 

1674 pretty-printing. 

1675 :param formatter: A Formatter object, or a string naming one of 

1676 the standard formatters. 

1677 :param errors: An error handling strategy such as 

1678 'xmlcharrefreplace'. This value is passed along into 

1679 encode() and its value should be one of the constants 

1680 defined by Python. 

1681 :return: A bytestring. 

1682 

1683 """ 

1684 # Turn the data structure into Unicode, then encode the 

1685 # Unicode. 

1686 u = self.decode(indent_level, encoding, formatter) 

1687 return u.encode(encoding, errors) 

1688 

1689 def decode(self, indent_level=None, 

1690 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 

1691 formatter="minimal", 

1692 iterator=None): 

1693 pieces = [] 

1694 # First off, turn a non-Formatter `formatter` into a Formatter 

1695 # object. This will stop the lookup from happening over and 

1696 # over again. 

1697 if not isinstance(formatter, Formatter): 

1698 formatter = self.formatter_for_name(formatter) 

1699 

1700 if indent_level is True: 

1701 indent_level = 0 

1702 

1703 # The currently active tag that put us into string literal 

1704 # mode. Until this element is closed, children will be treated 

1705 # as string literals and not pretty-printed. String literal 

1706 # mode is turned on immediately after this tag begins, and 

1707 # turned off immediately before it's closed. This means there 

1708 # will be whitespace before and after the tag itself. 

1709 string_literal_tag = None 

1710 

1711 for event, element in self._event_stream(iterator): 

1712 if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT): 

1713 piece = element._format_tag( 

1714 eventual_encoding, formatter, opening=True 

1715 ) 

1716 elif event is Tag.END_ELEMENT_EVENT: 

1717 piece = element._format_tag( 

1718 eventual_encoding, formatter, opening=False 

1719 ) 

1720 if indent_level is not None: 

1721 indent_level -= 1 

1722 else: 

1723 piece = element.output_ready(formatter) 

1724 

1725 # Now we need to apply the 'prettiness' -- extra 

1726 # whitespace before and/or after this tag. This can get 

1727 # complicated because certain tags, like <pre> and 

1728 # <script>, can't be prettified, since adding whitespace would 

1729 # change the meaning of the content. 

1730 

1731 # The default behavior is to add whitespace before and 

1732 # after an element when string literal mode is off, and to 

1733 # leave things as they are when string literal mode is on. 

1734 if string_literal_tag: 

1735 indent_before = indent_after = False 

1736 else: 

1737 indent_before = indent_after = True 

1738 

1739 # The only time the behavior is more complex than that is 

1740 # when we encounter an opening or closing tag that might 

1741 # put us into or out of string literal mode. 

1742 if (event is Tag.START_ELEMENT_EVENT 

1743 and not string_literal_tag 

1744 and not element._should_pretty_print()): 

1745 # We are about to enter string literal mode. Add 

1746 # whitespace before this tag, but not after. We 

1747 # will stay in string literal mode until this tag 

1748 # is closed. 

1749 indent_before = True 

1750 indent_after = False 

1751 string_literal_tag = element 

1752 elif (event is Tag.END_ELEMENT_EVENT 

1753 and element is string_literal_tag): 

1754 # We are about to exit string literal mode by closing 

1755 # the tag that sent us into that mode. Add whitespace 

1756 # after this tag, but not before. 

1757 indent_before = False 

1758 indent_after = True 

1759 string_literal_tag = None 

1760 

1761 # Now we know whether to add whitespace before and/or 

1762 # after this element. 

1763 if indent_level is not None: 

1764 if (indent_before or indent_after): 

1765 if isinstance(element, NavigableString): 

1766 piece = piece.strip() 

1767 if piece: 

1768 piece = self._indent_string( 

1769 piece, indent_level, formatter, 

1770 indent_before, indent_after 

1771 ) 

1772 if event == Tag.START_ELEMENT_EVENT: 

1773 indent_level += 1 

1774 pieces.append(piece) 

1775 return "".join(pieces) 

1776 

1777 # Names for the different events yielded by _event_stream 

1778 START_ELEMENT_EVENT = object() 

1779 END_ELEMENT_EVENT = object() 

1780 EMPTY_ELEMENT_EVENT = object() 

1781 STRING_ELEMENT_EVENT = object() 

1782 

1783 def _event_stream(self, iterator=None): 

1784 """Yield a sequence of events that can be used to reconstruct the DOM 

1785 for this element. 

1786 

1787 This lets us recreate the nested structure of this element 

1788 (e.g. when formatting it as a string) without using recursive 

1789 method calls. 

1790 

1791 This is similar in concept to the SAX API, but it's a simpler 

1792 interface designed for internal use. The events are different 

1793 from SAX and the arguments associated with the events are Tags 

1794 and other Beautiful Soup objects. 

1795 

1796 :param iterator: An alternate iterator to use when traversing 

1797 the tree. 

1798 """ 

1799 tag_stack = [] 

1800 

1801 iterator = iterator or self.self_and_descendants 

1802 

1803 for c in iterator: 

1804 # If the parent of the element we're about to yield is not 

1805 # the tag currently on the stack, it means that the tag on 

1806 # the stack closed before this element appeared. 

1807 while tag_stack and c.parent != tag_stack[-1]: 

1808 now_closed_tag = tag_stack.pop() 

1809 yield Tag.END_ELEMENT_EVENT, now_closed_tag 

1810 

1811 if isinstance(c, Tag): 

1812 if c.is_empty_element: 

1813 yield Tag.EMPTY_ELEMENT_EVENT, c 

1814 else: 

1815 yield Tag.START_ELEMENT_EVENT, c 

1816 tag_stack.append(c) 

1817 continue 

1818 else: 

1819 yield Tag.STRING_ELEMENT_EVENT, c 

1820 

1821 while tag_stack: 

1822 now_closed_tag = tag_stack.pop() 

1823 yield Tag.END_ELEMENT_EVENT, now_closed_tag 

1824 

1825 def _indent_string(self, s, indent_level, formatter, 

1826 indent_before, indent_after): 

1827 """Add indentation whitespace before and/or after a string. 

1828 

1829 :param s: The string to amend with whitespace. 

1830 :param indent_level: The indentation level; affects how much 

1831 whitespace goes before the string. 

1832 :param indent_before: Whether or not to add whitespace 

1833 before the string. 

1834 :param indent_after: Whether or not to add whitespace 

1835 (a newline) after the string. 

1836 """ 

1837 space_before = '' 

1838 if indent_before and indent_level: 

1839 space_before = (formatter.indent * indent_level) 

1840 

1841 space_after = '' 

1842 if indent_after: 

1843 space_after = "\n" 

1844 

1845 return space_before + s + space_after 

1846 

1847 def _format_tag(self, eventual_encoding, formatter, opening): 

1848 # A tag starts with the < character (see below). 

1849 

1850 # Then the / character, if this is a closing tag. 

1851 closing_slash = '' 

1852 if not opening: 

1853 closing_slash = '/' 

1854 

1855 # Then an optional namespace prefix. 

1856 prefix = '' 

1857 if self.prefix: 

1858 prefix = self.prefix + ":" 

1859 

1860 # Then a list of attribute values, if this is an opening tag. 

1861 attribute_string = '' 

1862 if opening: 

1863 attributes = formatter.attributes(self) 

1864 attrs = [] 

1865 for key, val in attributes: 

1866 if val is None: 

1867 decoded = key 

1868 else: 

1869 if isinstance(val, list) or isinstance(val, tuple): 

1870 val = ' '.join(val) 

1871 elif not isinstance(val, str): 

1872 val = str(val) 

1873 elif ( 

1874 isinstance(val, AttributeValueWithCharsetSubstitution) 

1875 and eventual_encoding is not None 

1876 ): 

1877 val = val.encode(eventual_encoding) 

1878 

1879 text = formatter.attribute_value(val) 

1880 decoded = ( 

1881 str(key) + '=' 

1882 + formatter.quoted_attribute_value(text)) 

1883 attrs.append(decoded) 

1884 if attrs: 

1885 attribute_string = ' ' + ' '.join(attrs) 

1886 

1887 # Then an optional closing slash (for a void element in an 

1888 # XML document). 

1889 void_element_closing_slash = '' 

1890 if self.is_empty_element: 

1891 void_element_closing_slash = formatter.void_element_close_prefix or '' 

1892 

1893 # Put it all together. 

1894 return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>' 

1895 

1896 def _should_pretty_print(self, indent_level=1): 

1897 """Should this tag be pretty-printed? 

1898 

1899 Most of them should, but some (such as <pre> in HTML 

1900 documents) should not. 

1901 """ 

1902 return ( 

1903 indent_level is not None 

1904 and ( 

1905 not self.preserve_whitespace_tags 

1906 or self.name not in self.preserve_whitespace_tags 

1907 ) 

1908 ) 

1909 

1910 def prettify(self, encoding=None, formatter="minimal"): 

1911 """Pretty-print this PageElement as a string. 

1912 

1913 :param encoding: The eventual encoding of the string. If this is None, 

1914 a Unicode string will be returned. 

1915 :param formatter: A Formatter object, or a string naming one of 

1916 the standard formatters. 

1917 :return: A Unicode string (if encoding==None) or a bytestring 

1918 (otherwise). 

1919 """ 

1920 if encoding is None: 

1921 return self.decode(True, formatter=formatter) 

1922 else: 

1923 return self.encode(encoding, True, formatter=formatter) 

1924 

1925 def decode_contents(self, indent_level=None, 

1926 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 

1927 formatter="minimal"): 

1928 """Renders the contents of this tag as a Unicode string. 

1929 

1930 :param indent_level: Each line of the rendering will be 

1931 indented this many levels. (The formatter decides what a 

1932 'level' means in terms of spaces or other characters 

1933 output.) Used internally in recursive calls while 

1934 pretty-printing. 

1935 

1936 :param eventual_encoding: The tag is destined to be 

1937 encoded into this encoding. decode_contents() is _not_ 

1938 responsible for performing that encoding. This information 

1939 is passed in so that it can be substituted in if the 

1940 document contains a <META> tag that mentions the document's 

1941 encoding. 

1942 

1943 :param formatter: A Formatter object, or a string naming one of 

1944 the standard Formatters. 

1945 

1946 """ 

1947 return self.decode(indent_level, eventual_encoding, formatter, 

1948 iterator=self.descendants) 

1949 

1950 def encode_contents( 

1951 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, 

1952 formatter="minimal"): 

1953 """Renders the contents of this PageElement as a bytestring. 

1954 

1955 :param indent_level: Each line of the rendering will be 

1956 indented this many levels. (The formatter decides what a 

1957 'level' means in terms of spaces or other characters 

1958 output.) Used internally in recursive calls while 

1959 pretty-printing. 

1960 

1961 :param eventual_encoding: The bytestring will be in this encoding. 

1962 

1963 :param formatter: A Formatter object, or a string naming one of 

1964 the standard Formatters. 

1965 

1966 :return: A bytestring. 

1967 """ 

1968 contents = self.decode_contents(indent_level, encoding, formatter) 

1969 return contents.encode(encoding) 

1970 

1971 # Old method for BS3 compatibility 

1972 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 

1973 prettyPrint=False, indentLevel=0): 

1974 """Deprecated method for BS3 compatibility.""" 

1975 if not prettyPrint: 

1976 indentLevel = None 

1977 return self.encode_contents( 

1978 indent_level=indentLevel, encoding=encoding) 

1979 

1980 #Soup methods 

1981 

1982 def find(self, name=None, attrs={}, recursive=True, string=None, 

1983 **kwargs): 

1984 """Look in the children of this PageElement and find the first 

1985 PageElement that matches the given criteria. 

1986 

1987 All find_* methods take a common set of arguments. See the online 

1988 documentation for detailed explanations. 

1989 

1990 :param name: A filter on tag name. 

1991 :param attrs: A dictionary of filters on attribute values. 

1992 :param recursive: If this is True, find() will perform a 

1993 recursive search of this PageElement's children. Otherwise, 

1994 only the direct children will be considered. 

1995 :param limit: Stop looking after finding this many results. 

1996 :kwargs: A dictionary of filters on attribute values. 

1997 :return: A PageElement. 

1998 :rtype: bs4.element.Tag | bs4.element.NavigableString 

1999 """ 

2000 r = None 

2001 l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, 

2002 **kwargs) 

2003 if l: 

2004 r = l[0] 

2005 return r 

2006 findChild = find #BS2 

2007 

2008 def find_all(self, name=None, attrs={}, recursive=True, string=None, 

2009 limit=None, **kwargs): 

2010 """Look in the children of this PageElement and find all 

2011 PageElements that match the given criteria. 

2012 

2013 All find_* methods take a common set of arguments. See the online 

2014 documentation for detailed explanations. 

2015 

2016 :param name: A filter on tag name. 

2017 :param attrs: A dictionary of filters on attribute values. 

2018 :param recursive: If this is True, find_all() will perform a 

2019 recursive search of this PageElement's children. Otherwise, 

2020 only the direct children will be considered. 

2021 :param limit: Stop looking after finding this many results. 

2022 :kwargs: A dictionary of filters on attribute values. 

2023 :return: A ResultSet of PageElements. 

2024 :rtype: bs4.element.ResultSet 

2025 """ 

2026 generator = self.descendants 

2027 if not recursive: 

2028 generator = self.children 

2029 _stacklevel = kwargs.pop('_stacklevel', 2) 

2030 return self._find_all(name, attrs, string, limit, generator, 

2031 _stacklevel=_stacklevel+1, **kwargs) 

2032 findAll = find_all # BS3 

2033 findChildren = find_all # BS2 

2034 

2035 #Generator methods 

2036 @property 

2037 def children(self): 

2038 """Iterate over all direct children of this PageElement. 

2039 

2040 :yield: A sequence of PageElements. 

2041 """ 

2042 # return iter() to make the purpose of the method clear 

2043 return iter(self.contents) # XXX This seems to be untested. 

2044 

2045 @property 

2046 def self_and_descendants(self): 

2047 """Iterate over this PageElement and its children in a 

2048 breadth-first sequence. 

2049 

2050 :yield: A sequence of PageElements. 

2051 """ 

2052 if not self.hidden: 

2053 yield self 

2054 for i in self.descendants: 

2055 yield i 

2056 

2057 @property 

2058 def descendants(self): 

2059 """Iterate over all children of this PageElement in a 

2060 breadth-first sequence. 

2061 

2062 :yield: A sequence of PageElements. 

2063 """ 

2064 if not len(self.contents): 

2065 return 

2066 stopNode = self._last_descendant().next_element 

2067 current = self.contents[0] 

2068 while current is not stopNode: 

2069 yield current 

2070 current = current.next_element 

2071 

2072 # CSS selector code 

2073 def select_one(self, selector, namespaces=None, **kwargs): 

2074 """Perform a CSS selection operation on the current element. 

2075 

2076 :param selector: A CSS selector. 

2077 

2078 :param namespaces: A dictionary mapping namespace prefixes 

2079 used in the CSS selector to namespace URIs. By default, 

2080 Beautiful Soup will use the prefixes it encountered while 

2081 parsing the document. 

2082 

2083 :param kwargs: Keyword arguments to be passed into Soup Sieve's 

2084 soupsieve.select() method. 

2085 

2086 :return: A Tag. 

2087 :rtype: bs4.element.Tag 

2088 """ 

2089 return self.css.select_one(selector, namespaces, **kwargs) 

2090 

2091 def select(self, selector, namespaces=None, limit=None, **kwargs): 

2092 """Perform a CSS selection operation on the current element. 

2093 

2094 This uses the SoupSieve library. 

2095 

2096 :param selector: A string containing a CSS selector. 

2097 

2098 :param namespaces: A dictionary mapping namespace prefixes 

2099 used in the CSS selector to namespace URIs. By default, 

2100 Beautiful Soup will use the prefixes it encountered while 

2101 parsing the document. 

2102 

2103 :param limit: After finding this number of results, stop looking. 

2104 

2105 :param kwargs: Keyword arguments to be passed into SoupSieve's 

2106 soupsieve.select() method. 

2107 

2108 :return: A ResultSet of Tags. 

2109 :rtype: bs4.element.ResultSet 

2110 """ 

2111 return self.css.select(selector, namespaces, limit, **kwargs) 

2112 

2113 @property 

2114 def css(self): 

2115 """Return an interface to the CSS selector API.""" 

2116 return CSS(self) 

2117 

2118 # Old names for backwards compatibility 

2119 def childGenerator(self): 

2120 """Deprecated generator.""" 

2121 return self.children 

2122 

2123 def recursiveChildGenerator(self): 

2124 """Deprecated generator.""" 

2125 return self.descendants 

2126 

2127 def has_key(self, key): 

2128 """Deprecated method. This was kind of misleading because has_key() 

2129 (attributes) was different from __in__ (contents). 

2130 

2131 has_key() is gone in Python 3, anyway. 

2132 """ 

2133 warnings.warn( 

2134 'has_key is deprecated. Use has_attr(key) instead.', 

2135 DeprecationWarning, stacklevel=2 

2136 ) 

2137 return self.has_attr(key) 

2138 

2139# Next, a couple classes to represent queries and their results. 

2140class SoupStrainer(object): 

2141 """Encapsulates a number of ways of matching a markup element (tag or 

2142 string). 

2143 

2144 This is primarily used to underpin the find_* methods, but you can 

2145 create one yourself and pass it in as `parse_only` to the 

2146 `BeautifulSoup` constructor, to parse a subset of a large 

2147 document. 

2148 """ 

2149 

2150 def __init__(self, name=None, attrs={}, string=None, **kwargs): 

2151 """Constructor. 

2152 

2153 The SoupStrainer constructor takes the same arguments passed 

2154 into the find_* methods. See the online documentation for 

2155 detailed explanations. 

2156 

2157 :param name: A filter on tag name. 

2158 :param attrs: A dictionary of filters on attribute values. 

2159 :param string: A filter for a NavigableString with specific text. 

2160 :kwargs: A dictionary of filters on attribute values. 

2161 """ 

2162 if string is None and 'text' in kwargs: 

2163 string = kwargs.pop('text') 

2164 warnings.warn( 

2165 "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.", 

2166 DeprecationWarning, stacklevel=2 

2167 ) 

2168 

2169 self.name = self._normalize_search_value(name) 

2170 if not isinstance(attrs, dict): 

2171 # Treat a non-dict value for attrs as a search for the 'class' 

2172 # attribute. 

2173 kwargs['class'] = attrs 

2174 attrs = None 

2175 

2176 if 'class_' in kwargs: 

2177 # Treat class_="foo" as a search for the 'class' 

2178 # attribute, overriding any non-dict value for attrs. 

2179 kwargs['class'] = kwargs['class_'] 

2180 del kwargs['class_'] 

2181 

2182 if kwargs: 

2183 if attrs: 

2184 attrs = attrs.copy() 

2185 attrs.update(kwargs) 

2186 else: 

2187 attrs = kwargs 

2188 normalized_attrs = {} 

2189 for key, value in list(attrs.items()): 

2190 normalized_attrs[key] = self._normalize_search_value(value) 

2191 

2192 self.attrs = normalized_attrs 

2193 self.string = self._normalize_search_value(string) 

2194 

2195 # DEPRECATED but just in case someone is checking this. 

2196 self.text = self.string 

2197 

2198 def _normalize_search_value(self, value): 

2199 # Leave it alone if it's a Unicode string, a callable, a 

2200 # regular expression, a boolean, or None. 

2201 if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match') 

2202 or isinstance(value, bool) or value is None): 

2203 return value 

2204 

2205 # If it's a bytestring, convert it to Unicode, treating it as UTF-8. 

2206 if isinstance(value, bytes): 

2207 return value.decode("utf8") 

2208 

2209 # If it's listlike, convert it into a list of strings. 

2210 if hasattr(value, '__iter__'): 

2211 new_value = [] 

2212 for v in value: 

2213 if (hasattr(v, '__iter__') and not isinstance(v, bytes) 

2214 and not isinstance(v, str)): 

2215 # This is almost certainly the user's mistake. In the 

2216 # interests of avoiding infinite loops, we'll let 

2217 # it through as-is rather than doing a recursive call. 

2218 new_value.append(v) 

2219 else: 

2220 new_value.append(self._normalize_search_value(v)) 

2221 return new_value 

2222 

2223 # Otherwise, convert it into a Unicode string. 

2224 # The unicode(str()) thing is so this will do the same thing on Python 2 

2225 # and Python 3. 

2226 return str(str(value)) 

2227 

2228 def __str__(self): 

2229 """A human-readable representation of this SoupStrainer.""" 

2230 if self.string: 

2231 return self.string 

2232 else: 

2233 return "%s|%s" % (self.name, self.attrs) 

2234 

2235 def search_tag(self, markup_name=None, markup_attrs={}): 

2236 """Check whether a Tag with the given name and attributes would 

2237 match this SoupStrainer. 

2238 

2239 Used prospectively to decide whether to even bother creating a Tag 

2240 object. 

2241 

2242 :param markup_name: A tag name as found in some markup. 

2243 :param markup_attrs: A dictionary of attributes as found in some markup. 

2244 

2245 :return: True if the prospective tag would match this SoupStrainer; 

2246 False otherwise. 

2247 """ 

2248 found = None 

2249 markup = None 

2250 if isinstance(markup_name, Tag): 

2251 markup = markup_name 

2252 markup_attrs = markup 

2253 

2254 if isinstance(self.name, str): 

2255 # Optimization for a very common case where the user is 

2256 # searching for a tag with one specific name, and we're 

2257 # looking at a tag with a different name. 

2258 if markup and not markup.prefix and self.name != markup.name: 

2259 return False 

2260 

2261 call_function_with_tag_data = ( 

2262 isinstance(self.name, Callable) 

2263 and not isinstance(markup_name, Tag)) 

2264 

2265 if ((not self.name) 

2266 or call_function_with_tag_data 

2267 or (markup and self._matches(markup, self.name)) 

2268 or (not markup and self._matches(markup_name, self.name))): 

2269 if call_function_with_tag_data: 

2270 match = self.name(markup_name, markup_attrs) 

2271 else: 

2272 match = True 

2273 markup_attr_map = None 

2274 for attr, match_against in list(self.attrs.items()): 

2275 if not markup_attr_map: 

2276 if hasattr(markup_attrs, 'get'): 

2277 markup_attr_map = markup_attrs 

2278 else: 

2279 markup_attr_map = {} 

2280 for k, v in markup_attrs: 

2281 markup_attr_map[k] = v 

2282 attr_value = markup_attr_map.get(attr) 

2283 if not self._matches(attr_value, match_against): 

2284 match = False 

2285 break 

2286 if match: 

2287 if markup: 

2288 found = markup 

2289 else: 

2290 found = markup_name 

2291 if found and self.string and not self._matches(found.string, self.string): 

2292 found = None 

2293 return found 

2294 

2295 # For BS3 compatibility. 

2296 searchTag = search_tag 

2297 

2298 def search(self, markup): 

2299 """Find all items in `markup` that match this SoupStrainer. 

2300 

2301 Used by the core _find_all() method, which is ultimately 

2302 called by all find_* methods. 

2303 

2304 :param markup: A PageElement or a list of them. 

2305 """ 

2306 # print('looking for %s in %s' % (self, markup)) 

2307 found = None 

2308 # If given a list of items, scan it for a text element that 

2309 # matches. 

2310 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): 

2311 for element in markup: 

2312 if isinstance(element, NavigableString) \ 

2313 and self.search(element): 

2314 found = element 

2315 break 

2316 # If it's a Tag, make sure its name or attributes match. 

2317 # Don't bother with Tags if we're searching for text. 

2318 elif isinstance(markup, Tag): 

2319 if not self.string or self.name or self.attrs: 

2320 found = self.search_tag(markup) 

2321 # If it's text, make sure the text matches. 

2322 elif isinstance(markup, NavigableString) or \ 

2323 isinstance(markup, str): 

2324 if not self.name and not self.attrs and self._matches(markup, self.string): 

2325 found = markup 

2326 else: 

2327 raise Exception( 

2328 "I don't know how to match against a %s" % markup.__class__) 

2329 return found 

2330 

2331 def _matches(self, markup, match_against, already_tried=None): 

2332 # print(u"Matching %s against %s" % (markup, match_against)) 

2333 result = False 

2334 if isinstance(markup, list) or isinstance(markup, tuple): 

2335 # This should only happen when searching a multi-valued attribute 

2336 # like 'class'. 

2337 for item in markup: 

2338 if self._matches(item, match_against): 

2339 return True 

2340 # We didn't match any particular value of the multivalue 

2341 # attribute, but maybe we match the attribute value when 

2342 # considered as a string. 

2343 if self._matches(' '.join(markup), match_against): 

2344 return True 

2345 return False 

2346 

2347 if match_against is True: 

2348 # True matches any non-None value. 

2349 return markup is not None 

2350 

2351 if isinstance(match_against, Callable): 

2352 return match_against(markup) 

2353 

2354 # Custom callables take the tag as an argument, but all 

2355 # other ways of matching match the tag name as a string. 

2356 original_markup = markup 

2357 if isinstance(markup, Tag): 

2358 markup = markup.name 

2359 

2360 # Ensure that `markup` is either a Unicode string, or None. 

2361 markup = self._normalize_search_value(markup) 

2362 

2363 if markup is None: 

2364 # None matches None, False, an empty string, an empty list, and so on. 

2365 return not match_against 

2366 

2367 if (hasattr(match_against, '__iter__') 

2368 and not isinstance(match_against, str)): 

2369 # We're asked to match against an iterable of items. 

2370 # The markup must be match at least one item in the 

2371 # iterable. We'll try each one in turn. 

2372 # 

2373 # To avoid infinite recursion we need to keep track of 

2374 # items we've already seen. 

2375 if not already_tried: 

2376 already_tried = set() 

2377 for item in match_against: 

2378 if item.__hash__: 

2379 key = item 

2380 else: 

2381 key = id(item) 

2382 if key in already_tried: 

2383 continue 

2384 else: 

2385 already_tried.add(key) 

2386 if self._matches(original_markup, item, already_tried): 

2387 return True 

2388 else: 

2389 return False 

2390 

2391 # Beyond this point we might need to run the test twice: once against 

2392 # the tag's name and once against its prefixed name. 

2393 match = False 

2394 

2395 if not match and isinstance(match_against, str): 

2396 # Exact string match 

2397 match = markup == match_against 

2398 

2399 if not match and hasattr(match_against, 'search'): 

2400 # Regexp match 

2401 return match_against.search(markup) 

2402 

2403 if (not match 

2404 and isinstance(original_markup, Tag) 

2405 and original_markup.prefix): 

2406 # Try the whole thing again with the prefixed tag name. 

2407 return self._matches( 

2408 original_markup.prefix + ':' + original_markup.name, match_against 

2409 ) 

2410 

2411 return match 

2412 

2413 

2414class ResultSet(list): 

2415 """A ResultSet is just a list that keeps track of the SoupStrainer 

2416 that created it.""" 

2417 def __init__(self, source, result=()): 

2418 """Constructor. 

2419 

2420 :param source: A SoupStrainer. 

2421 :param result: A list of PageElements. 

2422 """ 

2423 super(ResultSet, self).__init__(result) 

2424 self.source = source 

2425 

2426 def __getattr__(self, key): 

2427 """Raise a helpful exception to explain a common code fix.""" 

2428 raise AttributeError( 

2429 "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key 

2430 )