Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/element.py: 56%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

917 statements  

1# Use of this source code is governed by the MIT license. 

2__license__ = "MIT" 

3 

4try: 

5 from collections.abc import Callable # Python 3.6 

6except ImportError as e: 

7 from collections import Callable 

8import re 

9import sys 

10import warnings 

11try: 

12 import soupsieve 

13except ImportError as e: 

14 soupsieve = None 

15 warnings.warn( 

16 'The soupsieve package is not installed. CSS selectors cannot be used.' 

17 ) 

18 

19from bs4.formatter import ( 

20 Formatter, 

21 HTMLFormatter, 

22 XMLFormatter, 

23) 

24 

25DEFAULT_OUTPUT_ENCODING = "utf-8" 

26 

27nonwhitespace_re = re.compile(r"\S+") 

28 

29# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on 

30# the off chance someone imported it for their own use. 

31whitespace_re = re.compile(r"\s+") 

32 

33def _alias(attr): 

34 """Alias one attribute name to another for backward compatibility""" 

35 @property 

36 def alias(self): 

37 return getattr(self, attr) 

38 

39 @alias.setter 

40 def alias(self): 

41 return setattr(self, attr) 

42 return alias 

43 

44 

45# These encodings are recognized by Python (so PageElement.encode 

46# could theoretically support them) but XML and HTML don't recognize 

47# them (so they should not show up in an XML or HTML document as that 

48# document's encoding). 

49# 

50# If an XML document is encoded in one of these encodings, no encoding 

51# will be mentioned in the XML declaration. If an HTML document is 

52# encoded in one of these encodings, and the HTML document has a 

53# <meta> tag that mentions an encoding, the encoding will be given as 

54# the empty string. 

55# 

56# Source: 

57# https://docs.python.org/3/library/codecs.html#python-specific-encodings 

58PYTHON_SPECIFIC_ENCODINGS = set([ 

59 "idna", 

60 "mbcs", 

61 "oem", 

62 "palmos", 

63 "punycode", 

64 "raw_unicode_escape", 

65 "undefined", 

66 "unicode_escape", 

67 "raw-unicode-escape", 

68 "unicode-escape", 

69 "string-escape", 

70 "string_escape", 

71]) 

72 

73 

74class NamespacedAttribute(str): 

75 """A namespaced string (e.g. 'xml:lang') that remembers the namespace 

76 ('xml') and the name ('lang') that were used to create it. 

77 """ 

78 

79 def __new__(cls, prefix, name=None, namespace=None): 

80 if not name: 

81 # This is the default namespace. Its name "has no value" 

82 # per https://www.w3.org/TR/xml-names/#defaulting 

83 name = None 

84 

85 if not name: 

86 obj = str.__new__(cls, prefix) 

87 elif not prefix: 

88 # Not really namespaced. 

89 obj = str.__new__(cls, name) 

90 else: 

91 obj = str.__new__(cls, prefix + ":" + name) 

92 obj.prefix = prefix 

93 obj.name = name 

94 obj.namespace = namespace 

95 return obj 

96 

97class AttributeValueWithCharsetSubstitution(str): 

98 """A stand-in object for a character encoding specified in HTML.""" 

99 

100class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): 

101 """A generic stand-in for the value of a meta tag's 'charset' attribute. 

102 

103 When Beautiful Soup parses the markup '<meta charset="utf8">', the 

104 value of the 'charset' attribute will be one of these objects. 

105 """ 

106 

107 def __new__(cls, original_value): 

108 obj = str.__new__(cls, original_value) 

109 obj.original_value = original_value 

110 return obj 

111 

112 def encode(self, encoding): 

113 """When an HTML document is being encoded to a given encoding, the 

114 value of a meta tag's 'charset' is the name of the encoding. 

115 """ 

116 if encoding in PYTHON_SPECIFIC_ENCODINGS: 

117 return '' 

118 return encoding 

119 

120 

121class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): 

122 """A generic stand-in for the value of a meta tag's 'content' attribute. 

123 

124 When Beautiful Soup parses the markup: 

125 <meta http-equiv="content-type" content="text/html; charset=utf8"> 

126 

127 The value of the 'content' attribute will be one of these objects. 

128 """ 

129 

130 CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) 

131 

132 def __new__(cls, original_value): 

133 match = cls.CHARSET_RE.search(original_value) 

134 if match is None: 

135 # No substitution necessary. 

136 return str.__new__(str, original_value) 

137 

138 obj = str.__new__(cls, original_value) 

139 obj.original_value = original_value 

140 return obj 

141 

142 def encode(self, encoding): 

143 if encoding in PYTHON_SPECIFIC_ENCODINGS: 

144 return '' 

145 def rewrite(match): 

146 return match.group(1) + encoding 

147 return self.CHARSET_RE.sub(rewrite, self.original_value) 

148 

149 

150class PageElement(object): 

151 """Contains the navigational information for some part of the page: 

152 that is, its current location in the parse tree. 

153 

154 NavigableString, Tag, etc. are all subclasses of PageElement. 

155 """ 

156 

157 def setup(self, parent=None, previous_element=None, next_element=None, 

158 previous_sibling=None, next_sibling=None): 

159 """Sets up the initial relations between this element and 

160 other elements. 

161 

162 :param parent: The parent of this element. 

163 

164 :param previous_element: The element parsed immediately before 

165 this one. 

166  

167 :param next_element: The element parsed immediately before 

168 this one. 

169 

170 :param previous_sibling: The most recently encountered element 

171 on the same level of the parse tree as this one. 

172 

173 :param previous_sibling: The next element to be encountered 

174 on the same level of the parse tree as this one. 

175 """ 

176 self.parent = parent 

177 

178 self.previous_element = previous_element 

179 if previous_element is not None: 

180 self.previous_element.next_element = self 

181 

182 self.next_element = next_element 

183 if self.next_element is not None: 

184 self.next_element.previous_element = self 

185 

186 self.next_sibling = next_sibling 

187 if self.next_sibling is not None: 

188 self.next_sibling.previous_sibling = self 

189 

190 if (previous_sibling is None 

191 and self.parent is not None and self.parent.contents): 

192 previous_sibling = self.parent.contents[-1] 

193 

194 self.previous_sibling = previous_sibling 

195 if previous_sibling is not None: 

196 self.previous_sibling.next_sibling = self 

197 

198 def format_string(self, s, formatter): 

199 """Format the given string using the given formatter. 

200 

201 :param s: A string. 

202 :param formatter: A Formatter object, or a string naming one of the standard formatters. 

203 """ 

204 if formatter is None: 

205 return s 

206 if not isinstance(formatter, Formatter): 

207 formatter = self.formatter_for_name(formatter) 

208 output = formatter.substitute(s) 

209 return output 

210 

211 def formatter_for_name(self, formatter): 

212 """Look up or create a Formatter for the given identifier, 

213 if necessary. 

214 

215 :param formatter: Can be a Formatter object (used as-is), a 

216 function (used as the entity substitution hook for an 

217 XMLFormatter or HTMLFormatter), or a string (used to look 

218 up an XMLFormatter or HTMLFormatter in the appropriate 

219 registry. 

220 """ 

221 if isinstance(formatter, Formatter): 

222 return formatter 

223 if self._is_xml: 

224 c = XMLFormatter 

225 else: 

226 c = HTMLFormatter 

227 if isinstance(formatter, Callable): 

228 return c(entity_substitution=formatter) 

229 return c.REGISTRY[formatter] 

230 

231 @property 

232 def _is_xml(self): 

233 """Is this element part of an XML tree or an HTML tree? 

234 

235 This is used in formatter_for_name, when deciding whether an 

236 XMLFormatter or HTMLFormatter is more appropriate. It can be 

237 inefficient, but it should be called very rarely. 

238 """ 

239 if self.known_xml is not None: 

240 # Most of the time we will have determined this when the 

241 # document is parsed. 

242 return self.known_xml 

243 

244 # Otherwise, it's likely that this element was created by 

245 # direct invocation of the constructor from within the user's 

246 # Python code. 

247 if self.parent is None: 

248 # This is the top-level object. It should have .known_xml set 

249 # from tree creation. If not, take a guess--BS is usually 

250 # used on HTML markup. 

251 return getattr(self, 'is_xml', False) 

252 return self.parent._is_xml 

253 

254 nextSibling = _alias("next_sibling") # BS3 

255 previousSibling = _alias("previous_sibling") # BS3 

256 

257 default = object() 

258 def _all_strings(self, strip=False, types=default): 

259 """Yield all strings of certain classes, possibly stripping them. 

260  

261 This is implemented differently in Tag and NavigableString. 

262 """ 

263 raise NotImplementedError() 

264 

265 @property 

266 def stripped_strings(self): 

267 """Yield all strings in this PageElement, stripping them first. 

268 

269 :yield: A sequence of stripped strings. 

270 """ 

271 for string in self._all_strings(True): 

272 yield string 

273 

274 def get_text(self, separator="", strip=False, 

275 types=default): 

276 """Get all child strings of this PageElement, concatenated using the 

277 given separator. 

278 

279 :param separator: Strings will be concatenated using this separator. 

280 

281 :param strip: If True, strings will be stripped before being 

282 concatenated. 

283 

284 :param types: A tuple of NavigableString subclasses. Any 

285 strings of a subclass not found in this list will be 

286 ignored. Although there are exceptions, the default 

287 behavior in most cases is to consider only NavigableString 

288 and CData objects. That means no comments, processing 

289 instructions, etc. 

290 

291 :return: A string. 

292 """ 

293 return separator.join([s for s in self._all_strings( 

294 strip, types=types)]) 

295 getText = get_text 

296 text = property(get_text) 

297 

298 def replace_with(self, *args): 

299 """Replace this PageElement with one or more PageElements, keeping the  

300 rest of the tree the same. 

301  

302 :param args: One or more PageElements. 

303 :return: `self`, no longer part of the tree. 

304 """ 

305 if self.parent is None: 

306 raise ValueError( 

307 "Cannot replace one element with another when the " 

308 "element to be replaced is not part of a tree.") 

309 if len(args) == 1 and args[0] is self: 

310 return 

311 if any(x is self.parent for x in args): 

312 raise ValueError("Cannot replace a Tag with its parent.") 

313 old_parent = self.parent 

314 my_index = self.parent.index(self) 

315 self.extract(_self_index=my_index) 

316 for idx, replace_with in enumerate(args, start=my_index): 

317 old_parent.insert(idx, replace_with) 

318 return self 

319 replaceWith = replace_with # BS3 

320 

321 def unwrap(self): 

322 """Replace this PageElement with its contents. 

323 

324 :return: `self`, no longer part of the tree. 

325 """ 

326 my_parent = self.parent 

327 if self.parent is None: 

328 raise ValueError( 

329 "Cannot replace an element with its contents when that" 

330 "element is not part of a tree.") 

331 my_index = self.parent.index(self) 

332 self.extract(_self_index=my_index) 

333 for child in reversed(self.contents[:]): 

334 my_parent.insert(my_index, child) 

335 return self 

336 replace_with_children = unwrap 

337 replaceWithChildren = unwrap # BS3 

338 

339 def wrap(self, wrap_inside): 

340 """Wrap this PageElement inside another one. 

341 

342 :param wrap_inside: A PageElement. 

343 :return: `wrap_inside`, occupying the position in the tree that used 

344 to be occupied by `self`, and with `self` inside it. 

345 """ 

346 me = self.replace_with(wrap_inside) 

347 wrap_inside.append(me) 

348 return wrap_inside 

349 

350 def extract(self, _self_index=None): 

351 """Destructively rips this element out of the tree. 

352 

353 :param _self_index: The location of this element in its parent's 

354 .contents, if known. Passing this in allows for a performance 

355 optimization. 

356 

357 :return: `self`, no longer part of the tree. 

358 """ 

359 if self.parent is not None: 

360 if _self_index is None: 

361 _self_index = self.parent.index(self) 

362 del self.parent.contents[_self_index] 

363 

364 #Find the two elements that would be next to each other if 

365 #this element (and any children) hadn't been parsed. Connect 

366 #the two. 

367 last_child = self._last_descendant() 

368 next_element = last_child.next_element 

369 

370 if (self.previous_element is not None and 

371 self.previous_element is not next_element): 

372 self.previous_element.next_element = next_element 

373 if next_element is not None and next_element is not self.previous_element: 

374 next_element.previous_element = self.previous_element 

375 self.previous_element = None 

376 last_child.next_element = None 

377 

378 self.parent = None 

379 if (self.previous_sibling is not None 

380 and self.previous_sibling is not self.next_sibling): 

381 self.previous_sibling.next_sibling = self.next_sibling 

382 if (self.next_sibling is not None 

383 and self.next_sibling is not self.previous_sibling): 

384 self.next_sibling.previous_sibling = self.previous_sibling 

385 self.previous_sibling = self.next_sibling = None 

386 return self 

387 

388 def _last_descendant(self, is_initialized=True, accept_self=True): 

389 """Finds the last element beneath this object to be parsed. 

390 

391 :param is_initialized: Has `setup` been called on this PageElement 

392 yet? 

393 :param accept_self: Is `self` an acceptable answer to the question? 

394 """ 

395 if is_initialized and self.next_sibling is not None: 

396 last_child = self.next_sibling.previous_element 

397 else: 

398 last_child = self 

399 while isinstance(last_child, Tag) and last_child.contents: 

400 last_child = last_child.contents[-1] 

401 if not accept_self and last_child is self: 

402 last_child = None 

403 return last_child 

404 # BS3: Not part of the API! 

405 _lastRecursiveChild = _last_descendant 

406 

407 def insert(self, position, new_child): 

408 """Insert a new PageElement in the list of this PageElement's children. 

409 

410 This works the same way as `list.insert`. 

411 

412 :param position: The numeric position that should be occupied 

413 in `self.children` by the new PageElement.  

414 :param new_child: A PageElement. 

415 """ 

416 if new_child is None: 

417 raise ValueError("Cannot insert None into a tag.") 

418 if new_child is self: 

419 raise ValueError("Cannot insert a tag into itself.") 

420 if (isinstance(new_child, str) 

421 and not isinstance(new_child, NavigableString)): 

422 new_child = NavigableString(new_child) 

423 

424 from bs4 import BeautifulSoup 

425 if isinstance(new_child, BeautifulSoup): 

426 # We don't want to end up with a situation where one BeautifulSoup 

427 # object contains another. Insert the children one at a time. 

428 for subchild in list(new_child.contents): 

429 self.insert(position, subchild) 

430 position += 1 

431 return 

432 position = min(position, len(self.contents)) 

433 if hasattr(new_child, 'parent') and new_child.parent is not None: 

434 # We're 'inserting' an element that's already one 

435 # of this object's children. 

436 if new_child.parent is self: 

437 current_index = self.index(new_child) 

438 if current_index < position: 

439 # We're moving this element further down the list 

440 # of this object's children. That means that when 

441 # we extract this element, our target index will 

442 # jump down one. 

443 position -= 1 

444 new_child.extract() 

445 

446 new_child.parent = self 

447 previous_child = None 

448 if position == 0: 

449 new_child.previous_sibling = None 

450 new_child.previous_element = self 

451 else: 

452 previous_child = self.contents[position - 1] 

453 new_child.previous_sibling = previous_child 

454 new_child.previous_sibling.next_sibling = new_child 

455 new_child.previous_element = previous_child._last_descendant(False) 

456 if new_child.previous_element is not None: 

457 new_child.previous_element.next_element = new_child 

458 

459 new_childs_last_element = new_child._last_descendant(False) 

460 

461 if position >= len(self.contents): 

462 new_child.next_sibling = None 

463 

464 parent = self 

465 parents_next_sibling = None 

466 while parents_next_sibling is None and parent is not None: 

467 parents_next_sibling = parent.next_sibling 

468 parent = parent.parent 

469 if parents_next_sibling is not None: 

470 # We found the element that comes next in the document. 

471 break 

472 if parents_next_sibling is not None: 

473 new_childs_last_element.next_element = parents_next_sibling 

474 else: 

475 # The last element of this tag is the last element in 

476 # the document. 

477 new_childs_last_element.next_element = None 

478 else: 

479 next_child = self.contents[position] 

480 new_child.next_sibling = next_child 

481 if new_child.next_sibling is not None: 

482 new_child.next_sibling.previous_sibling = new_child 

483 new_childs_last_element.next_element = next_child 

484 

485 if new_childs_last_element.next_element is not None: 

486 new_childs_last_element.next_element.previous_element = new_childs_last_element 

487 self.contents.insert(position, new_child) 

488 

489 def append(self, tag): 

490 """Appends the given PageElement to the contents of this one. 

491 

492 :param tag: A PageElement. 

493 """ 

494 self.insert(len(self.contents), tag) 

495 

496 def extend(self, tags): 

497 """Appends the given PageElements to this one's contents. 

498 

499 :param tags: A list of PageElements. 

500 """ 

501 if isinstance(tags, Tag): 

502 # Calling self.append() on another tag's contents will change 

503 # the list we're iterating over. Make a list that won't 

504 # change. 

505 tags = list(tags.contents) 

506 for tag in tags: 

507 self.append(tag) 

508 

509 def insert_before(self, *args): 

510 """Makes the given element(s) the immediate predecessor of this one. 

511 

512 All the elements will have the same parent, and the given elements 

513 will be immediately before this one. 

514 

515 :param args: One or more PageElements. 

516 """ 

517 parent = self.parent 

518 if parent is None: 

519 raise ValueError( 

520 "Element has no parent, so 'before' has no meaning.") 

521 if any(x is self for x in args): 

522 raise ValueError("Can't insert an element before itself.") 

523 for predecessor in args: 

524 # Extract first so that the index won't be screwed up if they 

525 # are siblings. 

526 if isinstance(predecessor, PageElement): 

527 predecessor.extract() 

528 index = parent.index(self) 

529 parent.insert(index, predecessor) 

530 

531 def insert_after(self, *args): 

532 """Makes the given element(s) the immediate successor of this one. 

533 

534 The elements will have the same parent, and the given elements 

535 will be immediately after this one. 

536 

537 :param args: One or more PageElements. 

538 """ 

539 # Do all error checking before modifying the tree. 

540 parent = self.parent 

541 if parent is None: 

542 raise ValueError( 

543 "Element has no parent, so 'after' has no meaning.") 

544 if any(x is self for x in args): 

545 raise ValueError("Can't insert an element after itself.") 

546 

547 offset = 0 

548 for successor in args: 

549 # Extract first so that the index won't be screwed up if they 

550 # are siblings. 

551 if isinstance(successor, PageElement): 

552 successor.extract() 

553 index = parent.index(self) 

554 parent.insert(index+1+offset, successor) 

555 offset += 1 

556 

557 def find_next(self, name=None, attrs={}, string=None, **kwargs): 

558 """Find the first PageElement that matches the given criteria and 

559 appears later in the document than this PageElement. 

560 

561 All find_* methods take a common set of arguments. See the online 

562 documentation for detailed explanations. 

563 

564 :param name: A filter on tag name. 

565 :param attrs: A dictionary of filters on attribute values. 

566 :param string: A filter for a NavigableString with specific text. 

567 :kwargs: A dictionary of filters on attribute values. 

568 :return: A PageElement. 

569 :rtype: bs4.element.Tag | bs4.element.NavigableString 

570 """ 

571 return self._find_one(self.find_all_next, name, attrs, string, **kwargs) 

572 findNext = find_next # BS3 

573 

574 def find_all_next(self, name=None, attrs={}, string=None, limit=None, 

575 **kwargs): 

576 """Find all PageElements that match the given criteria and appear 

577 later in the document than this PageElement. 

578 

579 All find_* methods take a common set of arguments. See the online 

580 documentation for detailed explanations. 

581 

582 :param name: A filter on tag name. 

583 :param attrs: A dictionary of filters on attribute values. 

584 :param string: A filter for a NavigableString with specific text. 

585 :param limit: Stop looking after finding this many results. 

586 :kwargs: A dictionary of filters on attribute values. 

587 :return: A ResultSet containing PageElements. 

588 """ 

589 return self._find_all(name, attrs, string, limit, self.next_elements, 

590 **kwargs) 

591 findAllNext = find_all_next # BS3 

592 

593 def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs): 

594 """Find the closest sibling to this PageElement that matches the 

595 given criteria and appears later in the document. 

596 

597 All find_* methods take a common set of arguments. See the 

598 online documentation for detailed explanations. 

599 

600 :param name: A filter on tag name. 

601 :param attrs: A dictionary of filters on attribute values. 

602 :param string: A filter for a NavigableString with specific text. 

603 :kwargs: A dictionary of filters on attribute values. 

604 :return: A PageElement. 

605 :rtype: bs4.element.Tag | bs4.element.NavigableString 

606 """ 

607 return self._find_one(self.find_next_siblings, name, attrs, string, 

608 **kwargs) 

609 findNextSibling = find_next_sibling # BS3 

610 

611 def find_next_siblings(self, name=None, attrs={}, string=None, limit=None, 

612 **kwargs): 

613 """Find all siblings of this PageElement that match the given criteria 

614 and appear later in the document. 

615 

616 All find_* methods take a common set of arguments. See the online 

617 documentation for detailed explanations. 

618 

619 :param name: A filter on tag name. 

620 :param attrs: A dictionary of filters on attribute values. 

621 :param string: A filter for a NavigableString with specific text. 

622 :param limit: Stop looking after finding this many results. 

623 :kwargs: A dictionary of filters on attribute values. 

624 :return: A ResultSet of PageElements. 

625 :rtype: bs4.element.ResultSet 

626 """ 

627 return self._find_all(name, attrs, string, limit, 

628 self.next_siblings, **kwargs) 

629 findNextSiblings = find_next_siblings # BS3 

630 fetchNextSiblings = find_next_siblings # BS2 

631 

632 def find_previous(self, name=None, attrs={}, string=None, **kwargs): 

633 """Look backwards in the document from this PageElement and find the 

634 first PageElement that matches the given criteria. 

635 

636 All find_* methods take a common set of arguments. See the online 

637 documentation for detailed explanations. 

638 

639 :param name: A filter on tag name. 

640 :param attrs: A dictionary of filters on attribute values. 

641 :param string: A filter for a NavigableString with specific text. 

642 :kwargs: A dictionary of filters on attribute values. 

643 :return: A PageElement. 

644 :rtype: bs4.element.Tag | bs4.element.NavigableString 

645 """ 

646 return self._find_one( 

647 self.find_all_previous, name, attrs, string, **kwargs) 

648 findPrevious = find_previous # BS3 

649 

650 def find_all_previous(self, name=None, attrs={}, string=None, limit=None, 

651 **kwargs): 

652 """Look backwards in the document from this PageElement and find all 

653 PageElements that match the given criteria. 

654 

655 All find_* methods take a common set of arguments. See the online 

656 documentation for detailed explanations. 

657 

658 :param name: A filter on tag name. 

659 :param attrs: A dictionary of filters on attribute values. 

660 :param string: A filter for a NavigableString with specific text. 

661 :param limit: Stop looking after finding this many results. 

662 :kwargs: A dictionary of filters on attribute values. 

663 :return: A ResultSet of PageElements. 

664 :rtype: bs4.element.ResultSet 

665 """ 

666 return self._find_all(name, attrs, string, limit, self.previous_elements, 

667 **kwargs) 

668 findAllPrevious = find_all_previous # BS3 

669 fetchPrevious = find_all_previous # BS2 

670 

671 def find_previous_sibling(self, name=None, attrs={}, string=None, **kwargs): 

672 """Returns the closest sibling to this PageElement that matches the 

673 given criteria and appears earlier in the document. 

674 

675 All find_* methods take a common set of arguments. See the online 

676 documentation for detailed explanations. 

677 

678 :param name: A filter on tag name. 

679 :param attrs: A dictionary of filters on attribute values. 

680 :param string: A filter for a NavigableString with specific text. 

681 :kwargs: A dictionary of filters on attribute values. 

682 :return: A PageElement. 

683 :rtype: bs4.element.Tag | bs4.element.NavigableString 

684 """ 

685 return self._find_one(self.find_previous_siblings, name, attrs, string, 

686 **kwargs) 

687 findPreviousSibling = find_previous_sibling # BS3 

688 

689 def find_previous_siblings(self, name=None, attrs={}, string=None, 

690 limit=None, **kwargs): 

691 """Returns all siblings to this PageElement that match the 

692 given criteria and appear earlier in the document. 

693 

694 All find_* methods take a common set of arguments. See the online 

695 documentation for detailed explanations. 

696 

697 :param name: A filter on tag name. 

698 :param attrs: A dictionary of filters on attribute values. 

699 :param string: A filter for a NavigableString with specific text. 

700 :param limit: Stop looking after finding this many results. 

701 :kwargs: A dictionary of filters on attribute values. 

702 :return: A ResultSet of PageElements. 

703 :rtype: bs4.element.ResultSet 

704 """ 

705 return self._find_all(name, attrs, string, limit, 

706 self.previous_siblings, **kwargs) 

707 findPreviousSiblings = find_previous_siblings # BS3 

708 fetchPreviousSiblings = find_previous_siblings # BS2 

709 

710 def find_parent(self, name=None, attrs={}, **kwargs): 

711 """Find the closest parent of this PageElement that matches the given 

712 criteria. 

713 

714 All find_* methods take a common set of arguments. See the online 

715 documentation for detailed explanations. 

716 

717 :param name: A filter on tag name. 

718 :param attrs: A dictionary of filters on attribute values. 

719 :kwargs: A dictionary of filters on attribute values. 

720 

721 :return: A PageElement. 

722 :rtype: bs4.element.Tag | bs4.element.NavigableString 

723 """ 

724 # NOTE: We can't use _find_one because findParents takes a different 

725 # set of arguments. 

726 r = None 

727 l = self.find_parents(name, attrs, 1, **kwargs) 

728 if l: 

729 r = l[0] 

730 return r 

731 findParent = find_parent # BS3 

732 

733 def find_parents(self, name=None, attrs={}, limit=None, **kwargs): 

734 """Find all parents of this PageElement that match the given criteria. 

735 

736 All find_* methods take a common set of arguments. See the online 

737 documentation for detailed explanations. 

738 

739 :param name: A filter on tag name. 

740 :param attrs: A dictionary of filters on attribute values. 

741 :param limit: Stop looking after finding this many results. 

742 :kwargs: A dictionary of filters on attribute values. 

743 

744 :return: A PageElement. 

745 :rtype: bs4.element.Tag | bs4.element.NavigableString 

746 """ 

747 return self._find_all(name, attrs, None, limit, self.parents, 

748 **kwargs) 

749 findParents = find_parents # BS3 

750 fetchParents = find_parents # BS2 

751 

752 @property 

753 def next(self): 

754 """The PageElement, if any, that was parsed just after this one. 

755 

756 :return: A PageElement. 

757 :rtype: bs4.element.Tag | bs4.element.NavigableString 

758 """ 

759 return self.next_element 

760 

761 @property 

762 def previous(self): 

763 """The PageElement, if any, that was parsed just before this one. 

764 

765 :return: A PageElement. 

766 :rtype: bs4.element.Tag | bs4.element.NavigableString 

767 """ 

768 return self.previous_element 

769 

770 #These methods do the real heavy lifting. 

771 

772 def _find_one(self, method, name, attrs, string, **kwargs): 

773 r = None 

774 l = method(name, attrs, string, 1, **kwargs) 

775 if l: 

776 r = l[0] 

777 return r 

778 

779 def _find_all(self, name, attrs, string, limit, generator, **kwargs): 

780 "Iterates over a generator looking for things that match." 

781 

782 if string is None and 'text' in kwargs: 

783 string = kwargs.pop('text') 

784 warnings.warn( 

785 "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.", 

786 DeprecationWarning 

787 ) 

788 

789 if isinstance(name, SoupStrainer): 

790 strainer = name 

791 else: 

792 strainer = SoupStrainer(name, attrs, string, **kwargs) 

793 

794 if string is None and not limit and not attrs and not kwargs: 

795 if name is True or name is None: 

796 # Optimization to find all tags. 

797 result = (element for element in generator 

798 if isinstance(element, Tag)) 

799 return ResultSet(strainer, result) 

800 elif isinstance(name, str): 

801 # Optimization to find all tags with a given name. 

802 if name.count(':') == 1: 

803 # This is a name with a prefix. If this is a namespace-aware document, 

804 # we need to match the local name against tag.name. If not, 

805 # we need to match the fully-qualified name against tag.name. 

806 prefix, local_name = name.split(':', 1) 

807 else: 

808 prefix = None 

809 local_name = name 

810 result = (element for element in generator 

811 if isinstance(element, Tag) 

812 and ( 

813 element.name == name 

814 ) or ( 

815 element.name == local_name 

816 and (prefix is None or element.prefix == prefix) 

817 ) 

818 ) 

819 return ResultSet(strainer, result) 

820 results = ResultSet(strainer) 

821 while True: 

822 try: 

823 i = next(generator) 

824 except StopIteration: 

825 break 

826 if i: 

827 found = strainer.search(i) 

828 if found: 

829 results.append(found) 

830 if limit and len(results) >= limit: 

831 break 

832 return results 

833 

834 #These generators can be used to navigate starting from both 

835 #NavigableStrings and Tags. 

836 @property 

837 def next_elements(self): 

838 """All PageElements that were parsed after this one. 

839 

840 :yield: A sequence of PageElements. 

841 """ 

842 i = self.next_element 

843 while i is not None: 

844 yield i 

845 i = i.next_element 

846 

847 @property 

848 def next_siblings(self): 

849 """All PageElements that are siblings of this one but were parsed 

850 later. 

851 

852 :yield: A sequence of PageElements. 

853 """ 

854 i = self.next_sibling 

855 while i is not None: 

856 yield i 

857 i = i.next_sibling 

858 

859 @property 

860 def previous_elements(self): 

861 """All PageElements that were parsed before this one. 

862 

863 :yield: A sequence of PageElements. 

864 """ 

865 i = self.previous_element 

866 while i is not None: 

867 yield i 

868 i = i.previous_element 

869 

870 @property 

871 def previous_siblings(self): 

872 """All PageElements that are siblings of this one but were parsed 

873 earlier. 

874 

875 :yield: A sequence of PageElements. 

876 """ 

877 i = self.previous_sibling 

878 while i is not None: 

879 yield i 

880 i = i.previous_sibling 

881 

882 @property 

883 def parents(self): 

884 """All PageElements that are parents of this PageElement. 

885 

886 :yield: A sequence of PageElements. 

887 """ 

888 i = self.parent 

889 while i is not None: 

890 yield i 

891 i = i.parent 

892 

893 @property 

894 def decomposed(self): 

895 """Check whether a PageElement has been decomposed. 

896 

897 :rtype: bool 

898 """ 

899 return getattr(self, '_decomposed', False) or False 

900 

901 # Old non-property versions of the generators, for backwards 

902 # compatibility with BS3. 

903 def nextGenerator(self): 

904 return self.next_elements 

905 

906 def nextSiblingGenerator(self): 

907 return self.next_siblings 

908 

909 def previousGenerator(self): 

910 return self.previous_elements 

911 

912 def previousSiblingGenerator(self): 

913 return self.previous_siblings 

914 

915 def parentGenerator(self): 

916 return self.parents 

917 

918 

919class NavigableString(str, PageElement): 

920 """A Python Unicode string that is part of a parse tree. 

921 

922 When Beautiful Soup parses the markup <b>penguin</b>, it will 

923 create a NavigableString for the string "penguin". 

924 """ 

925 

926 PREFIX = '' 

927 SUFFIX = '' 

928 

929 # We can't tell just by looking at a string whether it's contained 

930 # in an XML document or an HTML document. 

931 

932 known_xml = None 

933 

934 def __new__(cls, value): 

935 """Create a new NavigableString. 

936 

937 When unpickling a NavigableString, this method is called with 

938 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 

939 passed in to the superclass's __new__ or the superclass won't know 

940 how to handle non-ASCII characters. 

941 """ 

942 if isinstance(value, str): 

943 u = str.__new__(cls, value) 

944 else: 

945 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 

946 u.setup() 

947 return u 

948 

949 def __copy__(self): 

950 """A copy of a NavigableString has the same contents and class 

951 as the original, but it is not connected to the parse tree. 

952 """ 

953 return type(self)(self) 

954 

955 def __getnewargs__(self): 

956 return (str(self),) 

957 

958 def __getattr__(self, attr): 

959 """text.string gives you text. This is for backwards 

960 compatibility for Navigable*String, but for CData* it lets you 

961 get the string without the CData wrapper.""" 

962 if attr == 'string': 

963 return self 

964 else: 

965 raise AttributeError( 

966 "'%s' object has no attribute '%s'" % ( 

967 self.__class__.__name__, attr)) 

968 

969 def output_ready(self, formatter="minimal"): 

970 """Run the string through the provided formatter. 

971 

972 :param formatter: A Formatter object, or a string naming one of the standard formatters. 

973 """ 

974 output = self.format_string(self, formatter) 

975 return self.PREFIX + output + self.SUFFIX 

976 

977 @property 

978 def name(self): 

979 """Since a NavigableString is not a Tag, it has no .name. 

980 

981 This property is implemented so that code like this doesn't crash 

982 when run on a mixture of Tag and NavigableString objects: 

983 [x.name for x in tag.children] 

984 """ 

985 return None 

986 

987 @name.setter 

988 def name(self, name): 

989 """Prevent NavigableString.name from ever being set.""" 

990 raise AttributeError("A NavigableString cannot be given a name.") 

991 

992 def _all_strings(self, strip=False, types=PageElement.default): 

993 """Yield all strings of certain classes, possibly stripping them. 

994 

995 This makes it easy for NavigableString to implement methods 

996 like get_text() as conveniences, creating a consistent 

997 text-extraction API across all PageElements. 

998 

999 :param strip: If True, all strings will be stripped before being 

1000 yielded. 

1001 

1002 :param types: A tuple of NavigableString subclasses. If this 

1003 NavigableString isn't one of those subclasses, the 

1004 sequence will be empty. By default, the subclasses 

1005 considered are NavigableString and CData objects. That 

1006 means no comments, processing instructions, etc. 

1007 

1008 :yield: A sequence that either contains this string, or is empty. 

1009 

1010 """ 

1011 if types is self.default: 

1012 # This is kept in Tag because it's full of subclasses of 

1013 # this class, which aren't defined until later in the file. 

1014 types = Tag.DEFAULT_INTERESTING_STRING_TYPES 

1015 

1016 # Do nothing if the caller is looking for specific types of 

1017 # string, and we're of a different type. 

1018 # 

1019 # We check specific types instead of using isinstance(self, 

1020 # types) because all of these classes subclass 

1021 # NavigableString. Anyone who's using this feature probably 

1022 # wants generic NavigableStrings but not other stuff. 

1023 my_type = type(self) 

1024 if types is not None: 

1025 if isinstance(types, type): 

1026 # Looking for a single type. 

1027 if my_type is not types: 

1028 return 

1029 elif my_type not in types: 

1030 # Looking for one of a list of types. 

1031 return 

1032 

1033 value = self 

1034 if strip: 

1035 value = value.strip() 

1036 if len(value) > 0: 

1037 yield value 

1038 strings = property(_all_strings) 

1039 

1040class PreformattedString(NavigableString): 

1041 """A NavigableString not subject to the normal formatting rules. 

1042 

1043 This is an abstract class used for special kinds of strings such 

1044 as comments (the Comment class) and CDATA blocks (the CData 

1045 class). 

1046 """ 

1047 

1048 PREFIX = '' 

1049 SUFFIX = '' 

1050 

1051 def output_ready(self, formatter=None): 

1052 """Make this string ready for output by adding any subclass-specific 

1053 prefix or suffix. 

1054 

1055 :param formatter: A Formatter object, or a string naming one 

1056 of the standard formatters. The string will be passed into the 

1057 Formatter, but only to trigger any side effects: the return 

1058 value is ignored. 

1059 

1060 :return: The string, with any subclass-specific prefix and 

1061 suffix added on. 

1062 """ 

1063 if formatter is not None: 

1064 ignore = self.format_string(self, formatter) 

1065 return self.PREFIX + self + self.SUFFIX 

1066 

1067class CData(PreformattedString): 

1068 """A CDATA block.""" 

1069 PREFIX = '<![CDATA[' 

1070 SUFFIX = ']]>' 

1071 

1072class ProcessingInstruction(PreformattedString): 

1073 """A SGML processing instruction.""" 

1074 

1075 PREFIX = '<?' 

1076 SUFFIX = '>' 

1077 

1078class XMLProcessingInstruction(ProcessingInstruction): 

1079 """An XML processing instruction.""" 

1080 PREFIX = '<?' 

1081 SUFFIX = '?>' 

1082 

1083class Comment(PreformattedString): 

1084 """An HTML or XML comment.""" 

1085 PREFIX = '<!--' 

1086 SUFFIX = '-->' 

1087 

1088 

1089class Declaration(PreformattedString): 

1090 """An XML declaration.""" 

1091 PREFIX = '<?' 

1092 SUFFIX = '?>' 

1093 

1094 

1095class Doctype(PreformattedString): 

1096 """A document type declaration.""" 

1097 @classmethod 

1098 def for_name_and_ids(cls, name, pub_id, system_id): 

1099 """Generate an appropriate document type declaration for a given 

1100 public ID and system ID. 

1101 

1102 :param name: The name of the document's root element, e.g. 'html'. 

1103 :param pub_id: The Formal Public Identifier for this document type, 

1104 e.g. '-//W3C//DTD XHTML 1.1//EN' 

1105 :param system_id: The system identifier for this document type, 

1106 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' 

1107 

1108 :return: A Doctype. 

1109 """ 

1110 value = name or '' 

1111 if pub_id is not None: 

1112 value += ' PUBLIC "%s"' % pub_id 

1113 if system_id is not None: 

1114 value += ' "%s"' % system_id 

1115 elif system_id is not None: 

1116 value += ' SYSTEM "%s"' % system_id 

1117 

1118 return Doctype(value) 

1119 

1120 PREFIX = '<!DOCTYPE ' 

1121 SUFFIX = '>\n' 

1122 

1123 

1124class Stylesheet(NavigableString): 

1125 """A NavigableString representing an stylesheet (probably 

1126 CSS). 

1127 

1128 Used to distinguish embedded stylesheets from textual content. 

1129 """ 

1130 pass 

1131 

1132 

1133class Script(NavigableString): 

1134 """A NavigableString representing an executable script (probably 

1135 Javascript). 

1136 

1137 Used to distinguish executable code from textual content. 

1138 """ 

1139 pass 

1140 

1141 

1142class TemplateString(NavigableString): 

1143 """A NavigableString representing a string found inside an HTML 

1144 template embedded in a larger document. 

1145 

1146 Used to distinguish such strings from the main body of the document. 

1147 """ 

1148 pass 

1149 

1150 

1151class RubyTextString(NavigableString): 

1152 """A NavigableString representing the contents of the <rt> HTML 

1153 element. 

1154 

1155 https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element 

1156 

1157 Can be used to distinguish such strings from the strings they're 

1158 annotating. 

1159 """ 

1160 pass 

1161 

1162 

1163class RubyParenthesisString(NavigableString): 

1164 """A NavigableString representing the contents of the <rp> HTML 

1165 element. 

1166 

1167 https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element 

1168 """ 

1169 pass 

1170 

1171 

1172class Tag(PageElement): 

1173 """Represents an HTML or XML tag that is part of a parse tree, along 

1174 with its attributes and contents. 

1175 

1176 When Beautiful Soup parses the markup <b>penguin</b>, it will 

1177 create a Tag object representing the <b> tag. 

1178 """ 

1179 

1180 def __init__(self, parser=None, builder=None, name=None, namespace=None, 

1181 prefix=None, attrs=None, parent=None, previous=None, 

1182 is_xml=None, sourceline=None, sourcepos=None, 

1183 can_be_empty_element=None, cdata_list_attributes=None, 

1184 preserve_whitespace_tags=None, 

1185 interesting_string_types=None, 

1186 namespaces=None 

1187 ): 

1188 """Basic constructor. 

1189 

1190 :param parser: A BeautifulSoup object. 

1191 :param builder: A TreeBuilder. 

1192 :param name: The name of the tag. 

1193 :param namespace: The URI of this Tag's XML namespace, if any. 

1194 :param prefix: The prefix for this Tag's XML namespace, if any. 

1195 :param attrs: A dictionary of this Tag's attribute values. 

1196 :param parent: The PageElement to use as this Tag's parent. 

1197 :param previous: The PageElement that was parsed immediately before 

1198 this tag. 

1199 :param is_xml: If True, this is an XML tag. Otherwise, this is an 

1200 HTML tag. 

1201 :param sourceline: The line number where this tag was found in its 

1202 source document. 

1203 :param sourcepos: The character position within `sourceline` where this 

1204 tag was found. 

1205 :param can_be_empty_element: If True, this tag should be 

1206 represented as <tag/>. If False, this tag should be represented 

1207 as <tag></tag>. 

1208 :param cdata_list_attributes: A list of attributes whose values should 

1209 be treated as CDATA if they ever show up on this tag. 

1210 :param preserve_whitespace_tags: A list of tag names whose contents 

1211 should have their whitespace preserved. 

1212 :param interesting_string_types: This is a NavigableString 

1213 subclass or a tuple of them. When iterating over this 

1214 Tag's strings in methods like Tag.strings or Tag.get_text, 

1215 these are the types of strings that are interesting enough 

1216 to be considered. The default is to consider 

1217 NavigableString and CData the only interesting string 

1218 subtypes. 

1219 :param namespaces: A dictionary mapping currently active 

1220 namespace prefixes to URIs. This can be used later to 

1221 construct CSS selectors. 

1222 """ 

1223 if parser is None: 

1224 self.parser_class = None 

1225 else: 

1226 # We don't actually store the parser object: that lets extracted 

1227 # chunks be garbage-collected. 

1228 self.parser_class = parser.__class__ 

1229 if name is None: 

1230 raise ValueError("No value provided for new tag's name.") 

1231 self.name = name 

1232 self.namespace = namespace 

1233 self._namespaces = namespaces or {} 

1234 self.prefix = prefix 

1235 if ((not builder or builder.store_line_numbers) 

1236 and (sourceline is not None or sourcepos is not None)): 

1237 self.sourceline = sourceline 

1238 self.sourcepos = sourcepos 

1239 if attrs is None: 

1240 attrs = {} 

1241 elif attrs: 

1242 if builder is not None and builder.cdata_list_attributes: 

1243 attrs = builder._replace_cdata_list_attribute_values( 

1244 self.name, attrs) 

1245 else: 

1246 attrs = dict(attrs) 

1247 else: 

1248 attrs = dict(attrs) 

1249 

1250 # If possible, determine ahead of time whether this tag is an 

1251 # XML tag. 

1252 if builder: 

1253 self.known_xml = builder.is_xml 

1254 else: 

1255 self.known_xml = is_xml 

1256 self.attrs = attrs 

1257 self.contents = [] 

1258 self.setup(parent, previous) 

1259 self.hidden = False 

1260 

1261 if builder is None: 

1262 # In the absence of a TreeBuilder, use whatever values were 

1263 # passed in here. They're probably None, unless this is a copy of some 

1264 # other tag. 

1265 self.can_be_empty_element = can_be_empty_element 

1266 self.cdata_list_attributes = cdata_list_attributes 

1267 self.preserve_whitespace_tags = preserve_whitespace_tags 

1268 self.interesting_string_types = interesting_string_types 

1269 else: 

1270 # Set up any substitutions for this tag, such as the charset in a META tag. 

1271 builder.set_up_substitutions(self) 

1272 

1273 # Ask the TreeBuilder whether this tag might be an empty-element tag. 

1274 self.can_be_empty_element = builder.can_be_empty_element(name) 

1275 

1276 # Keep track of the list of attributes of this tag that 

1277 # might need to be treated as a list. 

1278 # 

1279 # For performance reasons, we store the whole data structure 

1280 # rather than asking the question of every tag. Asking would 

1281 # require building a new data structure every time, and 

1282 # (unlike can_be_empty_element), we almost never need 

1283 # to check this. 

1284 self.cdata_list_attributes = builder.cdata_list_attributes 

1285 

1286 # Keep track of the names that might cause this tag to be treated as a 

1287 # whitespace-preserved tag. 

1288 self.preserve_whitespace_tags = builder.preserve_whitespace_tags 

1289 

1290 if self.name in builder.string_containers: 

1291 # This sort of tag uses a special string container 

1292 # subclass for most of its strings. When we ask the 

1293 self.interesting_string_types = builder.string_containers[self.name] 

1294 else: 

1295 self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES 

1296 

1297 parserClass = _alias("parser_class") # BS3 

1298 

1299 def __copy__(self): 

1300 """A copy of a Tag is a new Tag, unconnected to the parse tree. 

1301 Its contents are a copy of the old Tag's contents. 

1302 """ 

1303 clone = type(self)( 

1304 None, self.builder, self.name, self.namespace, 

1305 self.prefix, self.attrs, is_xml=self._is_xml, 

1306 sourceline=self.sourceline, sourcepos=self.sourcepos, 

1307 can_be_empty_element=self.can_be_empty_element, 

1308 cdata_list_attributes=self.cdata_list_attributes, 

1309 preserve_whitespace_tags=self.preserve_whitespace_tags 

1310 ) 

1311 for attr in ('can_be_empty_element', 'hidden'): 

1312 setattr(clone, attr, getattr(self, attr)) 

1313 for child in self.contents: 

1314 clone.append(child.__copy__()) 

1315 return clone 

1316 

1317 @property 

1318 def is_empty_element(self): 

1319 """Is this tag an empty-element tag? (aka a self-closing tag) 

1320 

1321 A tag that has contents is never an empty-element tag. 

1322 

1323 A tag that has no contents may or may not be an empty-element 

1324 tag. It depends on the builder used to create the tag. If the 

1325 builder has a designated list of empty-element tags, then only 

1326 a tag whose name shows up in that list is considered an 

1327 empty-element tag. 

1328 

1329 If the builder has no designated list of empty-element tags, 

1330 then any tag with no contents is an empty-element tag. 

1331 """ 

1332 return len(self.contents) == 0 and self.can_be_empty_element 

1333 isSelfClosing = is_empty_element # BS3 

1334 

1335 @property 

1336 def string(self): 

1337 """Convenience property to get the single string within this 

1338 PageElement. 

1339 

1340 TODO It might make sense to have NavigableString.string return 

1341 itself. 

1342 

1343 :return: If this element has a single string child, return 

1344 value is that string. If this element has one child tag, 

1345 return value is the 'string' attribute of the child tag, 

1346 recursively. If this element is itself a string, has no 

1347 children, or has more than one child, return value is None. 

1348 """ 

1349 if len(self.contents) != 1: 

1350 return None 

1351 child = self.contents[0] 

1352 if isinstance(child, NavigableString): 

1353 return child 

1354 return child.string 

1355 

1356 @string.setter 

1357 def string(self, string): 

1358 """Replace this PageElement's contents with `string`.""" 

1359 self.clear() 

1360 self.append(string.__class__(string)) 

1361 

1362 DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData) 

1363 def _all_strings(self, strip=False, types=PageElement.default): 

1364 """Yield all strings of certain classes, possibly stripping them. 

1365 

1366 :param strip: If True, all strings will be stripped before being 

1367 yielded. 

1368 

1369 :param types: A tuple of NavigableString subclasses. Any strings of 

1370 a subclass not found in this list will be ignored. By 

1371 default, the subclasses considered are the ones found in 

1372 self.interesting_string_types. If that's not specified, 

1373 only NavigableString and CData objects will be 

1374 considered. That means no comments, processing 

1375 instructions, etc. 

1376 

1377 :yield: A sequence of strings. 

1378 

1379 """ 

1380 if types is self.default: 

1381 types = self.interesting_string_types 

1382 

1383 for descendant in self.descendants: 

1384 if (types is None and not isinstance(descendant, NavigableString)): 

1385 continue 

1386 descendant_type = type(descendant) 

1387 if isinstance(types, type): 

1388 if descendant_type is not types: 

1389 # We're not interested in strings of this type. 

1390 continue 

1391 elif types is not None and descendant_type not in types: 

1392 # We're not interested in strings of this type. 

1393 continue 

1394 if strip: 

1395 descendant = descendant.strip() 

1396 if len(descendant) == 0: 

1397 continue 

1398 yield descendant 

1399 strings = property(_all_strings) 

1400 

1401 def decompose(self): 

1402 """Recursively destroys this PageElement and its children. 

1403 

1404 This element will be removed from the tree and wiped out; so 

1405 will everything beneath it. 

1406 

1407 The behavior of a decomposed PageElement is undefined and you 

1408 should never use one for anything, but if you need to _check_ 

1409 whether an element has been decomposed, you can use the 

1410 `decomposed` property. 

1411 """ 

1412 self.extract() 

1413 i = self 

1414 while i is not None: 

1415 n = i.next_element 

1416 i.__dict__.clear() 

1417 i.contents = [] 

1418 i._decomposed = True 

1419 i = n 

1420 

1421 def clear(self, decompose=False): 

1422 """Wipe out all children of this PageElement by calling extract() 

1423 on them. 

1424 

1425 :param decompose: If this is True, decompose() (a more 

1426 destructive method) will be called instead of extract(). 

1427 """ 

1428 if decompose: 

1429 for element in self.contents[:]: 

1430 if isinstance(element, Tag): 

1431 element.decompose() 

1432 else: 

1433 element.extract() 

1434 else: 

1435 for element in self.contents[:]: 

1436 element.extract() 

1437 

1438 def smooth(self): 

1439 """Smooth out this element's children by consolidating consecutive 

1440 strings. 

1441 

1442 This makes pretty-printed output look more natural following a 

1443 lot of operations that modified the tree. 

1444 """ 

1445 # Mark the first position of every pair of children that need 

1446 # to be consolidated. Do this rather than making a copy of 

1447 # self.contents, since in most cases very few strings will be 

1448 # affected. 

1449 marked = [] 

1450 for i, a in enumerate(self.contents): 

1451 if isinstance(a, Tag): 

1452 # Recursively smooth children. 

1453 a.smooth() 

1454 if i == len(self.contents)-1: 

1455 # This is the last item in .contents, and it's not a 

1456 # tag. There's no chance it needs any work. 

1457 continue 

1458 b = self.contents[i+1] 

1459 if (isinstance(a, NavigableString) 

1460 and isinstance(b, NavigableString) 

1461 and not isinstance(a, PreformattedString) 

1462 and not isinstance(b, PreformattedString) 

1463 ): 

1464 marked.append(i) 

1465 

1466 # Go over the marked positions in reverse order, so that 

1467 # removing items from .contents won't affect the remaining 

1468 # positions. 

1469 for i in reversed(marked): 

1470 a = self.contents[i] 

1471 b = self.contents[i+1] 

1472 b.extract() 

1473 n = NavigableString(a+b) 

1474 a.replace_with(n) 

1475 

1476 def index(self, element): 

1477 """Find the index of a child by identity, not value. 

1478 

1479 Avoids issues with tag.contents.index(element) getting the 

1480 index of equal elements. 

1481 

1482 :param element: Look for this PageElement in `self.contents`. 

1483 """ 

1484 for i, child in enumerate(self.contents): 

1485 if child is element: 

1486 return i 

1487 raise ValueError("Tag.index: element not in tag") 

1488 

1489 def get(self, key, default=None): 

1490 """Returns the value of the 'key' attribute for the tag, or 

1491 the value given for 'default' if it doesn't have that 

1492 attribute.""" 

1493 return self.attrs.get(key, default) 

1494 

1495 def get_attribute_list(self, key, default=None): 

1496 """The same as get(), but always returns a list. 

1497 

1498 :param key: The attribute to look for. 

1499 :param default: Use this value if the attribute is not present 

1500 on this PageElement. 

1501 :return: A list of values, probably containing only a single 

1502 value. 

1503 """ 

1504 value = self.get(key, default) 

1505 if not isinstance(value, list): 

1506 value = [value] 

1507 return value 

1508 

1509 def has_attr(self, key): 

1510 """Does this PageElement have an attribute with the given name?""" 

1511 return key in self.attrs 

1512 

1513 def __hash__(self): 

1514 return str(self).__hash__() 

1515 

1516 def __getitem__(self, key): 

1517 """tag[key] returns the value of the 'key' attribute for the Tag, 

1518 and throws an exception if it's not there.""" 

1519 return self.attrs[key] 

1520 

1521 def __iter__(self): 

1522 "Iterating over a Tag iterates over its contents." 

1523 return iter(self.contents) 

1524 

1525 def __len__(self): 

1526 "The length of a Tag is the length of its list of contents." 

1527 return len(self.contents) 

1528 

1529 def __contains__(self, x): 

1530 return x in self.contents 

1531 

1532 def __bool__(self): 

1533 "A tag is non-None even if it has no contents." 

1534 return True 

1535 

1536 def __setitem__(self, key, value): 

1537 """Setting tag[key] sets the value of the 'key' attribute for the 

1538 tag.""" 

1539 self.attrs[key] = value 

1540 

1541 def __delitem__(self, key): 

1542 "Deleting tag[key] deletes all 'key' attributes for the tag." 

1543 self.attrs.pop(key, None) 

1544 

1545 def __call__(self, *args, **kwargs): 

1546 """Calling a Tag like a function is the same as calling its 

1547 find_all() method. Eg. tag('a') returns a list of all the A tags 

1548 found within this tag.""" 

1549 return self.find_all(*args, **kwargs) 

1550 

1551 def __getattr__(self, tag): 

1552 """Calling tag.subtag is the same as calling tag.find(name="subtag")""" 

1553 #print("Getattr %s.%s" % (self.__class__, tag)) 

1554 if len(tag) > 3 and tag.endswith('Tag'): 

1555 # BS3: soup.aTag -> "soup.find("a") 

1556 tag_name = tag[:-3] 

1557 warnings.warn( 

1558 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( 

1559 name=tag_name 

1560 ), 

1561 DeprecationWarning 

1562 ) 

1563 return self.find(tag_name) 

1564 # We special case contents to avoid recursion. 

1565 elif not tag.startswith("__") and not tag == "contents": 

1566 return self.find(tag) 

1567 raise AttributeError( 

1568 "'%s' object has no attribute '%s'" % (self.__class__, tag)) 

1569 

1570 def __eq__(self, other): 

1571 """Returns true iff this Tag has the same name, the same attributes, 

1572 and the same contents (recursively) as `other`.""" 

1573 if self is other: 

1574 return True 

1575 if (not hasattr(other, 'name') or 

1576 not hasattr(other, 'attrs') or 

1577 not hasattr(other, 'contents') or 

1578 self.name != other.name or 

1579 self.attrs != other.attrs or 

1580 len(self) != len(other)): 

1581 return False 

1582 for i, my_child in enumerate(self.contents): 

1583 if my_child != other.contents[i]: 

1584 return False 

1585 return True 

1586 

1587 def __ne__(self, other): 

1588 """Returns true iff this Tag is not identical to `other`, 

1589 as defined in __eq__.""" 

1590 return not self == other 

1591 

1592 def __repr__(self, encoding="unicode-escape"): 

1593 """Renders this PageElement as a string. 

1594 

1595 :param encoding: The encoding to use (Python 2 only).  

1596 TODO: This is now ignored and a warning should be issued 

1597 if a value is provided. 

1598 :return: A (Unicode) string. 

1599 """ 

1600 # "The return value must be a string object", i.e. Unicode 

1601 return self.decode() 

1602 

1603 def __unicode__(self): 

1604 """Renders this PageElement as a Unicode string.""" 

1605 return self.decode() 

1606 

1607 __str__ = __repr__ = __unicode__ 

1608 

1609 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, 

1610 indent_level=None, formatter="minimal", 

1611 errors="xmlcharrefreplace"): 

1612 """Render a bytestring representation of this PageElement and its 

1613 contents. 

1614 

1615 :param encoding: The destination encoding. 

1616 :param indent_level: Each line of the rendering will be 

1617 indented this many levels. (The formatter decides what a 

1618 'level' means in terms of spaces or other characters 

1619 output.) Used internally in recursive calls while 

1620 pretty-printing. 

1621 :param formatter: A Formatter object, or a string naming one of 

1622 the standard formatters. 

1623 :param errors: An error handling strategy such as 

1624 'xmlcharrefreplace'. This value is passed along into 

1625 encode() and its value should be one of the constants 

1626 defined by Python. 

1627 :return: A bytestring. 

1628 

1629 """ 

1630 # Turn the data structure into Unicode, then encode the 

1631 # Unicode. 

1632 u = self.decode(indent_level, encoding, formatter) 

1633 return u.encode(encoding, errors) 

1634 

1635 def decode(self, indent_level=None, 

1636 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 

1637 formatter="minimal"): 

1638 """Render a Unicode representation of this PageElement and its 

1639 contents. 

1640 

1641 :param indent_level: Each line of the rendering will be 

1642 indented this many spaces. Used internally in 

1643 recursive calls while pretty-printing. 

1644 :param eventual_encoding: The tag is destined to be 

1645 encoded into this encoding. This method is _not_ 

1646 responsible for performing that encoding. This information 

1647 is passed in so that it can be substituted in if the 

1648 document contains a <META> tag that mentions the document's 

1649 encoding. 

1650 :param formatter: A Formatter object, or a string naming one of 

1651 the standard formatters. 

1652 """ 

1653 

1654 # First off, turn a non-Formatter `formatter` into a Formatter 

1655 # object. This will stop the lookup from happening over and 

1656 # over again. 

1657 if not isinstance(formatter, Formatter): 

1658 formatter = self.formatter_for_name(formatter) 

1659 attributes = formatter.attributes(self) 

1660 attrs = [] 

1661 for key, val in attributes: 

1662 if val is None: 

1663 decoded = key 

1664 else: 

1665 if isinstance(val, list) or isinstance(val, tuple): 

1666 val = ' '.join(val) 

1667 elif not isinstance(val, str): 

1668 val = str(val) 

1669 elif ( 

1670 isinstance(val, AttributeValueWithCharsetSubstitution) 

1671 and eventual_encoding is not None 

1672 ): 

1673 val = val.encode(eventual_encoding) 

1674 

1675 text = formatter.attribute_value(val) 

1676 decoded = ( 

1677 str(key) + '=' 

1678 + formatter.quoted_attribute_value(text)) 

1679 attrs.append(decoded) 

1680 close = '' 

1681 closeTag = '' 

1682 

1683 prefix = '' 

1684 if self.prefix: 

1685 prefix = self.prefix + ":" 

1686 

1687 if self.is_empty_element: 

1688 close = formatter.void_element_close_prefix or '' 

1689 else: 

1690 closeTag = '</%s%s>' % (prefix, self.name) 

1691 

1692 pretty_print = self._should_pretty_print(indent_level) 

1693 space = '' 

1694 indent_space = '' 

1695 if indent_level is not None: 

1696 indent_space = (formatter.indent * (indent_level - 1)) 

1697 if pretty_print: 

1698 space = indent_space 

1699 indent_contents = indent_level + 1 

1700 else: 

1701 indent_contents = None 

1702 contents = self.decode_contents( 

1703 indent_contents, eventual_encoding, formatter 

1704 ) 

1705 

1706 if self.hidden: 

1707 # This is the 'document root' object. 

1708 s = contents 

1709 else: 

1710 s = [] 

1711 attribute_string = '' 

1712 if attrs: 

1713 attribute_string = ' ' + ' '.join(attrs) 

1714 if indent_level is not None: 

1715 # Even if this particular tag is not pretty-printed, 

1716 # we should indent up to the start of the tag. 

1717 s.append(indent_space) 

1718 s.append('<%s%s%s%s>' % ( 

1719 prefix, self.name, attribute_string, close)) 

1720 if pretty_print: 

1721 s.append("\n") 

1722 s.append(contents) 

1723 if pretty_print and contents and contents[-1] != "\n": 

1724 s.append("\n") 

1725 if pretty_print and closeTag: 

1726 s.append(space) 

1727 s.append(closeTag) 

1728 if indent_level is not None and closeTag and self.next_sibling: 

1729 # Even if this particular tag is not pretty-printed, 

1730 # we're now done with the tag, and we should add a 

1731 # newline if appropriate. 

1732 s.append("\n") 

1733 s = ''.join(s) 

1734 return s 

1735 

1736 def _should_pretty_print(self, indent_level): 

1737 """Should this tag be pretty-printed? 

1738 

1739 Most of them should, but some (such as <pre> in HTML 

1740 documents) should not. 

1741 """ 

1742 return ( 

1743 indent_level is not None 

1744 and ( 

1745 not self.preserve_whitespace_tags 

1746 or self.name not in self.preserve_whitespace_tags 

1747 ) 

1748 ) 

1749 

1750 def prettify(self, encoding=None, formatter="minimal"): 

1751 """Pretty-print this PageElement as a string. 

1752 

1753 :param encoding: The eventual encoding of the string. If this is None, 

1754 a Unicode string will be returned. 

1755 :param formatter: A Formatter object, or a string naming one of 

1756 the standard formatters. 

1757 :return: A Unicode string (if encoding==None) or a bytestring  

1758 (otherwise). 

1759 """ 

1760 if encoding is None: 

1761 return self.decode(True, formatter=formatter) 

1762 else: 

1763 return self.encode(encoding, True, formatter=formatter) 

1764 

1765 def decode_contents(self, indent_level=None, 

1766 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 

1767 formatter="minimal"): 

1768 """Renders the contents of this tag as a Unicode string. 

1769 

1770 :param indent_level: Each line of the rendering will be 

1771 indented this many levels. (The formatter decides what a 

1772 'level' means in terms of spaces or other characters 

1773 output.) Used internally in recursive calls while 

1774 pretty-printing. 

1775 

1776 :param eventual_encoding: The tag is destined to be 

1777 encoded into this encoding. decode_contents() is _not_ 

1778 responsible for performing that encoding. This information 

1779 is passed in so that it can be substituted in if the 

1780 document contains a <META> tag that mentions the document's 

1781 encoding. 

1782 

1783 :param formatter: A Formatter object, or a string naming one of 

1784 the standard Formatters. 

1785 

1786 """ 

1787 # First off, turn a string formatter into a Formatter object. This 

1788 # will stop the lookup from happening over and over again. 

1789 if not isinstance(formatter, Formatter): 

1790 formatter = self.formatter_for_name(formatter) 

1791 

1792 pretty_print = (indent_level is not None) 

1793 s = [] 

1794 for c in self: 

1795 text = None 

1796 if isinstance(c, NavigableString): 

1797 text = c.output_ready(formatter) 

1798 elif isinstance(c, Tag): 

1799 s.append(c.decode(indent_level, eventual_encoding, 

1800 formatter)) 

1801 preserve_whitespace = ( 

1802 self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags 

1803 ) 

1804 if text and indent_level and not preserve_whitespace: 

1805 text = text.strip() 

1806 if text: 

1807 if pretty_print and not preserve_whitespace: 

1808 s.append(formatter.indent * (indent_level - 1)) 

1809 s.append(text) 

1810 if pretty_print and not preserve_whitespace: 

1811 s.append("\n") 

1812 return ''.join(s) 

1813 

1814 def encode_contents( 

1815 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, 

1816 formatter="minimal"): 

1817 """Renders the contents of this PageElement as a bytestring. 

1818 

1819 :param indent_level: Each line of the rendering will be 

1820 indented this many levels. (The formatter decides what a 

1821 'level' means in terms of spaces or other characters 

1822 output.) Used internally in recursive calls while 

1823 pretty-printing. 

1824 

1825 :param eventual_encoding: The bytestring will be in this encoding. 

1826 

1827 :param formatter: A Formatter object, or a string naming one of 

1828 the standard Formatters. 

1829 

1830 :return: A bytestring. 

1831 """ 

1832 contents = self.decode_contents(indent_level, encoding, formatter) 

1833 return contents.encode(encoding) 

1834 

1835 # Old method for BS3 compatibility 

1836 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 

1837 prettyPrint=False, indentLevel=0): 

1838 """Deprecated method for BS3 compatibility.""" 

1839 if not prettyPrint: 

1840 indentLevel = None 

1841 return self.encode_contents( 

1842 indent_level=indentLevel, encoding=encoding) 

1843 

1844 #Soup methods 

1845 

1846 def find(self, name=None, attrs={}, recursive=True, string=None, 

1847 **kwargs): 

1848 """Look in the children of this PageElement and find the first 

1849 PageElement that matches the given criteria. 

1850 

1851 All find_* methods take a common set of arguments. See the online 

1852 documentation for detailed explanations. 

1853 

1854 :param name: A filter on tag name. 

1855 :param attrs: A dictionary of filters on attribute values. 

1856 :param recursive: If this is True, find() will perform a 

1857 recursive search of this PageElement's children. Otherwise, 

1858 only the direct children will be considered. 

1859 :param limit: Stop looking after finding this many results. 

1860 :kwargs: A dictionary of filters on attribute values. 

1861 :return: A PageElement. 

1862 :rtype: bs4.element.Tag | bs4.element.NavigableString 

1863 """ 

1864 r = None 

1865 l = self.find_all(name, attrs, recursive, string, 1, **kwargs) 

1866 if l: 

1867 r = l[0] 

1868 return r 

1869 findChild = find #BS2 

1870 

1871 def find_all(self, name=None, attrs={}, recursive=True, string=None, 

1872 limit=None, **kwargs): 

1873 """Look in the children of this PageElement and find all 

1874 PageElements that match the given criteria. 

1875 

1876 All find_* methods take a common set of arguments. See the online 

1877 documentation for detailed explanations. 

1878 

1879 :param name: A filter on tag name. 

1880 :param attrs: A dictionary of filters on attribute values. 

1881 :param recursive: If this is True, find_all() will perform a 

1882 recursive search of this PageElement's children. Otherwise, 

1883 only the direct children will be considered. 

1884 :param limit: Stop looking after finding this many results. 

1885 :kwargs: A dictionary of filters on attribute values. 

1886 :return: A ResultSet of PageElements. 

1887 :rtype: bs4.element.ResultSet 

1888 """ 

1889 generator = self.descendants 

1890 if not recursive: 

1891 generator = self.children 

1892 return self._find_all(name, attrs, string, limit, generator, **kwargs) 

1893 findAll = find_all # BS3 

1894 findChildren = find_all # BS2 

1895 

1896 #Generator methods 

1897 @property 

1898 def children(self): 

1899 """Iterate over all direct children of this PageElement. 

1900 

1901 :yield: A sequence of PageElements. 

1902 """ 

1903 # return iter() to make the purpose of the method clear 

1904 return iter(self.contents) # XXX This seems to be untested. 

1905 

1906 @property 

1907 def descendants(self): 

1908 """Iterate over all children of this PageElement in a 

1909 breadth-first sequence. 

1910 

1911 :yield: A sequence of PageElements. 

1912 """ 

1913 if not len(self.contents): 

1914 return 

1915 stopNode = self._last_descendant().next_element 

1916 current = self.contents[0] 

1917 while current is not stopNode: 

1918 yield current 

1919 current = current.next_element 

1920 

1921 # CSS selector code 

1922 def select_one(self, selector, namespaces=None, **kwargs): 

1923 """Perform a CSS selection operation on the current element. 

1924 

1925 :param selector: A CSS selector. 

1926 

1927 :param namespaces: A dictionary mapping namespace prefixes 

1928 used in the CSS selector to namespace URIs. By default, 

1929 Beautiful Soup will use the prefixes it encountered while 

1930 parsing the document. 

1931 

1932 :param kwargs: Keyword arguments to be passed into SoupSieve's  

1933 soupsieve.select() method. 

1934 

1935 :return: A Tag. 

1936 :rtype: bs4.element.Tag 

1937 """ 

1938 value = self.select(selector, namespaces, 1, **kwargs) 

1939 if value: 

1940 return value[0] 

1941 return None 

1942 

1943 def select(self, selector, namespaces=None, limit=None, **kwargs): 

1944 """Perform a CSS selection operation on the current element. 

1945 

1946 This uses the SoupSieve library. 

1947 

1948 :param selector: A string containing a CSS selector. 

1949 

1950 :param namespaces: A dictionary mapping namespace prefixes 

1951 used in the CSS selector to namespace URIs. By default, 

1952 Beautiful Soup will use the prefixes it encountered while 

1953 parsing the document. 

1954 

1955 :param limit: After finding this number of results, stop looking. 

1956 

1957 :param kwargs: Keyword arguments to be passed into SoupSieve's  

1958 soupsieve.select() method. 

1959 

1960 :return: A ResultSet of Tags. 

1961 :rtype: bs4.element.ResultSet 

1962 """ 

1963 if namespaces is None: 

1964 namespaces = self._namespaces 

1965 

1966 if limit is None: 

1967 limit = 0 

1968 if soupsieve is None: 

1969 raise NotImplementedError( 

1970 "Cannot execute CSS selectors because the soupsieve package is not installed." 

1971 ) 

1972 

1973 results = soupsieve.select(selector, self, namespaces, limit, **kwargs) 

1974 

1975 # We do this because it's more consistent and because 

1976 # ResultSet.__getattr__ has a helpful error message. 

1977 return ResultSet(None, results) 

1978 

1979 # Old names for backwards compatibility 

1980 def childGenerator(self): 

1981 """Deprecated generator.""" 

1982 return self.children 

1983 

1984 def recursiveChildGenerator(self): 

1985 """Deprecated generator.""" 

1986 return self.descendants 

1987 

1988 def has_key(self, key): 

1989 """Deprecated method. This was kind of misleading because has_key() 

1990 (attributes) was different from __in__ (contents). 

1991 

1992 has_key() is gone in Python 3, anyway. 

1993 """ 

1994 warnings.warn( 

1995 'has_key is deprecated. Use has_attr(key) instead.', 

1996 DeprecationWarning 

1997 ) 

1998 return self.has_attr(key) 

1999 

2000# Next, a couple classes to represent queries and their results. 

2001class SoupStrainer(object): 

2002 """Encapsulates a number of ways of matching a markup element (tag or 

2003 string). 

2004 

2005 This is primarily used to underpin the find_* methods, but you can 

2006 create one yourself and pass it in as `parse_only` to the 

2007 `BeautifulSoup` constructor, to parse a subset of a large 

2008 document. 

2009 """ 

2010 

2011 def __init__(self, name=None, attrs={}, string=None, **kwargs): 

2012 """Constructor. 

2013 

2014 The SoupStrainer constructor takes the same arguments passed 

2015 into the find_* methods. See the online documentation for 

2016 detailed explanations. 

2017 

2018 :param name: A filter on tag name. 

2019 :param attrs: A dictionary of filters on attribute values. 

2020 :param string: A filter for a NavigableString with specific text. 

2021 :kwargs: A dictionary of filters on attribute values. 

2022 """ 

2023 if string is None and 'text' in kwargs: 

2024 string = kwargs.pop('text') 

2025 warnings.warn( 

2026 "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.", 

2027 DeprecationWarning 

2028 ) 

2029 

2030 self.name = self._normalize_search_value(name) 

2031 if not isinstance(attrs, dict): 

2032 # Treat a non-dict value for attrs as a search for the 'class' 

2033 # attribute. 

2034 kwargs['class'] = attrs 

2035 attrs = None 

2036 

2037 if 'class_' in kwargs: 

2038 # Treat class_="foo" as a search for the 'class' 

2039 # attribute, overriding any non-dict value for attrs. 

2040 kwargs['class'] = kwargs['class_'] 

2041 del kwargs['class_'] 

2042 

2043 if kwargs: 

2044 if attrs: 

2045 attrs = attrs.copy() 

2046 attrs.update(kwargs) 

2047 else: 

2048 attrs = kwargs 

2049 normalized_attrs = {} 

2050 for key, value in list(attrs.items()): 

2051 normalized_attrs[key] = self._normalize_search_value(value) 

2052 

2053 self.attrs = normalized_attrs 

2054 self.string = self._normalize_search_value(string) 

2055 

2056 # DEPRECATED but just in case someone is checking this. 

2057 self.text = self.string 

2058 

2059 def _normalize_search_value(self, value): 

2060 # Leave it alone if it's a Unicode string, a callable, a 

2061 # regular expression, a boolean, or None. 

2062 if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match') 

2063 or isinstance(value, bool) or value is None): 

2064 return value 

2065 

2066 # If it's a bytestring, convert it to Unicode, treating it as UTF-8. 

2067 if isinstance(value, bytes): 

2068 return value.decode("utf8") 

2069 

2070 # If it's listlike, convert it into a list of strings. 

2071 if hasattr(value, '__iter__'): 

2072 new_value = [] 

2073 for v in value: 

2074 if (hasattr(v, '__iter__') and not isinstance(v, bytes) 

2075 and not isinstance(v, str)): 

2076 # This is almost certainly the user's mistake. In the 

2077 # interests of avoiding infinite loops, we'll let 

2078 # it through as-is rather than doing a recursive call. 

2079 new_value.append(v) 

2080 else: 

2081 new_value.append(self._normalize_search_value(v)) 

2082 return new_value 

2083 

2084 # Otherwise, convert it into a Unicode string. 

2085 # The unicode(str()) thing is so this will do the same thing on Python 2 

2086 # and Python 3. 

2087 return str(str(value)) 

2088 

2089 def __str__(self): 

2090 """A human-readable representation of this SoupStrainer.""" 

2091 if self.string: 

2092 return self.string 

2093 else: 

2094 return "%s|%s" % (self.name, self.attrs) 

2095 

2096 def search_tag(self, markup_name=None, markup_attrs={}): 

2097 """Check whether a Tag with the given name and attributes would 

2098 match this SoupStrainer. 

2099 

2100 Used prospectively to decide whether to even bother creating a Tag 

2101 object. 

2102 

2103 :param markup_name: A tag name as found in some markup. 

2104 :param markup_attrs: A dictionary of attributes as found in some markup. 

2105 

2106 :return: True if the prospective tag would match this SoupStrainer; 

2107 False otherwise. 

2108 """ 

2109 found = None 

2110 markup = None 

2111 if isinstance(markup_name, Tag): 

2112 markup = markup_name 

2113 markup_attrs = markup 

2114 

2115 if isinstance(self.name, str): 

2116 # Optimization for a very common case where the user is 

2117 # searching for a tag with one specific name, and we're 

2118 # looking at a tag with a different name. 

2119 if markup and not markup.prefix and self.name != markup.name: 

2120 return False 

2121 

2122 call_function_with_tag_data = ( 

2123 isinstance(self.name, Callable) 

2124 and not isinstance(markup_name, Tag)) 

2125 

2126 if ((not self.name) 

2127 or call_function_with_tag_data 

2128 or (markup and self._matches(markup, self.name)) 

2129 or (not markup and self._matches(markup_name, self.name))): 

2130 if call_function_with_tag_data: 

2131 match = self.name(markup_name, markup_attrs) 

2132 else: 

2133 match = True 

2134 markup_attr_map = None 

2135 for attr, match_against in list(self.attrs.items()): 

2136 if not markup_attr_map: 

2137 if hasattr(markup_attrs, 'get'): 

2138 markup_attr_map = markup_attrs 

2139 else: 

2140 markup_attr_map = {} 

2141 for k, v in markup_attrs: 

2142 markup_attr_map[k] = v 

2143 attr_value = markup_attr_map.get(attr) 

2144 if not self._matches(attr_value, match_against): 

2145 match = False 

2146 break 

2147 if match: 

2148 if markup: 

2149 found = markup 

2150 else: 

2151 found = markup_name 

2152 if found and self.string and not self._matches(found.string, self.string): 

2153 found = None 

2154 return found 

2155 

2156 # For BS3 compatibility. 

2157 searchTag = search_tag 

2158 

2159 def search(self, markup): 

2160 """Find all items in `markup` that match this SoupStrainer. 

2161 

2162 Used by the core _find_all() method, which is ultimately 

2163 called by all find_* methods. 

2164 

2165 :param markup: A PageElement or a list of them. 

2166 """ 

2167 # print('looking for %s in %s' % (self, markup)) 

2168 found = None 

2169 # If given a list of items, scan it for a text element that 

2170 # matches. 

2171 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): 

2172 for element in markup: 

2173 if isinstance(element, NavigableString) \ 

2174 and self.search(element): 

2175 found = element 

2176 break 

2177 # If it's a Tag, make sure its name or attributes match. 

2178 # Don't bother with Tags if we're searching for text. 

2179 elif isinstance(markup, Tag): 

2180 if not self.string or self.name or self.attrs: 

2181 found = self.search_tag(markup) 

2182 # If it's text, make sure the text matches. 

2183 elif isinstance(markup, NavigableString) or \ 

2184 isinstance(markup, str): 

2185 if not self.name and not self.attrs and self._matches(markup, self.string): 

2186 found = markup 

2187 else: 

2188 raise Exception( 

2189 "I don't know how to match against a %s" % markup.__class__) 

2190 return found 

2191 

2192 def _matches(self, markup, match_against, already_tried=None): 

2193 # print(u"Matching %s against %s" % (markup, match_against)) 

2194 result = False 

2195 if isinstance(markup, list) or isinstance(markup, tuple): 

2196 # This should only happen when searching a multi-valued attribute 

2197 # like 'class'. 

2198 for item in markup: 

2199 if self._matches(item, match_against): 

2200 return True 

2201 # We didn't match any particular value of the multivalue 

2202 # attribute, but maybe we match the attribute value when 

2203 # considered as a string. 

2204 if self._matches(' '.join(markup), match_against): 

2205 return True 

2206 return False 

2207 

2208 if match_against is True: 

2209 # True matches any non-None value. 

2210 return markup is not None 

2211 

2212 if isinstance(match_against, Callable): 

2213 return match_against(markup) 

2214 

2215 # Custom callables take the tag as an argument, but all 

2216 # other ways of matching match the tag name as a string. 

2217 original_markup = markup 

2218 if isinstance(markup, Tag): 

2219 markup = markup.name 

2220 

2221 # Ensure that `markup` is either a Unicode string, or None. 

2222 markup = self._normalize_search_value(markup) 

2223 

2224 if markup is None: 

2225 # None matches None, False, an empty string, an empty list, and so on. 

2226 return not match_against 

2227 

2228 if (hasattr(match_against, '__iter__') 

2229 and not isinstance(match_against, str)): 

2230 # We're asked to match against an iterable of items. 

2231 # The markup must be match at least one item in the 

2232 # iterable. We'll try each one in turn. 

2233 # 

2234 # To avoid infinite recursion we need to keep track of 

2235 # items we've already seen. 

2236 if not already_tried: 

2237 already_tried = set() 

2238 for item in match_against: 

2239 if item.__hash__: 

2240 key = item 

2241 else: 

2242 key = id(item) 

2243 if key in already_tried: 

2244 continue 

2245 else: 

2246 already_tried.add(key) 

2247 if self._matches(original_markup, item, already_tried): 

2248 return True 

2249 else: 

2250 return False 

2251 

2252 # Beyond this point we might need to run the test twice: once against 

2253 # the tag's name and once against its prefixed name. 

2254 match = False 

2255 

2256 if not match and isinstance(match_against, str): 

2257 # Exact string match 

2258 match = markup == match_against 

2259 

2260 if not match and hasattr(match_against, 'search'): 

2261 # Regexp match 

2262 return match_against.search(markup) 

2263 

2264 if (not match 

2265 and isinstance(original_markup, Tag) 

2266 and original_markup.prefix): 

2267 # Try the whole thing again with the prefixed tag name. 

2268 return self._matches( 

2269 original_markup.prefix + ':' + original_markup.name, match_against 

2270 ) 

2271 

2272 return match 

2273 

2274 

2275class ResultSet(list): 

2276 """A ResultSet is just a list that keeps track of the SoupStrainer 

2277 that created it.""" 

2278 def __init__(self, source, result=()): 

2279 """Constructor. 

2280 

2281 :param source: A SoupStrainer. 

2282 :param result: A list of PageElements. 

2283 """ 

2284 super(ResultSet, self).__init__(result) 

2285 self.source = source 

2286 

2287 def __getattr__(self, key): 

2288 """Raise a helpful exception to explain a common code fix.""" 

2289 raise AttributeError( 

2290 "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key 

2291 )