Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/element.py: 25%

1# Use of this source code is governed by the MIT license.

2__license__ = "MIT"

4try:

5 from collections.abc import Callable # Python 3.6

6except ImportError as e:

7 from collections import Callable

8import re

9import sys

10import warnings

12from bs4.css import CSS

13from bs4.formatter import (

14 Formatter,

15 HTMLFormatter,

16 XMLFormatter,

17)

19DEFAULT_OUTPUT_ENCODING = "utf-8"

21nonwhitespace_re = re.compile(r"\S+")

23# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on

24# the off chance someone imported it for their own use.

25whitespace_re = re.compile(r"\s+")

27def _alias(attr):

28 """Alias one attribute name to another for backward compatibility"""

29 @property

30 def alias(self):

31 return getattr(self, attr)

33 @alias.setter

34 def alias(self):

35 return setattr(self, attr)

36 return alias

39# These encodings are recognized by Python (so PageElement.encode

40# could theoretically support them) but XML and HTML don't recognize

41# them (so they should not show up in an XML or HTML document as that

42# document's encoding).

43#

44# If an XML document is encoded in one of these encodings, no encoding

45# will be mentioned in the XML declaration. If an HTML document is

46# encoded in one of these encodings, and the HTML document has a

47# <meta> tag that mentions an encoding, the encoding will be given as

48# the empty string.

49#

50# Source:

51# https://docs.python.org/3/library/codecs.html#python-specific-encodings

52PYTHON_SPECIFIC_ENCODINGS = set([

53 "idna",

54 "mbcs",

55 "oem",

56 "palmos",

57 "punycode",

58 "raw_unicode_escape",

59 "undefined",

60 "unicode_escape",

61 "raw-unicode-escape",

62 "unicode-escape",

63 "string-escape",

64 "string_escape",

65])

68class NamespacedAttribute(str):

69 """A namespaced string (e.g. 'xml:lang') that remembers the namespace

70 ('xml') and the name ('lang') that were used to create it.

71 """

73 def __new__(cls, prefix, name=None, namespace=None):

74 if not name:

75 # This is the default namespace. Its name "has no value"

76 # per https://www.w3.org/TR/xml-names/#defaulting

77 name = None

79 if not name:

80 obj = str.__new__(cls, prefix)

81 elif not prefix:

82 # Not really namespaced.

83 obj = str.__new__(cls, name)

84 else:

85 obj = str.__new__(cls, prefix + ":" + name)

86 obj.prefix = prefix

87 obj.name = name

88 obj.namespace = namespace

89 return obj

91class AttributeValueWithCharsetSubstitution(str):

92 """A stand-in object for a character encoding specified in HTML."""

94class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):

95 """A generic stand-in for the value of a meta tag's 'charset' attribute.

97 When Beautiful Soup parses the markup '<meta charset="utf8">', the

98 value of the 'charset' attribute will be one of these objects.

99 """

100

101 def __new__(cls, original_value):

102 obj = str.__new__(cls, original_value)

103 obj.original_value = original_value

104 return obj

105

106 def encode(self, encoding):

107 """When an HTML document is being encoded to a given encoding, the

108 value of a meta tag's 'charset' is the name of the encoding.

109 """

110 if encoding in PYTHON_SPECIFIC_ENCODINGS:

111 return ''

112 return encoding

113

114

115class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):

116 """A generic stand-in for the value of a meta tag's 'content' attribute.

117

118 When Beautiful Soup parses the markup:

119 <meta http-equiv="content-type" content="text/html; charset=utf8">

120

121 The value of the 'content' attribute will be one of these objects.

122 """

123

124 CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)

125

126 def __new__(cls, original_value):

127 match = cls.CHARSET_RE.search(original_value)

128 if match is None:

129 # No substitution necessary.

130 return str.__new__(str, original_value)

131

132 obj = str.__new__(cls, original_value)

133 obj.original_value = original_value

134 return obj

135

136 def encode(self, encoding):

137 if encoding in PYTHON_SPECIFIC_ENCODINGS:

138 return ''

139 def rewrite(match):

140 return match.group(1) + encoding

141 return self.CHARSET_RE.sub(rewrite, self.original_value)

142

143

144class PageElement(object):

145 """Contains the navigational information for some part of the page:

146 that is, its current location in the parse tree.

147

148 NavigableString, Tag, etc. are all subclasses of PageElement.

149 """

150

151 # In general, we can't tell just by looking at an element whether

152 # it's contained in an XML document or an HTML document. But for

153 # Tags (q.v.) we can store this information at parse time.

154 known_xml = None

155

156 def setup(self, parent=None, previous_element=None, next_element=None,

157 previous_sibling=None, next_sibling=None):

158 """Sets up the initial relations between this element and

159 other elements.

160

161 :param parent: The parent of this element.

162

163 :param previous_element: The element parsed immediately before

164 this one.

165

166 :param next_element: The element parsed immediately before

167 this one.

168

169 :param previous_sibling: The most recently encountered element

170 on the same level of the parse tree as this one.

171

172 :param previous_sibling: The next element to be encountered

173 on the same level of the parse tree as this one.

174 """

175 self.parent = parent

176

177 self.previous_element = previous_element

178 if previous_element is not None:

179 self.previous_element.next_element = self

180

181 self.next_element = next_element

182 if self.next_element is not None:

183 self.next_element.previous_element = self

184

185 self.next_sibling = next_sibling

186 if self.next_sibling is not None:

187 self.next_sibling.previous_sibling = self

188

189 if (previous_sibling is None

190 and self.parent is not None and self.parent.contents):

191 previous_sibling = self.parent.contents[-1]

192

193 self.previous_sibling = previous_sibling

194 if previous_sibling is not None:

195 self.previous_sibling.next_sibling = self

196

197 def format_string(self, s, formatter):

198 """Format the given string using the given formatter.

199

200 :param s: A string.

201 :param formatter: A Formatter object, or a string naming one of the standard formatters.

202 """

203 if formatter is None:

204 return s

205 if not isinstance(formatter, Formatter):

206 formatter = self.formatter_for_name(formatter)

207 output = formatter.substitute(s)

208 return output

209

210 def formatter_for_name(self, formatter):

211 """Look up or create a Formatter for the given identifier,

212 if necessary.

213

214 :param formatter: Can be a Formatter object (used as-is), a

215 function (used as the entity substitution hook for an

216 XMLFormatter or HTMLFormatter), or a string (used to look

217 up an XMLFormatter or HTMLFormatter in the appropriate

218 registry.

219 """

220 if isinstance(formatter, Formatter):

221 return formatter

222 if self._is_xml:

223 c = XMLFormatter

224 else:

225 c = HTMLFormatter

226 if isinstance(formatter, Callable):

227 return c(entity_substitution=formatter)

228 return c.REGISTRY[formatter]

229

230 @property

231 def _is_xml(self):

232 """Is this element part of an XML tree or an HTML tree?

233

234 This is used in formatter_for_name, when deciding whether an

235 XMLFormatter or HTMLFormatter is more appropriate. It can be

236 inefficient, but it should be called very rarely.

237 """

238 if self.known_xml is not None:

239 # Most of the time we will have determined this when the

240 # document is parsed.

241 return self.known_xml

242

243 # Otherwise, it's likely that this element was created by

244 # direct invocation of the constructor from within the user's

245 # Python code.

246 if self.parent is None:

247 # This is the top-level object. It should have .known_xml set

248 # from tree creation. If not, take a guess--BS is usually

249 # used on HTML markup.

250 return getattr(self, 'is_xml', False)

251 return self.parent._is_xml

252

253 nextSibling = _alias("next_sibling") # BS3

254 previousSibling = _alias("previous_sibling") # BS3

255

256 default = object()

257 def _all_strings(self, strip=False, types=default):

258 """Yield all strings of certain classes, possibly stripping them.

259

260 This is implemented differently in Tag and NavigableString.

261 """

262 raise NotImplementedError()

263

264 @property

265 def stripped_strings(self):

266 """Yield all strings in this PageElement, stripping them first.

267

268 :yield: A sequence of stripped strings.

269 """

270 for string in self._all_strings(True):

271 yield string

272

273 def get_text(self, separator="", strip=False,

274 types=default):

275 """Get all child strings of this PageElement, concatenated using the

276 given separator.

277

278 :param separator: Strings will be concatenated using this separator.

279

280 :param strip: If True, strings will be stripped before being

281 concatenated.

282

283 :param types: A tuple of NavigableString subclasses. Any

284 strings of a subclass not found in this list will be

285 ignored. Although there are exceptions, the default

286 behavior in most cases is to consider only NavigableString

287 and CData objects. That means no comments, processing

288 instructions, etc.

289

290 :return: A string.

291 """

292 return separator.join([s for s in self._all_strings(

293 strip, types=types)])

294 getText = get_text

295 text = property(get_text)

296

297 def replace_with(self, *args):

298 """Replace this PageElement with one or more PageElements, keeping the

299 rest of the tree the same.

300

301 :param args: One or more PageElements.

302 :return: `self`, no longer part of the tree.

303 """

304 if self.parent is None:

305 raise ValueError(

306 "Cannot replace one element with another when the "

307 "element to be replaced is not part of a tree.")

308 if len(args) == 1 and args[0] is self:

309 return

310 if any(x is self.parent for x in args):

311 raise ValueError("Cannot replace a Tag with its parent.")

312 old_parent = self.parent

313 my_index = self.parent.index(self)

314 self.extract(_self_index=my_index)

315 for idx, replace_with in enumerate(args, start=my_index):

316 old_parent.insert(idx, replace_with)

317 return self

318 replaceWith = replace_with # BS3

319

320 def unwrap(self):

321 """Replace this PageElement with its contents.

322

323 :return: `self`, no longer part of the tree.

324 """

325 my_parent = self.parent

326 if self.parent is None:

327 raise ValueError(

328 "Cannot replace an element with its contents when that"

329 "element is not part of a tree.")

330 my_index = self.parent.index(self)

331 self.extract(_self_index=my_index)

332 for child in reversed(self.contents[:]):

333 my_parent.insert(my_index, child)

334 return self

335 replace_with_children = unwrap

336 replaceWithChildren = unwrap # BS3

337

338 def wrap(self, wrap_inside):

339 """Wrap this PageElement inside another one.

340

341 :param wrap_inside: A PageElement.

342 :return: `wrap_inside`, occupying the position in the tree that used

343 to be occupied by `self`, and with `self` inside it.

344 """

345 me = self.replace_with(wrap_inside)

346 wrap_inside.append(me)

347 return wrap_inside

348

349 def extract(self, _self_index=None):

350 """Destructively rips this element out of the tree.

351

352 :param _self_index: The location of this element in its parent's

353 .contents, if known. Passing this in allows for a performance

354 optimization.

355

356 :return: `self`, no longer part of the tree.

357 """

358 if self.parent is not None:

359 if _self_index is None:

360 _self_index = self.parent.index(self)

361 del self.parent.contents[_self_index]

362

363 #Find the two elements that would be next to each other if

364 #this element (and any children) hadn't been parsed. Connect

365 #the two.

366 last_child = self._last_descendant()

367 next_element = last_child.next_element

368

369 if (self.previous_element is not None and

370 self.previous_element is not next_element):

371 self.previous_element.next_element = next_element

372 if next_element is not None and next_element is not self.previous_element:

373 next_element.previous_element = self.previous_element

374 self.previous_element = None

375 last_child.next_element = None

376

377 self.parent = None

378 if (self.previous_sibling is not None

379 and self.previous_sibling is not self.next_sibling):

380 self.previous_sibling.next_sibling = self.next_sibling

381 if (self.next_sibling is not None

382 and self.next_sibling is not self.previous_sibling):

383 self.next_sibling.previous_sibling = self.previous_sibling

384 self.previous_sibling = self.next_sibling = None

385 return self

386

387 def _last_descendant(self, is_initialized=True, accept_self=True):

388 """Finds the last element beneath this object to be parsed.

389

390 :param is_initialized: Has `setup` been called on this PageElement

391 yet?

392 :param accept_self: Is `self` an acceptable answer to the question?

393 """

394 if is_initialized and self.next_sibling is not None:

395 last_child = self.next_sibling.previous_element

396 else:

397 last_child = self

398 while isinstance(last_child, Tag) and last_child.contents:

399 last_child = last_child.contents[-1]

400 if not accept_self and last_child is self:

401 last_child = None

402 return last_child

403 # BS3: Not part of the API!

404 _lastRecursiveChild = _last_descendant

405

406 def insert(self, position, new_child):

407 """Insert a new PageElement in the list of this PageElement's children.

408

409 This works the same way as `list.insert`.

410

411 :param position: The numeric position that should be occupied

412 in `self.children` by the new PageElement.

413 :param new_child: A PageElement.

414 """

415 if new_child is None:

416 raise ValueError("Cannot insert None into a tag.")

417 if new_child is self:

418 raise ValueError("Cannot insert a tag into itself.")

419 if (isinstance(new_child, str)

420 and not isinstance(new_child, NavigableString)):

421 new_child = NavigableString(new_child)

422

423 from bs4 import BeautifulSoup

424 if isinstance(new_child, BeautifulSoup):

425 # We don't want to end up with a situation where one BeautifulSoup

426 # object contains another. Insert the children one at a time.

427 for subchild in list(new_child.contents):

428 self.insert(position, subchild)

429 position += 1

430 return

431 position = min(position, len(self.contents))

432 if hasattr(new_child, 'parent') and new_child.parent is not None:

433 # We're 'inserting' an element that's already one

434 # of this object's children.

435 if new_child.parent is self:

436 current_index = self.index(new_child)

437 if current_index < position:

438 # We're moving this element further down the list

439 # of this object's children. That means that when

440 # we extract this element, our target index will

441 # jump down one.

442 position -= 1

443 new_child.extract()

444

445 new_child.parent = self

446 previous_child = None

447 if position == 0:

448 new_child.previous_sibling = None

449 new_child.previous_element = self

450 else:

451 previous_child = self.contents[position - 1]

452 new_child.previous_sibling = previous_child

453 new_child.previous_sibling.next_sibling = new_child

454 new_child.previous_element = previous_child._last_descendant(False)

455 if new_child.previous_element is not None:

456 new_child.previous_element.next_element = new_child

457

458 new_childs_last_element = new_child._last_descendant(False)

459

460 if position >= len(self.contents):

461 new_child.next_sibling = None

462

463 parent = self

464 parents_next_sibling = None

465 while parents_next_sibling is None and parent is not None:

466 parents_next_sibling = parent.next_sibling

467 parent = parent.parent

468 if parents_next_sibling is not None:

469 # We found the element that comes next in the document.

470 break

471 if parents_next_sibling is not None:

472 new_childs_last_element.next_element = parents_next_sibling

473 else:

474 # The last element of this tag is the last element in

475 # the document.

476 new_childs_last_element.next_element = None

477 else:

478 next_child = self.contents[position]

479 new_child.next_sibling = next_child

480 if new_child.next_sibling is not None:

481 new_child.next_sibling.previous_sibling = new_child

482 new_childs_last_element.next_element = next_child

483

484 if new_childs_last_element.next_element is not None:

485 new_childs_last_element.next_element.previous_element = new_childs_last_element

486 self.contents.insert(position, new_child)

487

488 def append(self, tag):

489 """Appends the given PageElement to the contents of this one.

490

491 :param tag: A PageElement.

492 """

493 self.insert(len(self.contents), tag)

494

495 def extend(self, tags):

496 """Appends the given PageElements to this one's contents.

497

498 :param tags: A list of PageElements. If a single Tag is

499 provided instead, this PageElement's contents will be extended

500 with that Tag's contents.

501 """

502 if isinstance(tags, Tag):

503 tags = tags.contents

504 if isinstance(tags, list):

505 # Moving items around the tree may change their position in

506 # the original list. Make a list that won't change.

507 tags = list(tags)

508 for tag in tags:

509 self.append(tag)

510

511 def insert_before(self, *args):

512 """Makes the given element(s) the immediate predecessor of this one.

513

514 All the elements will have the same parent, and the given elements

515 will be immediately before this one.

516

517 :param args: One or more PageElements.

518 """

519 parent = self.parent

520 if parent is None:

521 raise ValueError(

522 "Element has no parent, so 'before' has no meaning.")

523 if any(x is self for x in args):

524 raise ValueError("Can't insert an element before itself.")

525 for predecessor in args:

526 # Extract first so that the index won't be screwed up if they

527 # are siblings.

528 if isinstance(predecessor, PageElement):

529 predecessor.extract()

530 index = parent.index(self)

531 parent.insert(index, predecessor)

532

533 def insert_after(self, *args):

534 """Makes the given element(s) the immediate successor of this one.

535

536 The elements will have the same parent, and the given elements

537 will be immediately after this one.

538

539 :param args: One or more PageElements.

540 """

541 # Do all error checking before modifying the tree.

542 parent = self.parent

543 if parent is None:

544 raise ValueError(

545 "Element has no parent, so 'after' has no meaning.")

546 if any(x is self for x in args):

547 raise ValueError("Can't insert an element after itself.")

548

549 offset = 0

550 for successor in args:

551 # Extract first so that the index won't be screwed up if they

552 # are siblings.

553 if isinstance(successor, PageElement):

554 successor.extract()

555 index = parent.index(self)

556 parent.insert(index+1+offset, successor)

557 offset += 1

558

559 def find_next(self, name=None, attrs={}, string=None, **kwargs):

560 """Find the first PageElement that matches the given criteria and

561 appears later in the document than this PageElement.

562

563 All find_* methods take a common set of arguments. See the online

564 documentation for detailed explanations.

565

566 :param name: A filter on tag name.

567 :param attrs: A dictionary of filters on attribute values.

568 :param string: A filter for a NavigableString with specific text.

569 :kwargs: A dictionary of filters on attribute values.

570 :return: A PageElement.

571 :rtype: bs4.element.Tag | bs4.element.NavigableString

572 """

573 return self._find_one(self.find_all_next, name, attrs, string, **kwargs)

574 findNext = find_next # BS3

575

576 def find_all_next(self, name=None, attrs={}, string=None, limit=None,

577 **kwargs):

578 """Find all PageElements that match the given criteria and appear

579 later in the document than this PageElement.

580

581 All find_* methods take a common set of arguments. See the online

582 documentation for detailed explanations.

583

584 :param name: A filter on tag name.

585 :param attrs: A dictionary of filters on attribute values.

586 :param string: A filter for a NavigableString with specific text.

587 :param limit: Stop looking after finding this many results.

588 :kwargs: A dictionary of filters on attribute values.

589 :return: A ResultSet containing PageElements.

590 """

591 _stacklevel = kwargs.pop('_stacklevel', 2)

592 return self._find_all(name, attrs, string, limit, self.next_elements,

593 _stacklevel=_stacklevel+1, **kwargs)

594 findAllNext = find_all_next # BS3

595

596 def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):

597 """Find the closest sibling to this PageElement that matches the

598 given criteria and appears later in the document.

599

600 All find_* methods take a common set of arguments. See the

601 online documentation for detailed explanations.

602

603 :param name: A filter on tag name.

604 :param attrs: A dictionary of filters on attribute values.

605 :param string: A filter for a NavigableString with specific text.

606 :kwargs: A dictionary of filters on attribute values.

607 :return: A PageElement.

608 :rtype: bs4.element.Tag | bs4.element.NavigableString

609 """

610 return self._find_one(self.find_next_siblings, name, attrs, string,

611 **kwargs)

612 findNextSibling = find_next_sibling # BS3

613

614 def find_next_siblings(self, name=None, attrs={}, string=None, limit=None,

615 **kwargs):

616 """Find all siblings of this PageElement that match the given criteria

617 and appear later in the document.

618

619 All find_* methods take a common set of arguments. See the online

620 documentation for detailed explanations.

621

622 :param name: A filter on tag name.

623 :param attrs: A dictionary of filters on attribute values.

624 :param string: A filter for a NavigableString with specific text.

625 :param limit: Stop looking after finding this many results.

626 :kwargs: A dictionary of filters on attribute values.

627 :return: A ResultSet of PageElements.

628 :rtype: bs4.element.ResultSet

629 """

630 _stacklevel = kwargs.pop('_stacklevel', 2)

631 return self._find_all(

632 name, attrs, string, limit,

633 self.next_siblings, _stacklevel=_stacklevel+1, **kwargs

634 )

635 findNextSiblings = find_next_siblings # BS3

636 fetchNextSiblings = find_next_siblings # BS2

637

638 def find_previous(self, name=None, attrs={}, string=None, **kwargs):

639 """Look backwards in the document from this PageElement and find the

640 first PageElement that matches the given criteria.

641

642 All find_* methods take a common set of arguments. See the online

643 documentation for detailed explanations.

644

645 :param name: A filter on tag name.

646 :param attrs: A dictionary of filters on attribute values.

647 :param string: A filter for a NavigableString with specific text.

648 :kwargs: A dictionary of filters on attribute values.

649 :return: A PageElement.

650 :rtype: bs4.element.Tag | bs4.element.NavigableString

651 """

652 return self._find_one(

653 self.find_all_previous, name, attrs, string, **kwargs)

654 findPrevious = find_previous # BS3

655

656 def find_all_previous(self, name=None, attrs={}, string=None, limit=None,

657 **kwargs):

658 """Look backwards in the document from this PageElement and find all

659 PageElements that match the given criteria.

660

661 All find_* methods take a common set of arguments. See the online

662 documentation for detailed explanations.

663

664 :param name: A filter on tag name.

665 :param attrs: A dictionary of filters on attribute values.

666 :param string: A filter for a NavigableString with specific text.

667 :param limit: Stop looking after finding this many results.

668 :kwargs: A dictionary of filters on attribute values.

669 :return: A ResultSet of PageElements.

670 :rtype: bs4.element.ResultSet

671 """

672 _stacklevel = kwargs.pop('_stacklevel', 2)

673 return self._find_all(

674 name, attrs, string, limit, self.previous_elements,

675 _stacklevel=_stacklevel+1, **kwargs

676 )

677 findAllPrevious = find_all_previous # BS3

678 fetchPrevious = find_all_previous # BS2

679

680 def find_previous_sibling(self, name=None, attrs={}, string=None, **kwargs):

681 """Returns the closest sibling to this PageElement that matches the

682 given criteria and appears earlier in the document.

683

684 All find_* methods take a common set of arguments. See the online

685 documentation for detailed explanations.

686

687 :param name: A filter on tag name.

688 :param attrs: A dictionary of filters on attribute values.

689 :param string: A filter for a NavigableString with specific text.

690 :kwargs: A dictionary of filters on attribute values.

691 :return: A PageElement.

692 :rtype: bs4.element.Tag | bs4.element.NavigableString

693 """

694 return self._find_one(self.find_previous_siblings, name, attrs, string,

695 **kwargs)

696 findPreviousSibling = find_previous_sibling # BS3

697

698 def find_previous_siblings(self, name=None, attrs={}, string=None,

699 limit=None, **kwargs):

700 """Returns all siblings to this PageElement that match the

701 given criteria and appear earlier in the document.

702

703 All find_* methods take a common set of arguments. See the online

704 documentation for detailed explanations.

705

706 :param name: A filter on tag name.

707 :param attrs: A dictionary of filters on attribute values.

708 :param string: A filter for a NavigableString with specific text.

709 :param limit: Stop looking after finding this many results.

710 :kwargs: A dictionary of filters on attribute values.

711 :return: A ResultSet of PageElements.

712 :rtype: bs4.element.ResultSet

713 """

714 _stacklevel = kwargs.pop('_stacklevel', 2)

715 return self._find_all(

716 name, attrs, string, limit,

717 self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs

718 )

719 findPreviousSiblings = find_previous_siblings # BS3

720 fetchPreviousSiblings = find_previous_siblings # BS2

721

722 def find_parent(self, name=None, attrs={}, **kwargs):

723 """Find the closest parent of this PageElement that matches the given

724 criteria.

725

726 All find_* methods take a common set of arguments. See the online

727 documentation for detailed explanations.

728

729 :param name: A filter on tag name.

730 :param attrs: A dictionary of filters on attribute values.

731 :kwargs: A dictionary of filters on attribute values.

732

733 :return: A PageElement.

734 :rtype: bs4.element.Tag | bs4.element.NavigableString

735 """

736 # NOTE: We can't use _find_one because findParents takes a different

737 # set of arguments.

738 r = None

739 l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs)

740 if l:

741 r = l[0]

742 return r

743 findParent = find_parent # BS3

744

745 def find_parents(self, name=None, attrs={}, limit=None, **kwargs):

746 """Find all parents of this PageElement that match the given criteria.

747

748 All find_* methods take a common set of arguments. See the online

749 documentation for detailed explanations.

750

751 :param name: A filter on tag name.

752 :param attrs: A dictionary of filters on attribute values.

753 :param limit: Stop looking after finding this many results.

754 :kwargs: A dictionary of filters on attribute values.

755

756 :return: A PageElement.

757 :rtype: bs4.element.Tag | bs4.element.NavigableString

758 """

759 _stacklevel = kwargs.pop('_stacklevel', 2)

760 return self._find_all(name, attrs, None, limit, self.parents,

761 _stacklevel=_stacklevel+1, **kwargs)

762 findParents = find_parents # BS3

763 fetchParents = find_parents # BS2

764

765 @property

766 def next(self):

767 """The PageElement, if any, that was parsed just after this one.

768

769 :return: A PageElement.

770 :rtype: bs4.element.Tag | bs4.element.NavigableString

771 """

772 return self.next_element

773

774 @property

775 def previous(self):

776 """The PageElement, if any, that was parsed just before this one.

777

778 :return: A PageElement.

779 :rtype: bs4.element.Tag | bs4.element.NavigableString

780 """

781 return self.previous_element

782

783 #These methods do the real heavy lifting.

784

785 def _find_one(self, method, name, attrs, string, **kwargs):

786 r = None

787 l = method(name, attrs, string, 1, _stacklevel=4, **kwargs)

788 if l:

789 r = l[0]

790 return r

791

792 def _find_all(self, name, attrs, string, limit, generator, **kwargs):

793 "Iterates over a generator looking for things that match."

794 _stacklevel = kwargs.pop('_stacklevel', 3)

795

796 if string is None and 'text' in kwargs:

797 string = kwargs.pop('text')

798 warnings.warn(

799 "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",

800 DeprecationWarning, stacklevel=_stacklevel

801 )

802

803 if isinstance(name, SoupStrainer):

804 strainer = name

805 else:

806 strainer = SoupStrainer(name, attrs, string, **kwargs)

807

808 if string is None and not limit and not attrs and not kwargs:

809 if name is True or name is None:

810 # Optimization to find all tags.

811 result = (element for element in generator

812 if isinstance(element, Tag))

813 return ResultSet(strainer, result)

814 elif isinstance(name, str):

815 # Optimization to find all tags with a given name.

816 if name.count(':') == 1:

817 # This is a name with a prefix. If this is a namespace-aware document,

818 # we need to match the local name against tag.name. If not,

819 # we need to match the fully-qualified name against tag.name.

820 prefix, local_name = name.split(':', 1)

821 else:

822 prefix = None

823 local_name = name

824 result = (element for element in generator

825 if isinstance(element, Tag)

826 and (

827 element.name == name

828 ) or (

829 element.name == local_name

830 and (prefix is None or element.prefix == prefix)

831 )

832 )

833 return ResultSet(strainer, result)

834 results = ResultSet(strainer)

835 while True:

836 try:

837 i = next(generator)

838 except StopIteration:

839 break

840 if i:

841 found = strainer.search(i)

842 if found:

843 results.append(found)

844 if limit and len(results) >= limit:

845 break

846 return results

847

848 #These generators can be used to navigate starting from both

849 #NavigableStrings and Tags.

850 @property

851 def next_elements(self):

852 """All PageElements that were parsed after this one.

853

854 :yield: A sequence of PageElements.

855 """

856 i = self.next_element

857 while i is not None:

858 yield i

859 i = i.next_element

860

861 @property

862 def next_siblings(self):

863 """All PageElements that are siblings of this one but were parsed

864 later.

865

866 :yield: A sequence of PageElements.

867 """

868 i = self.next_sibling

869 while i is not None:

870 yield i

871 i = i.next_sibling

872

873 @property

874 def previous_elements(self):

875 """All PageElements that were parsed before this one.

876

877 :yield: A sequence of PageElements.

878 """

879 i = self.previous_element

880 while i is not None:

881 yield i

882 i = i.previous_element

883

884 @property

885 def previous_siblings(self):

886 """All PageElements that are siblings of this one but were parsed

887 earlier.

888

889 :yield: A sequence of PageElements.

890 """

891 i = self.previous_sibling

892 while i is not None:

893 yield i

894 i = i.previous_sibling

895

896 @property

897 def parents(self):

898 """All PageElements that are parents of this PageElement.

899

900 :yield: A sequence of PageElements.

901 """

902 i = self.parent

903 while i is not None:

904 yield i

905 i = i.parent

906

907 @property

908 def decomposed(self):

909 """Check whether a PageElement has been decomposed.

910

911 :rtype: bool

912 """

913 return getattr(self, '_decomposed', False) or False

914

915 # Old non-property versions of the generators, for backwards

916 # compatibility with BS3.

917 def nextGenerator(self):

918 return self.next_elements

919

920 def nextSiblingGenerator(self):

921 return self.next_siblings

922

923 def previousGenerator(self):

924 return self.previous_elements

925

926 def previousSiblingGenerator(self):

927 return self.previous_siblings

928

929 def parentGenerator(self):

930 return self.parents

931

932

933class NavigableString(str, PageElement):

934 """A Python Unicode string that is part of a parse tree.

935

936 When Beautiful Soup parses the markup penguin, it will

937 create a NavigableString for the string "penguin".

938 """

939

940 PREFIX = ''

941 SUFFIX = ''

942

943 def __new__(cls, value):

944 """Create a new NavigableString.

945

946 When unpickling a NavigableString, this method is called with

947 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be

948 passed in to the superclass's __new__ or the superclass won't know

949 how to handle non-ASCII characters.

950 """

951 if isinstance(value, str):

952 u = str.__new__(cls, value)

953 else:

954 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)

955 u.setup()

956 return u

957

958 def __deepcopy__(self, memo, recursive=False):

959 """A copy of a NavigableString has the same contents and class

960 as the original, but it is not connected to the parse tree.

961

962 :param recursive: This parameter is ignored; it's only defined

963 so that NavigableString.__deepcopy__ implements the same

964 signature as Tag.__deepcopy__.

965 """

966 return type(self)(self)

967

968 def __copy__(self):

969 """A copy of a NavigableString can only be a deep copy, because

970 only one PageElement can occupy a given place in a parse tree.

971 """

972 return self.__deepcopy__({})

973

974 def __getnewargs__(self):

975 return (str(self),)

976

977 def __getattr__(self, attr):

978 """text.string gives you text. This is for backwards

979 compatibility for Navigable*String, but for CData* it lets you

980 get the string without the CData wrapper."""

981 if attr == 'string':

982 return self

983 else:

984 raise AttributeError(

985 "'%s' object has no attribute '%s'" % (

986 self.__class__.__name__, attr))

987

988 def output_ready(self, formatter="minimal"):

989 """Run the string through the provided formatter.

990

991 :param formatter: A Formatter object, or a string naming one of the standard formatters.

992 """

993 output = self.format_string(self, formatter)

994 return self.PREFIX + output + self.SUFFIX

995

996 @property

997 def name(self):

998 """Since a NavigableString is not a Tag, it has no .name.

999

1000 This property is implemented so that code like this doesn't crash

1001 when run on a mixture of Tag and NavigableString objects:

1002 [x.name for x in tag.children]

1003 """

1004 return None

1005

1006 @name.setter

1007 def name(self, name):

1008 """Prevent NavigableString.name from ever being set."""

1009 raise AttributeError("A NavigableString cannot be given a name.")

1010

1011 def _all_strings(self, strip=False, types=PageElement.default):

1012 """Yield all strings of certain classes, possibly stripping them.

1013

1014 This makes it easy for NavigableString to implement methods

1015 like get_text() as conveniences, creating a consistent

1016 text-extraction API across all PageElements.

1017

1018 :param strip: If True, all strings will be stripped before being

1019 yielded.

1020

1021 :param types: A tuple of NavigableString subclasses. If this

1022 NavigableString isn't one of those subclasses, the

1023 sequence will be empty. By default, the subclasses

1024 considered are NavigableString and CData objects. That

1025 means no comments, processing instructions, etc.

1026

1027 :yield: A sequence that either contains this string, or is empty.

1028

1029 """

1030 if types is self.default:

1031 # This is kept in Tag because it's full of subclasses of

1032 # this class, which aren't defined until later in the file.

1033 types = Tag.DEFAULT_INTERESTING_STRING_TYPES

1034

1035 # Do nothing if the caller is looking for specific types of

1036 # string, and we're of a different type.

1037 #

1038 # We check specific types instead of using isinstance(self,

1039 # types) because all of these classes subclass

1040 # NavigableString. Anyone who's using this feature probably

1041 # wants generic NavigableStrings but not other stuff.

1042 my_type = type(self)

1043 if types is not None:

1044 if isinstance(types, type):

1045 # Looking for a single type.

1046 if my_type is not types:

1047 return

1048 elif my_type not in types:

1049 # Looking for one of a list of types.

1050 return

1051

1052 value = self

1053 if strip:

1054 value = value.strip()

1055 if len(value) > 0:

1056 yield value

1057 strings = property(_all_strings)

1058

1059class PreformattedString(NavigableString):

1060 """A NavigableString not subject to the normal formatting rules.

1061

1062 This is an abstract class used for special kinds of strings such

1063 as comments (the Comment class) and CDATA blocks (the CData

1064 class).

1065 """

1066

1067 PREFIX = ''

1068 SUFFIX = ''

1069

1070 def output_ready(self, formatter=None):

1071 """Make this string ready for output by adding any subclass-specific

1072 prefix or suffix.

1073

1074 :param formatter: A Formatter object, or a string naming one

1075 of the standard formatters. The string will be passed into the

1076 Formatter, but only to trigger any side effects: the return

1077 value is ignored.

1078

1079 :return: The string, with any subclass-specific prefix and

1080 suffix added on.

1081 """

1082 if formatter is not None:

1083 ignore = self.format_string(self, formatter)

1084 return self.PREFIX + self + self.SUFFIX

1085

1086class CData(PreformattedString):

1087 """A CDATA block."""

1088 PREFIX = '<![CDATA['

1089 SUFFIX = ']]>'

1090

1091class ProcessingInstruction(PreformattedString):

1092 """A SGML processing instruction."""

1093

1094 PREFIX = '<?'

1095 SUFFIX = '>'

1096

1097class XMLProcessingInstruction(ProcessingInstruction):

1098 """An XML processing instruction."""

1099 PREFIX = '<?'

1100 SUFFIX = '?>'

1101

1102class Comment(PreformattedString):

1103 """An HTML or XML comment."""

1104 PREFIX = '<!--'

1105 SUFFIX = '-->'

1106

1107

1108class Declaration(PreformattedString):

1109 """An XML declaration."""

1110 PREFIX = '<?'

1111 SUFFIX = '?>'

1112

1113

1114class Doctype(PreformattedString):

1115 """A document type declaration."""

1116 @classmethod

1117 def for_name_and_ids(cls, name, pub_id, system_id):

1118 """Generate an appropriate document type declaration for a given

1119 public ID and system ID.

1120

1121 :param name: The name of the document's root element, e.g. 'html'.

1122 :param pub_id: The Formal Public Identifier for this document type,

1123 e.g. '-//W3C//DTD XHTML 1.1//EN'

1124 :param system_id: The system identifier for this document type,

1125 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'

1126

1127 :return: A Doctype.

1128 """

1129 value = name or ''

1130 if pub_id is not None:

1131 value += ' PUBLIC "%s"' % pub_id

1132 if system_id is not None:

1133 value += ' "%s"' % system_id

1134 elif system_id is not None:

1135 value += ' SYSTEM "%s"' % system_id

1136

1137 return Doctype(value)

1138

1139 PREFIX = '<!DOCTYPE '

1140 SUFFIX = '>\n'

1141

1142

1143class Stylesheet(NavigableString):

1144 """A NavigableString representing an stylesheet (probably

1145 CSS).

1146

1147 Used to distinguish embedded stylesheets from textual content.

1148 """

1149 pass

1150

1151

1152class Script(NavigableString):

1153 """A NavigableString representing an executable script (probably

1154 Javascript).

1155

1156 Used to distinguish executable code from textual content.

1157 """

1158 pass

1159

1160

1161class TemplateString(NavigableString):

1162 """A NavigableString representing a string found inside an HTML

1163 template embedded in a larger document.

1164

1165 Used to distinguish such strings from the main body of the document.

1166 """

1167 pass

1168

1169

1170class RubyTextString(NavigableString):

1171 """A NavigableString representing the contents of the <rt> HTML

1172 element.

1173

1174 https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element

1175

1176 Can be used to distinguish such strings from the strings they're

1177 annotating.

1178 """

1179 pass

1180

1181

1182class RubyParenthesisString(NavigableString):

1183 """A NavigableString representing the contents of the <rp> HTML

1184 element.

1185

1186 https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element

1187 """

1188 pass

1189

1190

1191class Tag(PageElement):

1192 """Represents an HTML or XML tag that is part of a parse tree, along

1193 with its attributes and contents.

1194

1195 When Beautiful Soup parses the markup penguin, it will

1196 create a Tag object representing the tag.

1197 """

1198

1199 def __init__(self, parser=None, builder=None, name=None, namespace=None,

1200 prefix=None, attrs=None, parent=None, previous=None,

1201 is_xml=None, sourceline=None, sourcepos=None,

1202 can_be_empty_element=None, cdata_list_attributes=None,

1203 preserve_whitespace_tags=None,

1204 interesting_string_types=None,

1205 namespaces=None

1206 ):

1207 """Basic constructor.

1208

1209 :param parser: A BeautifulSoup object.

1210 :param builder: A TreeBuilder.

1211 :param name: The name of the tag.

1212 :param namespace: The URI of this Tag's XML namespace, if any.

1213 :param prefix: The prefix for this Tag's XML namespace, if any.

1214 :param attrs: A dictionary of this Tag's attribute values.

1215 :param parent: The PageElement to use as this Tag's parent.

1216 :param previous: The PageElement that was parsed immediately before

1217 this tag.

1218 :param is_xml: If True, this is an XML tag. Otherwise, this is an

1219 HTML tag.

1220 :param sourceline: The line number where this tag was found in its

1221 source document.

1222 :param sourcepos: The character position within `sourceline` where this

1223 tag was found.

1224 :param can_be_empty_element: If True, this tag should be

1225 represented as <tag/>. If False, this tag should be represented

1226 as <tag></tag>.

1227 :param cdata_list_attributes: A list of attributes whose values should

1228 be treated as CDATA if they ever show up on this tag.

1229 :param preserve_whitespace_tags: A list of tag names whose contents

1230 should have their whitespace preserved.

1231 :param interesting_string_types: This is a NavigableString

1232 subclass or a tuple of them. When iterating over this

1233 Tag's strings in methods like Tag.strings or Tag.get_text,

1234 these are the types of strings that are interesting enough

1235 to be considered. The default is to consider

1236 NavigableString and CData the only interesting string

1237 subtypes.

1238 :param namespaces: A dictionary mapping currently active

1239 namespace prefixes to URIs. This can be used later to

1240 construct CSS selectors.

1241 """

1242 if parser is None:

1243 self.parser_class = None

1244 else:

1245 # We don't actually store the parser object: that lets extracted

1246 # chunks be garbage-collected.

1247 self.parser_class = parser.__class__

1248 if name is None:

1249 raise ValueError("No value provided for new tag's name.")

1250 self.name = name

1251 self.namespace = namespace

1252 self._namespaces = namespaces or {}

1253 self.prefix = prefix

1254 if ((not builder or builder.store_line_numbers)

1255 and (sourceline is not None or sourcepos is not None)):

1256 self.sourceline = sourceline

1257 self.sourcepos = sourcepos

1258 if attrs is None:

1259 attrs = {}

1260 elif attrs:

1261 if builder is not None and builder.cdata_list_attributes:

1262 attrs = builder._replace_cdata_list_attribute_values(

1263 self.name, attrs)

1264 else:

1265 attrs = dict(attrs)

1266 else:

1267 attrs = dict(attrs)

1268

1269 # If possible, determine ahead of time whether this tag is an

1270 # XML tag.

1271 if builder:

1272 self.known_xml = builder.is_xml

1273 else:

1274 self.known_xml = is_xml

1275 self.attrs = attrs

1276 self.contents = []

1277 self.setup(parent, previous)

1278 self.hidden = False

1279

1280 if builder is None:

1281 # In the absence of a TreeBuilder, use whatever values were

1282 # passed in here. They're probably None, unless this is a copy of some

1283 # other tag.

1284 self.can_be_empty_element = can_be_empty_element

1285 self.cdata_list_attributes = cdata_list_attributes

1286 self.preserve_whitespace_tags = preserve_whitespace_tags

1287 self.interesting_string_types = interesting_string_types

1288 else:

1289 # Set up any substitutions for this tag, such as the charset in a META tag.

1290 builder.set_up_substitutions(self)

1291

1292 # Ask the TreeBuilder whether this tag might be an empty-element tag.

1293 self.can_be_empty_element = builder.can_be_empty_element(name)

1294

1295 # Keep track of the list of attributes of this tag that

1296 # might need to be treated as a list.

1297 #

1298 # For performance reasons, we store the whole data structure

1299 # rather than asking the question of every tag. Asking would

1300 # require building a new data structure every time, and

1301 # (unlike can_be_empty_element), we almost never need

1302 # to check this.

1303 self.cdata_list_attributes = builder.cdata_list_attributes

1304

1305 # Keep track of the names that might cause this tag to be treated as a

1306 # whitespace-preserved tag.

1307 self.preserve_whitespace_tags = builder.preserve_whitespace_tags

1308

1309 if self.name in builder.string_containers:

1310 # This sort of tag uses a special string container

1311 # subclass for most of its strings. When we ask the

1312 self.interesting_string_types = builder.string_containers[self.name]

1313 else:

1314 self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES

1315

1316 parserClass = _alias("parser_class") # BS3

1317

1318 def __deepcopy__(self, memo, recursive=True):

1319 """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.

1320 Its contents are a copy of the old Tag's contents.

1321 """

1322 clone = self._clone()

1323

1324 if recursive:

1325 # Clone this tag's descendants recursively, but without

1326 # making any recursive function calls.

1327 tag_stack = [clone]

1328 for event, element in self._event_stream(self.descendants):

1329 if event is Tag.END_ELEMENT_EVENT:

1330 # Stop appending incoming Tags to the Tag that was

1331 # just closed.

1332 tag_stack.pop()

1333 else:

1334 descendant_clone = element.__deepcopy__(

1335 memo, recursive=False

1336 )

1337 # Add to its parent's .contents

1338 tag_stack[-1].append(descendant_clone)

1339

1340 if event is Tag.START_ELEMENT_EVENT:

1341 # Add the Tag itself to the stack so that its

1342 # children will be .appended to it.

1343 tag_stack.append(descendant_clone)

1344 return clone

1345

1346 def __copy__(self):

1347 """A copy of a Tag must always be a deep copy, because a Tag's

1348 children can only have one parent at a time.

1349 """

1350 return self.__deepcopy__({})

1351

1352 def _clone(self):

1353 """Create a new Tag just like this one, but with no

1354 contents and unattached to any parse tree.

1355

1356 This is the first step in the deepcopy process.

1357 """

1358 clone = type(self)(

1359 None, self.builder, self.name, self.namespace,

1360 self.prefix, self.attrs, is_xml=self._is_xml,

1361 sourceline=self.sourceline, sourcepos=self.sourcepos,

1362 can_be_empty_element=self.can_be_empty_element,

1363 cdata_list_attributes=self.cdata_list_attributes,

1364 preserve_whitespace_tags=self.preserve_whitespace_tags,

1365 interesting_string_types=self.interesting_string_types

1366 )

1367 for attr in ('can_be_empty_element', 'hidden'):

1368 setattr(clone, attr, getattr(self, attr))

1369 return clone

1370

1371 @property

1372 def is_empty_element(self):

1373 """Is this tag an empty-element tag? (aka a self-closing tag)

1374

1375 A tag that has contents is never an empty-element tag.

1376

1377 A tag that has no contents may or may not be an empty-element

1378 tag. It depends on the builder used to create the tag. If the

1379 builder has a designated list of empty-element tags, then only

1380 a tag whose name shows up in that list is considered an

1381 empty-element tag.

1382

1383 If the builder has no designated list of empty-element tags,

1384 then any tag with no contents is an empty-element tag.

1385 """

1386 return len(self.contents) == 0 and self.can_be_empty_element

1387 isSelfClosing = is_empty_element # BS3

1388

1389 @property

1390 def string(self):

1391 """Convenience property to get the single string within this

1392 PageElement.

1393

1394 TODO It might make sense to have NavigableString.string return

1395 itself.

1396

1397 :return: If this element has a single string child, return

1398 value is that string. If this element has one child tag,

1399 return value is the 'string' attribute of the child tag,

1400 recursively. If this element is itself a string, has no

1401 children, or has more than one child, return value is None.

1402 """

1403 if len(self.contents) != 1:

1404 return None

1405 child = self.contents[0]

1406 if isinstance(child, NavigableString):

1407 return child

1408 return child.string

1409

1410 @string.setter

1411 def string(self, string):

1412 """Replace this PageElement's contents with `string`."""

1413 self.clear()

1414 self.append(string.__class__(string))

1415

1416 DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData)

1417 def _all_strings(self, strip=False, types=PageElement.default):

1418 """Yield all strings of certain classes, possibly stripping them.

1419

1420 :param strip: If True, all strings will be stripped before being

1421 yielded.

1422

1423 :param types: A tuple of NavigableString subclasses. Any strings of

1424 a subclass not found in this list will be ignored. By

1425 default, the subclasses considered are the ones found in

1426 self.interesting_string_types. If that's not specified,

1427 only NavigableString and CData objects will be

1428 considered. That means no comments, processing

1429 instructions, etc.

1430

1431 :yield: A sequence of strings.

1432

1433 """

1434 if types is self.default:

1435 types = self.interesting_string_types

1436

1437 for descendant in self.descendants:

1438 if (types is None and not isinstance(descendant, NavigableString)):

1439 continue

1440 descendant_type = type(descendant)

1441 if isinstance(types, type):

1442 if descendant_type is not types:

1443 # We're not interested in strings of this type.

1444 continue

1445 elif types is not None and descendant_type not in types:

1446 # We're not interested in strings of this type.

1447 continue

1448 if strip:

1449 descendant = descendant.strip()

1450 if len(descendant) == 0:

1451 continue

1452 yield descendant

1453 strings = property(_all_strings)

1454

1455 def decompose(self):

1456 """Recursively destroys this PageElement and its children.

1457

1458 This element will be removed from the tree and wiped out; so

1459 will everything beneath it.

1460

1461 The behavior of a decomposed PageElement is undefined and you

1462 should never use one for anything, but if you need to _check_

1463 whether an element has been decomposed, you can use the

1464 `decomposed` property.

1465 """

1466 self.extract()

1467 i = self

1468 while i is not None:

1469 n = i.next_element

1470 i.__dict__.clear()

1471 i.contents = []

1472 i._decomposed = True

1473 i = n

1474

1475 def clear(self, decompose=False):

1476 """Wipe out all children of this PageElement by calling extract()

1477 on them.

1478

1479 :param decompose: If this is True, decompose() (a more

1480 destructive method) will be called instead of extract().

1481 """

1482 if decompose:

1483 for element in self.contents[:]:

1484 if isinstance(element, Tag):

1485 element.decompose()

1486 else:

1487 element.extract()

1488 else:

1489 for element in self.contents[:]:

1490 element.extract()

1491

1492 def smooth(self):

1493 """Smooth out this element's children by consolidating consecutive

1494 strings.

1495

1496 This makes pretty-printed output look more natural following a

1497 lot of operations that modified the tree.

1498 """

1499 # Mark the first position of every pair of children that need

1500 # to be consolidated. Do this rather than making a copy of

1501 # self.contents, since in most cases very few strings will be

1502 # affected.

1503 marked = []

1504 for i, a in enumerate(self.contents):

1505 if isinstance(a, Tag):

1506 # Recursively smooth children.

1507 a.smooth()

1508 if i == len(self.contents)-1:

1509 # This is the last item in .contents, and it's not a

1510 # tag. There's no chance it needs any work.

1511 continue

1512 b = self.contents[i+1]

1513 if (isinstance(a, NavigableString)

1514 and isinstance(b, NavigableString)

1515 and not isinstance(a, PreformattedString)

1516 and not isinstance(b, PreformattedString)

1517 ):

1518 marked.append(i)

1519

1520 # Go over the marked positions in reverse order, so that

1521 # removing items from .contents won't affect the remaining

1522 # positions.

1523 for i in reversed(marked):

1524 a = self.contents[i]

1525 b = self.contents[i+1]

1526 b.extract()

1527 n = NavigableString(a+b)

1528 a.replace_with(n)

1529

1530 def index(self, element):

1531 """Find the index of a child by identity, not value.

1532

1533 Avoids issues with tag.contents.index(element) getting the

1534 index of equal elements.

1535

1536 :param element: Look for this PageElement in `self.contents`.

1537 """

1538 for i, child in enumerate(self.contents):

1539 if child is element:

1540 return i

1541 raise ValueError("Tag.index: element not in tag")

1542

1543 def get(self, key, default=None):

1544 """Returns the value of the 'key' attribute for the tag, or

1545 the value given for 'default' if it doesn't have that

1546 attribute."""

1547 return self.attrs.get(key, default)

1548

1549 def get_attribute_list(self, key, default=None):

1550 """The same as get(), but always returns a list.

1551

1552 :param key: The attribute to look for.

1553 :param default: Use this value if the attribute is not present

1554 on this PageElement.

1555 :return: A list of values, probably containing only a single

1556 value.

1557 """

1558 value = self.get(key, default)

1559 if not isinstance(value, list):

1560 value = [value]

1561 return value

1562

1563 def has_attr(self, key):

1564 """Does this PageElement have an attribute with the given name?"""

1565 return key in self.attrs

1566

1567 def __hash__(self):

1568 return str(self).__hash__()

1569

1570 def __getitem__(self, key):

1571 """tag[key] returns the value of the 'key' attribute for the Tag,

1572 and throws an exception if it's not there."""

1573 return self.attrs[key]

1574

1575 def __iter__(self):

1576 "Iterating over a Tag iterates over its contents."

1577 return iter(self.contents)

1578

1579 def __len__(self):

1580 "The length of a Tag is the length of its list of contents."

1581 return len(self.contents)

1582

1583 def __contains__(self, x):

1584 return x in self.contents

1585

1586 def __bool__(self):

1587 "A tag is non-None even if it has no contents."

1588 return True

1589

1590 def __setitem__(self, key, value):

1591 """Setting tag[key] sets the value of the 'key' attribute for the

1592 tag."""

1593 self.attrs[key] = value

1594

1595 def __delitem__(self, key):

1596 "Deleting tag[key] deletes all 'key' attributes for the tag."

1597 self.attrs.pop(key, None)

1598

1599 def __call__(self, *args, **kwargs):

1600 """Calling a Tag like a function is the same as calling its

1601 find_all() method. Eg. tag('a') returns a list of all the A tags

1602 found within this tag."""

1603 return self.find_all(*args, **kwargs)

1604

1605 def __getattr__(self, tag):

1606 """Calling tag.subtag is the same as calling tag.find(name="subtag")"""

1607 #print("Getattr %s.%s" % (self.__class__, tag))

1608 if len(tag) > 3 and tag.endswith('Tag'):

1609 # BS3: soup.aTag -> "soup.find("a")

1610 tag_name = tag[:-3]

1611 warnings.warn(

1612 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(

1613 name=tag_name

1614 ),

1615 DeprecationWarning, stacklevel=2

1616 )

1617 return self.find(tag_name)

1618 # We special case contents to avoid recursion.

1619 elif not tag.startswith("__") and not tag == "contents":

1620 return self.find(tag)

1621 raise AttributeError(

1622 "'%s' object has no attribute '%s'" % (self.__class__, tag))

1623

1624 def __eq__(self, other):

1625 """Returns true iff this Tag has the same name, the same attributes,

1626 and the same contents (recursively) as `other`."""

1627 if self is other:

1628 return True

1629 if (not hasattr(other, 'name') or

1630 not hasattr(other, 'attrs') or

1631 not hasattr(other, 'contents') or

1632 self.name != other.name or

1633 self.attrs != other.attrs or

1634 len(self) != len(other)):

1635 return False

1636 for i, my_child in enumerate(self.contents):

1637 if my_child != other.contents[i]:

1638 return False

1639 return True

1640

1641 def __ne__(self, other):

1642 """Returns true iff this Tag is not identical to `other`,

1643 as defined in __eq__."""

1644 return not self == other

1645

1646 def __repr__(self, encoding="unicode-escape"):

1647 """Renders this PageElement as a string.

1648

1649 :param encoding: The encoding to use (Python 2 only).

1650 TODO: This is now ignored and a warning should be issued

1651 if a value is provided.

1652 :return: A (Unicode) string.

1653 """

1654 # "The return value must be a string object", i.e. Unicode

1655 return self.decode()

1656

1657 def __unicode__(self):

1658 """Renders this PageElement as a Unicode string."""

1659 return self.decode()

1660

1661 __str__ = __repr__ = __unicode__

1662

1663 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,

1664 indent_level=None, formatter="minimal",

1665 errors="xmlcharrefreplace"):

1666 """Render a bytestring representation of this PageElement and its

1667 contents.

1668

1669 :param encoding: The destination encoding.

1670 :param indent_level: Each line of the rendering will be

1671 indented this many levels. (The formatter decides what a

1672 'level' means in terms of spaces or other characters

1673 output.) Used internally in recursive calls while

1674 pretty-printing.

1675 :param formatter: A Formatter object, or a string naming one of

1676 the standard formatters.

1677 :param errors: An error handling strategy such as

1678 'xmlcharrefreplace'. This value is passed along into

1679 encode() and its value should be one of the constants

1680 defined by Python.

1681 :return: A bytestring.

1682

1683 """

1684 # Turn the data structure into Unicode, then encode the

1685 # Unicode.

1686 u = self.decode(indent_level, encoding, formatter)

1687 return u.encode(encoding, errors)

1688

1689 def decode(self, indent_level=None,

1690 eventual_encoding=DEFAULT_OUTPUT_ENCODING,

1691 formatter="minimal",

1692 iterator=None):

1693 pieces = []

1694 # First off, turn a non-Formatter `formatter` into a Formatter

1695 # object. This will stop the lookup from happening over and

1696 # over again.

1697 if not isinstance(formatter, Formatter):

1698 formatter = self.formatter_for_name(formatter)

1699

1700 if indent_level is True:

1701 indent_level = 0

1702

1703 # The currently active tag that put us into string literal

1704 # mode. Until this element is closed, children will be treated

1705 # as string literals and not pretty-printed. String literal

1706 # mode is turned on immediately after this tag begins, and

1707 # turned off immediately before it's closed. This means there

1708 # will be whitespace before and after the tag itself.

1709 string_literal_tag = None

1710

1711 for event, element in self._event_stream(iterator):

1712 if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):

1713 piece = element._format_tag(

1714 eventual_encoding, formatter, opening=True

1715 )

1716 elif event is Tag.END_ELEMENT_EVENT:

1717 piece = element._format_tag(

1718 eventual_encoding, formatter, opening=False

1719 )

1720 if indent_level is not None:

1721 indent_level -= 1

1722 else:

1723 piece = element.output_ready(formatter)

1724

1725 # Now we need to apply the 'prettiness' -- extra

1726 # whitespace before and/or after this tag. This can get

1727 # complicated because certain tags, like <pre> and

1728 # <script>, can't be prettified, since adding whitespace would

1729 # change the meaning of the content.

1730

1731 # The default behavior is to add whitespace before and

1732 # after an element when string literal mode is off, and to

1733 # leave things as they are when string literal mode is on.

1734 if string_literal_tag:

1735 indent_before = indent_after = False

1736 else:

1737 indent_before = indent_after = True

1738

1739 # The only time the behavior is more complex than that is

1740 # when we encounter an opening or closing tag that might

1741 # put us into or out of string literal mode.

1742 if (event is Tag.START_ELEMENT_EVENT

1743 and not string_literal_tag

1744 and not element._should_pretty_print()):

1745 # We are about to enter string literal mode. Add

1746 # whitespace before this tag, but not after. We

1747 # will stay in string literal mode until this tag

1748 # is closed.

1749 indent_before = True

1750 indent_after = False

1751 string_literal_tag = element

1752 elif (event is Tag.END_ELEMENT_EVENT

1753 and element is string_literal_tag):

1754 # We are about to exit string literal mode by closing

1755 # the tag that sent us into that mode. Add whitespace

1756 # after this tag, but not before.

1757 indent_before = False

1758 indent_after = True

1759 string_literal_tag = None

1760

1761 # Now we know whether to add whitespace before and/or

1762 # after this element.

1763 if indent_level is not None:

1764 if (indent_before or indent_after):

1765 if isinstance(element, NavigableString):

1766 piece = piece.strip()

1767 if piece:

1768 piece = self._indent_string(

1769 piece, indent_level, formatter,

1770 indent_before, indent_after

1771 )

1772 if event == Tag.START_ELEMENT_EVENT:

1773 indent_level += 1

1774 pieces.append(piece)

1775 return "".join(pieces)

1776

1777 # Names for the different events yielded by _event_stream

1778 START_ELEMENT_EVENT = object()

1779 END_ELEMENT_EVENT = object()

1780 EMPTY_ELEMENT_EVENT = object()

1781 STRING_ELEMENT_EVENT = object()

1782

1783 def _event_stream(self, iterator=None):

1784 """Yield a sequence of events that can be used to reconstruct the DOM

1785 for this element.

1786

1787 This lets us recreate the nested structure of this element

1788 (e.g. when formatting it as a string) without using recursive

1789 method calls.

1790

1791 This is similar in concept to the SAX API, but it's a simpler

1792 interface designed for internal use. The events are different

1793 from SAX and the arguments associated with the events are Tags

1794 and other Beautiful Soup objects.

1795

1796 :param iterator: An alternate iterator to use when traversing

1797 the tree.

1798 """

1799 tag_stack = []

1800

1801 iterator = iterator or self.self_and_descendants

1802

1803 for c in iterator:

1804 # If the parent of the element we're about to yield is not

1805 # the tag currently on the stack, it means that the tag on

1806 # the stack closed before this element appeared.

1807 while tag_stack and c.parent != tag_stack[-1]:

1808 now_closed_tag = tag_stack.pop()

1809 yield Tag.END_ELEMENT_EVENT, now_closed_tag

1810

1811 if isinstance(c, Tag):

1812 if c.is_empty_element:

1813 yield Tag.EMPTY_ELEMENT_EVENT, c

1814 else:

1815 yield Tag.START_ELEMENT_EVENT, c

1816 tag_stack.append(c)

1817 continue

1818 else:

1819 yield Tag.STRING_ELEMENT_EVENT, c

1820

1821 while tag_stack:

1822 now_closed_tag = tag_stack.pop()

1823 yield Tag.END_ELEMENT_EVENT, now_closed_tag

1824

1825 def _indent_string(self, s, indent_level, formatter,

1826 indent_before, indent_after):

1827 """Add indentation whitespace before and/or after a string.

1828

1829 :param s: The string to amend with whitespace.

1830 :param indent_level: The indentation level; affects how much

1831 whitespace goes before the string.

1832 :param indent_before: Whether or not to add whitespace

1833 before the string.

1834 :param indent_after: Whether or not to add whitespace

1835 (a newline) after the string.

1836 """

1837 space_before = ''

1838 if indent_before and indent_level:

1839 space_before = (formatter.indent * indent_level)

1840

1841 space_after = ''

1842 if indent_after:

1843 space_after = "\n"

1844

1845 return space_before + s + space_after

1846

1847 def _format_tag(self, eventual_encoding, formatter, opening):

1848 # A tag starts with the < character (see below).

1849

1850 # Then the / character, if this is a closing tag.

1851 closing_slash = ''

1852 if not opening:

1853 closing_slash = '/'

1854

1855 # Then an optional namespace prefix.

1856 prefix = ''

1857 if self.prefix:

1858 prefix = self.prefix + ":"

1859

1860 # Then a list of attribute values, if this is an opening tag.

1861 attribute_string = ''

1862 if opening:

1863 attributes = formatter.attributes(self)

1864 attrs = []

1865 for key, val in attributes:

1866 if val is None:

1867 decoded = key

1868 else:

1869 if isinstance(val, list) or isinstance(val, tuple):

1870 val = ' '.join(val)

1871 elif not isinstance(val, str):

1872 val = str(val)

1873 elif (

1874 isinstance(val, AttributeValueWithCharsetSubstitution)

1875 and eventual_encoding is not None

1876 ):

1877 val = val.encode(eventual_encoding)

1878

1879 text = formatter.attribute_value(val)

1880 decoded = (

1881 str(key) + '='

1882 + formatter.quoted_attribute_value(text))

1883 attrs.append(decoded)

1884 if attrs:

1885 attribute_string = ' ' + ' '.join(attrs)

1886

1887 # Then an optional closing slash (for a void element in an

1888 # XML document).

1889 void_element_closing_slash = ''

1890 if self.is_empty_element:

1891 void_element_closing_slash = formatter.void_element_close_prefix or ''

1892

1893 # Put it all together.

1894 return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>'

1895

1896 def _should_pretty_print(self, indent_level=1):

1897 """Should this tag be pretty-printed?

1898

1899 Most of them should, but some (such as <pre> in HTML

1900 documents) should not.

1901 """

1902 return (

1903 indent_level is not None

1904 and (

1905 not self.preserve_whitespace_tags

1906 or self.name not in self.preserve_whitespace_tags

1907 )

1908 )

1909

1910 def prettify(self, encoding=None, formatter="minimal"):

1911 """Pretty-print this PageElement as a string.

1912

1913 :param encoding: The eventual encoding of the string. If this is None,

1914 a Unicode string will be returned.

1915 :param formatter: A Formatter object, or a string naming one of

1916 the standard formatters.

1917 :return: A Unicode string (if encoding==None) or a bytestring

1918 (otherwise).

1919 """

1920 if encoding is None:

1921 return self.decode(True, formatter=formatter)

1922 else:

1923 return self.encode(encoding, True, formatter=formatter)

1924

1925 def decode_contents(self, indent_level=None,

1926 eventual_encoding=DEFAULT_OUTPUT_ENCODING,

1927 formatter="minimal"):

1928 """Renders the contents of this tag as a Unicode string.

1929

1930 :param indent_level: Each line of the rendering will be

1931 indented this many levels. (The formatter decides what a

1932 'level' means in terms of spaces or other characters

1933 output.) Used internally in recursive calls while

1934 pretty-printing.

1935

1936 :param eventual_encoding: The tag is destined to be

1937 encoded into this encoding. decode_contents() is _not_

1938 responsible for performing that encoding. This information

1939 is passed in so that it can be substituted in if the

1940 document contains a <META> tag that mentions the document's

1941 encoding.

1942

1943 :param formatter: A Formatter object, or a string naming one of

1944 the standard Formatters.

1945

1946 """

1947 return self.decode(indent_level, eventual_encoding, formatter,

1948 iterator=self.descendants)

1949

1950 def encode_contents(

1951 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,

1952 formatter="minimal"):

1953 """Renders the contents of this PageElement as a bytestring.

1954

1955 :param indent_level: Each line of the rendering will be

1956 indented this many levels. (The formatter decides what a

1957 'level' means in terms of spaces or other characters

1958 output.) Used internally in recursive calls while

1959 pretty-printing.

1960

1961 :param eventual_encoding: The bytestring will be in this encoding.

1962

1963 :param formatter: A Formatter object, or a string naming one of

1964 the standard Formatters.

1965

1966 :return: A bytestring.

1967 """

1968 contents = self.decode_contents(indent_level, encoding, formatter)

1969 return contents.encode(encoding)

1970

1971 # Old method for BS3 compatibility

1972 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,

1973 prettyPrint=False, indentLevel=0):

1974 """Deprecated method for BS3 compatibility."""

1975 if not prettyPrint:

1976 indentLevel = None

1977 return self.encode_contents(

1978 indent_level=indentLevel, encoding=encoding)

1979

1980 #Soup methods

1981

1982 def find(self, name=None, attrs={}, recursive=True, string=None,

1983 **kwargs):

1984 """Look in the children of this PageElement and find the first

1985 PageElement that matches the given criteria.

1986

1987 All find_* methods take a common set of arguments. See the online

1988 documentation for detailed explanations.

1989

1990 :param name: A filter on tag name.

1991 :param attrs: A dictionary of filters on attribute values.

1992 :param recursive: If this is True, find() will perform a

1993 recursive search of this PageElement's children. Otherwise,

1994 only the direct children will be considered.

1995 :param limit: Stop looking after finding this many results.

1996 :kwargs: A dictionary of filters on attribute values.

1997 :return: A PageElement.

1998 :rtype: bs4.element.Tag | bs4.element.NavigableString

1999 """

2000 r = None

2001 l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3,

2002 **kwargs)

2003 if l:

2004 r = l[0]

2005 return r

2006 findChild = find #BS2

2007

2008 def find_all(self, name=None, attrs={}, recursive=True, string=None,

2009 limit=None, **kwargs):

2010 """Look in the children of this PageElement and find all

2011 PageElements that match the given criteria.

2012

2013 All find_* methods take a common set of arguments. See the online

2014 documentation for detailed explanations.

2015

2016 :param name: A filter on tag name.

2017 :param attrs: A dictionary of filters on attribute values.

2018 :param recursive: If this is True, find_all() will perform a

2019 recursive search of this PageElement's children. Otherwise,

2020 only the direct children will be considered.

2021 :param limit: Stop looking after finding this many results.

2022 :kwargs: A dictionary of filters on attribute values.

2023 :return: A ResultSet of PageElements.

2024 :rtype: bs4.element.ResultSet

2025 """

2026 generator = self.descendants

2027 if not recursive:

2028 generator = self.children

2029 _stacklevel = kwargs.pop('_stacklevel', 2)

2030 return self._find_all(name, attrs, string, limit, generator,

2031 _stacklevel=_stacklevel+1, **kwargs)

2032 findAll = find_all # BS3

2033 findChildren = find_all # BS2

2034

2035 #Generator methods

2036 @property

2037 def children(self):

2038 """Iterate over all direct children of this PageElement.

2039

2040 :yield: A sequence of PageElements.

2041 """

2042 # return iter() to make the purpose of the method clear

2043 return iter(self.contents) # XXX This seems to be untested.

2044

2045 @property

2046 def self_and_descendants(self):

2047 """Iterate over this PageElement and its children in a

2048 breadth-first sequence.

2049

2050 :yield: A sequence of PageElements.

2051 """

2052 if not self.hidden:

2053 yield self

2054 for i in self.descendants:

2055 yield i

2056

2057 @property

2058 def descendants(self):

2059 """Iterate over all children of this PageElement in a

2060 breadth-first sequence.

2061

2062 :yield: A sequence of PageElements.

2063 """

2064 if not len(self.contents):

2065 return

2066 stopNode = self._last_descendant().next_element

2067 current = self.contents[0]

2068 while current is not stopNode:

2069 yield current

2070 current = current.next_element

2071

2072 # CSS selector code

2073 def select_one(self, selector, namespaces=None, **kwargs):

2074 """Perform a CSS selection operation on the current element.

2075

2076 :param selector: A CSS selector.

2077

2078 :param namespaces: A dictionary mapping namespace prefixes

2079 used in the CSS selector to namespace URIs. By default,

2080 Beautiful Soup will use the prefixes it encountered while

2081 parsing the document.

2082

2083 :param kwargs: Keyword arguments to be passed into Soup Sieve's

2084 soupsieve.select() method.

2085

2086 :return: A Tag.

2087 :rtype: bs4.element.Tag

2088 """

2089 return self.css.select_one(selector, namespaces, **kwargs)

2090

2091 def select(self, selector, namespaces=None, limit=None, **kwargs):

2092 """Perform a CSS selection operation on the current element.

2093

2094 This uses the SoupSieve library.

2095

2096 :param selector: A string containing a CSS selector.

2097

2098 :param namespaces: A dictionary mapping namespace prefixes

2099 used in the CSS selector to namespace URIs. By default,

2100 Beautiful Soup will use the prefixes it encountered while

2101 parsing the document.

2102

2103 :param limit: After finding this number of results, stop looking.

2104

2105 :param kwargs: Keyword arguments to be passed into SoupSieve's

2106 soupsieve.select() method.

2107

2108 :return: A ResultSet of Tags.

2109 :rtype: bs4.element.ResultSet

2110 """

2111 return self.css.select(selector, namespaces, limit, **kwargs)

2112

2113 @property

2114 def css(self):

2115 """Return an interface to the CSS selector API."""

2116 return CSS(self)

2117

2118 # Old names for backwards compatibility

2119 def childGenerator(self):

2120 """Deprecated generator."""

2121 return self.children

2122

2123 def recursiveChildGenerator(self):

2124 """Deprecated generator."""

2125 return self.descendants

2126

2127 def has_key(self, key):

2128 """Deprecated method. This was kind of misleading because has_key()

2129 (attributes) was different from __in__ (contents).

2130

2131 has_key() is gone in Python 3, anyway.

2132 """

2133 warnings.warn(

2134 'has_key is deprecated. Use has_attr(key) instead.',

2135 DeprecationWarning, stacklevel=2

2136 )

2137 return self.has_attr(key)

2138

2139# Next, a couple classes to represent queries and their results.

2140class SoupStrainer(object):

2141 """Encapsulates a number of ways of matching a markup element (tag or

2142 string).

2143

2144 This is primarily used to underpin the find_* methods, but you can

2145 create one yourself and pass it in as `parse_only` to the

2146 `BeautifulSoup` constructor, to parse a subset of a large

2147 document.

2148 """

2149

2150 def __init__(self, name=None, attrs={}, string=None, **kwargs):

2151 """Constructor.

2152

2153 The SoupStrainer constructor takes the same arguments passed

2154 into the find_* methods. See the online documentation for

2155 detailed explanations.

2156

2157 :param name: A filter on tag name.

2158 :param attrs: A dictionary of filters on attribute values.

2159 :param string: A filter for a NavigableString with specific text.

2160 :kwargs: A dictionary of filters on attribute values.

2161 """

2162 if string is None and 'text' in kwargs:

2163 string = kwargs.pop('text')

2164 warnings.warn(

2165 "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",

2166 DeprecationWarning, stacklevel=2

2167 )

2168

2169 self.name = self._normalize_search_value(name)

2170 if not isinstance(attrs, dict):

2171 # Treat a non-dict value for attrs as a search for the 'class'

2172 # attribute.

2173 kwargs['class'] = attrs

2174 attrs = None

2175

2176 if 'class_' in kwargs:

2177 # Treat class_="foo" as a search for the 'class'

2178 # attribute, overriding any non-dict value for attrs.

2179 kwargs['class'] = kwargs['class_']

2180 del kwargs['class_']

2181

2182 if kwargs:

2183 if attrs:

2184 attrs = attrs.copy()

2185 attrs.update(kwargs)

2186 else:

2187 attrs = kwargs

2188 normalized_attrs = {}

2189 for key, value in list(attrs.items()):

2190 normalized_attrs[key] = self._normalize_search_value(value)

2191

2192 self.attrs = normalized_attrs

2193 self.string = self._normalize_search_value(string)

2194

2195 # DEPRECATED but just in case someone is checking this.

2196 self.text = self.string

2197

2198 def _normalize_search_value(self, value):

2199 # Leave it alone if it's a Unicode string, a callable, a

2200 # regular expression, a boolean, or None.

2201 if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')

2202 or isinstance(value, bool) or value is None):

2203 return value

2204

2205 # If it's a bytestring, convert it to Unicode, treating it as UTF-8.

2206 if isinstance(value, bytes):

2207 return value.decode("utf8")

2208

2209 # If it's listlike, convert it into a list of strings.

2210 if hasattr(value, '__iter__'):

2211 new_value = []

2212 for v in value:

2213 if (hasattr(v, '__iter__') and not isinstance(v, bytes)

2214 and not isinstance(v, str)):

2215 # This is almost certainly the user's mistake. In the

2216 # interests of avoiding infinite loops, we'll let

2217 # it through as-is rather than doing a recursive call.

2218 new_value.append(v)

2219 else:

2220 new_value.append(self._normalize_search_value(v))

2221 return new_value

2222

2223 # Otherwise, convert it into a Unicode string.

2224 # The unicode(str()) thing is so this will do the same thing on Python 2

2225 # and Python 3.

2226 return str(str(value))

2227

2228 def __str__(self):

2229 """A human-readable representation of this SoupStrainer."""

2230 if self.string:

2231 return self.string

2232 else:

2233 return "%s|%s" % (self.name, self.attrs)

2234

2235 def search_tag(self, markup_name=None, markup_attrs={}):

2236 """Check whether a Tag with the given name and attributes would

2237 match this SoupStrainer.

2238

2239 Used prospectively to decide whether to even bother creating a Tag

2240 object.

2241

2242 :param markup_name: A tag name as found in some markup.

2243 :param markup_attrs: A dictionary of attributes as found in some markup.

2244

2245 :return: True if the prospective tag would match this SoupStrainer;

2246 False otherwise.

2247 """

2248 found = None

2249 markup = None

2250 if isinstance(markup_name, Tag):

2251 markup = markup_name

2252 markup_attrs = markup

2253

2254 if isinstance(self.name, str):

2255 # Optimization for a very common case where the user is

2256 # searching for a tag with one specific name, and we're

2257 # looking at a tag with a different name.

2258 if markup and not markup.prefix and self.name != markup.name:

2259 return False

2260

2261 call_function_with_tag_data = (

2262 isinstance(self.name, Callable)

2263 and not isinstance(markup_name, Tag))

2264

2265 if ((not self.name)

2266 or call_function_with_tag_data

2267 or (markup and self._matches(markup, self.name))

2268 or (not markup and self._matches(markup_name, self.name))):

2269 if call_function_with_tag_data:

2270 match = self.name(markup_name, markup_attrs)

2271 else:

2272 match = True

2273 markup_attr_map = None

2274 for attr, match_against in list(self.attrs.items()):

2275 if not markup_attr_map:

2276 if hasattr(markup_attrs, 'get'):

2277 markup_attr_map = markup_attrs

2278 else:

2279 markup_attr_map = {}

2280 for k, v in markup_attrs:

2281 markup_attr_map[k] = v

2282 attr_value = markup_attr_map.get(attr)

2283 if not self._matches(attr_value, match_against):

2284 match = False

2285 break

2286 if match:

2287 if markup:

2288 found = markup

2289 else:

2290 found = markup_name

2291 if found and self.string and not self._matches(found.string, self.string):

2292 found = None

2293 return found

2294

2295 # For BS3 compatibility.

2296 searchTag = search_tag

2297

2298 def search(self, markup):

2299 """Find all items in `markup` that match this SoupStrainer.

2300

2301 Used by the core _find_all() method, which is ultimately

2302 called by all find_* methods.

2303

2304 :param markup: A PageElement or a list of them.

2305 """

2306 # print('looking for %s in %s' % (self, markup))

2307 found = None

2308 # If given a list of items, scan it for a text element that

2309 # matches.

2310 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):

2311 for element in markup:

2312 if isinstance(element, NavigableString) \

2313 and self.search(element):

2314 found = element

2315 break

2316 # If it's a Tag, make sure its name or attributes match.

2317 # Don't bother with Tags if we're searching for text.

2318 elif isinstance(markup, Tag):

2319 if not self.string or self.name or self.attrs:

2320 found = self.search_tag(markup)

2321 # If it's text, make sure the text matches.

2322 elif isinstance(markup, NavigableString) or \

2323 isinstance(markup, str):

2324 if not self.name and not self.attrs and self._matches(markup, self.string):

2325 found = markup

2326 else:

2327 raise Exception(

2328 "I don't know how to match against a %s" % markup.__class__)

2329 return found

2330

2331 def _matches(self, markup, match_against, already_tried=None):

2332 # print(u"Matching %s against %s" % (markup, match_against))

2333 result = False

2334 if isinstance(markup, list) or isinstance(markup, tuple):

2335 # This should only happen when searching a multi-valued attribute

2336 # like 'class'.

2337 for item in markup:

2338 if self._matches(item, match_against):

2339 return True

2340 # We didn't match any particular value of the multivalue

2341 # attribute, but maybe we match the attribute value when

2342 # considered as a string.

2343 if self._matches(' '.join(markup), match_against):

2344 return True

2345 return False

2346

2347 if match_against is True:

2348 # True matches any non-None value.

2349 return markup is not None

2350

2351 if isinstance(match_against, Callable):

2352 return match_against(markup)

2353

2354 # Custom callables take the tag as an argument, but all

2355 # other ways of matching match the tag name as a string.

2356 original_markup = markup

2357 if isinstance(markup, Tag):

2358 markup = markup.name

2359

2360 # Ensure that `markup` is either a Unicode string, or None.

2361 markup = self._normalize_search_value(markup)

2362

2363 if markup is None:

2364 # None matches None, False, an empty string, an empty list, and so on.

2365 return not match_against

2366

2367 if (hasattr(match_against, '__iter__')

2368 and not isinstance(match_against, str)):

2369 # We're asked to match against an iterable of items.

2370 # The markup must be match at least one item in the

2371 # iterable. We'll try each one in turn.

2372 #

2373 # To avoid infinite recursion we need to keep track of

2374 # items we've already seen.

2375 if not already_tried:

2376 already_tried = set()

2377 for item in match_against:

2378 if item.__hash__:

2379 key = item

2380 else:

2381 key = id(item)

2382 if key in already_tried:

2383 continue

2384 else:

2385 already_tried.add(key)

2386 if self._matches(original_markup, item, already_tried):

2387 return True

2388 else:

2389 return False

2390

2391 # Beyond this point we might need to run the test twice: once against

2392 # the tag's name and once against its prefixed name.

2393 match = False

2394

2395 if not match and isinstance(match_against, str):

2396 # Exact string match

2397 match = markup == match_against

2398

2399 if not match and hasattr(match_against, 'search'):

2400 # Regexp match

2401 return match_against.search(markup)

2402

2403 if (not match

2404 and isinstance(original_markup, Tag)

2405 and original_markup.prefix):

2406 # Try the whole thing again with the prefixed tag name.

2407 return self._matches(

2408 original_markup.prefix + ':' + original_markup.name, match_against

2409 )

2410

2411 return match

2412

2413

2414class ResultSet(list):

2415 """A ResultSet is just a list that keeps track of the SoupStrainer

2416 that created it."""

2417 def __init__(self, source, result=()):

2418 """Constructor.

2419

2420 :param source: A SoupStrainer.

2421 :param result: A list of PageElements.

2422 """

2423 super(ResultSet, self).__init__(result)

2424 self.source = source

2425

2426 def __getattr__(self, key):

2427 """Raise a helpful exception to explain a common code fix."""

2428 raise AttributeError(

2429 "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key

2430 )