Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/soupsieve/css

1"""CSS matcher."""

2from __future__ import annotations

3from datetime import datetime

4from . import util

5import re

6from . import css_types as ct

7import unicodedata

8import bs4

9from typing import Iterator, Iterable, Any, Callable, Sequence, Any, cast # noqa: F401, F811

11# Empty tag pattern (whitespace okay)

12RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')

14RE_NOT_WS = re.compile('[^ \t\r\n\f]+')

16# Relationships

17REL_PARENT = ' '

18REL_CLOSE_PARENT = '>'

19REL_SIBLING = '~'

20REL_CLOSE_SIBLING = '+'

22# Relationships for :has() (forward looking)

23REL_HAS_PARENT = ': '

24REL_HAS_CLOSE_PARENT = ':>'

25REL_HAS_SIBLING = ':~'

26REL_HAS_CLOSE_SIBLING = ':+'

28NS_XHTML = 'http://www.w3.org/1999/xhtml'

29NS_XML = 'http://www.w3.org/XML/1998/namespace'

31DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL

32RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE

34DIR_MAP = {

35 'ltr': ct.SEL_DIR_LTR,

36 'rtl': ct.SEL_DIR_RTL,

37 'auto': 0

38}

40RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$")

41RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$')

42RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$')

43RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$')

44RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$')

45RE_DATETIME = re.compile(

46 r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'

47)

48RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')

50MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November

51FEB = 2

52SHORT_MONTH = 30

53LONG_MONTH = 31

54FEB_MONTH = 28

55FEB_LEAP_MONTH = 29

56DAYS_IN_WEEK = 7

59class _FakeParent:

60 """

61 Fake parent class.

63 When we have a fragment with no `BeautifulSoup` document object,

64 we can't evaluate `nth` selectors properly. Create a temporary

65 fake parent so we can traverse the root element as a child.

66 """

68 def __init__(self, element: bs4.Tag) -> None:

69 """Initialize."""

71 self.contents = [element]

73 def __len__(self) -> int:

74 """Length."""

76 return len(self.contents)

79class _DocumentNav:

80 """Navigate a Beautiful Soup document."""

82 @classmethod

83 def assert_valid_input(cls, tag: Any) -> None:

84 """Check if valid input tag or document."""

86 # Fail on unexpected types.

87 if not cls.is_tag(tag):

88 raise TypeError(f"Expected a BeautifulSoup 'Tag', but instead received type {type(tag)}")

90 @staticmethod

91 def is_doc(obj: bs4.element.PageElement | None) -> bool:

92 """Is `BeautifulSoup` object."""

93 return isinstance(obj, bs4.BeautifulSoup)

95 @staticmethod

96 def is_tag(obj: bs4.element.PageElement | None) -> bool:

97 """Is tag."""

98 return isinstance(obj, bs4.Tag)

100 @staticmethod

101 def is_declaration(obj: bs4.element.PageElement | None) -> bool: # pragma: no cover

102 """Is declaration."""

103 return isinstance(obj, bs4.Declaration)

104

105 @staticmethod

106 def is_cdata(obj: bs4.element.PageElement | None) -> bool:

107 """Is CDATA."""

108 return isinstance(obj, bs4.CData)

109

110 @staticmethod

111 def is_processing_instruction(obj: bs4.element.PageElement | None) -> bool: # pragma: no cover

112 """Is processing instruction."""

113 return isinstance(obj, bs4.ProcessingInstruction)

114

115 @staticmethod

116 def is_navigable_string(obj: bs4.element.PageElement | None) -> bool:

117 """Is navigable string."""

118 return isinstance(obj, bs4.element.NavigableString)

119

120 @staticmethod

121 def is_special_string(obj: bs4.element.PageElement | None) -> bool:

122 """Is special string."""

123 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))

124

125 @classmethod

126 def is_content_string(cls, obj: bs4.element.PageElement | None) -> bool:

127 """Check if node is content string."""

128

129 return cls.is_navigable_string(obj) and not cls.is_special_string(obj)

130

131 @staticmethod

132 def create_fake_parent(el: bs4.Tag) -> _FakeParent:

133 """Create fake parent for a given element."""

134

135 return _FakeParent(el)

136

137 @staticmethod

138 def is_xml_tree(el: bs4.Tag | None) -> bool:

139 """Check if element (or document) is from a XML tree."""

140

141 return el is not None and bool(el._is_xml)

142

143 def is_iframe(self, el: bs4.Tag | None) -> bool:

144 """Check if element is an `iframe`."""

145

146 if el is None: # pragma: no cover

147 return False

148

149 return bool(

150 ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and

151 self.is_html_tag(el) # type: ignore[attr-defined]

152 )

153

154 def is_root(self, el: bs4.Tag) -> bool:

155 """

156 Return whether element is a root element.

157

158 We check that the element is the root of the tree (which we have already pre-calculated),

159 and we check if it is the root element under an `iframe`.

160 """

161

162 root = self.root and self.root is el # type: ignore[attr-defined]

163 if not root:

164 parent = self.get_parent(el)

165 root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined]

166 return root

167

168 def get_contents(self, el: bs4.Tag | None, no_iframe: bool = False) -> Iterator[bs4.element.PageElement]:

169 """Get contents or contents in reverse."""

170

171 if el is not None:

172 if not no_iframe or not self.is_iframe(el):

173 yield from el.contents

174

175 def get_tag_children(

176 self,

177 el: bs4.Tag | None,

178 start: int | None = None,

179 reverse: bool = False,

180 no_iframe: bool = False

181 ) -> Iterator[bs4.Tag]:

182 """Get tag children."""

183

184 return self.get_children(el, start, reverse, True, no_iframe) # type: ignore[return-value]

185

186 def get_children(

187 self,

188 el: bs4.Tag | None,

189 start: int | None = None,

190 reverse: bool = False,

191 tags: bool = False,

192 no_iframe: bool = False

193 ) -> Iterator[bs4.element.PageElement]:

194 """Get children."""

195

196 if el is not None and (not no_iframe or not self.is_iframe(el)):

197 last = len(el.contents) - 1

198 if start is None:

199 index = last if reverse else 0

200 else:

201 index = start

202 end = -1 if reverse else last + 1

203 incr = -1 if reverse else 1

204

205 if 0 <= index <= last:

206 while index != end:

207 node = el.contents[index]

208 index += incr

209 if not tags or self.is_tag(node):

210 yield node

211

212 def get_tag_descendants(

213 self,

214 el: bs4.Tag | None,

215 no_iframe: bool = False

216 ) -> Iterator[bs4.Tag]:

217 """Specifically get tag descendants."""

218

219 yield from self.get_descendants(el, tags=True, no_iframe=no_iframe) # type: ignore[misc]

220

221 def get_descendants(

222 self,

223 el: bs4.Tag | None,

224 tags: bool = False,

225 no_iframe: bool = False

226 ) -> Iterator[bs4.element.PageElement]:

227 """Get descendants."""

228

229 if el is not None and (not no_iframe or not self.is_iframe(el)):

230 next_good = None

231 for child in el.descendants:

232

233 if next_good is not None:

234 if child is not next_good:

235 continue

236 next_good = None

237

238 if isinstance(child, bs4.Tag):

239 if no_iframe and self.is_iframe(child):

240 if child.next_sibling is not None:

241 next_good = child.next_sibling

242 else:

243 last_child = child # type: bs4.element.PageElement

244 while isinstance(last_child, bs4.Tag) and last_child.contents:

245 last_child = last_child.contents[-1]

246 next_good = last_child.next_element

247 yield child

248 if next_good is None:

249 break

250 # Coverage isn't seeing this even though it's executed

251 continue # pragma: no cover

252 yield child

253

254 elif not tags:

255 yield child

256

257 def get_parent(self, el: bs4.Tag | None, no_iframe: bool = False) -> bs4.Tag | None:

258 """Get parent."""

259

260 parent = el.parent if el is not None else None

261 if no_iframe and parent is not None and self.is_iframe(parent):

262 parent = None

263 return parent

264

265 @staticmethod

266 def get_tag_name(el: bs4.Tag | None) -> str | None:

267 """Get tag."""

268

269 return el.name if el is not None else None

270

271 @staticmethod

272 def get_prefix_name(el: bs4.Tag) -> str | None:

273 """Get prefix."""

274

275 return el.prefix

276

277 @staticmethod

278 def get_uri(el: bs4.Tag | None) -> str | None:

279 """Get namespace `URI`."""

280

281 return el.namespace if el is not None else None

282

283 @classmethod

284 def get_next_tag(cls, el: bs4.Tag) -> bs4.Tag | None:

285 """Get next sibling tag."""

286

287 return cls.get_next(el, tags=True) # type: ignore[return-value]

288

289 @classmethod

290 def get_next(cls, el: bs4.Tag, tags: bool = False) -> bs4.element.PageElement | None:

291 """Get next sibling tag."""

292

293 sibling = el.next_sibling

294 while tags and not isinstance(sibling, bs4.Tag) and sibling is not None:

295 sibling = sibling.next_sibling

296

297 if tags and not isinstance(sibling, bs4.Tag):

298 sibling = None

299

300 return sibling

301

302 @classmethod

303 def get_previous_tag(cls, el: bs4.Tag, tags: bool = True) -> bs4.Tag | None:

304 """Get previous sibling tag."""

305

306 return cls.get_previous(el, True) # type: ignore[return-value]

307

308 @classmethod

309 def get_previous(cls, el: bs4.Tag, tags: bool = False) -> bs4.element.PageElement | None:

310 """Get previous sibling tag."""

311

312 sibling = el.previous_sibling

313 while tags and not isinstance(sibling, bs4.Tag) and sibling is not None:

314 sibling = sibling.previous_sibling

315

316 if tags and not isinstance(sibling, bs4.Tag):

317 sibling = None

318

319 return sibling

320

321 @staticmethod

322 def has_html_ns(el: bs4.Tag | None) -> bool:

323 """

324 Check if element has an HTML namespace.

325

326 This is a bit different than whether a element is treated as having an HTML namespace,

327 like we do in the case of `is_html_tag`.

328 """

329

330 ns = getattr(el, 'namespace') if el is not None else None # noqa: B009

331 return bool(ns and ns == NS_XHTML)

332

333 @staticmethod

334 def split_namespace(el: bs4.Tag | None, attr_name: str) -> tuple[str | None, str | None]:

335 """Return namespace and attribute name without the prefix."""

336

337 if el is None: # pragma: no cover

338 return None, None

339

340 return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)

341

342 @classmethod

343 def normalize_value(cls, value: Any) -> str | Sequence[str]:

344 """Normalize the value to be a string or list of strings."""

345

346 # Treat `None` as empty string.

347 if value is None:

348 return ''

349

350 # Pass through strings

351 if (isinstance(value, str)):

352 return value

353

354 # If it's a byte string, convert it to Unicode, treating it as UTF-8.

355 if isinstance(value, bytes):

356 return value.decode("utf8")

357

358 # BeautifulSoup supports sequences of attribute values, so make sure the children are strings.

359 if isinstance(value, Sequence):

360 new_value = []

361 for v in value:

362 if not isinstance(v, (str, bytes)) and isinstance(v, Sequence):

363 # This is most certainly a user error and will crash and burn later.

364 # To keep things working, we'll do what we do with all objects,

365 # And convert them to strings.

366 new_value.append(str(v))

367 else:

368 # Convert the child to a string

369 new_value.append(cast(str, cls.normalize_value(v)))

370 return new_value

371

372 # Try and make anything else a string

373 return str(value)

374

375 @classmethod

376 def get_attribute_by_name(

377 cls,

378 el: bs4.Tag,

379 name: str,

380 default: str | Sequence[str] | None = None

381 ) -> str | Sequence[str] | None:

382 """Get attribute by name."""

383

384 value = default

385 if el._is_xml:

386 try:

387 value = cls.normalize_value(el.attrs[name])

388 except KeyError:

389 pass

390 else:

391 for k, v in el.attrs.items():

392 if util.lower(k) == name:

393 value = cls.normalize_value(v)

394 break

395 return value

396

397 @classmethod

398 def iter_attributes(cls, el: bs4.Tag | None) -> Iterator[tuple[str, str | Sequence[str] | None]]:

399 """Iterate attributes."""

400

401 if el is not None:

402 for k, v in el.attrs.items():

403 yield k, cls.normalize_value(v)

404

405 @classmethod

406 def get_classes(cls, el: bs4.Tag) -> Sequence[str]:

407 """Get classes."""

408

409 classes = cls.get_attribute_by_name(el, 'class', [])

410 if isinstance(classes, str):

411 classes = RE_NOT_WS.findall(classes)

412 return cast(Sequence[str], classes)

413

414 def get_text(self, el: bs4.Tag, no_iframe: bool = False) -> str:

415 """Get text."""

416

417 return ''.join(

418 [

419 node for node in self.get_descendants(el, no_iframe=no_iframe) # type: ignore[misc]

420 if self.is_content_string(node)

421 ]

422 )

423

424 def get_own_text(self, el: bs4.Tag, no_iframe: bool = False) -> list[str]:

425 """Get Own Text."""

426

427 return [

428 node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node) # type: ignore[misc]

429 ]

430

431

432class Inputs:

433 """Class for parsing and validating input items."""

434

435 @staticmethod

436 def validate_day(year: int, month: int, day: int) -> bool:

437 """Validate day."""

438

439 max_days = LONG_MONTH

440 if month == FEB:

441 max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH

442 elif month in MONTHS_30:

443 max_days = SHORT_MONTH

444 return 1 <= day <= max_days

445

446 @staticmethod

447 def validate_week(year: int, week: int) -> bool:

448 """Validate week."""

449

450 max_week = datetime.strptime(f"{12}-{31}-{year}", "%m-%d-%Y").isocalendar()[1]

451 if max_week == 1:

452 max_week = 53

453 return 1 <= week <= max_week

454

455 @staticmethod

456 def validate_month(month: int) -> bool:

457 """Validate month."""

458

459 return 1 <= month <= 12

460

461 @staticmethod

462 def validate_year(year: int) -> bool:

463 """Validate year."""

464

465 return 1 <= year

466

467 @staticmethod

468 def validate_hour(hour: int) -> bool:

469 """Validate hour."""

470

471 return 0 <= hour <= 23

472

473 @staticmethod

474 def validate_minutes(minutes: int) -> bool:

475 """Validate minutes."""

476

477 return 0 <= minutes <= 59

478

479 @classmethod

480 def parse_value(cls, itype: str, value: str | None) -> tuple[float, ...] | None:

481 """Parse the input value."""

482

483 parsed = None # type: tuple[float, ...] | None

484 if value is None:

485 return value

486 if itype == "date":

487 m = RE_DATE.match(value)

488 if m:

489 year = int(m.group('year'), 10)

490 month = int(m.group('month'), 10)

491 day = int(m.group('day'), 10)

492 if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day):

493 parsed = (year, month, day)

494 elif itype == "month":

495 m = RE_MONTH.match(value)

496 if m:

497 year = int(m.group('year'), 10)

498 month = int(m.group('month'), 10)

499 if cls.validate_year(year) and cls.validate_month(month):

500 parsed = (year, month)

501 elif itype == "week":

502 m = RE_WEEK.match(value)

503 if m:

504 year = int(m.group('year'), 10)

505 week = int(m.group('week'), 10)

506 if cls.validate_year(year) and cls.validate_week(year, week):

507 parsed = (year, week)

508 elif itype == "time":

509 m = RE_TIME.match(value)

510 if m:

511 hour = int(m.group('hour'), 10)

512 minutes = int(m.group('minutes'), 10)

513 if cls.validate_hour(hour) and cls.validate_minutes(minutes):

514 parsed = (hour, minutes)

515 elif itype == "datetime-local":

516 m = RE_DATETIME.match(value)

517 if m:

518 year = int(m.group('year'), 10)

519 month = int(m.group('month'), 10)

520 day = int(m.group('day'), 10)

521 hour = int(m.group('hour'), 10)

522 minutes = int(m.group('minutes'), 10)

523 if (

524 cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and

525 cls.validate_hour(hour) and cls.validate_minutes(minutes)

526 ):

527 parsed = (year, month, day, hour, minutes)

528 elif itype in ("number", "range"):

529 m = RE_NUM.match(value)

530 if m:

531 parsed = (float(m.group('value')),)

532 return parsed

533

534

535class CSSMatch(_DocumentNav):

536 """Perform CSS matching."""

537

538 def __init__(

539 self,

540 selectors: ct.SelectorList,

541 scope: bs4.Tag | None,

542 namespaces: ct.Namespaces | None,

543 flags: int

544 ) -> None:

545 """Initialize."""

546

547 self.assert_valid_input(scope)

548 self.tag = scope

549 self.cached_meta_lang = [] # type: list[tuple[str, str]]

550 self.cached_default_forms = [] # type: list[tuple[bs4.Tag, bs4.Tag]]

551 self.cached_indeterminate_forms = [] # type: list[tuple[bs4.Tag, str, bool]]

552 self.selectors = selectors

553 self.namespaces = {} if namespaces is None else namespaces # type: ct.Namespaces | dict[str, str]

554 self.flags = flags

555 self.iframe_restrict = False

556

557 # Find the root element for the whole tree

558 doc = scope

559 parent = self.get_parent(doc)

560 while parent:

561 doc = parent

562 parent = self.get_parent(doc)

563 root = None # type: bs4.Tag | None

564 if not self.is_doc(doc):

565 root = doc

566 else:

567 for child in self.get_tag_children(doc):

568 root = child

569 break

570

571 self.root = root

572 self.scope = scope if scope is not doc else root

573 self.has_html_namespace = self.has_html_ns(root)

574

575 # A document can be both XML and HTML (XHTML)

576 self.is_xml = self.is_xml_tree(doc)

577 self.is_html = not self.is_xml or self.has_html_namespace

578

579 def supports_namespaces(self) -> bool:

580 """Check if namespaces are supported in the HTML type."""

581

582 return self.is_xml or self.has_html_namespace

583

584 def get_tag_ns(self, el: bs4.Tag | None) -> str:

585 """Get tag namespace."""

586

587 namespace = ''

588 if el is None: # pragma: no cover

589 return namespace

590

591 if self.supports_namespaces():

592 ns = self.get_uri(el)

593 if ns:

594 namespace = ns

595 else:

596 namespace = NS_XHTML

597 return namespace

598

599 def is_html_tag(self, el: bs4.Tag | None) -> bool:

600 """Check if tag is in HTML namespace."""

601

602 return self.get_tag_ns(el) == NS_XHTML

603

604 def get_tag(self, el: bs4.Tag | None) -> str | None:

605 """Get tag."""

606

607 name = self.get_tag_name(el)

608 return util.lower(name) if name is not None and not self.is_xml else name

609

610 def get_prefix(self, el: bs4.Tag) -> str | None:

611 """Get prefix."""

612

613 prefix = self.get_prefix_name(el)

614 return util.lower(prefix) if prefix is not None and not self.is_xml else prefix

615

616 def find_bidi(self, el: bs4.Tag) -> int | None:

617 """Get directionality from element text."""

618

619 for node in self.get_children(el):

620

621 # Analyze child text nodes

622 if self.is_tag(node):

623

624 # Avoid analyzing certain elements specified in the specification.

625 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None) # type: ignore[arg-type]

626 name = self.get_tag(node) # type: ignore[arg-type]

627 if (

628 (name and name in ('bdi', 'script', 'style', 'textarea', 'iframe')) or

629 not self.is_html_tag(node) or # type: ignore[arg-type]

630 direction is not None

631 ):

632 continue # pragma: no cover

633

634 # Check directionality of this node's text

635 value = self.find_bidi(node) # type: ignore[arg-type]

636 if value is not None:

637 return value

638

639 # Direction could not be determined

640 continue # pragma: no cover

641

642 # Skip `doctype` comments, etc.

643 if self.is_special_string(node):

644 continue

645

646 # Analyze text nodes for directionality.

647 for c in node: # type: ignore[attr-defined]

648 bidi = unicodedata.bidirectional(c)

649 if bidi in ('AL', 'R', 'L'):

650 return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL

651 return None

652

653 def extended_language_filter(self, lang_range: str, lang_tag: str) -> bool:

654 """Filter the language tags."""

655

656 match = True

657 lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()

658 ranges = lang_range.split('-')

659 subtags = lang_tag.lower().split('-')

660 length = len(ranges)

661 slength = len(subtags)

662 rindex = 0

663 sindex = 0

664 r = ranges[rindex]

665 s = subtags[sindex]

666

667 # Empty specified language should match unspecified language attributes

668 if length == 1 and slength == 1 and not r and r == s:

669 return True

670

671 # Primary tag needs to match

672 if (r != '*' and r != s) or (r == '*' and slength == 1 and not s):

673 match = False

674

675 rindex += 1

676 sindex += 1

677

678 # Match until we run out of ranges

679 while match and rindex < length:

680 r = ranges[rindex]

681 try:

682 s = subtags[sindex]

683 except IndexError:

684 # Ran out of subtags,

685 # but we still have ranges

686 match = False

687 continue

688

689 # Empty range

690 if not r:

691 match = False

692 continue

693

694 # Matched range

695 elif s == r:

696 rindex += 1

697

698 # Implicit wildcard cannot match

699 # singletons

700 elif len(s) == 1:

701 match = False

702 continue

703

704 # Implicitly matched, so grab next subtag

705 sindex += 1

706

707 return match

708

709 def match_attribute_name(

710 self,

711 el: bs4.Tag,

712 attr: str,

713 prefix: str | None

714 ) -> str | Sequence[str] | None:

715 """Match attribute name and return value if it exists."""

716

717 value = None

718 if self.supports_namespaces():

719 value = None

720 # If we have not defined namespaces, we can't very well find them, so don't bother trying.

721 if prefix:

722 ns = self.namespaces.get(prefix)

723 if ns is None and prefix != '*':

724 return None

725 else:

726 ns = None

727

728 for k, v in self.iter_attributes(el):

729

730 # Get attribute parts

731 namespace, name = self.split_namespace(el, k)

732

733 # Can't match a prefix attribute as we haven't specified one to match

734 # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`.

735 if ns is None:

736 if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)):

737 value = v

738 break

739 # Coverage is not finding this even though it is executed.

740 # Adding a print statement before this (and erasing coverage) causes coverage to find the line.

741 # Ignore the false positive message.

742 continue # pragma: no cover

743

744 # We can't match our desired prefix attribute as the attribute doesn't have a prefix

745 if namespace is None or (ns != namespace and prefix != '*'):

746 continue

747

748 # The attribute doesn't match.

749 if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name):

750 continue

751

752 value = v

753 break

754 else:

755 for k, v in self.iter_attributes(el):

756 if util.lower(attr) != util.lower(k):

757 continue

758 value = v

759 break

760 return value

761

762 def match_namespace(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:

763 """Match the namespace of the element."""

764

765 match = True

766 namespace = self.get_tag_ns(el)

767 default_namespace = self.namespaces.get('')

768 tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix)

769 # We must match the default namespace if one is not provided

770 if tag.prefix is None and (default_namespace is not None and namespace != default_namespace):

771 match = False

772 # If we specified `|tag`, we must not have a namespace.

773 elif (tag.prefix is not None and tag.prefix == '' and namespace):

774 match = False

775 # Verify prefix matches

776 elif (

777 tag.prefix and

778 tag.prefix != '*' and (tag_ns is None or namespace != tag_ns)

779 ):

780 match = False

781 return match

782

783 def match_attributes(self, el: bs4.Tag, attributes: tuple[ct.SelectorAttribute, ...]) -> bool:

784 """Match attributes."""

785

786 match = True

787 if attributes:

788 for a in attributes:

789 temp = self.match_attribute_name(el, a.attribute, a.prefix)

790 pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern

791 if temp is None:

792 match = False

793 break

794 value = temp if isinstance(temp, str) else ' '.join(temp)

795 if pattern is None:

796 continue

797 elif pattern.match(value) is None:

798 match = False

799 break

800 return match

801

802 def match_tagname(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:

803 """Match tag name."""

804

805 name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)

806 return not (

807 name is not None and

808 name not in (self.get_tag(el), '*')

809 )

810

811 def match_tag(self, el: bs4.Tag, tag: ct.SelectorTag | None) -> bool:

812 """Match the tag."""

813

814 match = True

815 if tag is not None:

816 # Verify namespace

817 if not self.match_namespace(el, tag):

818 match = False

819 if not self.match_tagname(el, tag):

820 match = False

821 return match

822

823 def match_past_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:

824 """Match past relationship."""

825

826 found = False

827 # I don't think this can ever happen, but it makes `mypy` happy

828 if isinstance(relation[0], ct.SelectorNull): # pragma: no cover

829 return found

830

831 if relation[0].rel_type == REL_PARENT:

832 parent = self.get_parent(el, no_iframe=self.iframe_restrict)

833 while not found and parent:

834 found = self.match_selectors(parent, relation)

835 parent = self.get_parent(parent, no_iframe=self.iframe_restrict)

836 elif relation[0].rel_type == REL_CLOSE_PARENT:

837 parent = self.get_parent(el, no_iframe=self.iframe_restrict)

838 if parent:

839 found = self.match_selectors(parent, relation)

840 elif relation[0].rel_type == REL_SIBLING:

841 sibling = self.get_previous_tag(el)

842 while not found and sibling:

843 found = self.match_selectors(sibling, relation)

844 sibling = self.get_previous_tag(sibling)

845 elif relation[0].rel_type == REL_CLOSE_SIBLING:

846 sibling = self.get_previous_tag(el)

847 if sibling and self.is_tag(sibling):

848 found = self.match_selectors(sibling, relation)

849 return found

850

851 def match_future_child(self, parent: bs4.Tag, relation: ct.SelectorList, recursive: bool = False) -> bool:

852 """Match future child."""

853

854 match = False

855 if recursive:

856 children = self.get_tag_descendants # type: Callable[..., Iterator[bs4.Tag]]

857 else:

858 children = self.get_tag_children

859 for child in children(parent, no_iframe=self.iframe_restrict):

860 match = self.match_selectors(child, relation)

861 if match:

862 break

863 return match

864

865 def match_future_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:

866 """Match future relationship."""

867

868 found = False

869 # I don't think this can ever happen, but it makes `mypy` happy

870 if isinstance(relation[0], ct.SelectorNull): # pragma: no cover

871 return found

872

873 if relation[0].rel_type == REL_HAS_PARENT:

874 found = self.match_future_child(el, relation, True)

875 elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:

876 found = self.match_future_child(el, relation)

877 elif relation[0].rel_type == REL_HAS_SIBLING:

878 sibling = self.get_next_tag(el)

879 while not found and sibling:

880 found = self.match_selectors(sibling, relation)

881 sibling = self.get_next_tag(sibling)

882 elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:

883 sibling = self.get_next_tag(el)

884 if sibling and self.is_tag(sibling):

885 found = self.match_selectors(sibling, relation)

886 return found

887

888 def match_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:

889 """Match relationship to other elements."""

890

891 found = False

892

893 if isinstance(relation[0], ct.SelectorNull) or relation[0].rel_type is None:

894 return found

895

896 if relation[0].rel_type.startswith(':'):

897 found = self.match_future_relations(el, relation)

898 else:

899 found = self.match_past_relations(el, relation)

900

901 return found

902

903 def match_id(self, el: bs4.Tag, ids: tuple[str, ...]) -> bool:

904 """Match element's ID."""

905

906 found = True

907 for i in ids:

908 if i != self.get_attribute_by_name(el, 'id', ''):

909 found = False

910 break

911 return found

912

913 def match_classes(self, el: bs4.Tag, classes: tuple[str, ...]) -> bool:

914 """Match element's classes."""

915

916 current_classes = self.get_classes(el)

917 found = True

918 for c in classes:

919 if c not in current_classes:

920 found = False

921 break

922 return found

923

924 def match_root(self, el: bs4.Tag) -> bool:

925 """Match element as root."""

926

927 is_root = self.is_root(el)

928 if is_root:

929 sibling = self.get_previous(el) # type: Any

930 while is_root and sibling is not None:

931 if (

932 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or

933 self.is_cdata(sibling)

934 ):

935 is_root = False

936 else:

937 sibling = self.get_previous(sibling)

938 if is_root:

939 sibling = self.get_next(el)

940 while is_root and sibling is not None:

941 if (

942 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or

943 self.is_cdata(sibling)

944 ):

945 is_root = False

946 else:

947 sibling = self.get_next(sibling)

948 return is_root

949

950 def match_scope(self, el: bs4.Tag) -> bool:

951 """Match element as scope."""

952

953 return self.scope is el

954

955 def match_nth_tag_type(self, el: bs4.Tag, child: bs4.Tag) -> bool:

956 """Match tag type for `nth` matches."""

957

958 return (

959 (self.get_tag(child) == self.get_tag(el)) and

960 (self.get_tag_ns(child) == self.get_tag_ns(el))

961 )

962

963 def match_nth(self, el: bs4.Tag, nth: tuple[ct.SelectorNth, ...]) -> bool:

964 """Match `nth` elements."""

965

966 matched = True

967

968 for n in nth:

969 matched = False

970 if n.selectors and not self.match_selectors(el, n.selectors):

971 break

972 parent = self.get_parent(el) # type: bs4.Tag | None

973 if parent is None:

974 parent = cast('bs4.Tag', self.create_fake_parent(el))

975 last = n.last

976 last_index = len(parent) - 1

977 index = last_index if last else 0

978 relative_index = 0

979 a = n.a

980 b = n.b

981 var = n.n

982 count = 0

983 count_incr = 1

984 factor = -1 if last else 1

985 idx = last_idx = a * count + b if var else a

986

987 # We can only adjust bounds within a variable index

988 if var:

989 # Abort if our nth index is out of bounds and only getting further out of bounds as we increment.

990 # Otherwise, increment to try to get in bounds.

991 adjust = None

992 while idx < 1 or idx > last_index:

993 if idx < 0:

994 diff_low = 0 - idx

995 if adjust is not None and adjust == 1:

996 break

997 adjust = -1

998 count += count_incr

999 idx = last_idx = a * count + b if var else a

1000 diff = 0 - idx

1001 if diff >= diff_low:

1002 break

1003 else:

1004 diff_high = idx - last_index

1005 if adjust is not None and adjust == -1:

1006 break

1007 adjust = 1

1008 count += count_incr

1009 idx = last_idx = a * count + b if var else a

1010 diff = idx - last_index

1011 if diff >= diff_high:

1012 break

1013 diff_high = diff

1014

1015 # If a < 0, our count is working backwards, so floor the index by increasing the count.

1016 # Find the count that yields the lowest, in bound value and use that.

1017 # Lastly reverse count increment so that we'll increase our index.

1018 lowest = count

1019 if a < 0:

1020 while idx >= 1:

1021 lowest = count

1022 count += count_incr

1023 idx = last_idx = a * count + b if var else a

1024 count_incr = -1

1025 count = lowest

1026 idx = last_idx = a * count + b if var else a

1027

1028 # Evaluate elements while our calculated nth index is still in range

1029 while 1 <= idx <= last_index + 1:

1030 child = None # type: bs4.element.PageElement | None

1031 # Evaluate while our child index is still in range.

1032 for child in self.get_children(parent, start=index, reverse=factor < 0):

1033 index += factor

1034 if not isinstance(child, bs4.Tag):

1035 continue

1036 # Handle `of S` in `nth-child`

1037 if n.selectors and not self.match_selectors(child, n.selectors):

1038 continue

1039 # Handle `of-type`

1040 if n.of_type and not self.match_nth_tag_type(el, child):

1041 continue

1042 relative_index += 1

1043 if relative_index == idx:

1044 if child is el:

1045 matched = True

1046 else:

1047 break

1048 if child is el:

1049 break

1050 if child is el:

1051 break

1052 last_idx = idx

1053 count += count_incr

1054 if count < 0:

1055 # Count is counting down and has now ventured into invalid territory.

1056 break

1057 idx = a * count + b if var else a

1058 if last_idx == idx:

1059 break

1060 if not matched:

1061 break

1062 return matched

1063

1064 def match_empty(self, el: bs4.Tag) -> bool:

1065 """Check if element is empty (if requested)."""

1066

1067 is_empty = True

1068 for child in self.get_children(el):

1069 if self.is_tag(child):

1070 is_empty = False

1071 break

1072 elif self.is_content_string(child) and RE_NOT_EMPTY.search(child): # type: ignore[call-overload]

1073 is_empty = False

1074 break

1075 return is_empty

1076

1077 def match_subselectors(self, el: bs4.Tag, selectors: tuple[ct.SelectorList, ...]) -> bool:

1078 """Match selectors."""

1079

1080 match = True

1081 for sel in selectors:

1082 if not self.match_selectors(el, sel):

1083 match = False

1084 return match

1085

1086 def match_contains(self, el: bs4.Tag, contains: tuple[ct.SelectorContains, ...]) -> bool:

1087 """Match element if it contains text."""

1088

1089 match = True

1090 content = None # type: str | Sequence[str] | None

1091 for contain_list in contains:

1092 if content is None:

1093 if contain_list.own:

1094 content = self.get_own_text(el, no_iframe=self.is_html)

1095 else:

1096 content = self.get_text(el, no_iframe=self.is_html)

1097 found = False

1098 for text in contain_list.text:

1099 if contain_list.own:

1100 for c in content:

1101 if text in c:

1102 found = True

1103 break

1104 if found:

1105 break

1106 else:

1107 if text in content:

1108 found = True

1109 break

1110 if not found:

1111 match = False

1112 return match

1113

1114 def match_default(self, el: bs4.Tag) -> bool:

1115 """Match default."""

1116

1117 match = False

1118

1119 # Find this input's form

1120 form = None # type: bs4.Tag | None

1121 parent = self.get_parent(el, no_iframe=True)

1122 while parent and form is None:

1123 if self.get_tag(parent) == 'form' and self.is_html_tag(parent):

1124 form = parent

1125 else:

1126 parent = self.get_parent(parent, no_iframe=True)

1127

1128 if form is not None:

1129 # Look in form cache to see if we've already located its default button

1130 found_form = False

1131 for f, t in self.cached_default_forms:

1132 if f is form:

1133 found_form = True

1134 if t is el:

1135 match = True

1136 break

1137

1138 # We didn't have the form cached, so look for its default button

1139 if not found_form:

1140 for child in self.get_tag_descendants(form, no_iframe=True):

1141 name = self.get_tag(child)

1142 # Can't do nested forms (haven't figured out why we never hit this)

1143 if name == 'form': # pragma: no cover

1144 break

1145 if name in ('input', 'button'):

1146 v = self.get_attribute_by_name(child, 'type', '')

1147 if v and util.lower(v) == 'submit':

1148 self.cached_default_forms.append((form, child))

1149 if el is child:

1150 match = True

1151 break

1152 return match

1153

1154 def match_indeterminate(self, el: bs4.Tag) -> bool:

1155 """Match default."""

1156

1157 match = False

1158 name = cast(str, self.get_attribute_by_name(el, 'name'))

1159

1160 def get_parent_form(el: bs4.Tag) -> bs4.Tag | None:

1161 """Find this input's form."""

1162 form = None

1163 parent = self.get_parent(el, no_iframe=True)

1164 while form is None:

1165 if self.get_tag(parent) == 'form' and self.is_html_tag(parent):

1166 form = parent

1167 break

1168 last_parent = parent

1169 parent = self.get_parent(parent, no_iframe=True)

1170 if parent is None:

1171 form = last_parent

1172 break

1173 return form

1174

1175 form = get_parent_form(el)

1176

1177 # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate

1178 if form is not None:

1179 found_form = False

1180 for f, n, i in self.cached_indeterminate_forms:

1181 if f is form and n == name:

1182 found_form = True

1183 if i is True:

1184 match = True

1185 break

1186

1187 # We didn't have the form cached, so validate that the radio button is indeterminate

1188 if not found_form:

1189 checked = False

1190 for child in self.get_tag_descendants(form, no_iframe=True):

1191 if child is el:

1192 continue

1193 tag_name = self.get_tag(child)

1194 if tag_name == 'input':

1195 is_radio = False

1196 check = False

1197 has_name = False

1198 for k, v in self.iter_attributes(child):

1199 if util.lower(k) == 'type' and util.lower(v) == 'radio':

1200 is_radio = True

1201 elif util.lower(k) == 'name' and v == name:

1202 has_name = True

1203 elif util.lower(k) == 'checked':

1204 check = True

1205 if is_radio and check and has_name and get_parent_form(child) is form:

1206 checked = True

1207 break

1208 if checked:

1209 break

1210 if not checked:

1211 match = True

1212 self.cached_indeterminate_forms.append((form, name, match))

1213

1214 return match

1215

1216 def match_lang(self, el: bs4.Tag, langs: tuple[ct.SelectorLang, ...]) -> bool:

1217 """Match languages."""

1218

1219 match = False

1220 has_ns = self.supports_namespaces()

1221 root = self.root

1222 has_html_namespace = self.has_html_namespace

1223

1224 # Walk parents looking for `lang` (HTML) or `xml:lang` XML property.

1225 parent = el # type: bs4.Tag | None

1226 found_lang = None

1227 last = None

1228 while not found_lang:

1229 has_html_ns = self.has_html_ns(parent)

1230 for k, v in self.iter_attributes(parent):

1231 attr_ns, attr = self.split_namespace(parent, k)

1232 if (

1233 ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or

1234 (

1235 has_ns and not has_html_ns and attr_ns == NS_XML and

1236 (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang'

1237 )

1238 ):

1239 found_lang = v

1240 break

1241 last = parent

1242 parent = self.get_parent(parent, no_iframe=self.is_html)

1243

1244 if parent is None:

1245 root = last

1246 has_html_namespace = self.has_html_ns(root)

1247 parent = last

1248 break

1249

1250 # Use cached meta language.

1251 if found_lang is None and self.cached_meta_lang:

1252 for cache in self.cached_meta_lang:

1253 if root is cache[0]:

1254 found_lang = cache[1]

1255

1256 # If we couldn't find a language, and the document is HTML, look to meta to determine language.

1257 if found_lang is None and (not self.is_xml or (has_html_namespace and root and root.name == 'html')):

1258 # Find head

1259 found = False

1260 for tag in ('html', 'head'):

1261 found = False

1262 for child in self.get_tag_children(parent, no_iframe=self.is_html):

1263 if self.get_tag(child) == tag and self.is_html_tag(child):

1264 found = True

1265 parent = child

1266 break

1267 if not found: # pragma: no cover

1268 break

1269

1270 # Search meta tags

1271 if found and parent is not None:

1272 for child2 in parent:

1273 if isinstance(child2, bs4.Tag) and self.get_tag(child2) == 'meta' and self.is_html_tag(parent):

1274 c_lang = False

1275 content = None

1276 for k, v in self.iter_attributes(child2):

1277 if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language':

1278 c_lang = True

1279 if util.lower(k) == 'content':

1280 content = v

1281 if c_lang and content:

1282 found_lang = content

1283 self.cached_meta_lang.append((cast(str, root), cast(str, found_lang)))

1284 break

1285 if found_lang is not None:

1286 break

1287 if found_lang is None:

1288 self.cached_meta_lang.append((cast(str, root), ''))

1289

1290 # If we determined a language, compare.

1291 if found_lang is not None:

1292 for patterns in langs:

1293 match = False

1294 for pattern in patterns:

1295 if self.extended_language_filter(pattern, cast(str, found_lang)):

1296 match = True

1297 if not match:

1298 break

1299

1300 return match

1301

1302 def match_dir(self, el: bs4.Tag | None, directionality: int) -> bool:

1303 """Check directionality."""

1304

1305 # If we have to match both left and right, we can't match either.

1306 if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:

1307 return False

1308

1309 if el is None or not self.is_html_tag(el):

1310 return False

1311

1312 # Element has defined direction of left to right or right to left

1313 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None)

1314 if direction not in (None, 0):

1315 return direction == directionality

1316

1317 # Element is the document element (the root) and no direction assigned, assume left to right.

1318 is_root = self.is_root(el)

1319 if is_root and direction is None:

1320 return ct.SEL_DIR_LTR == directionality

1321

1322 # If `input[type=telephone]` and no direction is assigned, assume left to right.

1323 name = self.get_tag(el)

1324 is_input = name == 'input'

1325 is_textarea = name == 'textarea'

1326 is_bdi = name == 'bdi'

1327 itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else ''

1328 if is_input and itype == 'tel' and direction is None:

1329 return ct.SEL_DIR_LTR == directionality

1330

1331 # Auto handling for text inputs

1332 if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0:

1333 if is_textarea:

1334 value = ''.join(node for node in self.get_contents(el, no_iframe=True) if self.is_content_string(node)) # type: ignore[misc]

1335 else:

1336 value = cast(str, self.get_attribute_by_name(el, 'value', ''))

1337 if value:

1338 for c in value:

1339 bidi = unicodedata.bidirectional(c)

1340 if bidi in ('AL', 'R', 'L'):

1341 direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL

1342 return direction == directionality

1343 # Assume left to right

1344 return ct.SEL_DIR_LTR == directionality

1345 elif is_root:

1346 return ct.SEL_DIR_LTR == directionality

1347 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)

1348

1349 # Auto handling for `bdi` and other non text inputs.

1350 if (is_bdi and direction is None) or direction == 0:

1351 direction = self.find_bidi(el)

1352 if direction is not None:

1353 return direction == directionality

1354 elif is_root:

1355 return ct.SEL_DIR_LTR == directionality

1356 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)

1357

1358 # Match parents direction

1359 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)

1360

1361 def match_range(self, el: bs4.Tag, condition: int) -> bool:

1362 """

1363 Match range.

1364

1365 Behavior is modeled after what we see in browsers. Browsers seem to evaluate

1366 if the value is out of range, and if not, it is in range. So a missing value

1367 will not evaluate out of range; therefore, value is in range. Personally, I

1368 feel like this should evaluate as neither in or out of range.

1369 """

1370

1371 out_of_range = False

1372

1373 itype = util.lower(self.get_attribute_by_name(el, 'type'))

1374 mn = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'min', None)))

1375 mx = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'max', None)))

1376

1377 # There is no valid min or max, so we cannot evaluate a range

1378 if mn is None and mx is None:

1379 return False

1380

1381 value = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'value', None)))

1382 if value is not None:

1383 if itype in ("date", "datetime-local", "month", "week", "number", "range"):

1384 if mn is not None and value < mn:

1385 out_of_range = True

1386 if not out_of_range and mx is not None and value > mx:

1387 out_of_range = True

1388 elif itype == "time":

1389 if mn is not None and mx is not None and mn > mx:

1390 # Time is periodic, so this is a reversed/discontinuous range

1391 if value < mn and value > mx:

1392 out_of_range = True

1393 else:

1394 if mn is not None and value < mn:

1395 out_of_range = True

1396 if not out_of_range and mx is not None and value > mx:

1397 out_of_range = True

1398

1399 return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range

1400

1401 def match_defined(self, el: bs4.Tag) -> bool:

1402 """

1403 Match defined.

1404

1405 `:defined` is related to custom elements in a browser.

1406

1407 - If the document is XML (not XHTML), all tags will match.

1408 - Tags that are not custom (don't have a hyphen) are marked defined.

1409 - If the tag has a prefix (without or without a namespace), it will not match.

1410

1411 This is of course requires the parser to provide us with the proper prefix and namespace info,

1412 if it doesn't, there is nothing we can do.

1413 """

1414

1415 name = self.get_tag(el)

1416 return (

1417 name is not None and (

1418 name.find('-') == -1 or

1419 name.find(':') != -1 or

1420 self.get_prefix(el) is not None

1421 )

1422 )

1423

1424 def match_placeholder_shown(self, el: bs4.Tag) -> bool:

1425 """

1426 Match placeholder shown according to HTML spec.

1427

1428 - text area should be checked if they have content. A single newline does not count as content.

1429

1430 """

1431

1432 match = False

1433 content = self.get_text(el)

1434 if content in ('', '\n'):

1435 match = True

1436

1437 return match

1438

1439 def match_selectors(self, el: bs4.Tag, selectors: ct.SelectorList) -> bool:

1440 """Check if element matches one of the selectors."""

1441

1442 match = False

1443 is_not = selectors.is_not

1444 is_html = selectors.is_html

1445

1446 # Internal selector lists that use the HTML flag, will automatically get the `html` namespace.

1447 if is_html:

1448 namespaces = self.namespaces

1449 iframe_restrict = self.iframe_restrict

1450 self.namespaces = {'html': NS_XHTML}

1451 self.iframe_restrict = True

1452

1453 if not is_html or self.is_html:

1454 for selector in selectors:

1455 match = is_not

1456 # We have a un-matchable situation (like `:focus` as you can focus an element in this environment)

1457 if isinstance(selector, ct.SelectorNull):

1458 continue

1459 # Verify tag matches

1460 if not self.match_tag(el, selector.tag):

1461 continue

1462 # Verify tag is defined

1463 if selector.flags & ct.SEL_DEFINED and not self.match_defined(el):

1464 continue

1465 # Verify element is root

1466 if selector.flags & ct.SEL_ROOT and not self.match_root(el):

1467 continue

1468 # Verify element is scope

1469 if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):

1470 continue

1471 # Verify element has placeholder shown

1472 if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el):

1473 continue

1474 # Verify `nth` matches

1475 if not self.match_nth(el, selector.nth):

1476 continue

1477 if selector.flags & ct.SEL_EMPTY and not self.match_empty(el):

1478 continue

1479 # Verify id matches

1480 if selector.ids and not self.match_id(el, selector.ids):

1481 continue

1482 # Verify classes match

1483 if selector.classes and not self.match_classes(el, selector.classes):

1484 continue

1485 # Verify attribute(s) match

1486 if not self.match_attributes(el, selector.attributes):

1487 continue

1488 # Verify ranges

1489 if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES):

1490 continue

1491 # Verify language patterns

1492 if selector.lang and not self.match_lang(el, selector.lang):

1493 continue

1494 # Verify pseudo selector patterns

1495 if selector.selectors and not self.match_subselectors(el, selector.selectors):

1496 continue

1497 # Verify relationship selectors

1498 if selector.relation and not self.match_relations(el, selector.relation):

1499 continue

1500 # Validate that the current default selector match corresponds to the first submit button in the form

1501 if selector.flags & ct.SEL_DEFAULT and not self.match_default(el):

1502 continue

1503 # Validate that the unset radio button is among radio buttons with the same name in a form that are

1504 # also not set.

1505 if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el):

1506 continue

1507 # Validate element directionality

1508 if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS):

1509 continue

1510 # Validate that the tag contains the specified text.

1511 if selector.contains and not self.match_contains(el, selector.contains):

1512 continue

1513 match = not is_not

1514 break

1515

1516 # Restore actual namespaces being used for external selector lists

1517 if is_html:

1518 self.namespaces = namespaces

1519 self.iframe_restrict = iframe_restrict

1520

1521 return match

1522

1523 def select(self, limit: int = 0) -> Iterator[bs4.Tag]:

1524 """Match all tags under the targeted tag."""

1525

1526 lim = None if limit < 1 else limit

1527

1528 for child in self.get_tag_descendants(self.tag):

1529 if self.match(child):

1530 yield child

1531 if lim is not None:

1532 lim -= 1

1533 if lim < 1:

1534 break

1535

1536 def closest(self) -> bs4.Tag | None:

1537 """Match closest ancestor."""

1538

1539 current = self.tag # type: bs4.Tag | None

1540 closest = None

1541 while closest is None and current is not None:

1542 if self.match(current):

1543 closest = current

1544 else:

1545 current = self.get_parent(current)

1546 return closest

1547

1548 def filter(self) -> list[bs4.Tag]: # noqa A001

1549 """Filter tag's children."""

1550

1551 return [

1552 tag for tag in self.get_contents(self.tag)

1553 if isinstance(tag, bs4.Tag) and self.match(tag)

1554 ]

1555

1556 def match(self, el: bs4.Tag) -> bool:

1557 """Match."""

1558

1559 return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)

1560

1561

1562class SoupSieve(ct.Immutable):

1563 """Compiled Soup Sieve selector matching object."""

1564

1565 pattern: str

1566 selectors: ct.SelectorList

1567 namespaces: ct.Namespaces | None

1568 custom: dict[str, str]

1569 flags: int

1570

1571 __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash")

1572

1573 def __init__(

1574 self,

1575 pattern: str,

1576 selectors: ct.SelectorList,

1577 namespaces: ct.Namespaces | None,

1578 custom: ct.CustomSelectors | None,

1579 flags: int

1580 ):

1581 """Initialize."""

1582

1583 super().__init__(

1584 pattern=pattern,

1585 selectors=selectors,

1586 namespaces=namespaces,

1587 custom=custom,

1588 flags=flags

1589 )

1590

1591 def match(self, tag: bs4.Tag) -> bool:

1592 """Match."""

1593

1594 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag)

1595

1596 def closest(self, tag: bs4.Tag) -> bs4.Tag | None:

1597 """Match closest ancestor."""

1598

1599 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest()

1600

1601 def filter(self, iterable: Iterable[bs4.Tag]) -> list[bs4.Tag]: # noqa A001

1602 """

1603 Filter.

1604

1605 `CSSMatch` can cache certain searches for tags of the same document,

1606 so if we are given a tag, all tags are from the same document,

1607 and we can take advantage of the optimization.

1608

1609 Any other kind of iterable could have tags from different documents or detached tags,

1610 so for those, we use a new `CSSMatch` for each item in the iterable.

1611 """

1612

1613 if isinstance(iterable, bs4.Tag):

1614 return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter()

1615 else:

1616 return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]

1617

1618 def select_one(self, tag: bs4.Tag) -> bs4.Tag | None:

1619 """Select a single tag."""

1620

1621 tags = self.select(tag, limit=1)

1622 return tags[0] if tags else None

1623

1624 def select(self, tag: bs4.Tag, limit: int = 0) -> list[bs4.Tag]:

1625 """Select the specified tags."""

1626

1627 return list(self.iselect(tag, limit))

1628

1629 def iselect(self, tag: bs4.Tag, limit: int = 0) -> Iterator[bs4.Tag]:

1630 """Iterate the specified tags."""

1631

1632 yield from CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit)

1633

1634 def __repr__(self) -> str: # pragma: no cover

1635 """Representation."""

1636

1637 return (

1638 f"SoupSieve(pattern={self.pattern!r}, namespaces={self.namespaces!r}, "

1639 f"custom={self.custom!r}, flags={self.flags!r})"

1640 )

1641

1642 __str__ = __repr__

1643

1644

1645ct.pickle_register(SoupSieve)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/soupsieve/css_match.py: 21%

974 statements