Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/soupsieve/css_match.py: 22%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

953 statements  

1"""CSS matcher.""" 

2from __future__ import annotations 

3from datetime import datetime 

4from . import util 

5import re 

6from . import css_types as ct 

7import unicodedata 

8import bs4 # type: ignore[import-untyped] 

9from typing import Iterator, Iterable, Any, Callable, Sequence, cast # noqa: F401 

10 

11# Empty tag pattern (whitespace okay) 

12RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') 

13 

14RE_NOT_WS = re.compile('[^ \t\r\n\f]+') 

15 

16# Relationships 

17REL_PARENT = ' ' 

18REL_CLOSE_PARENT = '>' 

19REL_SIBLING = '~' 

20REL_CLOSE_SIBLING = '+' 

21 

22# Relationships for :has() (forward looking) 

23REL_HAS_PARENT = ': ' 

24REL_HAS_CLOSE_PARENT = ':>' 

25REL_HAS_SIBLING = ':~' 

26REL_HAS_CLOSE_SIBLING = ':+' 

27 

28NS_XHTML = 'http://www.w3.org/1999/xhtml' 

29NS_XML = 'http://www.w3.org/XML/1998/namespace' 

30 

31DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL 

32RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE 

33 

34DIR_MAP = { 

35 'ltr': ct.SEL_DIR_LTR, 

36 'rtl': ct.SEL_DIR_RTL, 

37 'auto': 0 

38} 

39 

40RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$") 

41RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$') 

42RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$') 

43RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$') 

44RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$') 

45RE_DATETIME = re.compile( 

46 r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$' 

47) 

48RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)') 

49 

50MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November 

51FEB = 2 

52SHORT_MONTH = 30 

53LONG_MONTH = 31 

54FEB_MONTH = 28 

55FEB_LEAP_MONTH = 29 

56DAYS_IN_WEEK = 7 

57 

58 

59class _FakeParent: 

60 """ 

61 Fake parent class. 

62 

63 When we have a fragment with no `BeautifulSoup` document object, 

64 we can't evaluate `nth` selectors properly. Create a temporary 

65 fake parent so we can traverse the root element as a child. 

66 """ 

67 

68 def __init__(self, element: bs4.Tag) -> None: 

69 """Initialize.""" 

70 

71 self.contents = [element] 

72 

73 def __len__(self) -> bs4.PageElement: 

74 """Length.""" 

75 

76 return len(self.contents) 

77 

78 

79class _DocumentNav: 

80 """Navigate a Beautiful Soup document.""" 

81 

82 @classmethod 

83 def assert_valid_input(cls, tag: Any) -> None: 

84 """Check if valid input tag or document.""" 

85 

86 # Fail on unexpected types. 

87 if not cls.is_tag(tag): 

88 raise TypeError(f"Expected a BeautifulSoup 'Tag', but instead received type {type(tag)}") 

89 

90 @staticmethod 

91 def is_doc(obj: bs4.Tag) -> bool: 

92 """Is `BeautifulSoup` object.""" 

93 return isinstance(obj, bs4.BeautifulSoup) 

94 

95 @staticmethod 

96 def is_tag(obj: bs4.PageElement) -> bool: 

97 """Is tag.""" 

98 return isinstance(obj, bs4.Tag) 

99 

100 @staticmethod 

101 def is_declaration(obj: bs4.PageElement) -> bool: # pragma: no cover 

102 """Is declaration.""" 

103 return isinstance(obj, bs4.Declaration) 

104 

105 @staticmethod 

106 def is_cdata(obj: bs4.PageElement) -> bool: 

107 """Is CDATA.""" 

108 return isinstance(obj, bs4.CData) 

109 

110 @staticmethod 

111 def is_processing_instruction(obj: bs4.PageElement) -> bool: # pragma: no cover 

112 """Is processing instruction.""" 

113 return isinstance(obj, bs4.ProcessingInstruction) 

114 

115 @staticmethod 

116 def is_navigable_string(obj: bs4.PageElement) -> bool: 

117 """Is navigable string.""" 

118 return isinstance(obj, bs4.NavigableString) 

119 

120 @staticmethod 

121 def is_special_string(obj: bs4.PageElement) -> bool: 

122 """Is special string.""" 

123 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) 

124 

125 @classmethod 

126 def is_content_string(cls, obj: bs4.PageElement) -> bool: 

127 """Check if node is content string.""" 

128 

129 return cls.is_navigable_string(obj) and not cls.is_special_string(obj) 

130 

131 @staticmethod 

132 def create_fake_parent(el: bs4.Tag) -> _FakeParent: 

133 """Create fake parent for a given element.""" 

134 

135 return _FakeParent(el) 

136 

137 @staticmethod 

138 def is_xml_tree(el: bs4.Tag) -> bool: 

139 """Check if element (or document) is from a XML tree.""" 

140 

141 return bool(el._is_xml) 

142 

143 def is_iframe(self, el: bs4.Tag) -> bool: 

144 """Check if element is an `iframe`.""" 

145 

146 return bool( 

147 ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and 

148 self.is_html_tag(el) # type: ignore[attr-defined] 

149 ) 

150 

151 def is_root(self, el: bs4.Tag) -> bool: 

152 """ 

153 Return whether element is a root element. 

154 

155 We check that the element is the root of the tree (which we have already pre-calculated), 

156 and we check if it is the root element under an `iframe`. 

157 """ 

158 

159 root = self.root and self.root is el # type: ignore[attr-defined] 

160 if not root: 

161 parent = self.get_parent(el) 

162 root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined] 

163 return root 

164 

165 def get_contents(self, el: bs4.Tag, no_iframe: bool = False) -> Iterator[bs4.PageElement]: 

166 """Get contents or contents in reverse.""" 

167 if not no_iframe or not self.is_iframe(el): 

168 yield from el.contents 

169 

170 def get_children( 

171 self, 

172 el: bs4.Tag, 

173 start: int | None = None, 

174 reverse: bool = False, 

175 tags: bool = True, 

176 no_iframe: bool = False 

177 ) -> Iterator[bs4.PageElement]: 

178 """Get children.""" 

179 

180 if not no_iframe or not self.is_iframe(el): 

181 last = len(el.contents) - 1 

182 if start is None: 

183 index = last if reverse else 0 

184 else: 

185 index = start 

186 end = -1 if reverse else last + 1 

187 incr = -1 if reverse else 1 

188 

189 if 0 <= index <= last: 

190 while index != end: 

191 node = el.contents[index] 

192 index += incr 

193 if not tags or self.is_tag(node): 

194 yield node 

195 

196 def get_descendants( 

197 self, 

198 el: bs4.Tag, 

199 tags: bool = True, 

200 no_iframe: bool = False 

201 ) -> Iterator[bs4.PageElement]: 

202 """Get descendants.""" 

203 

204 if not no_iframe or not self.is_iframe(el): 

205 next_good = None 

206 for child in el.descendants: 

207 

208 if next_good is not None: 

209 if child is not next_good: 

210 continue 

211 next_good = None 

212 

213 is_tag = self.is_tag(child) 

214 

215 if no_iframe and is_tag and self.is_iframe(child): 

216 if child.next_sibling is not None: 

217 next_good = child.next_sibling 

218 else: 

219 last_child = child 

220 while self.is_tag(last_child) and last_child.contents: 

221 last_child = last_child.contents[-1] 

222 next_good = last_child.next_element 

223 yield child 

224 if next_good is None: 

225 break 

226 # Coverage isn't seeing this even though it's executed 

227 continue # pragma: no cover 

228 

229 if not tags or is_tag: 

230 yield child 

231 

232 def get_parent(self, el: bs4.Tag, no_iframe: bool = False) -> bs4.Tag: 

233 """Get parent.""" 

234 

235 parent = el.parent 

236 if no_iframe and parent is not None and self.is_iframe(parent): 

237 parent = None 

238 return parent 

239 

240 @staticmethod 

241 def get_tag_name(el: bs4.Tag) -> str | None: 

242 """Get tag.""" 

243 

244 return cast('str | None', el.name) 

245 

246 @staticmethod 

247 def get_prefix_name(el: bs4.Tag) -> str | None: 

248 """Get prefix.""" 

249 

250 return cast('str | None', el.prefix) 

251 

252 @staticmethod 

253 def get_uri(el: bs4.Tag) -> str | None: 

254 """Get namespace `URI`.""" 

255 

256 return cast('str | None', el.namespace) 

257 

258 @classmethod 

259 def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement: 

260 """Get next sibling tag.""" 

261 

262 sibling = el.next_sibling 

263 while tags and not cls.is_tag(sibling) and sibling is not None: 

264 sibling = sibling.next_sibling 

265 return sibling 

266 

267 @classmethod 

268 def get_previous(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement: 

269 """Get previous sibling tag.""" 

270 

271 sibling = el.previous_sibling 

272 while tags and not cls.is_tag(sibling) and sibling is not None: 

273 sibling = sibling.previous_sibling 

274 return sibling 

275 

276 @staticmethod 

277 def has_html_ns(el: bs4.Tag) -> bool: 

278 """ 

279 Check if element has an HTML namespace. 

280 

281 This is a bit different than whether a element is treated as having an HTML namespace, 

282 like we do in the case of `is_html_tag`. 

283 """ 

284 

285 ns = getattr(el, 'namespace') if el else None # noqa: B009 

286 return bool(ns and ns == NS_XHTML) 

287 

288 @staticmethod 

289 def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[str | None, str | None]: 

290 """Return namespace and attribute name without the prefix.""" 

291 

292 return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None) 

293 

294 @classmethod 

295 def normalize_value(cls, value: Any) -> str | Sequence[str]: 

296 """Normalize the value to be a string or list of strings.""" 

297 

298 # Treat `None` as empty string. 

299 if value is None: 

300 return '' 

301 

302 # Pass through strings 

303 if (isinstance(value, str)): 

304 return value 

305 

306 # If it's a byte string, convert it to Unicode, treating it as UTF-8. 

307 if isinstance(value, bytes): 

308 return value.decode("utf8") 

309 

310 # BeautifulSoup supports sequences of attribute values, so make sure the children are strings. 

311 if isinstance(value, Sequence): 

312 new_value = [] 

313 for v in value: 

314 if not isinstance(v, (str, bytes)) and isinstance(v, Sequence): 

315 # This is most certainly a user error and will crash and burn later. 

316 # To keep things working, we'll do what we do with all objects, 

317 # And convert them to strings. 

318 new_value.append(str(v)) 

319 else: 

320 # Convert the child to a string 

321 new_value.append(cast(str, cls.normalize_value(v))) 

322 return new_value 

323 

324 # Try and make anything else a string 

325 return str(value) 

326 

327 @classmethod 

328 def get_attribute_by_name( 

329 cls, 

330 el: bs4.Tag, 

331 name: str, 

332 default: str | Sequence[str] | None = None 

333 ) -> str | Sequence[str] | None: 

334 """Get attribute by name.""" 

335 

336 value = default 

337 if el._is_xml: 

338 try: 

339 value = cls.normalize_value(el.attrs[name]) 

340 except KeyError: 

341 pass 

342 else: 

343 for k, v in el.attrs.items(): 

344 if util.lower(k) == name: 

345 value = cls.normalize_value(v) 

346 break 

347 return value 

348 

349 @classmethod 

350 def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, str | Sequence[str] | None]]: 

351 """Iterate attributes.""" 

352 

353 for k, v in el.attrs.items(): 

354 yield k, cls.normalize_value(v) 

355 

356 @classmethod 

357 def get_classes(cls, el: bs4.Tag) -> Sequence[str]: 

358 """Get classes.""" 

359 

360 classes = cls.get_attribute_by_name(el, 'class', []) 

361 if isinstance(classes, str): 

362 classes = RE_NOT_WS.findall(classes) 

363 return cast(Sequence[str], classes) 

364 

365 def get_text(self, el: bs4.Tag, no_iframe: bool = False) -> str: 

366 """Get text.""" 

367 

368 return ''.join( 

369 [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)] 

370 ) 

371 

372 def get_own_text(self, el: bs4.Tag, no_iframe: bool = False) -> list[str]: 

373 """Get Own Text.""" 

374 

375 return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)] 

376 

377 

378class Inputs: 

379 """Class for parsing and validating input items.""" 

380 

381 @staticmethod 

382 def validate_day(year: int, month: int, day: int) -> bool: 

383 """Validate day.""" 

384 

385 max_days = LONG_MONTH 

386 if month == FEB: 

387 max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH 

388 elif month in MONTHS_30: 

389 max_days = SHORT_MONTH 

390 return 1 <= day <= max_days 

391 

392 @staticmethod 

393 def validate_week(year: int, week: int) -> bool: 

394 """Validate week.""" 

395 

396 max_week = datetime.strptime(f"{12}-{31}-{year}", "%m-%d-%Y").isocalendar()[1] 

397 if max_week == 1: 

398 max_week = 53 

399 return 1 <= week <= max_week 

400 

401 @staticmethod 

402 def validate_month(month: int) -> bool: 

403 """Validate month.""" 

404 

405 return 1 <= month <= 12 

406 

407 @staticmethod 

408 def validate_year(year: int) -> bool: 

409 """Validate year.""" 

410 

411 return 1 <= year 

412 

413 @staticmethod 

414 def validate_hour(hour: int) -> bool: 

415 """Validate hour.""" 

416 

417 return 0 <= hour <= 23 

418 

419 @staticmethod 

420 def validate_minutes(minutes: int) -> bool: 

421 """Validate minutes.""" 

422 

423 return 0 <= minutes <= 59 

424 

425 @classmethod 

426 def parse_value(cls, itype: str, value: str | None) -> tuple[float, ...] | None: 

427 """Parse the input value.""" 

428 

429 parsed = None # type: tuple[float, ...] | None 

430 if value is None: 

431 return value 

432 if itype == "date": 

433 m = RE_DATE.match(value) 

434 if m: 

435 year = int(m.group('year'), 10) 

436 month = int(m.group('month'), 10) 

437 day = int(m.group('day'), 10) 

438 if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day): 

439 parsed = (year, month, day) 

440 elif itype == "month": 

441 m = RE_MONTH.match(value) 

442 if m: 

443 year = int(m.group('year'), 10) 

444 month = int(m.group('month'), 10) 

445 if cls.validate_year(year) and cls.validate_month(month): 

446 parsed = (year, month) 

447 elif itype == "week": 

448 m = RE_WEEK.match(value) 

449 if m: 

450 year = int(m.group('year'), 10) 

451 week = int(m.group('week'), 10) 

452 if cls.validate_year(year) and cls.validate_week(year, week): 

453 parsed = (year, week) 

454 elif itype == "time": 

455 m = RE_TIME.match(value) 

456 if m: 

457 hour = int(m.group('hour'), 10) 

458 minutes = int(m.group('minutes'), 10) 

459 if cls.validate_hour(hour) and cls.validate_minutes(minutes): 

460 parsed = (hour, minutes) 

461 elif itype == "datetime-local": 

462 m = RE_DATETIME.match(value) 

463 if m: 

464 year = int(m.group('year'), 10) 

465 month = int(m.group('month'), 10) 

466 day = int(m.group('day'), 10) 

467 hour = int(m.group('hour'), 10) 

468 minutes = int(m.group('minutes'), 10) 

469 if ( 

470 cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and 

471 cls.validate_hour(hour) and cls.validate_minutes(minutes) 

472 ): 

473 parsed = (year, month, day, hour, minutes) 

474 elif itype in ("number", "range"): 

475 m = RE_NUM.match(value) 

476 if m: 

477 parsed = (float(m.group('value')),) 

478 return parsed 

479 

480 

481class CSSMatch(_DocumentNav): 

482 """Perform CSS matching.""" 

483 

484 def __init__( 

485 self, 

486 selectors: ct.SelectorList, 

487 scope: bs4.Tag, 

488 namespaces: ct.Namespaces | None, 

489 flags: int 

490 ) -> None: 

491 """Initialize.""" 

492 

493 self.assert_valid_input(scope) 

494 self.tag = scope 

495 self.cached_meta_lang = [] # type: list[tuple[str, str]] 

496 self.cached_default_forms = [] # type: list[tuple[bs4.Tag, bs4.Tag]] 

497 self.cached_indeterminate_forms = [] # type: list[tuple[bs4.Tag, str, bool]] 

498 self.selectors = selectors 

499 self.namespaces = {} if namespaces is None else namespaces # type: ct.Namespaces | dict[str, str] 

500 self.flags = flags 

501 self.iframe_restrict = False 

502 

503 # Find the root element for the whole tree 

504 doc = scope 

505 parent = self.get_parent(doc) 

506 while parent: 

507 doc = parent 

508 parent = self.get_parent(doc) 

509 root = None 

510 if not self.is_doc(doc): 

511 root = doc 

512 else: 

513 for child in self.get_children(doc): 

514 root = child 

515 break 

516 

517 self.root = root 

518 self.scope = scope if scope is not doc else root 

519 self.has_html_namespace = self.has_html_ns(root) 

520 

521 # A document can be both XML and HTML (XHTML) 

522 self.is_xml = self.is_xml_tree(doc) 

523 self.is_html = not self.is_xml or self.has_html_namespace 

524 

525 def supports_namespaces(self) -> bool: 

526 """Check if namespaces are supported in the HTML type.""" 

527 

528 return self.is_xml or self.has_html_namespace 

529 

530 def get_tag_ns(self, el: bs4.Tag) -> str: 

531 """Get tag namespace.""" 

532 

533 if self.supports_namespaces(): 

534 namespace = '' 

535 ns = self.get_uri(el) 

536 if ns: 

537 namespace = ns 

538 else: 

539 namespace = NS_XHTML 

540 return namespace 

541 

542 def is_html_tag(self, el: bs4.Tag) -> bool: 

543 """Check if tag is in HTML namespace.""" 

544 

545 return self.get_tag_ns(el) == NS_XHTML 

546 

547 def get_tag(self, el: bs4.Tag) -> str | None: 

548 """Get tag.""" 

549 

550 name = self.get_tag_name(el) 

551 return util.lower(name) if name is not None and not self.is_xml else name 

552 

553 def get_prefix(self, el: bs4.Tag) -> str | None: 

554 """Get prefix.""" 

555 

556 prefix = self.get_prefix_name(el) 

557 return util.lower(prefix) if prefix is not None and not self.is_xml else prefix 

558 

559 def find_bidi(self, el: bs4.Tag) -> int | None: 

560 """Get directionality from element text.""" 

561 

562 for node in self.get_children(el, tags=False): 

563 

564 # Analyze child text nodes 

565 if self.is_tag(node): 

566 

567 # Avoid analyzing certain elements specified in the specification. 

568 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None) 

569 if ( 

570 self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or 

571 not self.is_html_tag(node) or 

572 direction is not None 

573 ): 

574 continue # pragma: no cover 

575 

576 # Check directionality of this node's text 

577 value = self.find_bidi(node) 

578 if value is not None: 

579 return value 

580 

581 # Direction could not be determined 

582 continue # pragma: no cover 

583 

584 # Skip `doctype` comments, etc. 

585 if self.is_special_string(node): 

586 continue 

587 

588 # Analyze text nodes for directionality. 

589 for c in node: 

590 bidi = unicodedata.bidirectional(c) 

591 if bidi in ('AL', 'R', 'L'): 

592 return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL 

593 return None 

594 

595 def extended_language_filter(self, lang_range: str, lang_tag: str) -> bool: 

596 """Filter the language tags.""" 

597 

598 match = True 

599 lang_range = RE_WILD_STRIP.sub('-', lang_range).lower() 

600 ranges = lang_range.split('-') 

601 subtags = lang_tag.lower().split('-') 

602 length = len(ranges) 

603 slength = len(subtags) 

604 rindex = 0 

605 sindex = 0 

606 r = ranges[rindex] 

607 s = subtags[sindex] 

608 

609 # Empty specified language should match unspecified language attributes 

610 if length == 1 and slength == 1 and not r and r == s: 

611 return True 

612 

613 # Primary tag needs to match 

614 if (r != '*' and r != s) or (r == '*' and slength == 1 and not s): 

615 match = False 

616 

617 rindex += 1 

618 sindex += 1 

619 

620 # Match until we run out of ranges 

621 while match and rindex < length: 

622 r = ranges[rindex] 

623 try: 

624 s = subtags[sindex] 

625 except IndexError: 

626 # Ran out of subtags, 

627 # but we still have ranges 

628 match = False 

629 continue 

630 

631 # Empty range 

632 if not r: 

633 match = False 

634 continue 

635 

636 # Matched range 

637 elif s == r: 

638 rindex += 1 

639 

640 # Implicit wildcard cannot match 

641 # singletons 

642 elif len(s) == 1: 

643 match = False 

644 continue 

645 

646 # Implicitly matched, so grab next subtag 

647 sindex += 1 

648 

649 return match 

650 

651 def match_attribute_name( 

652 self, 

653 el: bs4.Tag, 

654 attr: str, 

655 prefix: str | None 

656 ) -> str | Sequence[str] | None: 

657 """Match attribute name and return value if it exists.""" 

658 

659 value = None 

660 if self.supports_namespaces(): 

661 value = None 

662 # If we have not defined namespaces, we can't very well find them, so don't bother trying. 

663 if prefix: 

664 ns = self.namespaces.get(prefix) 

665 if ns is None and prefix != '*': 

666 return None 

667 else: 

668 ns = None 

669 

670 for k, v in self.iter_attributes(el): 

671 

672 # Get attribute parts 

673 namespace, name = self.split_namespace(el, k) 

674 

675 # Can't match a prefix attribute as we haven't specified one to match 

676 # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`. 

677 if ns is None: 

678 if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)): 

679 value = v 

680 break 

681 # Coverage is not finding this even though it is executed. 

682 # Adding a print statement before this (and erasing coverage) causes coverage to find the line. 

683 # Ignore the false positive message. 

684 continue # pragma: no cover 

685 

686 # We can't match our desired prefix attribute as the attribute doesn't have a prefix 

687 if namespace is None or ns != namespace and prefix != '*': 

688 continue 

689 

690 # The attribute doesn't match. 

691 if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name): 

692 continue 

693 

694 value = v 

695 break 

696 else: 

697 for k, v in self.iter_attributes(el): 

698 if util.lower(attr) != util.lower(k): 

699 continue 

700 value = v 

701 break 

702 return value 

703 

704 def match_namespace(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool: 

705 """Match the namespace of the element.""" 

706 

707 match = True 

708 namespace = self.get_tag_ns(el) 

709 default_namespace = self.namespaces.get('') 

710 tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix) 

711 # We must match the default namespace if one is not provided 

712 if tag.prefix is None and (default_namespace is not None and namespace != default_namespace): 

713 match = False 

714 # If we specified `|tag`, we must not have a namespace. 

715 elif (tag.prefix is not None and tag.prefix == '' and namespace): 

716 match = False 

717 # Verify prefix matches 

718 elif ( 

719 tag.prefix and 

720 tag.prefix != '*' and (tag_ns is None or namespace != tag_ns) 

721 ): 

722 match = False 

723 return match 

724 

725 def match_attributes(self, el: bs4.Tag, attributes: tuple[ct.SelectorAttribute, ...]) -> bool: 

726 """Match attributes.""" 

727 

728 match = True 

729 if attributes: 

730 for a in attributes: 

731 temp = self.match_attribute_name(el, a.attribute, a.prefix) 

732 pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern 

733 if temp is None: 

734 match = False 

735 break 

736 value = temp if isinstance(temp, str) else ' '.join(temp) 

737 if pattern is None: 

738 continue 

739 elif pattern.match(value) is None: 

740 match = False 

741 break 

742 return match 

743 

744 def match_tagname(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool: 

745 """Match tag name.""" 

746 

747 name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name) 

748 return not ( 

749 name is not None and 

750 name not in (self.get_tag(el), '*') 

751 ) 

752 

753 def match_tag(self, el: bs4.Tag, tag: ct.SelectorTag | None) -> bool: 

754 """Match the tag.""" 

755 

756 match = True 

757 if tag is not None: 

758 # Verify namespace 

759 if not self.match_namespace(el, tag): 

760 match = False 

761 if not self.match_tagname(el, tag): 

762 match = False 

763 return match 

764 

765 def match_past_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: 

766 """Match past relationship.""" 

767 

768 found = False 

769 # I don't think this can ever happen, but it makes `mypy` happy 

770 if isinstance(relation[0], ct.SelectorNull): # pragma: no cover 

771 return found 

772 

773 if relation[0].rel_type == REL_PARENT: 

774 parent = self.get_parent(el, no_iframe=self.iframe_restrict) 

775 while not found and parent: 

776 found = self.match_selectors(parent, relation) 

777 parent = self.get_parent(parent, no_iframe=self.iframe_restrict) 

778 elif relation[0].rel_type == REL_CLOSE_PARENT: 

779 parent = self.get_parent(el, no_iframe=self.iframe_restrict) 

780 if parent: 

781 found = self.match_selectors(parent, relation) 

782 elif relation[0].rel_type == REL_SIBLING: 

783 sibling = self.get_previous(el) 

784 while not found and sibling: 

785 found = self.match_selectors(sibling, relation) 

786 sibling = self.get_previous(sibling) 

787 elif relation[0].rel_type == REL_CLOSE_SIBLING: 

788 sibling = self.get_previous(el) 

789 if sibling and self.is_tag(sibling): 

790 found = self.match_selectors(sibling, relation) 

791 return found 

792 

793 def match_future_child(self, parent: bs4.Tag, relation: ct.SelectorList, recursive: bool = False) -> bool: 

794 """Match future child.""" 

795 

796 match = False 

797 if recursive: 

798 children = self.get_descendants # type: Callable[..., Iterator[bs4.Tag]] 

799 else: 

800 children = self.get_children 

801 for child in children(parent, no_iframe=self.iframe_restrict): 

802 match = self.match_selectors(child, relation) 

803 if match: 

804 break 

805 return match 

806 

807 def match_future_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: 

808 """Match future relationship.""" 

809 

810 found = False 

811 # I don't think this can ever happen, but it makes `mypy` happy 

812 if isinstance(relation[0], ct.SelectorNull): # pragma: no cover 

813 return found 

814 

815 if relation[0].rel_type == REL_HAS_PARENT: 

816 found = self.match_future_child(el, relation, True) 

817 elif relation[0].rel_type == REL_HAS_CLOSE_PARENT: 

818 found = self.match_future_child(el, relation) 

819 elif relation[0].rel_type == REL_HAS_SIBLING: 

820 sibling = self.get_next(el) 

821 while not found and sibling: 

822 found = self.match_selectors(sibling, relation) 

823 sibling = self.get_next(sibling) 

824 elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING: 

825 sibling = self.get_next(el) 

826 if sibling and self.is_tag(sibling): 

827 found = self.match_selectors(sibling, relation) 

828 return found 

829 

830 def match_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: 

831 """Match relationship to other elements.""" 

832 

833 found = False 

834 

835 if isinstance(relation[0], ct.SelectorNull) or relation[0].rel_type is None: 

836 return found 

837 

838 if relation[0].rel_type.startswith(':'): 

839 found = self.match_future_relations(el, relation) 

840 else: 

841 found = self.match_past_relations(el, relation) 

842 

843 return found 

844 

845 def match_id(self, el: bs4.Tag, ids: tuple[str, ...]) -> bool: 

846 """Match element's ID.""" 

847 

848 found = True 

849 for i in ids: 

850 if i != self.get_attribute_by_name(el, 'id', ''): 

851 found = False 

852 break 

853 return found 

854 

855 def match_classes(self, el: bs4.Tag, classes: tuple[str, ...]) -> bool: 

856 """Match element's classes.""" 

857 

858 current_classes = self.get_classes(el) 

859 found = True 

860 for c in classes: 

861 if c not in current_classes: 

862 found = False 

863 break 

864 return found 

865 

866 def match_root(self, el: bs4.Tag) -> bool: 

867 """Match element as root.""" 

868 

869 is_root = self.is_root(el) 

870 if is_root: 

871 sibling = self.get_previous(el, tags=False) 

872 while is_root and sibling is not None: 

873 if ( 

874 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or 

875 self.is_cdata(sibling) 

876 ): 

877 is_root = False 

878 else: 

879 sibling = self.get_previous(sibling, tags=False) 

880 if is_root: 

881 sibling = self.get_next(el, tags=False) 

882 while is_root and sibling is not None: 

883 if ( 

884 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or 

885 self.is_cdata(sibling) 

886 ): 

887 is_root = False 

888 else: 

889 sibling = self.get_next(sibling, tags=False) 

890 return is_root 

891 

892 def match_scope(self, el: bs4.Tag) -> bool: 

893 """Match element as scope.""" 

894 

895 return self.scope is el 

896 

897 def match_nth_tag_type(self, el: bs4.Tag, child: bs4.Tag) -> bool: 

898 """Match tag type for `nth` matches.""" 

899 

900 return ( 

901 (self.get_tag(child) == self.get_tag(el)) and 

902 (self.get_tag_ns(child) == self.get_tag_ns(el)) 

903 ) 

904 

905 def match_nth(self, el: bs4.Tag, nth: bs4.Tag) -> bool: 

906 """Match `nth` elements.""" 

907 

908 matched = True 

909 

910 for n in nth: 

911 matched = False 

912 if n.selectors and not self.match_selectors(el, n.selectors): 

913 break 

914 parent = self.get_parent(el) 

915 if parent is None: 

916 parent = self.create_fake_parent(el) 

917 last = n.last 

918 last_index = len(parent) - 1 

919 index = last_index if last else 0 

920 relative_index = 0 

921 a = n.a 

922 b = n.b 

923 var = n.n 

924 count = 0 

925 count_incr = 1 

926 factor = -1 if last else 1 

927 idx = last_idx = a * count + b if var else a 

928 

929 # We can only adjust bounds within a variable index 

930 if var: 

931 # Abort if our nth index is out of bounds and only getting further out of bounds as we increment. 

932 # Otherwise, increment to try to get in bounds. 

933 adjust = None 

934 while idx < 1 or idx > last_index: 

935 if idx < 0: 

936 diff_low = 0 - idx 

937 if adjust is not None and adjust == 1: 

938 break 

939 adjust = -1 

940 count += count_incr 

941 idx = last_idx = a * count + b if var else a 

942 diff = 0 - idx 

943 if diff >= diff_low: 

944 break 

945 else: 

946 diff_high = idx - last_index 

947 if adjust is not None and adjust == -1: 

948 break 

949 adjust = 1 

950 count += count_incr 

951 idx = last_idx = a * count + b if var else a 

952 diff = idx - last_index 

953 if diff >= diff_high: 

954 break 

955 diff_high = diff 

956 

957 # If a < 0, our count is working backwards, so floor the index by increasing the count. 

958 # Find the count that yields the lowest, in bound value and use that. 

959 # Lastly reverse count increment so that we'll increase our index. 

960 lowest = count 

961 if a < 0: 

962 while idx >= 1: 

963 lowest = count 

964 count += count_incr 

965 idx = last_idx = a * count + b if var else a 

966 count_incr = -1 

967 count = lowest 

968 idx = last_idx = a * count + b if var else a 

969 

970 # Evaluate elements while our calculated nth index is still in range 

971 while 1 <= idx <= last_index + 1: 

972 child = None 

973 # Evaluate while our child index is still in range. 

974 for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False): 

975 index += factor 

976 if not self.is_tag(child): 

977 continue 

978 # Handle `of S` in `nth-child` 

979 if n.selectors and not self.match_selectors(child, n.selectors): 

980 continue 

981 # Handle `of-type` 

982 if n.of_type and not self.match_nth_tag_type(el, child): 

983 continue 

984 relative_index += 1 

985 if relative_index == idx: 

986 if child is el: 

987 matched = True 

988 else: 

989 break 

990 if child is el: 

991 break 

992 if child is el: 

993 break 

994 last_idx = idx 

995 count += count_incr 

996 if count < 0: 

997 # Count is counting down and has now ventured into invalid territory. 

998 break 

999 idx = a * count + b if var else a 

1000 if last_idx == idx: 

1001 break 

1002 if not matched: 

1003 break 

1004 return matched 

1005 

1006 def match_empty(self, el: bs4.Tag) -> bool: 

1007 """Check if element is empty (if requested).""" 

1008 

1009 is_empty = True 

1010 for child in self.get_children(el, tags=False): 

1011 if self.is_tag(child): 

1012 is_empty = False 

1013 break 

1014 elif self.is_content_string(child) and RE_NOT_EMPTY.search(child): 

1015 is_empty = False 

1016 break 

1017 return is_empty 

1018 

1019 def match_subselectors(self, el: bs4.Tag, selectors: tuple[ct.SelectorList, ...]) -> bool: 

1020 """Match selectors.""" 

1021 

1022 match = True 

1023 for sel in selectors: 

1024 if not self.match_selectors(el, sel): 

1025 match = False 

1026 return match 

1027 

1028 def match_contains(self, el: bs4.Tag, contains: tuple[ct.SelectorContains, ...]) -> bool: 

1029 """Match element if it contains text.""" 

1030 

1031 match = True 

1032 content = None # type: str | Sequence[str] | None 

1033 for contain_list in contains: 

1034 if content is None: 

1035 if contain_list.own: 

1036 content = self.get_own_text(el, no_iframe=self.is_html) 

1037 else: 

1038 content = self.get_text(el, no_iframe=self.is_html) 

1039 found = False 

1040 for text in contain_list.text: 

1041 if contain_list.own: 

1042 for c in content: 

1043 if text in c: 

1044 found = True 

1045 break 

1046 if found: 

1047 break 

1048 else: 

1049 if text in content: 

1050 found = True 

1051 break 

1052 if not found: 

1053 match = False 

1054 return match 

1055 

1056 def match_default(self, el: bs4.Tag) -> bool: 

1057 """Match default.""" 

1058 

1059 match = False 

1060 

1061 # Find this input's form 

1062 form = None 

1063 parent = self.get_parent(el, no_iframe=True) 

1064 while parent and form is None: 

1065 if self.get_tag(parent) == 'form' and self.is_html_tag(parent): 

1066 form = parent 

1067 else: 

1068 parent = self.get_parent(parent, no_iframe=True) 

1069 

1070 # Look in form cache to see if we've already located its default button 

1071 found_form = False 

1072 for f, t in self.cached_default_forms: 

1073 if f is form: 

1074 found_form = True 

1075 if t is el: 

1076 match = True 

1077 break 

1078 

1079 # We didn't have the form cached, so look for its default button 

1080 if not found_form: 

1081 for child in self.get_descendants(form, no_iframe=True): 

1082 name = self.get_tag(child) 

1083 # Can't do nested forms (haven't figured out why we never hit this) 

1084 if name == 'form': # pragma: no cover 

1085 break 

1086 if name in ('input', 'button'): 

1087 v = self.get_attribute_by_name(child, 'type', '') 

1088 if v and util.lower(v) == 'submit': 

1089 self.cached_default_forms.append((form, child)) 

1090 if el is child: 

1091 match = True 

1092 break 

1093 return match 

1094 

1095 def match_indeterminate(self, el: bs4.Tag) -> bool: 

1096 """Match default.""" 

1097 

1098 match = False 

1099 name = cast(str, self.get_attribute_by_name(el, 'name')) 

1100 

1101 def get_parent_form(el: bs4.Tag) -> bs4.Tag | None: 

1102 """Find this input's form.""" 

1103 form = None 

1104 parent = self.get_parent(el, no_iframe=True) 

1105 while form is None: 

1106 if self.get_tag(parent) == 'form' and self.is_html_tag(parent): 

1107 form = parent 

1108 break 

1109 last_parent = parent 

1110 parent = self.get_parent(parent, no_iframe=True) 

1111 if parent is None: 

1112 form = last_parent 

1113 break 

1114 return form 

1115 

1116 form = get_parent_form(el) 

1117 

1118 # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate 

1119 found_form = False 

1120 for f, n, i in self.cached_indeterminate_forms: 

1121 if f is form and n == name: 

1122 found_form = True 

1123 if i is True: 

1124 match = True 

1125 break 

1126 

1127 # We didn't have the form cached, so validate that the radio button is indeterminate 

1128 if not found_form: 

1129 checked = False 

1130 for child in self.get_descendants(form, no_iframe=True): 

1131 if child is el: 

1132 continue 

1133 tag_name = self.get_tag(child) 

1134 if tag_name == 'input': 

1135 is_radio = False 

1136 check = False 

1137 has_name = False 

1138 for k, v in self.iter_attributes(child): 

1139 if util.lower(k) == 'type' and util.lower(v) == 'radio': 

1140 is_radio = True 

1141 elif util.lower(k) == 'name' and v == name: 

1142 has_name = True 

1143 elif util.lower(k) == 'checked': 

1144 check = True 

1145 if is_radio and check and has_name and get_parent_form(child) is form: 

1146 checked = True 

1147 break 

1148 if checked: 

1149 break 

1150 if not checked: 

1151 match = True 

1152 self.cached_indeterminate_forms.append((form, name, match)) 

1153 

1154 return match 

1155 

1156 def match_lang(self, el: bs4.Tag, langs: tuple[ct.SelectorLang, ...]) -> bool: 

1157 """Match languages.""" 

1158 

1159 match = False 

1160 has_ns = self.supports_namespaces() 

1161 root = self.root 

1162 has_html_namespace = self.has_html_namespace 

1163 

1164 # Walk parents looking for `lang` (HTML) or `xml:lang` XML property. 

1165 parent = el 

1166 found_lang = None 

1167 last = None 

1168 while not found_lang: 

1169 has_html_ns = self.has_html_ns(parent) 

1170 for k, v in self.iter_attributes(parent): 

1171 attr_ns, attr = self.split_namespace(parent, k) 

1172 if ( 

1173 ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or 

1174 ( 

1175 has_ns and not has_html_ns and attr_ns == NS_XML and 

1176 (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang' 

1177 ) 

1178 ): 

1179 found_lang = v 

1180 break 

1181 last = parent 

1182 parent = self.get_parent(parent, no_iframe=self.is_html) 

1183 

1184 if parent is None: 

1185 root = last 

1186 has_html_namespace = self.has_html_ns(root) 

1187 parent = last 

1188 break 

1189 

1190 # Use cached meta language. 

1191 if found_lang is None and self.cached_meta_lang: 

1192 for cache in self.cached_meta_lang: 

1193 if root is cache[0]: 

1194 found_lang = cache[1] 

1195 

1196 # If we couldn't find a language, and the document is HTML, look to meta to determine language. 

1197 if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')): 

1198 # Find head 

1199 found = False 

1200 for tag in ('html', 'head'): 

1201 found = False 

1202 for child in self.get_children(parent, no_iframe=self.is_html): 

1203 if self.get_tag(child) == tag and self.is_html_tag(child): 

1204 found = True 

1205 parent = child 

1206 break 

1207 if not found: # pragma: no cover 

1208 break 

1209 

1210 # Search meta tags 

1211 if found: 

1212 for child in parent: 

1213 if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent): 

1214 c_lang = False 

1215 content = None 

1216 for k, v in self.iter_attributes(child): 

1217 if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language': 

1218 c_lang = True 

1219 if util.lower(k) == 'content': 

1220 content = v 

1221 if c_lang and content: 

1222 found_lang = content 

1223 self.cached_meta_lang.append((cast(str, root), cast(str, found_lang))) 

1224 break 

1225 if found_lang is not None: 

1226 break 

1227 if found_lang is None: 

1228 self.cached_meta_lang.append((cast(str, root), '')) 

1229 

1230 # If we determined a language, compare. 

1231 if found_lang is not None: 

1232 for patterns in langs: 

1233 match = False 

1234 for pattern in patterns: 

1235 if self.extended_language_filter(pattern, cast(str, found_lang)): 

1236 match = True 

1237 if not match: 

1238 break 

1239 

1240 return match 

1241 

1242 def match_dir(self, el: bs4.Tag, directionality: int) -> bool: 

1243 """Check directionality.""" 

1244 

1245 # If we have to match both left and right, we can't match either. 

1246 if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL: 

1247 return False 

1248 

1249 if el is None or not self.is_html_tag(el): 

1250 return False 

1251 

1252 # Element has defined direction of left to right or right to left 

1253 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None) 

1254 if direction not in (None, 0): 

1255 return direction == directionality 

1256 

1257 # Element is the document element (the root) and no direction assigned, assume left to right. 

1258 is_root = self.is_root(el) 

1259 if is_root and direction is None: 

1260 return ct.SEL_DIR_LTR == directionality 

1261 

1262 # If `input[type=telephone]` and no direction is assigned, assume left to right. 

1263 name = self.get_tag(el) 

1264 is_input = name == 'input' 

1265 is_textarea = name == 'textarea' 

1266 is_bdi = name == 'bdi' 

1267 itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else '' 

1268 if is_input and itype == 'tel' and direction is None: 

1269 return ct.SEL_DIR_LTR == directionality 

1270 

1271 # Auto handling for text inputs 

1272 if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0: 

1273 if is_textarea: 

1274 value = ''.join(node for node in self.get_contents(el, no_iframe=True) if self.is_content_string(node)) 

1275 else: 

1276 value = cast(str, self.get_attribute_by_name(el, 'value', '')) 

1277 if value: 

1278 for c in value: 

1279 bidi = unicodedata.bidirectional(c) 

1280 if bidi in ('AL', 'R', 'L'): 

1281 direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL 

1282 return direction == directionality 

1283 # Assume left to right 

1284 return ct.SEL_DIR_LTR == directionality 

1285 elif is_root: 

1286 return ct.SEL_DIR_LTR == directionality 

1287 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) 

1288 

1289 # Auto handling for `bdi` and other non text inputs. 

1290 if (is_bdi and direction is None) or direction == 0: 

1291 direction = self.find_bidi(el) 

1292 if direction is not None: 

1293 return direction == directionality 

1294 elif is_root: 

1295 return ct.SEL_DIR_LTR == directionality 

1296 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) 

1297 

1298 # Match parents direction 

1299 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) 

1300 

1301 def match_range(self, el: bs4.Tag, condition: int) -> bool: 

1302 """ 

1303 Match range. 

1304 

1305 Behavior is modeled after what we see in browsers. Browsers seem to evaluate 

1306 if the value is out of range, and if not, it is in range. So a missing value 

1307 will not evaluate out of range; therefore, value is in range. Personally, I 

1308 feel like this should evaluate as neither in or out of range. 

1309 """ 

1310 

1311 out_of_range = False 

1312 

1313 itype = util.lower(self.get_attribute_by_name(el, 'type')) 

1314 mn = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'min', None))) 

1315 mx = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'max', None))) 

1316 

1317 # There is no valid min or max, so we cannot evaluate a range 

1318 if mn is None and mx is None: 

1319 return False 

1320 

1321 value = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'value', None))) 

1322 if value is not None: 

1323 if itype in ("date", "datetime-local", "month", "week", "number", "range"): 

1324 if mn is not None and value < mn: 

1325 out_of_range = True 

1326 if not out_of_range and mx is not None and value > mx: 

1327 out_of_range = True 

1328 elif itype == "time": 

1329 if mn is not None and mx is not None and mn > mx: 

1330 # Time is periodic, so this is a reversed/discontinuous range 

1331 if value < mn and value > mx: 

1332 out_of_range = True 

1333 else: 

1334 if mn is not None and value < mn: 

1335 out_of_range = True 

1336 if not out_of_range and mx is not None and value > mx: 

1337 out_of_range = True 

1338 

1339 return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range 

1340 

1341 def match_defined(self, el: bs4.Tag) -> bool: 

1342 """ 

1343 Match defined. 

1344 

1345 `:defined` is related to custom elements in a browser. 

1346 

1347 - If the document is XML (not XHTML), all tags will match. 

1348 - Tags that are not custom (don't have a hyphen) are marked defined. 

1349 - If the tag has a prefix (without or without a namespace), it will not match. 

1350 

1351 This is of course requires the parser to provide us with the proper prefix and namespace info, 

1352 if it doesn't, there is nothing we can do. 

1353 """ 

1354 

1355 name = self.get_tag(el) 

1356 return ( 

1357 name is not None and ( 

1358 name.find('-') == -1 or 

1359 name.find(':') != -1 or 

1360 self.get_prefix(el) is not None 

1361 ) 

1362 ) 

1363 

1364 def match_placeholder_shown(self, el: bs4.Tag) -> bool: 

1365 """ 

1366 Match placeholder shown according to HTML spec. 

1367 

1368 - text area should be checked if they have content. A single newline does not count as content. 

1369 

1370 """ 

1371 

1372 match = False 

1373 content = self.get_text(el) 

1374 if content in ('', '\n'): 

1375 match = True 

1376 

1377 return match 

1378 

1379 def match_selectors(self, el: bs4.Tag, selectors: ct.SelectorList) -> bool: 

1380 """Check if element matches one of the selectors.""" 

1381 

1382 match = False 

1383 is_not = selectors.is_not 

1384 is_html = selectors.is_html 

1385 

1386 # Internal selector lists that use the HTML flag, will automatically get the `html` namespace. 

1387 if is_html: 

1388 namespaces = self.namespaces 

1389 iframe_restrict = self.iframe_restrict 

1390 self.namespaces = {'html': NS_XHTML} 

1391 self.iframe_restrict = True 

1392 

1393 if not is_html or self.is_html: 

1394 for selector in selectors: 

1395 match = is_not 

1396 # We have a un-matchable situation (like `:focus` as you can focus an element in this environment) 

1397 if isinstance(selector, ct.SelectorNull): 

1398 continue 

1399 # Verify tag matches 

1400 if not self.match_tag(el, selector.tag): 

1401 continue 

1402 # Verify tag is defined 

1403 if selector.flags & ct.SEL_DEFINED and not self.match_defined(el): 

1404 continue 

1405 # Verify element is root 

1406 if selector.flags & ct.SEL_ROOT and not self.match_root(el): 

1407 continue 

1408 # Verify element is scope 

1409 if selector.flags & ct.SEL_SCOPE and not self.match_scope(el): 

1410 continue 

1411 # Verify element has placeholder shown 

1412 if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el): 

1413 continue 

1414 # Verify `nth` matches 

1415 if not self.match_nth(el, selector.nth): 

1416 continue 

1417 if selector.flags & ct.SEL_EMPTY and not self.match_empty(el): 

1418 continue 

1419 # Verify id matches 

1420 if selector.ids and not self.match_id(el, selector.ids): 

1421 continue 

1422 # Verify classes match 

1423 if selector.classes and not self.match_classes(el, selector.classes): 

1424 continue 

1425 # Verify attribute(s) match 

1426 if not self.match_attributes(el, selector.attributes): 

1427 continue 

1428 # Verify ranges 

1429 if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES): 

1430 continue 

1431 # Verify language patterns 

1432 if selector.lang and not self.match_lang(el, selector.lang): 

1433 continue 

1434 # Verify pseudo selector patterns 

1435 if selector.selectors and not self.match_subselectors(el, selector.selectors): 

1436 continue 

1437 # Verify relationship selectors 

1438 if selector.relation and not self.match_relations(el, selector.relation): 

1439 continue 

1440 # Validate that the current default selector match corresponds to the first submit button in the form 

1441 if selector.flags & ct.SEL_DEFAULT and not self.match_default(el): 

1442 continue 

1443 # Validate that the unset radio button is among radio buttons with the same name in a form that are 

1444 # also not set. 

1445 if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el): 

1446 continue 

1447 # Validate element directionality 

1448 if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS): 

1449 continue 

1450 # Validate that the tag contains the specified text. 

1451 if selector.contains and not self.match_contains(el, selector.contains): 

1452 continue 

1453 match = not is_not 

1454 break 

1455 

1456 # Restore actual namespaces being used for external selector lists 

1457 if is_html: 

1458 self.namespaces = namespaces 

1459 self.iframe_restrict = iframe_restrict 

1460 

1461 return match 

1462 

1463 def select(self, limit: int = 0) -> Iterator[bs4.Tag]: 

1464 """Match all tags under the targeted tag.""" 

1465 

1466 lim = None if limit < 1 else limit 

1467 

1468 for child in self.get_descendants(self.tag): 

1469 if self.match(child): 

1470 yield child 

1471 if lim is not None: 

1472 lim -= 1 

1473 if lim < 1: 

1474 break 

1475 

1476 def closest(self) -> bs4.Tag | None: 

1477 """Match closest ancestor.""" 

1478 

1479 current = self.tag 

1480 closest = None 

1481 while closest is None and current is not None: 

1482 if self.match(current): 

1483 closest = current 

1484 else: 

1485 current = self.get_parent(current) 

1486 return closest 

1487 

1488 def filter(self) -> list[bs4.Tag]: # noqa A001 

1489 """Filter tag's children.""" 

1490 

1491 return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)] 

1492 

1493 def match(self, el: bs4.Tag) -> bool: 

1494 """Match.""" 

1495 

1496 return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors) 

1497 

1498 

1499class SoupSieve(ct.Immutable): 

1500 """Compiled Soup Sieve selector matching object.""" 

1501 

1502 pattern: str 

1503 selectors: ct.SelectorList 

1504 namespaces: ct.Namespaces | None 

1505 custom: dict[str, str] 

1506 flags: int 

1507 

1508 __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash") 

1509 

1510 def __init__( 

1511 self, 

1512 pattern: str, 

1513 selectors: ct.SelectorList, 

1514 namespaces: ct.Namespaces | None, 

1515 custom: ct.CustomSelectors | None, 

1516 flags: int 

1517 ): 

1518 """Initialize.""" 

1519 

1520 super().__init__( 

1521 pattern=pattern, 

1522 selectors=selectors, 

1523 namespaces=namespaces, 

1524 custom=custom, 

1525 flags=flags 

1526 ) 

1527 

1528 def match(self, tag: bs4.Tag) -> bool: 

1529 """Match.""" 

1530 

1531 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag) 

1532 

1533 def closest(self, tag: bs4.Tag) -> bs4.Tag: 

1534 """Match closest ancestor.""" 

1535 

1536 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest() 

1537 

1538 def filter(self, iterable: Iterable[bs4.Tag]) -> list[bs4.Tag]: # noqa A001 

1539 """ 

1540 Filter. 

1541 

1542 `CSSMatch` can cache certain searches for tags of the same document, 

1543 so if we are given a tag, all tags are from the same document, 

1544 and we can take advantage of the optimization. 

1545 

1546 Any other kind of iterable could have tags from different documents or detached tags, 

1547 so for those, we use a new `CSSMatch` for each item in the iterable. 

1548 """ 

1549 

1550 if CSSMatch.is_tag(iterable): 

1551 return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter() 

1552 else: 

1553 return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)] 

1554 

1555 def select_one(self, tag: bs4.Tag) -> bs4.Tag: 

1556 """Select a single tag.""" 

1557 

1558 tags = self.select(tag, limit=1) 

1559 return tags[0] if tags else None 

1560 

1561 def select(self, tag: bs4.Tag, limit: int = 0) -> list[bs4.Tag]: 

1562 """Select the specified tags.""" 

1563 

1564 return list(self.iselect(tag, limit)) 

1565 

1566 def iselect(self, tag: bs4.Tag, limit: int = 0) -> Iterator[bs4.Tag]: 

1567 """Iterate the specified tags.""" 

1568 

1569 yield from CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit) 

1570 

1571 def __repr__(self) -> str: # pragma: no cover 

1572 """Representation.""" 

1573 

1574 return ( 

1575 f"SoupSieve(pattern={self.pattern!r}, namespaces={self.namespaces!r}, " 

1576 f"custom={self.custom!r}, flags={self.flags!r})" 

1577 ) 

1578 

1579 __str__ = __repr__ 

1580 

1581 

1582ct.pickle_register(SoupSieve)