Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/soupsieve/css_match.py: 17%

959 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-01 06:54 +0000

1"""CSS matcher.""" 

2from __future__ import annotations 

3from datetime import datetime 

4from . import util 

5import re 

6from . import css_types as ct 

7import unicodedata 

8import bs4 # type: ignore[import] 

9from typing import Iterator, Iterable, Any, Callable, Sequence, cast # noqa: F401 

10 

11# Empty tag pattern (whitespace okay) 

12RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') 

13 

14RE_NOT_WS = re.compile('[^ \t\r\n\f]+') 

15 

16# Relationships 

17REL_PARENT = ' ' 

18REL_CLOSE_PARENT = '>' 

19REL_SIBLING = '~' 

20REL_CLOSE_SIBLING = '+' 

21 

22# Relationships for :has() (forward looking) 

23REL_HAS_PARENT = ': ' 

24REL_HAS_CLOSE_PARENT = ':>' 

25REL_HAS_SIBLING = ':~' 

26REL_HAS_CLOSE_SIBLING = ':+' 

27 

28NS_XHTML = 'http://www.w3.org/1999/xhtml' 

29NS_XML = 'http://www.w3.org/XML/1998/namespace' 

30 

31DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL 

32RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE 

33 

34DIR_MAP = { 

35 'ltr': ct.SEL_DIR_LTR, 

36 'rtl': ct.SEL_DIR_RTL, 

37 'auto': 0 

38} 

39 

40RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$") 

41RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$') 

42RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$') 

43RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$') 

44RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$') 

45RE_DATETIME = re.compile( 

46 r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$' 

47) 

48RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)') 

49 

50MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November 

51FEB = 2 

52SHORT_MONTH = 30 

53LONG_MONTH = 31 

54FEB_MONTH = 28 

55FEB_LEAP_MONTH = 29 

56DAYS_IN_WEEK = 7 

57 

58 

59class _FakeParent: 

60 """ 

61 Fake parent class. 

62 

63 When we have a fragment with no `BeautifulSoup` document object, 

64 we can't evaluate `nth` selectors properly. Create a temporary 

65 fake parent so we can traverse the root element as a child. 

66 """ 

67 

68 def __init__(self, element: bs4.Tag) -> None: 

69 """Initialize.""" 

70 

71 self.contents = [element] 

72 

73 def __len__(self) -> bs4.PageElement: 

74 """Length.""" 

75 

76 return len(self.contents) 

77 

78 

79class _DocumentNav: 

80 """Navigate a Beautiful Soup document.""" 

81 

82 @classmethod 

83 def assert_valid_input(cls, tag: Any) -> None: 

84 """Check if valid input tag or document.""" 

85 

86 # Fail on unexpected types. 

87 if not cls.is_tag(tag): 

88 raise TypeError("Expected a BeautifulSoup 'Tag', but instead received type {}".format(type(tag))) 

89 

90 @staticmethod 

91 def is_doc(obj: bs4.Tag) -> bool: 

92 """Is `BeautifulSoup` object.""" 

93 return isinstance(obj, bs4.BeautifulSoup) 

94 

95 @staticmethod 

96 def is_tag(obj: bs4.PageElement) -> bool: 

97 """Is tag.""" 

98 return isinstance(obj, bs4.Tag) 

99 

100 @staticmethod 

101 def is_declaration(obj: bs4.PageElement) -> bool: # pragma: no cover 

102 """Is declaration.""" 

103 return isinstance(obj, bs4.Declaration) 

104 

105 @staticmethod 

106 def is_cdata(obj: bs4.PageElement) -> bool: 

107 """Is CDATA.""" 

108 return isinstance(obj, bs4.CData) 

109 

110 @staticmethod 

111 def is_processing_instruction(obj: bs4.PageElement) -> bool: # pragma: no cover 

112 """Is processing instruction.""" 

113 return isinstance(obj, bs4.ProcessingInstruction) 

114 

115 @staticmethod 

116 def is_navigable_string(obj: bs4.PageElement) -> bool: 

117 """Is navigable string.""" 

118 return isinstance(obj, bs4.NavigableString) 

119 

120 @staticmethod 

121 def is_special_string(obj: bs4.PageElement) -> bool: 

122 """Is special string.""" 

123 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) 

124 

125 @classmethod 

126 def is_content_string(cls, obj: bs4.PageElement) -> bool: 

127 """Check if node is content string.""" 

128 

129 return cls.is_navigable_string(obj) and not cls.is_special_string(obj) 

130 

131 @staticmethod 

132 def create_fake_parent(el: bs4.Tag) -> _FakeParent: 

133 """Create fake parent for a given element.""" 

134 

135 return _FakeParent(el) 

136 

137 @staticmethod 

138 def is_xml_tree(el: bs4.Tag) -> bool: 

139 """Check if element (or document) is from a XML tree.""" 

140 

141 return bool(el._is_xml) 

142 

143 def is_iframe(self, el: bs4.Tag) -> bool: 

144 """Check if element is an `iframe`.""" 

145 

146 return bool( 

147 ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and 

148 self.is_html_tag(el) # type: ignore[attr-defined] 

149 ) 

150 

151 def is_root(self, el: bs4.Tag) -> bool: 

152 """ 

153 Return whether element is a root element. 

154 

155 We check that the element is the root of the tree (which we have already pre-calculated), 

156 and we check if it is the root element under an `iframe`. 

157 """ 

158 

159 root = self.root and self.root is el # type: ignore[attr-defined] 

160 if not root: 

161 parent = self.get_parent(el) 

162 root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined] 

163 return root 

164 

165 def get_contents(self, el: bs4.Tag, no_iframe: bool = False) -> Iterator[bs4.PageElement]: 

166 """Get contents or contents in reverse.""" 

167 if not no_iframe or not self.is_iframe(el): 

168 for content in el.contents: 

169 yield content 

170 

171 def get_children( 

172 self, 

173 el: bs4.Tag, 

174 start: int | None = None, 

175 reverse: bool = False, 

176 tags: bool = True, 

177 no_iframe: bool = False 

178 ) -> Iterator[bs4.PageElement]: 

179 """Get children.""" 

180 

181 if not no_iframe or not self.is_iframe(el): 

182 last = len(el.contents) - 1 

183 if start is None: 

184 index = last if reverse else 0 

185 else: 

186 index = start 

187 end = -1 if reverse else last + 1 

188 incr = -1 if reverse else 1 

189 

190 if 0 <= index <= last: 

191 while index != end: 

192 node = el.contents[index] 

193 index += incr 

194 if not tags or self.is_tag(node): 

195 yield node 

196 

197 def get_descendants( 

198 self, 

199 el: bs4.Tag, 

200 tags: bool = True, 

201 no_iframe: bool = False 

202 ) -> Iterator[bs4.PageElement]: 

203 """Get descendants.""" 

204 

205 if not no_iframe or not self.is_iframe(el): 

206 next_good = None 

207 for child in el.descendants: 

208 

209 if next_good is not None: 

210 if child is not next_good: 

211 continue 

212 next_good = None 

213 

214 is_tag = self.is_tag(child) 

215 

216 if no_iframe and is_tag and self.is_iframe(child): 

217 if child.next_sibling is not None: 

218 next_good = child.next_sibling 

219 else: 

220 last_child = child 

221 while self.is_tag(last_child) and last_child.contents: 

222 last_child = last_child.contents[-1] 

223 next_good = last_child.next_element 

224 yield child 

225 if next_good is None: 

226 break 

227 # Coverage isn't seeing this even though it's executed 

228 continue # pragma: no cover 

229 

230 if not tags or is_tag: 

231 yield child 

232 

233 def get_parent(self, el: bs4.Tag, no_iframe: bool = False) -> bs4.Tag: 

234 """Get parent.""" 

235 

236 parent = el.parent 

237 if no_iframe and parent is not None and self.is_iframe(parent): 

238 parent = None 

239 return parent 

240 

241 @staticmethod 

242 def get_tag_name(el: bs4.Tag) -> str | None: 

243 """Get tag.""" 

244 

245 return cast('str | None', el.name) 

246 

247 @staticmethod 

248 def get_prefix_name(el: bs4.Tag) -> str | None: 

249 """Get prefix.""" 

250 

251 return cast('str | None', el.prefix) 

252 

253 @staticmethod 

254 def get_uri(el: bs4.Tag) -> str | None: 

255 """Get namespace `URI`.""" 

256 

257 return cast('str | None', el.namespace) 

258 

259 @classmethod 

260 def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement: 

261 """Get next sibling tag.""" 

262 

263 sibling = el.next_sibling 

264 while tags and not cls.is_tag(sibling) and sibling is not None: 

265 sibling = sibling.next_sibling 

266 return sibling 

267 

268 @classmethod 

269 def get_previous(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement: 

270 """Get previous sibling tag.""" 

271 

272 sibling = el.previous_sibling 

273 while tags and not cls.is_tag(sibling) and sibling is not None: 

274 sibling = sibling.previous_sibling 

275 return sibling 

276 

277 @staticmethod 

278 def has_html_ns(el: bs4.Tag) -> bool: 

279 """ 

280 Check if element has an HTML namespace. 

281 

282 This is a bit different than whether a element is treated as having an HTML namespace, 

283 like we do in the case of `is_html_tag`. 

284 """ 

285 

286 ns = getattr(el, 'namespace') if el else None 

287 return bool(ns and ns == NS_XHTML) 

288 

289 @staticmethod 

290 def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[str | None, str | None]: 

291 """Return namespace and attribute name without the prefix.""" 

292 

293 return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None) 

294 

295 @classmethod 

296 def normalize_value(cls, value: Any) -> str | Sequence[str]: 

297 """Normalize the value to be a string or list of strings.""" 

298 

299 # Treat `None` as empty string. 

300 if value is None: 

301 return '' 

302 

303 # Pass through strings 

304 if (isinstance(value, str)): 

305 return value 

306 

307 # If it's a byte string, convert it to Unicode, treating it as UTF-8. 

308 if isinstance(value, bytes): 

309 return value.decode("utf8") 

310 

311 # BeautifulSoup supports sequences of attribute values, so make sure the children are strings. 

312 if isinstance(value, Sequence): 

313 new_value = [] 

314 for v in value: 

315 if not isinstance(v, (str, bytes)) and isinstance(v, Sequence): 

316 # This is most certainly a user error and will crash and burn later. 

317 # To keep things working, we'll do what we do with all objects, 

318 # And convert them to strings. 

319 new_value.append(str(v)) 

320 else: 

321 # Convert the child to a string 

322 new_value.append(cast(str, cls.normalize_value(v))) 

323 return new_value 

324 

325 # Try and make anything else a string 

326 return str(value) 

327 

328 @classmethod 

329 def get_attribute_by_name( 

330 cls, 

331 el: bs4.Tag, 

332 name: str, 

333 default: str | Sequence[str] | None = None 

334 ) -> str | Sequence[str] | None: 

335 """Get attribute by name.""" 

336 

337 value = default 

338 if el._is_xml: 

339 try: 

340 value = cls.normalize_value(el.attrs[name]) 

341 except KeyError: 

342 pass 

343 else: 

344 for k, v in el.attrs.items(): 

345 if util.lower(k) == name: 

346 value = cls.normalize_value(v) 

347 break 

348 return value 

349 

350 @classmethod 

351 def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, str | Sequence[str] | None]]: 

352 """Iterate attributes.""" 

353 

354 for k, v in el.attrs.items(): 

355 yield k, cls.normalize_value(v) 

356 

357 @classmethod 

358 def get_classes(cls, el: bs4.Tag) -> Sequence[str]: 

359 """Get classes.""" 

360 

361 classes = cls.get_attribute_by_name(el, 'class', []) 

362 if isinstance(classes, str): 

363 classes = RE_NOT_WS.findall(classes) 

364 return cast(Sequence[str], classes) 

365 

366 def get_text(self, el: bs4.Tag, no_iframe: bool = False) -> str: 

367 """Get text.""" 

368 

369 return ''.join( 

370 [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)] 

371 ) 

372 

373 def get_own_text(self, el: bs4.Tag, no_iframe: bool = False) -> list[str]: 

374 """Get Own Text.""" 

375 

376 return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)] 

377 

378 

379class Inputs: 

380 """Class for parsing and validating input items.""" 

381 

382 @staticmethod 

383 def validate_day(year: int, month: int, day: int) -> bool: 

384 """Validate day.""" 

385 

386 max_days = LONG_MONTH 

387 if month == FEB: 

388 max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH 

389 elif month in MONTHS_30: 

390 max_days = SHORT_MONTH 

391 return 1 <= day <= max_days 

392 

393 @staticmethod 

394 def validate_week(year: int, week: int) -> bool: 

395 """Validate week.""" 

396 

397 max_week = datetime.strptime("{}-{}-{}".format(12, 31, year), "%m-%d-%Y").isocalendar()[1] 

398 if max_week == 1: 

399 max_week = 53 

400 return 1 <= week <= max_week 

401 

402 @staticmethod 

403 def validate_month(month: int) -> bool: 

404 """Validate month.""" 

405 

406 return 1 <= month <= 12 

407 

408 @staticmethod 

409 def validate_year(year: int) -> bool: 

410 """Validate year.""" 

411 

412 return 1 <= year 

413 

414 @staticmethod 

415 def validate_hour(hour: int) -> bool: 

416 """Validate hour.""" 

417 

418 return 0 <= hour <= 23 

419 

420 @staticmethod 

421 def validate_minutes(minutes: int) -> bool: 

422 """Validate minutes.""" 

423 

424 return 0 <= minutes <= 59 

425 

426 @classmethod 

427 def parse_value(cls, itype: str, value: str | None) -> tuple[float, ...] | None: 

428 """Parse the input value.""" 

429 

430 parsed = None # type: tuple[float, ...] | None 

431 if value is None: 

432 return value 

433 if itype == "date": 

434 m = RE_DATE.match(value) 

435 if m: 

436 year = int(m.group('year'), 10) 

437 month = int(m.group('month'), 10) 

438 day = int(m.group('day'), 10) 

439 if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day): 

440 parsed = (year, month, day) 

441 elif itype == "month": 

442 m = RE_MONTH.match(value) 

443 if m: 

444 year = int(m.group('year'), 10) 

445 month = int(m.group('month'), 10) 

446 if cls.validate_year(year) and cls.validate_month(month): 

447 parsed = (year, month) 

448 elif itype == "week": 

449 m = RE_WEEK.match(value) 

450 if m: 

451 year = int(m.group('year'), 10) 

452 week = int(m.group('week'), 10) 

453 if cls.validate_year(year) and cls.validate_week(year, week): 

454 parsed = (year, week) 

455 elif itype == "time": 

456 m = RE_TIME.match(value) 

457 if m: 

458 hour = int(m.group('hour'), 10) 

459 minutes = int(m.group('minutes'), 10) 

460 if cls.validate_hour(hour) and cls.validate_minutes(minutes): 

461 parsed = (hour, minutes) 

462 elif itype == "datetime-local": 

463 m = RE_DATETIME.match(value) 

464 if m: 

465 year = int(m.group('year'), 10) 

466 month = int(m.group('month'), 10) 

467 day = int(m.group('day'), 10) 

468 hour = int(m.group('hour'), 10) 

469 minutes = int(m.group('minutes'), 10) 

470 if ( 

471 cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and 

472 cls.validate_hour(hour) and cls.validate_minutes(minutes) 

473 ): 

474 parsed = (year, month, day, hour, minutes) 

475 elif itype in ("number", "range"): 

476 m = RE_NUM.match(value) 

477 if m: 

478 parsed = (float(m.group('value')),) 

479 return parsed 

480 

481 

482class CSSMatch(_DocumentNav): 

483 """Perform CSS matching.""" 

484 

485 def __init__( 

486 self, 

487 selectors: ct.SelectorList, 

488 scope: bs4.Tag, 

489 namespaces: ct.Namespaces | None, 

490 flags: int 

491 ) -> None: 

492 """Initialize.""" 

493 

494 self.assert_valid_input(scope) 

495 self.tag = scope 

496 self.cached_meta_lang = [] # type: list[tuple[str, str]] 

497 self.cached_default_forms = [] # type: list[tuple[bs4.Tag, bs4.Tag]] 

498 self.cached_indeterminate_forms = [] # type: list[tuple[bs4.Tag, str, bool]] 

499 self.selectors = selectors 

500 self.namespaces = {} if namespaces is None else namespaces # type: ct.Namespaces | dict[str, str] 

501 self.flags = flags 

502 self.iframe_restrict = False 

503 

504 # Find the root element for the whole tree 

505 doc = scope 

506 parent = self.get_parent(doc) 

507 while parent: 

508 doc = parent 

509 parent = self.get_parent(doc) 

510 root = None 

511 if not self.is_doc(doc): 

512 root = doc 

513 else: 

514 for child in self.get_children(doc): 

515 root = child 

516 break 

517 

518 self.root = root 

519 self.scope = scope if scope is not doc else root 

520 self.has_html_namespace = self.has_html_ns(root) 

521 

522 # A document can be both XML and HTML (XHTML) 

523 self.is_xml = self.is_xml_tree(doc) 

524 self.is_html = not self.is_xml or self.has_html_namespace 

525 

526 def supports_namespaces(self) -> bool: 

527 """Check if namespaces are supported in the HTML type.""" 

528 

529 return self.is_xml or self.has_html_namespace 

530 

531 def get_tag_ns(self, el: bs4.Tag) -> str: 

532 """Get tag namespace.""" 

533 

534 if self.supports_namespaces(): 

535 namespace = '' 

536 ns = self.get_uri(el) 

537 if ns: 

538 namespace = ns 

539 else: 

540 namespace = NS_XHTML 

541 return namespace 

542 

543 def is_html_tag(self, el: bs4.Tag) -> bool: 

544 """Check if tag is in HTML namespace.""" 

545 

546 return self.get_tag_ns(el) == NS_XHTML 

547 

548 def get_tag(self, el: bs4.Tag) -> str | None: 

549 """Get tag.""" 

550 

551 name = self.get_tag_name(el) 

552 return util.lower(name) if name is not None and not self.is_xml else name 

553 

554 def get_prefix(self, el: bs4.Tag) -> str | None: 

555 """Get prefix.""" 

556 

557 prefix = self.get_prefix_name(el) 

558 return util.lower(prefix) if prefix is not None and not self.is_xml else prefix 

559 

560 def find_bidi(self, el: bs4.Tag) -> int | None: 

561 """Get directionality from element text.""" 

562 

563 for node in self.get_children(el, tags=False): 

564 

565 # Analyze child text nodes 

566 if self.is_tag(node): 

567 

568 # Avoid analyzing certain elements specified in the specification. 

569 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None) 

570 if ( 

571 self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or 

572 not self.is_html_tag(node) or 

573 direction is not None 

574 ): 

575 continue # pragma: no cover 

576 

577 # Check directionality of this node's text 

578 value = self.find_bidi(node) 

579 if value is not None: 

580 return value 

581 

582 # Direction could not be determined 

583 continue # pragma: no cover 

584 

585 # Skip `doctype` comments, etc. 

586 if self.is_special_string(node): 

587 continue 

588 

589 # Analyze text nodes for directionality. 

590 for c in node: 

591 bidi = unicodedata.bidirectional(c) 

592 if bidi in ('AL', 'R', 'L'): 

593 return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL 

594 return None 

595 

596 def extended_language_filter(self, lang_range: str, lang_tag: str) -> bool: 

597 """Filter the language tags.""" 

598 

599 match = True 

600 lang_range = RE_WILD_STRIP.sub('-', lang_range).lower() 

601 ranges = lang_range.split('-') 

602 subtags = lang_tag.lower().split('-') 

603 length = len(ranges) 

604 slength = len(subtags) 

605 rindex = 0 

606 sindex = 0 

607 r = ranges[rindex] 

608 s = subtags[sindex] 

609 

610 # Empty specified language should match unspecified language attributes 

611 if length == 1 and slength == 1 and not r and r == s: 

612 return True 

613 

614 # Primary tag needs to match 

615 if (r != '*' and r != s) or (r == '*' and slength == 1 and not s): 

616 match = False 

617 

618 rindex += 1 

619 sindex += 1 

620 

621 # Match until we run out of ranges 

622 while match and rindex < length: 

623 r = ranges[rindex] 

624 try: 

625 s = subtags[sindex] 

626 except IndexError: 

627 # Ran out of subtags, 

628 # but we still have ranges 

629 match = False 

630 continue 

631 

632 # Empty range 

633 if not r: 

634 match = False 

635 continue 

636 

637 # Matched range 

638 elif s == r: 

639 rindex += 1 

640 

641 # Implicit wildcard cannot match 

642 # singletons 

643 elif len(s) == 1: 

644 match = False 

645 continue 

646 

647 # Implicitly matched, so grab next subtag 

648 sindex += 1 

649 

650 return match 

651 

652 def match_attribute_name( 

653 self, 

654 el: bs4.Tag, 

655 attr: str, 

656 prefix: str | None 

657 ) -> str | Sequence[str] | None: 

658 """Match attribute name and return value if it exists.""" 

659 

660 value = None 

661 if self.supports_namespaces(): 

662 value = None 

663 # If we have not defined namespaces, we can't very well find them, so don't bother trying. 

664 if prefix: 

665 ns = self.namespaces.get(prefix) 

666 if ns is None and prefix != '*': 

667 return None 

668 else: 

669 ns = None 

670 

671 for k, v in self.iter_attributes(el): 

672 

673 # Get attribute parts 

674 namespace, name = self.split_namespace(el, k) 

675 

676 # Can't match a prefix attribute as we haven't specified one to match 

677 # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`. 

678 if ns is None: 

679 if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)): 

680 value = v 

681 break 

682 # Coverage is not finding this even though it is executed. 

683 # Adding a print statement before this (and erasing coverage) causes coverage to find the line. 

684 # Ignore the false positive message. 

685 continue # pragma: no cover 

686 

687 # We can't match our desired prefix attribute as the attribute doesn't have a prefix 

688 if namespace is None or ns != namespace and prefix != '*': 

689 continue 

690 

691 # The attribute doesn't match. 

692 if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name): 

693 continue 

694 

695 value = v 

696 break 

697 else: 

698 for k, v in self.iter_attributes(el): 

699 if util.lower(attr) != util.lower(k): 

700 continue 

701 value = v 

702 break 

703 return value 

704 

705 def match_namespace(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool: 

706 """Match the namespace of the element.""" 

707 

708 match = True 

709 namespace = self.get_tag_ns(el) 

710 default_namespace = self.namespaces.get('') 

711 tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix) 

712 # We must match the default namespace if one is not provided 

713 if tag.prefix is None and (default_namespace is not None and namespace != default_namespace): 

714 match = False 

715 # If we specified `|tag`, we must not have a namespace. 

716 elif (tag.prefix is not None and tag.prefix == '' and namespace): 

717 match = False 

718 # Verify prefix matches 

719 elif ( 

720 tag.prefix and 

721 tag.prefix != '*' and (tag_ns is None or namespace != tag_ns) 

722 ): 

723 match = False 

724 return match 

725 

726 def match_attributes(self, el: bs4.Tag, attributes: tuple[ct.SelectorAttribute, ...]) -> bool: 

727 """Match attributes.""" 

728 

729 match = True 

730 if attributes: 

731 for a in attributes: 

732 temp = self.match_attribute_name(el, a.attribute, a.prefix) 

733 pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern 

734 if temp is None: 

735 match = False 

736 break 

737 value = temp if isinstance(temp, str) else ' '.join(temp) 

738 if pattern is None: 

739 continue 

740 elif pattern.match(value) is None: 

741 match = False 

742 break 

743 return match 

744 

745 def match_tagname(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool: 

746 """Match tag name.""" 

747 

748 name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name) 

749 return not ( 

750 name is not None and 

751 name not in (self.get_tag(el), '*') 

752 ) 

753 

754 def match_tag(self, el: bs4.Tag, tag: ct.SelectorTag | None) -> bool: 

755 """Match the tag.""" 

756 

757 match = True 

758 if tag is not None: 

759 # Verify namespace 

760 if not self.match_namespace(el, tag): 

761 match = False 

762 if not self.match_tagname(el, tag): 

763 match = False 

764 return match 

765 

766 def match_past_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: 

767 """Match past relationship.""" 

768 

769 found = False 

770 # I don't think this can ever happen, but it makes `mypy` happy 

771 if isinstance(relation[0], ct.SelectorNull): # pragma: no cover 

772 return found 

773 

774 if relation[0].rel_type == REL_PARENT: 

775 parent = self.get_parent(el, no_iframe=self.iframe_restrict) 

776 while not found and parent: 

777 found = self.match_selectors(parent, relation) 

778 parent = self.get_parent(parent, no_iframe=self.iframe_restrict) 

779 elif relation[0].rel_type == REL_CLOSE_PARENT: 

780 parent = self.get_parent(el, no_iframe=self.iframe_restrict) 

781 if parent: 

782 found = self.match_selectors(parent, relation) 

783 elif relation[0].rel_type == REL_SIBLING: 

784 sibling = self.get_previous(el) 

785 while not found and sibling: 

786 found = self.match_selectors(sibling, relation) 

787 sibling = self.get_previous(sibling) 

788 elif relation[0].rel_type == REL_CLOSE_SIBLING: 

789 sibling = self.get_previous(el) 

790 if sibling and self.is_tag(sibling): 

791 found = self.match_selectors(sibling, relation) 

792 return found 

793 

794 def match_future_child(self, parent: bs4.Tag, relation: ct.SelectorList, recursive: bool = False) -> bool: 

795 """Match future child.""" 

796 

797 match = False 

798 if recursive: 

799 children = self.get_descendants # type: Callable[..., Iterator[bs4.Tag]] 

800 else: 

801 children = self.get_children 

802 for child in children(parent, no_iframe=self.iframe_restrict): 

803 match = self.match_selectors(child, relation) 

804 if match: 

805 break 

806 return match 

807 

808 def match_future_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: 

809 """Match future relationship.""" 

810 

811 found = False 

812 # I don't think this can ever happen, but it makes `mypy` happy 

813 if isinstance(relation[0], ct.SelectorNull): # pragma: no cover 

814 return found 

815 

816 if relation[0].rel_type == REL_HAS_PARENT: 

817 found = self.match_future_child(el, relation, True) 

818 elif relation[0].rel_type == REL_HAS_CLOSE_PARENT: 

819 found = self.match_future_child(el, relation) 

820 elif relation[0].rel_type == REL_HAS_SIBLING: 

821 sibling = self.get_next(el) 

822 while not found and sibling: 

823 found = self.match_selectors(sibling, relation) 

824 sibling = self.get_next(sibling) 

825 elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING: 

826 sibling = self.get_next(el) 

827 if sibling and self.is_tag(sibling): 

828 found = self.match_selectors(sibling, relation) 

829 return found 

830 

831 def match_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: 

832 """Match relationship to other elements.""" 

833 

834 found = False 

835 

836 if isinstance(relation[0], ct.SelectorNull) or relation[0].rel_type is None: 

837 return found 

838 

839 if relation[0].rel_type.startswith(':'): 

840 found = self.match_future_relations(el, relation) 

841 else: 

842 found = self.match_past_relations(el, relation) 

843 

844 return found 

845 

846 def match_id(self, el: bs4.Tag, ids: tuple[str, ...]) -> bool: 

847 """Match element's ID.""" 

848 

849 found = True 

850 for i in ids: 

851 if i != self.get_attribute_by_name(el, 'id', ''): 

852 found = False 

853 break 

854 return found 

855 

856 def match_classes(self, el: bs4.Tag, classes: tuple[str, ...]) -> bool: 

857 """Match element's classes.""" 

858 

859 current_classes = self.get_classes(el) 

860 found = True 

861 for c in classes: 

862 if c not in current_classes: 

863 found = False 

864 break 

865 return found 

866 

867 def match_root(self, el: bs4.Tag) -> bool: 

868 """Match element as root.""" 

869 

870 is_root = self.is_root(el) 

871 if is_root: 

872 sibling = self.get_previous(el, tags=False) 

873 while is_root and sibling is not None: 

874 if ( 

875 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or 

876 self.is_cdata(sibling) 

877 ): 

878 is_root = False 

879 else: 

880 sibling = self.get_previous(sibling, tags=False) 

881 if is_root: 

882 sibling = self.get_next(el, tags=False) 

883 while is_root and sibling is not None: 

884 if ( 

885 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or 

886 self.is_cdata(sibling) 

887 ): 

888 is_root = False 

889 else: 

890 sibling = self.get_next(sibling, tags=False) 

891 return is_root 

892 

893 def match_scope(self, el: bs4.Tag) -> bool: 

894 """Match element as scope.""" 

895 

896 return self.scope is el 

897 

898 def match_nth_tag_type(self, el: bs4.Tag, child: bs4.Tag) -> bool: 

899 """Match tag type for `nth` matches.""" 

900 

901 return ( 

902 (self.get_tag(child) == self.get_tag(el)) and 

903 (self.get_tag_ns(child) == self.get_tag_ns(el)) 

904 ) 

905 

906 def match_nth(self, el: bs4.Tag, nth: bs4.Tag) -> bool: 

907 """Match `nth` elements.""" 

908 

909 matched = True 

910 

911 for n in nth: 

912 matched = False 

913 if n.selectors and not self.match_selectors(el, n.selectors): 

914 break 

915 parent = self.get_parent(el) 

916 if parent is None: 

917 parent = self.create_fake_parent(el) 

918 last = n.last 

919 last_index = len(parent) - 1 

920 index = last_index if last else 0 

921 relative_index = 0 

922 a = n.a 

923 b = n.b 

924 var = n.n 

925 count = 0 

926 count_incr = 1 

927 factor = -1 if last else 1 

928 idx = last_idx = a * count + b if var else a 

929 

930 # We can only adjust bounds within a variable index 

931 if var: 

932 # Abort if our nth index is out of bounds and only getting further out of bounds as we increment. 

933 # Otherwise, increment to try to get in bounds. 

934 adjust = None 

935 while idx < 1 or idx > last_index: 

936 if idx < 0: 

937 diff_low = 0 - idx 

938 if adjust is not None and adjust == 1: 

939 break 

940 adjust = -1 

941 count += count_incr 

942 idx = last_idx = a * count + b if var else a 

943 diff = 0 - idx 

944 if diff >= diff_low: 

945 break 

946 else: 

947 diff_high = idx - last_index 

948 if adjust is not None and adjust == -1: 

949 break 

950 adjust = 1 

951 count += count_incr 

952 idx = last_idx = a * count + b if var else a 

953 diff = idx - last_index 

954 if diff >= diff_high: 

955 break 

956 diff_high = diff 

957 

958 # If a < 0, our count is working backwards, so floor the index by increasing the count. 

959 # Find the count that yields the lowest, in bound value and use that. 

960 # Lastly reverse count increment so that we'll increase our index. 

961 lowest = count 

962 if a < 0: 

963 while idx >= 1: 

964 lowest = count 

965 count += count_incr 

966 idx = last_idx = a * count + b if var else a 

967 count_incr = -1 

968 count = lowest 

969 idx = last_idx = a * count + b if var else a 

970 

971 # Evaluate elements while our calculated nth index is still in range 

972 while 1 <= idx <= last_index + 1: 

973 child = None 

974 # Evaluate while our child index is still in range. 

975 for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False): 

976 index += factor 

977 if not self.is_tag(child): 

978 continue 

979 # Handle `of S` in `nth-child` 

980 if n.selectors and not self.match_selectors(child, n.selectors): 

981 continue 

982 # Handle `of-type` 

983 if n.of_type and not self.match_nth_tag_type(el, child): 

984 continue 

985 relative_index += 1 

986 if relative_index == idx: 

987 if child is el: 

988 matched = True 

989 else: 

990 break 

991 if child is el: 

992 break 

993 if child is el: 

994 break 

995 last_idx = idx 

996 count += count_incr 

997 if count < 0: 

998 # Count is counting down and has now ventured into invalid territory. 

999 break 

1000 idx = a * count + b if var else a 

1001 if last_idx == idx: 

1002 break 

1003 if not matched: 

1004 break 

1005 return matched 

1006 

1007 def match_empty(self, el: bs4.Tag) -> bool: 

1008 """Check if element is empty (if requested).""" 

1009 

1010 is_empty = True 

1011 for child in self.get_children(el, tags=False): 

1012 if self.is_tag(child): 

1013 is_empty = False 

1014 break 

1015 elif self.is_content_string(child) and RE_NOT_EMPTY.search(child): 

1016 is_empty = False 

1017 break 

1018 return is_empty 

1019 

1020 def match_subselectors(self, el: bs4.Tag, selectors: tuple[ct.SelectorList, ...]) -> bool: 

1021 """Match selectors.""" 

1022 

1023 match = True 

1024 for sel in selectors: 

1025 if not self.match_selectors(el, sel): 

1026 match = False 

1027 return match 

1028 

1029 def match_contains(self, el: bs4.Tag, contains: tuple[ct.SelectorContains, ...]) -> bool: 

1030 """Match element if it contains text.""" 

1031 

1032 match = True 

1033 content = None # type: str | Sequence[str] | None 

1034 for contain_list in contains: 

1035 if content is None: 

1036 if contain_list.own: 

1037 content = self.get_own_text(el, no_iframe=self.is_html) 

1038 else: 

1039 content = self.get_text(el, no_iframe=self.is_html) 

1040 found = False 

1041 for text in contain_list.text: 

1042 if contain_list.own: 

1043 for c in content: 

1044 if text in c: 

1045 found = True 

1046 break 

1047 if found: 

1048 break 

1049 else: 

1050 if text in content: 

1051 found = True 

1052 break 

1053 if not found: 

1054 match = False 

1055 return match 

1056 

1057 def match_default(self, el: bs4.Tag) -> bool: 

1058 """Match default.""" 

1059 

1060 match = False 

1061 

1062 # Find this input's form 

1063 form = None 

1064 parent = self.get_parent(el, no_iframe=True) 

1065 while parent and form is None: 

1066 if self.get_tag(parent) == 'form' and self.is_html_tag(parent): 

1067 form = parent 

1068 else: 

1069 parent = self.get_parent(parent, no_iframe=True) 

1070 

1071 # Look in form cache to see if we've already located its default button 

1072 found_form = False 

1073 for f, t in self.cached_default_forms: 

1074 if f is form: 

1075 found_form = True 

1076 if t is el: 

1077 match = True 

1078 break 

1079 

1080 # We didn't have the form cached, so look for its default button 

1081 if not found_form: 

1082 for child in self.get_descendants(form, no_iframe=True): 

1083 name = self.get_tag(child) 

1084 # Can't do nested forms (haven't figured out why we never hit this) 

1085 if name == 'form': # pragma: no cover 

1086 break 

1087 if name in ('input', 'button'): 

1088 v = self.get_attribute_by_name(child, 'type', '') 

1089 if v and util.lower(v) == 'submit': 

1090 self.cached_default_forms.append((form, child)) 

1091 if el is child: 

1092 match = True 

1093 break 

1094 return match 

1095 

1096 def match_indeterminate(self, el: bs4.Tag) -> bool: 

1097 """Match default.""" 

1098 

1099 match = False 

1100 name = cast(str, self.get_attribute_by_name(el, 'name')) 

1101 

1102 def get_parent_form(el: bs4.Tag) -> bs4.Tag | None: 

1103 """Find this input's form.""" 

1104 form = None 

1105 parent = self.get_parent(el, no_iframe=True) 

1106 while form is None: 

1107 if self.get_tag(parent) == 'form' and self.is_html_tag(parent): 

1108 form = parent 

1109 break 

1110 last_parent = parent 

1111 parent = self.get_parent(parent, no_iframe=True) 

1112 if parent is None: 

1113 form = last_parent 

1114 break 

1115 return form 

1116 

1117 form = get_parent_form(el) 

1118 

1119 # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate 

1120 found_form = False 

1121 for f, n, i in self.cached_indeterminate_forms: 

1122 if f is form and n == name: 

1123 found_form = True 

1124 if i is True: 

1125 match = True 

1126 break 

1127 

1128 # We didn't have the form cached, so validate that the radio button is indeterminate 

1129 if not found_form: 

1130 checked = False 

1131 for child in self.get_descendants(form, no_iframe=True): 

1132 if child is el: 

1133 continue 

1134 tag_name = self.get_tag(child) 

1135 if tag_name == 'input': 

1136 is_radio = False 

1137 check = False 

1138 has_name = False 

1139 for k, v in self.iter_attributes(child): 

1140 if util.lower(k) == 'type' and util.lower(v) == 'radio': 

1141 is_radio = True 

1142 elif util.lower(k) == 'name' and v == name: 

1143 has_name = True 

1144 elif util.lower(k) == 'checked': 

1145 check = True 

1146 if is_radio and check and has_name and get_parent_form(child) is form: 

1147 checked = True 

1148 break 

1149 if checked: 

1150 break 

1151 if not checked: 

1152 match = True 

1153 self.cached_indeterminate_forms.append((form, name, match)) 

1154 

1155 return match 

1156 

1157 def match_lang(self, el: bs4.Tag, langs: tuple[ct.SelectorLang, ...]) -> bool: 

1158 """Match languages.""" 

1159 

1160 match = False 

1161 has_ns = self.supports_namespaces() 

1162 root = self.root 

1163 has_html_namespace = self.has_html_namespace 

1164 

1165 # Walk parents looking for `lang` (HTML) or `xml:lang` XML property. 

1166 parent = el 

1167 found_lang = None 

1168 last = None 

1169 while not found_lang: 

1170 has_html_ns = self.has_html_ns(parent) 

1171 for k, v in self.iter_attributes(parent): 

1172 attr_ns, attr = self.split_namespace(parent, k) 

1173 if ( 

1174 ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or 

1175 ( 

1176 has_ns and not has_html_ns and attr_ns == NS_XML and 

1177 (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang' 

1178 ) 

1179 ): 

1180 found_lang = v 

1181 break 

1182 last = parent 

1183 parent = self.get_parent(parent, no_iframe=self.is_html) 

1184 

1185 if parent is None: 

1186 root = last 

1187 has_html_namespace = self.has_html_ns(root) 

1188 parent = last 

1189 break 

1190 

1191 # Use cached meta language. 

1192 if found_lang is None and self.cached_meta_lang: 

1193 for cache in self.cached_meta_lang: 

1194 if root is cache[0]: 

1195 found_lang = cache[1] 

1196 

1197 # If we couldn't find a language, and the document is HTML, look to meta to determine language. 

1198 if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')): 

1199 # Find head 

1200 found = False 

1201 for tag in ('html', 'head'): 

1202 found = False 

1203 for child in self.get_children(parent, no_iframe=self.is_html): 

1204 if self.get_tag(child) == tag and self.is_html_tag(child): 

1205 found = True 

1206 parent = child 

1207 break 

1208 if not found: # pragma: no cover 

1209 break 

1210 

1211 # Search meta tags 

1212 if found: 

1213 for child in parent: 

1214 if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent): 

1215 c_lang = False 

1216 content = None 

1217 for k, v in self.iter_attributes(child): 

1218 if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language': 

1219 c_lang = True 

1220 if util.lower(k) == 'content': 

1221 content = v 

1222 if c_lang and content: 

1223 found_lang = content 

1224 self.cached_meta_lang.append((cast(str, root), cast(str, found_lang))) 

1225 break 

1226 if found_lang is not None: 

1227 break 

1228 if found_lang is None: 

1229 self.cached_meta_lang.append((cast(str, root), '')) 

1230 

1231 # If we determined a language, compare. 

1232 if found_lang is not None: 

1233 for patterns in langs: 

1234 match = False 

1235 for pattern in patterns: 

1236 if self.extended_language_filter(pattern, cast(str, found_lang)): 

1237 match = True 

1238 if not match: 

1239 break 

1240 

1241 return match 

1242 

1243 def match_dir(self, el: bs4.Tag, directionality: int) -> bool: 

1244 """Check directionality.""" 

1245 

1246 # If we have to match both left and right, we can't match either. 

1247 if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL: 

1248 return False 

1249 

1250 if el is None or not self.is_html_tag(el): 

1251 return False 

1252 

1253 # Element has defined direction of left to right or right to left 

1254 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None) 

1255 if direction not in (None, 0): 

1256 return direction == directionality 

1257 

1258 # Element is the document element (the root) and no direction assigned, assume left to right. 

1259 is_root = self.is_root(el) 

1260 if is_root and direction is None: 

1261 return ct.SEL_DIR_LTR == directionality 

1262 

1263 # If `input[type=telephone]` and no direction is assigned, assume left to right. 

1264 name = self.get_tag(el) 

1265 is_input = name == 'input' 

1266 is_textarea = name == 'textarea' 

1267 is_bdi = name == 'bdi' 

1268 itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else '' 

1269 if is_input and itype == 'tel' and direction is None: 

1270 return ct.SEL_DIR_LTR == directionality 

1271 

1272 # Auto handling for text inputs 

1273 if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0: 

1274 if is_textarea: 

1275 temp = [] 

1276 for node in self.get_contents(el, no_iframe=True): 

1277 if self.is_content_string(node): 

1278 temp.append(node) 

1279 value = ''.join(temp) 

1280 else: 

1281 value = cast(str, self.get_attribute_by_name(el, 'value', '')) 

1282 if value: 

1283 for c in value: 

1284 bidi = unicodedata.bidirectional(c) 

1285 if bidi in ('AL', 'R', 'L'): 

1286 direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL 

1287 return direction == directionality 

1288 # Assume left to right 

1289 return ct.SEL_DIR_LTR == directionality 

1290 elif is_root: 

1291 return ct.SEL_DIR_LTR == directionality 

1292 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) 

1293 

1294 # Auto handling for `bdi` and other non text inputs. 

1295 if (is_bdi and direction is None) or direction == 0: 

1296 direction = self.find_bidi(el) 

1297 if direction is not None: 

1298 return direction == directionality 

1299 elif is_root: 

1300 return ct.SEL_DIR_LTR == directionality 

1301 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) 

1302 

1303 # Match parents direction 

1304 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) 

1305 

1306 def match_range(self, el: bs4.Tag, condition: int) -> bool: 

1307 """ 

1308 Match range. 

1309 

1310 Behavior is modeled after what we see in browsers. Browsers seem to evaluate 

1311 if the value is out of range, and if not, it is in range. So a missing value 

1312 will not evaluate out of range; therefore, value is in range. Personally, I 

1313 feel like this should evaluate as neither in or out of range. 

1314 """ 

1315 

1316 out_of_range = False 

1317 

1318 itype = util.lower(self.get_attribute_by_name(el, 'type')) 

1319 mn = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'min', None))) 

1320 mx = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'max', None))) 

1321 

1322 # There is no valid min or max, so we cannot evaluate a range 

1323 if mn is None and mx is None: 

1324 return False 

1325 

1326 value = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'value', None))) 

1327 if value is not None: 

1328 if itype in ("date", "datetime-local", "month", "week", "number", "range"): 

1329 if mn is not None and value < mn: 

1330 out_of_range = True 

1331 if not out_of_range and mx is not None and value > mx: 

1332 out_of_range = True 

1333 elif itype == "time": 

1334 if mn is not None and mx is not None and mn > mx: 

1335 # Time is periodic, so this is a reversed/discontinuous range 

1336 if value < mn and value > mx: 

1337 out_of_range = True 

1338 else: 

1339 if mn is not None and value < mn: 

1340 out_of_range = True 

1341 if not out_of_range and mx is not None and value > mx: 

1342 out_of_range = True 

1343 

1344 return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range 

1345 

1346 def match_defined(self, el: bs4.Tag) -> bool: 

1347 """ 

1348 Match defined. 

1349 

1350 `:defined` is related to custom elements in a browser. 

1351 

1352 - If the document is XML (not XHTML), all tags will match. 

1353 - Tags that are not custom (don't have a hyphen) are marked defined. 

1354 - If the tag has a prefix (without or without a namespace), it will not match. 

1355 

1356 This is of course requires the parser to provide us with the proper prefix and namespace info, 

1357 if it doesn't, there is nothing we can do. 

1358 """ 

1359 

1360 name = self.get_tag(el) 

1361 return ( 

1362 name is not None and ( 

1363 name.find('-') == -1 or 

1364 name.find(':') != -1 or 

1365 self.get_prefix(el) is not None 

1366 ) 

1367 ) 

1368 

1369 def match_placeholder_shown(self, el: bs4.Tag) -> bool: 

1370 """ 

1371 Match placeholder shown according to HTML spec. 

1372 

1373 - text area should be checked if they have content. A single newline does not count as content. 

1374 

1375 """ 

1376 

1377 match = False 

1378 content = self.get_text(el) 

1379 if content in ('', '\n'): 

1380 match = True 

1381 

1382 return match 

1383 

1384 def match_selectors(self, el: bs4.Tag, selectors: ct.SelectorList) -> bool: 

1385 """Check if element matches one of the selectors.""" 

1386 

1387 match = False 

1388 is_not = selectors.is_not 

1389 is_html = selectors.is_html 

1390 

1391 # Internal selector lists that use the HTML flag, will automatically get the `html` namespace. 

1392 if is_html: 

1393 namespaces = self.namespaces 

1394 iframe_restrict = self.iframe_restrict 

1395 self.namespaces = {'html': NS_XHTML} 

1396 self.iframe_restrict = True 

1397 

1398 if not is_html or self.is_html: 

1399 for selector in selectors: 

1400 match = is_not 

1401 # We have a un-matchable situation (like `:focus` as you can focus an element in this environment) 

1402 if isinstance(selector, ct.SelectorNull): 

1403 continue 

1404 # Verify tag matches 

1405 if not self.match_tag(el, selector.tag): 

1406 continue 

1407 # Verify tag is defined 

1408 if selector.flags & ct.SEL_DEFINED and not self.match_defined(el): 

1409 continue 

1410 # Verify element is root 

1411 if selector.flags & ct.SEL_ROOT and not self.match_root(el): 

1412 continue 

1413 # Verify element is scope 

1414 if selector.flags & ct.SEL_SCOPE and not self.match_scope(el): 

1415 continue 

1416 # Verify element has placeholder shown 

1417 if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el): 

1418 continue 

1419 # Verify `nth` matches 

1420 if not self.match_nth(el, selector.nth): 

1421 continue 

1422 if selector.flags & ct.SEL_EMPTY and not self.match_empty(el): 

1423 continue 

1424 # Verify id matches 

1425 if selector.ids and not self.match_id(el, selector.ids): 

1426 continue 

1427 # Verify classes match 

1428 if selector.classes and not self.match_classes(el, selector.classes): 

1429 continue 

1430 # Verify attribute(s) match 

1431 if not self.match_attributes(el, selector.attributes): 

1432 continue 

1433 # Verify ranges 

1434 if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES): 

1435 continue 

1436 # Verify language patterns 

1437 if selector.lang and not self.match_lang(el, selector.lang): 

1438 continue 

1439 # Verify pseudo selector patterns 

1440 if selector.selectors and not self.match_subselectors(el, selector.selectors): 

1441 continue 

1442 # Verify relationship selectors 

1443 if selector.relation and not self.match_relations(el, selector.relation): 

1444 continue 

1445 # Validate that the current default selector match corresponds to the first submit button in the form 

1446 if selector.flags & ct.SEL_DEFAULT and not self.match_default(el): 

1447 continue 

1448 # Validate that the unset radio button is among radio buttons with the same name in a form that are 

1449 # also not set. 

1450 if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el): 

1451 continue 

1452 # Validate element directionality 

1453 if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS): 

1454 continue 

1455 # Validate that the tag contains the specified text. 

1456 if selector.contains and not self.match_contains(el, selector.contains): 

1457 continue 

1458 match = not is_not 

1459 break 

1460 

1461 # Restore actual namespaces being used for external selector lists 

1462 if is_html: 

1463 self.namespaces = namespaces 

1464 self.iframe_restrict = iframe_restrict 

1465 

1466 return match 

1467 

1468 def select(self, limit: int = 0) -> Iterator[bs4.Tag]: 

1469 """Match all tags under the targeted tag.""" 

1470 

1471 lim = None if limit < 1 else limit 

1472 

1473 for child in self.get_descendants(self.tag): 

1474 if self.match(child): 

1475 yield child 

1476 if lim is not None: 

1477 lim -= 1 

1478 if lim < 1: 

1479 break 

1480 

1481 def closest(self) -> bs4.Tag | None: 

1482 """Match closest ancestor.""" 

1483 

1484 current = self.tag 

1485 closest = None 

1486 while closest is None and current is not None: 

1487 if self.match(current): 

1488 closest = current 

1489 else: 

1490 current = self.get_parent(current) 

1491 return closest 

1492 

1493 def filter(self) -> list[bs4.Tag]: # noqa A001 

1494 """Filter tag's children.""" 

1495 

1496 return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)] 

1497 

1498 def match(self, el: bs4.Tag) -> bool: 

1499 """Match.""" 

1500 

1501 return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors) 

1502 

1503 

1504class SoupSieve(ct.Immutable): 

1505 """Compiled Soup Sieve selector matching object.""" 

1506 

1507 pattern: str 

1508 selectors: ct.SelectorList 

1509 namespaces: ct.Namespaces | None 

1510 custom: dict[str, str] 

1511 flags: int 

1512 

1513 __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash") 

1514 

1515 def __init__( 

1516 self, 

1517 pattern: str, 

1518 selectors: ct.SelectorList, 

1519 namespaces: ct.Namespaces | None, 

1520 custom: ct.CustomSelectors | None, 

1521 flags: int 

1522 ): 

1523 """Initialize.""" 

1524 

1525 super().__init__( 

1526 pattern=pattern, 

1527 selectors=selectors, 

1528 namespaces=namespaces, 

1529 custom=custom, 

1530 flags=flags 

1531 ) 

1532 

1533 def match(self, tag: bs4.Tag) -> bool: 

1534 """Match.""" 

1535 

1536 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag) 

1537 

1538 def closest(self, tag: bs4.Tag) -> bs4.Tag: 

1539 """Match closest ancestor.""" 

1540 

1541 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest() 

1542 

1543 def filter(self, iterable: Iterable[bs4.Tag]) -> list[bs4.Tag]: # noqa A001 

1544 """ 

1545 Filter. 

1546 

1547 `CSSMatch` can cache certain searches for tags of the same document, 

1548 so if we are given a tag, all tags are from the same document, 

1549 and we can take advantage of the optimization. 

1550 

1551 Any other kind of iterable could have tags from different documents or detached tags, 

1552 so for those, we use a new `CSSMatch` for each item in the iterable. 

1553 """ 

1554 

1555 if CSSMatch.is_tag(iterable): 

1556 return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter() 

1557 else: 

1558 return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)] 

1559 

1560 def select_one(self, tag: bs4.Tag) -> bs4.Tag: 

1561 """Select a single tag.""" 

1562 

1563 tags = self.select(tag, limit=1) 

1564 return tags[0] if tags else None 

1565 

1566 def select(self, tag: bs4.Tag, limit: int = 0) -> list[bs4.Tag]: 

1567 """Select the specified tags.""" 

1568 

1569 return list(self.iselect(tag, limit)) 

1570 

1571 def iselect(self, tag: bs4.Tag, limit: int = 0) -> Iterator[bs4.Tag]: 

1572 """Iterate the specified tags.""" 

1573 

1574 for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit): 

1575 yield el 

1576 

1577 def __repr__(self) -> str: # pragma: no cover 

1578 """Representation.""" 

1579 

1580 return "SoupSieve(pattern={!r}, namespaces={!r}, custom={!r}, flags={!r})".format( 

1581 self.pattern, 

1582 self.namespaces, 

1583 self.custom, 

1584 self.flags 

1585 ) 

1586 

1587 __str__ = __repr__ 

1588 

1589 

1590ct.pickle_register(SoupSieve)