Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/soupsieve/css_match.py: 21%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

974 statements  

1"""CSS matcher.""" 

2from __future__ import annotations 

3from datetime import datetime 

4from . import util 

5import re 

6from . import css_types as ct 

7import unicodedata 

8import bs4 

9from typing import Iterator, Iterable, Any, Callable, Sequence, Any, cast # noqa: F401, F811 

10 

11# Empty tag pattern (whitespace okay) 

12RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') 

13 

14RE_NOT_WS = re.compile('[^ \t\r\n\f]+') 

15 

16# Relationships 

17REL_PARENT = ' ' 

18REL_CLOSE_PARENT = '>' 

19REL_SIBLING = '~' 

20REL_CLOSE_SIBLING = '+' 

21 

22# Relationships for :has() (forward looking) 

23REL_HAS_PARENT = ': ' 

24REL_HAS_CLOSE_PARENT = ':>' 

25REL_HAS_SIBLING = ':~' 

26REL_HAS_CLOSE_SIBLING = ':+' 

27 

28NS_XHTML = 'http://www.w3.org/1999/xhtml' 

29NS_XML = 'http://www.w3.org/XML/1998/namespace' 

30 

31DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL 

32RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE 

33 

34DIR_MAP = { 

35 'ltr': ct.SEL_DIR_LTR, 

36 'rtl': ct.SEL_DIR_RTL, 

37 'auto': 0 

38} 

39 

40RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$") 

41RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$') 

42RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$') 

43RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$') 

44RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$') 

45RE_DATETIME = re.compile( 

46 r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$' 

47) 

48RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)') 

49 

50MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November 

51FEB = 2 

52SHORT_MONTH = 30 

53LONG_MONTH = 31 

54FEB_MONTH = 28 

55FEB_LEAP_MONTH = 29 

56DAYS_IN_WEEK = 7 

57 

58 

59class _FakeParent: 

60 """ 

61 Fake parent class. 

62 

63 When we have a fragment with no `BeautifulSoup` document object, 

64 we can't evaluate `nth` selectors properly. Create a temporary 

65 fake parent so we can traverse the root element as a child. 

66 """ 

67 

68 def __init__(self, element: bs4.Tag) -> None: 

69 """Initialize.""" 

70 

71 self.contents = [element] 

72 

73 def __len__(self) -> int: 

74 """Length.""" 

75 

76 return len(self.contents) 

77 

78 

79class _DocumentNav: 

80 """Navigate a Beautiful Soup document.""" 

81 

82 @classmethod 

83 def assert_valid_input(cls, tag: Any) -> None: 

84 """Check if valid input tag or document.""" 

85 

86 # Fail on unexpected types. 

87 if not cls.is_tag(tag): 

88 raise TypeError(f"Expected a BeautifulSoup 'Tag', but instead received type {type(tag)}") 

89 

90 @staticmethod 

91 def is_doc(obj: bs4.element.PageElement | None) -> bool: 

92 """Is `BeautifulSoup` object.""" 

93 return isinstance(obj, bs4.BeautifulSoup) 

94 

95 @staticmethod 

96 def is_tag(obj: bs4.element.PageElement | None) -> bool: 

97 """Is tag.""" 

98 return isinstance(obj, bs4.Tag) 

99 

100 @staticmethod 

101 def is_declaration(obj: bs4.element.PageElement | None) -> bool: # pragma: no cover 

102 """Is declaration.""" 

103 return isinstance(obj, bs4.Declaration) 

104 

105 @staticmethod 

106 def is_cdata(obj: bs4.element.PageElement | None) -> bool: 

107 """Is CDATA.""" 

108 return isinstance(obj, bs4.CData) 

109 

110 @staticmethod 

111 def is_processing_instruction(obj: bs4.element.PageElement | None) -> bool: # pragma: no cover 

112 """Is processing instruction.""" 

113 return isinstance(obj, bs4.ProcessingInstruction) 

114 

115 @staticmethod 

116 def is_navigable_string(obj: bs4.element.PageElement | None) -> bool: 

117 """Is navigable string.""" 

118 return isinstance(obj, bs4.element.NavigableString) 

119 

120 @staticmethod 

121 def is_special_string(obj: bs4.element.PageElement | None) -> bool: 

122 """Is special string.""" 

123 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) 

124 

125 @classmethod 

126 def is_content_string(cls, obj: bs4.element.PageElement | None) -> bool: 

127 """Check if node is content string.""" 

128 

129 return cls.is_navigable_string(obj) and not cls.is_special_string(obj) 

130 

131 @staticmethod 

132 def create_fake_parent(el: bs4.Tag) -> _FakeParent: 

133 """Create fake parent for a given element.""" 

134 

135 return _FakeParent(el) 

136 

137 @staticmethod 

138 def is_xml_tree(el: bs4.Tag | None) -> bool: 

139 """Check if element (or document) is from a XML tree.""" 

140 

141 return el is not None and bool(el._is_xml) 

142 

143 def is_iframe(self, el: bs4.Tag | None) -> bool: 

144 """Check if element is an `iframe`.""" 

145 

146 if el is None: # pragma: no cover 

147 return False 

148 

149 return bool( 

150 ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and 

151 self.is_html_tag(el) # type: ignore[attr-defined] 

152 ) 

153 

154 def is_root(self, el: bs4.Tag) -> bool: 

155 """ 

156 Return whether element is a root element. 

157 

158 We check that the element is the root of the tree (which we have already pre-calculated), 

159 and we check if it is the root element under an `iframe`. 

160 """ 

161 

162 root = self.root and self.root is el # type: ignore[attr-defined] 

163 if not root: 

164 parent = self.get_parent(el) 

165 root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined] 

166 return root 

167 

168 def get_contents(self, el: bs4.Tag | None, no_iframe: bool = False) -> Iterator[bs4.element.PageElement]: 

169 """Get contents or contents in reverse.""" 

170 

171 if el is not None: 

172 if not no_iframe or not self.is_iframe(el): 

173 yield from el.contents 

174 

175 def get_tag_children( 

176 self, 

177 el: bs4.Tag | None, 

178 start: int | None = None, 

179 reverse: bool = False, 

180 no_iframe: bool = False 

181 ) -> Iterator[bs4.Tag]: 

182 """Get tag children.""" 

183 

184 return self.get_children(el, start, reverse, True, no_iframe) # type: ignore[return-value] 

185 

186 def get_children( 

187 self, 

188 el: bs4.Tag | None, 

189 start: int | None = None, 

190 reverse: bool = False, 

191 tags: bool = False, 

192 no_iframe: bool = False 

193 ) -> Iterator[bs4.element.PageElement]: 

194 """Get children.""" 

195 

196 if el is not None and (not no_iframe or not self.is_iframe(el)): 

197 last = len(el.contents) - 1 

198 if start is None: 

199 index = last if reverse else 0 

200 else: 

201 index = start 

202 end = -1 if reverse else last + 1 

203 incr = -1 if reverse else 1 

204 

205 if 0 <= index <= last: 

206 while index != end: 

207 node = el.contents[index] 

208 index += incr 

209 if not tags or self.is_tag(node): 

210 yield node 

211 

212 def get_tag_descendants( 

213 self, 

214 el: bs4.Tag | None, 

215 no_iframe: bool = False 

216 ) -> Iterator[bs4.Tag]: 

217 """Specifically get tag descendants.""" 

218 

219 yield from self.get_descendants(el, tags=True, no_iframe=no_iframe) # type: ignore[misc] 

220 

221 def get_descendants( 

222 self, 

223 el: bs4.Tag | None, 

224 tags: bool = False, 

225 no_iframe: bool = False 

226 ) -> Iterator[bs4.element.PageElement]: 

227 """Get descendants.""" 

228 

229 if el is not None and (not no_iframe or not self.is_iframe(el)): 

230 next_good = None 

231 for child in el.descendants: 

232 

233 if next_good is not None: 

234 if child is not next_good: 

235 continue 

236 next_good = None 

237 

238 if isinstance(child, bs4.Tag): 

239 if no_iframe and self.is_iframe(child): 

240 if child.next_sibling is not None: 

241 next_good = child.next_sibling 

242 else: 

243 last_child = child # type: bs4.element.PageElement 

244 while isinstance(last_child, bs4.Tag) and last_child.contents: 

245 last_child = last_child.contents[-1] 

246 next_good = last_child.next_element 

247 yield child 

248 if next_good is None: 

249 break 

250 # Coverage isn't seeing this even though it's executed 

251 continue # pragma: no cover 

252 yield child 

253 

254 elif not tags: 

255 yield child 

256 

257 def get_parent(self, el: bs4.Tag | None, no_iframe: bool = False) -> bs4.Tag | None: 

258 """Get parent.""" 

259 

260 parent = el.parent if el is not None else None 

261 if no_iframe and parent is not None and self.is_iframe(parent): 

262 parent = None 

263 return parent 

264 

265 @staticmethod 

266 def get_tag_name(el: bs4.Tag | None) -> str | None: 

267 """Get tag.""" 

268 

269 return el.name if el is not None else None 

270 

271 @staticmethod 

272 def get_prefix_name(el: bs4.Tag) -> str | None: 

273 """Get prefix.""" 

274 

275 return el.prefix 

276 

277 @staticmethod 

278 def get_uri(el: bs4.Tag | None) -> str | None: 

279 """Get namespace `URI`.""" 

280 

281 return el.namespace if el is not None else None 

282 

283 @classmethod 

284 def get_next_tag(cls, el: bs4.Tag) -> bs4.Tag | None: 

285 """Get next sibling tag.""" 

286 

287 return cls.get_next(el, tags=True) # type: ignore[return-value] 

288 

289 @classmethod 

290 def get_next(cls, el: bs4.Tag, tags: bool = False) -> bs4.element.PageElement | None: 

291 """Get next sibling tag.""" 

292 

293 sibling = el.next_sibling 

294 while tags and not isinstance(sibling, bs4.Tag) and sibling is not None: 

295 sibling = sibling.next_sibling 

296 

297 if tags and not isinstance(sibling, bs4.Tag): 

298 sibling = None 

299 

300 return sibling 

301 

302 @classmethod 

303 def get_previous_tag(cls, el: bs4.Tag, tags: bool = True) -> bs4.Tag | None: 

304 """Get previous sibling tag.""" 

305 

306 return cls.get_previous(el, True) # type: ignore[return-value] 

307 

308 @classmethod 

309 def get_previous(cls, el: bs4.Tag, tags: bool = False) -> bs4.element.PageElement | None: 

310 """Get previous sibling tag.""" 

311 

312 sibling = el.previous_sibling 

313 while tags and not isinstance(sibling, bs4.Tag) and sibling is not None: 

314 sibling = sibling.previous_sibling 

315 

316 if tags and not isinstance(sibling, bs4.Tag): 

317 sibling = None 

318 

319 return sibling 

320 

321 @staticmethod 

322 def has_html_ns(el: bs4.Tag | None) -> bool: 

323 """ 

324 Check if element has an HTML namespace. 

325 

326 This is a bit different than whether a element is treated as having an HTML namespace, 

327 like we do in the case of `is_html_tag`. 

328 """ 

329 

330 ns = getattr(el, 'namespace') if el is not None else None # noqa: B009 

331 return bool(ns and ns == NS_XHTML) 

332 

333 @staticmethod 

334 def split_namespace(el: bs4.Tag | None, attr_name: str) -> tuple[str | None, str | None]: 

335 """Return namespace and attribute name without the prefix.""" 

336 

337 if el is None: # pragma: no cover 

338 return None, None 

339 

340 return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None) 

341 

342 @classmethod 

343 def normalize_value(cls, value: Any) -> str | Sequence[str]: 

344 """Normalize the value to be a string or list of strings.""" 

345 

346 # Treat `None` as empty string. 

347 if value is None: 

348 return '' 

349 

350 # Pass through strings 

351 if (isinstance(value, str)): 

352 return value 

353 

354 # If it's a byte string, convert it to Unicode, treating it as UTF-8. 

355 if isinstance(value, bytes): 

356 return value.decode("utf8") 

357 

358 # BeautifulSoup supports sequences of attribute values, so make sure the children are strings. 

359 if isinstance(value, Sequence): 

360 new_value = [] 

361 for v in value: 

362 if not isinstance(v, (str, bytes)) and isinstance(v, Sequence): 

363 # This is most certainly a user error and will crash and burn later. 

364 # To keep things working, we'll do what we do with all objects, 

365 # And convert them to strings. 

366 new_value.append(str(v)) 

367 else: 

368 # Convert the child to a string 

369 new_value.append(cast(str, cls.normalize_value(v))) 

370 return new_value 

371 

372 # Try and make anything else a string 

373 return str(value) 

374 

375 @classmethod 

376 def get_attribute_by_name( 

377 cls, 

378 el: bs4.Tag, 

379 name: str, 

380 default: str | Sequence[str] | None = None 

381 ) -> str | Sequence[str] | None: 

382 """Get attribute by name.""" 

383 

384 value = default 

385 if el._is_xml: 

386 try: 

387 value = cls.normalize_value(el.attrs[name]) 

388 except KeyError: 

389 pass 

390 else: 

391 for k, v in el.attrs.items(): 

392 if util.lower(k) == name: 

393 value = cls.normalize_value(v) 

394 break 

395 return value 

396 

397 @classmethod 

398 def iter_attributes(cls, el: bs4.Tag | None) -> Iterator[tuple[str, str | Sequence[str] | None]]: 

399 """Iterate attributes.""" 

400 

401 if el is not None: 

402 for k, v in el.attrs.items(): 

403 yield k, cls.normalize_value(v) 

404 

405 @classmethod 

406 def get_classes(cls, el: bs4.Tag) -> Sequence[str]: 

407 """Get classes.""" 

408 

409 classes = cls.get_attribute_by_name(el, 'class', []) 

410 if isinstance(classes, str): 

411 classes = RE_NOT_WS.findall(classes) 

412 return cast(Sequence[str], classes) 

413 

414 def get_text(self, el: bs4.Tag, no_iframe: bool = False) -> str: 

415 """Get text.""" 

416 

417 return ''.join( 

418 [ 

419 node for node in self.get_descendants(el, no_iframe=no_iframe) # type: ignore[misc] 

420 if self.is_content_string(node) 

421 ] 

422 ) 

423 

424 def get_own_text(self, el: bs4.Tag, no_iframe: bool = False) -> list[str]: 

425 """Get Own Text.""" 

426 

427 return [ 

428 node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node) # type: ignore[misc] 

429 ] 

430 

431 

432class Inputs: 

433 """Class for parsing and validating input items.""" 

434 

435 @staticmethod 

436 def validate_day(year: int, month: int, day: int) -> bool: 

437 """Validate day.""" 

438 

439 max_days = LONG_MONTH 

440 if month == FEB: 

441 max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH 

442 elif month in MONTHS_30: 

443 max_days = SHORT_MONTH 

444 return 1 <= day <= max_days 

445 

446 @staticmethod 

447 def validate_week(year: int, week: int) -> bool: 

448 """Validate week.""" 

449 

450 max_week = datetime.strptime(f"{12}-{31}-{year}", "%m-%d-%Y").isocalendar()[1] 

451 if max_week == 1: 

452 max_week = 53 

453 return 1 <= week <= max_week 

454 

455 @staticmethod 

456 def validate_month(month: int) -> bool: 

457 """Validate month.""" 

458 

459 return 1 <= month <= 12 

460 

461 @staticmethod 

462 def validate_year(year: int) -> bool: 

463 """Validate year.""" 

464 

465 return 1 <= year 

466 

467 @staticmethod 

468 def validate_hour(hour: int) -> bool: 

469 """Validate hour.""" 

470 

471 return 0 <= hour <= 23 

472 

473 @staticmethod 

474 def validate_minutes(minutes: int) -> bool: 

475 """Validate minutes.""" 

476 

477 return 0 <= minutes <= 59 

478 

479 @classmethod 

480 def parse_value(cls, itype: str, value: str | None) -> tuple[float, ...] | None: 

481 """Parse the input value.""" 

482 

483 parsed = None # type: tuple[float, ...] | None 

484 if value is None: 

485 return value 

486 if itype == "date": 

487 m = RE_DATE.match(value) 

488 if m: 

489 year = int(m.group('year'), 10) 

490 month = int(m.group('month'), 10) 

491 day = int(m.group('day'), 10) 

492 if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day): 

493 parsed = (year, month, day) 

494 elif itype == "month": 

495 m = RE_MONTH.match(value) 

496 if m: 

497 year = int(m.group('year'), 10) 

498 month = int(m.group('month'), 10) 

499 if cls.validate_year(year) and cls.validate_month(month): 

500 parsed = (year, month) 

501 elif itype == "week": 

502 m = RE_WEEK.match(value) 

503 if m: 

504 year = int(m.group('year'), 10) 

505 week = int(m.group('week'), 10) 

506 if cls.validate_year(year) and cls.validate_week(year, week): 

507 parsed = (year, week) 

508 elif itype == "time": 

509 m = RE_TIME.match(value) 

510 if m: 

511 hour = int(m.group('hour'), 10) 

512 minutes = int(m.group('minutes'), 10) 

513 if cls.validate_hour(hour) and cls.validate_minutes(minutes): 

514 parsed = (hour, minutes) 

515 elif itype == "datetime-local": 

516 m = RE_DATETIME.match(value) 

517 if m: 

518 year = int(m.group('year'), 10) 

519 month = int(m.group('month'), 10) 

520 day = int(m.group('day'), 10) 

521 hour = int(m.group('hour'), 10) 

522 minutes = int(m.group('minutes'), 10) 

523 if ( 

524 cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and 

525 cls.validate_hour(hour) and cls.validate_minutes(minutes) 

526 ): 

527 parsed = (year, month, day, hour, minutes) 

528 elif itype in ("number", "range"): 

529 m = RE_NUM.match(value) 

530 if m: 

531 parsed = (float(m.group('value')),) 

532 return parsed 

533 

534 

535class CSSMatch(_DocumentNav): 

536 """Perform CSS matching.""" 

537 

538 def __init__( 

539 self, 

540 selectors: ct.SelectorList, 

541 scope: bs4.Tag | None, 

542 namespaces: ct.Namespaces | None, 

543 flags: int 

544 ) -> None: 

545 """Initialize.""" 

546 

547 self.assert_valid_input(scope) 

548 self.tag = scope 

549 self.cached_meta_lang = [] # type: list[tuple[str, str]] 

550 self.cached_default_forms = [] # type: list[tuple[bs4.Tag, bs4.Tag]] 

551 self.cached_indeterminate_forms = [] # type: list[tuple[bs4.Tag, str, bool]] 

552 self.selectors = selectors 

553 self.namespaces = {} if namespaces is None else namespaces # type: ct.Namespaces | dict[str, str] 

554 self.flags = flags 

555 self.iframe_restrict = False 

556 

557 # Find the root element for the whole tree 

558 doc = scope 

559 parent = self.get_parent(doc) 

560 while parent: 

561 doc = parent 

562 parent = self.get_parent(doc) 

563 root = None # type: bs4.Tag | None 

564 if not self.is_doc(doc): 

565 root = doc 

566 else: 

567 for child in self.get_tag_children(doc): 

568 root = child 

569 break 

570 

571 self.root = root 

572 self.scope = scope if scope is not doc else root 

573 self.has_html_namespace = self.has_html_ns(root) 

574 

575 # A document can be both XML and HTML (XHTML) 

576 self.is_xml = self.is_xml_tree(doc) 

577 self.is_html = not self.is_xml or self.has_html_namespace 

578 

579 def supports_namespaces(self) -> bool: 

580 """Check if namespaces are supported in the HTML type.""" 

581 

582 return self.is_xml or self.has_html_namespace 

583 

584 def get_tag_ns(self, el: bs4.Tag | None) -> str: 

585 """Get tag namespace.""" 

586 

587 namespace = '' 

588 if el is None: # pragma: no cover 

589 return namespace 

590 

591 if self.supports_namespaces(): 

592 ns = self.get_uri(el) 

593 if ns: 

594 namespace = ns 

595 else: 

596 namespace = NS_XHTML 

597 return namespace 

598 

599 def is_html_tag(self, el: bs4.Tag | None) -> bool: 

600 """Check if tag is in HTML namespace.""" 

601 

602 return self.get_tag_ns(el) == NS_XHTML 

603 

604 def get_tag(self, el: bs4.Tag | None) -> str | None: 

605 """Get tag.""" 

606 

607 name = self.get_tag_name(el) 

608 return util.lower(name) if name is not None and not self.is_xml else name 

609 

610 def get_prefix(self, el: bs4.Tag) -> str | None: 

611 """Get prefix.""" 

612 

613 prefix = self.get_prefix_name(el) 

614 return util.lower(prefix) if prefix is not None and not self.is_xml else prefix 

615 

616 def find_bidi(self, el: bs4.Tag) -> int | None: 

617 """Get directionality from element text.""" 

618 

619 for node in self.get_children(el): 

620 

621 # Analyze child text nodes 

622 if self.is_tag(node): 

623 

624 # Avoid analyzing certain elements specified in the specification. 

625 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None) # type: ignore[arg-type] 

626 name = self.get_tag(node) # type: ignore[arg-type] 

627 if ( 

628 (name and name in ('bdi', 'script', 'style', 'textarea', 'iframe')) or 

629 not self.is_html_tag(node) or # type: ignore[arg-type] 

630 direction is not None 

631 ): 

632 continue # pragma: no cover 

633 

634 # Check directionality of this node's text 

635 value = self.find_bidi(node) # type: ignore[arg-type] 

636 if value is not None: 

637 return value 

638 

639 # Direction could not be determined 

640 continue # pragma: no cover 

641 

642 # Skip `doctype` comments, etc. 

643 if self.is_special_string(node): 

644 continue 

645 

646 # Analyze text nodes for directionality. 

647 for c in node: # type: ignore[attr-defined] 

648 bidi = unicodedata.bidirectional(c) 

649 if bidi in ('AL', 'R', 'L'): 

650 return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL 

651 return None 

652 

653 def extended_language_filter(self, lang_range: str, lang_tag: str) -> bool: 

654 """Filter the language tags.""" 

655 

656 match = True 

657 lang_range = RE_WILD_STRIP.sub('-', lang_range).lower() 

658 ranges = lang_range.split('-') 

659 subtags = lang_tag.lower().split('-') 

660 length = len(ranges) 

661 slength = len(subtags) 

662 rindex = 0 

663 sindex = 0 

664 r = ranges[rindex] 

665 s = subtags[sindex] 

666 

667 # Empty specified language should match unspecified language attributes 

668 if length == 1 and slength == 1 and not r and r == s: 

669 return True 

670 

671 # Primary tag needs to match 

672 if (r != '*' and r != s) or (r == '*' and slength == 1 and not s): 

673 match = False 

674 

675 rindex += 1 

676 sindex += 1 

677 

678 # Match until we run out of ranges 

679 while match and rindex < length: 

680 r = ranges[rindex] 

681 try: 

682 s = subtags[sindex] 

683 except IndexError: 

684 # Ran out of subtags, 

685 # but we still have ranges 

686 match = False 

687 continue 

688 

689 # Empty range 

690 if not r: 

691 match = False 

692 continue 

693 

694 # Matched range 

695 elif s == r: 

696 rindex += 1 

697 

698 # Implicit wildcard cannot match 

699 # singletons 

700 elif len(s) == 1: 

701 match = False 

702 continue 

703 

704 # Implicitly matched, so grab next subtag 

705 sindex += 1 

706 

707 return match 

708 

709 def match_attribute_name( 

710 self, 

711 el: bs4.Tag, 

712 attr: str, 

713 prefix: str | None 

714 ) -> str | Sequence[str] | None: 

715 """Match attribute name and return value if it exists.""" 

716 

717 value = None 

718 if self.supports_namespaces(): 

719 value = None 

720 # If we have not defined namespaces, we can't very well find them, so don't bother trying. 

721 if prefix: 

722 ns = self.namespaces.get(prefix) 

723 if ns is None and prefix != '*': 

724 return None 

725 else: 

726 ns = None 

727 

728 for k, v in self.iter_attributes(el): 

729 

730 # Get attribute parts 

731 namespace, name = self.split_namespace(el, k) 

732 

733 # Can't match a prefix attribute as we haven't specified one to match 

734 # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`. 

735 if ns is None: 

736 if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)): 

737 value = v 

738 break 

739 # Coverage is not finding this even though it is executed. 

740 # Adding a print statement before this (and erasing coverage) causes coverage to find the line. 

741 # Ignore the false positive message. 

742 continue # pragma: no cover 

743 

744 # We can't match our desired prefix attribute as the attribute doesn't have a prefix 

745 if namespace is None or (ns != namespace and prefix != '*'): 

746 continue 

747 

748 # The attribute doesn't match. 

749 if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name): 

750 continue 

751 

752 value = v 

753 break 

754 else: 

755 for k, v in self.iter_attributes(el): 

756 if util.lower(attr) != util.lower(k): 

757 continue 

758 value = v 

759 break 

760 return value 

761 

762 def match_namespace(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool: 

763 """Match the namespace of the element.""" 

764 

765 match = True 

766 namespace = self.get_tag_ns(el) 

767 default_namespace = self.namespaces.get('') 

768 tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix) 

769 # We must match the default namespace if one is not provided 

770 if tag.prefix is None and (default_namespace is not None and namespace != default_namespace): 

771 match = False 

772 # If we specified `|tag`, we must not have a namespace. 

773 elif (tag.prefix is not None and tag.prefix == '' and namespace): 

774 match = False 

775 # Verify prefix matches 

776 elif ( 

777 tag.prefix and 

778 tag.prefix != '*' and (tag_ns is None or namespace != tag_ns) 

779 ): 

780 match = False 

781 return match 

782 

783 def match_attributes(self, el: bs4.Tag, attributes: tuple[ct.SelectorAttribute, ...]) -> bool: 

784 """Match attributes.""" 

785 

786 match = True 

787 if attributes: 

788 for a in attributes: 

789 temp = self.match_attribute_name(el, a.attribute, a.prefix) 

790 pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern 

791 if temp is None: 

792 match = False 

793 break 

794 value = temp if isinstance(temp, str) else ' '.join(temp) 

795 if pattern is None: 

796 continue 

797 elif pattern.match(value) is None: 

798 match = False 

799 break 

800 return match 

801 

802 def match_tagname(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool: 

803 """Match tag name.""" 

804 

805 name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name) 

806 return not ( 

807 name is not None and 

808 name not in (self.get_tag(el), '*') 

809 ) 

810 

811 def match_tag(self, el: bs4.Tag, tag: ct.SelectorTag | None) -> bool: 

812 """Match the tag.""" 

813 

814 match = True 

815 if tag is not None: 

816 # Verify namespace 

817 if not self.match_namespace(el, tag): 

818 match = False 

819 if not self.match_tagname(el, tag): 

820 match = False 

821 return match 

822 

823 def match_past_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: 

824 """Match past relationship.""" 

825 

826 found = False 

827 # I don't think this can ever happen, but it makes `mypy` happy 

828 if isinstance(relation[0], ct.SelectorNull): # pragma: no cover 

829 return found 

830 

831 if relation[0].rel_type == REL_PARENT: 

832 parent = self.get_parent(el, no_iframe=self.iframe_restrict) 

833 while not found and parent: 

834 found = self.match_selectors(parent, relation) 

835 parent = self.get_parent(parent, no_iframe=self.iframe_restrict) 

836 elif relation[0].rel_type == REL_CLOSE_PARENT: 

837 parent = self.get_parent(el, no_iframe=self.iframe_restrict) 

838 if parent: 

839 found = self.match_selectors(parent, relation) 

840 elif relation[0].rel_type == REL_SIBLING: 

841 sibling = self.get_previous_tag(el) 

842 while not found and sibling: 

843 found = self.match_selectors(sibling, relation) 

844 sibling = self.get_previous_tag(sibling) 

845 elif relation[0].rel_type == REL_CLOSE_SIBLING: 

846 sibling = self.get_previous_tag(el) 

847 if sibling and self.is_tag(sibling): 

848 found = self.match_selectors(sibling, relation) 

849 return found 

850 

851 def match_future_child(self, parent: bs4.Tag, relation: ct.SelectorList, recursive: bool = False) -> bool: 

852 """Match future child.""" 

853 

854 match = False 

855 if recursive: 

856 children = self.get_tag_descendants # type: Callable[..., Iterator[bs4.Tag]] 

857 else: 

858 children = self.get_tag_children 

859 for child in children(parent, no_iframe=self.iframe_restrict): 

860 match = self.match_selectors(child, relation) 

861 if match: 

862 break 

863 return match 

864 

865 def match_future_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: 

866 """Match future relationship.""" 

867 

868 found = False 

869 # I don't think this can ever happen, but it makes `mypy` happy 

870 if isinstance(relation[0], ct.SelectorNull): # pragma: no cover 

871 return found 

872 

873 if relation[0].rel_type == REL_HAS_PARENT: 

874 found = self.match_future_child(el, relation, True) 

875 elif relation[0].rel_type == REL_HAS_CLOSE_PARENT: 

876 found = self.match_future_child(el, relation) 

877 elif relation[0].rel_type == REL_HAS_SIBLING: 

878 sibling = self.get_next_tag(el) 

879 while not found and sibling: 

880 found = self.match_selectors(sibling, relation) 

881 sibling = self.get_next_tag(sibling) 

882 elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING: 

883 sibling = self.get_next_tag(el) 

884 if sibling and self.is_tag(sibling): 

885 found = self.match_selectors(sibling, relation) 

886 return found 

887 

888 def match_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: 

889 """Match relationship to other elements.""" 

890 

891 found = False 

892 

893 if isinstance(relation[0], ct.SelectorNull) or relation[0].rel_type is None: 

894 return found 

895 

896 if relation[0].rel_type.startswith(':'): 

897 found = self.match_future_relations(el, relation) 

898 else: 

899 found = self.match_past_relations(el, relation) 

900 

901 return found 

902 

903 def match_id(self, el: bs4.Tag, ids: tuple[str, ...]) -> bool: 

904 """Match element's ID.""" 

905 

906 found = True 

907 for i in ids: 

908 if i != self.get_attribute_by_name(el, 'id', ''): 

909 found = False 

910 break 

911 return found 

912 

913 def match_classes(self, el: bs4.Tag, classes: tuple[str, ...]) -> bool: 

914 """Match element's classes.""" 

915 

916 current_classes = self.get_classes(el) 

917 found = True 

918 for c in classes: 

919 if c not in current_classes: 

920 found = False 

921 break 

922 return found 

923 

924 def match_root(self, el: bs4.Tag) -> bool: 

925 """Match element as root.""" 

926 

927 is_root = self.is_root(el) 

928 if is_root: 

929 sibling = self.get_previous(el) # type: Any 

930 while is_root and sibling is not None: 

931 if ( 

932 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or 

933 self.is_cdata(sibling) 

934 ): 

935 is_root = False 

936 else: 

937 sibling = self.get_previous(sibling) 

938 if is_root: 

939 sibling = self.get_next(el) 

940 while is_root and sibling is not None: 

941 if ( 

942 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or 

943 self.is_cdata(sibling) 

944 ): 

945 is_root = False 

946 else: 

947 sibling = self.get_next(sibling) 

948 return is_root 

949 

950 def match_scope(self, el: bs4.Tag) -> bool: 

951 """Match element as scope.""" 

952 

953 return self.scope is el 

954 

955 def match_nth_tag_type(self, el: bs4.Tag, child: bs4.Tag) -> bool: 

956 """Match tag type for `nth` matches.""" 

957 

958 return ( 

959 (self.get_tag(child) == self.get_tag(el)) and 

960 (self.get_tag_ns(child) == self.get_tag_ns(el)) 

961 ) 

962 

963 def match_nth(self, el: bs4.Tag, nth: tuple[ct.SelectorNth, ...]) -> bool: 

964 """Match `nth` elements.""" 

965 

966 matched = True 

967 

968 for n in nth: 

969 matched = False 

970 if n.selectors and not self.match_selectors(el, n.selectors): 

971 break 

972 parent = self.get_parent(el) # type: bs4.Tag | None 

973 if parent is None: 

974 parent = cast('bs4.Tag', self.create_fake_parent(el)) 

975 last = n.last 

976 last_index = len(parent) - 1 

977 index = last_index if last else 0 

978 relative_index = 0 

979 a = n.a 

980 b = n.b 

981 var = n.n 

982 count = 0 

983 count_incr = 1 

984 factor = -1 if last else 1 

985 idx = last_idx = a * count + b if var else a 

986 

987 # We can only adjust bounds within a variable index 

988 if var: 

989 # Abort if our nth index is out of bounds and only getting further out of bounds as we increment. 

990 # Otherwise, increment to try to get in bounds. 

991 adjust = None 

992 while idx < 1 or idx > last_index: 

993 if idx < 0: 

994 diff_low = 0 - idx 

995 if adjust is not None and adjust == 1: 

996 break 

997 adjust = -1 

998 count += count_incr 

999 idx = last_idx = a * count + b if var else a 

1000 diff = 0 - idx 

1001 if diff >= diff_low: 

1002 break 

1003 else: 

1004 diff_high = idx - last_index 

1005 if adjust is not None and adjust == -1: 

1006 break 

1007 adjust = 1 

1008 count += count_incr 

1009 idx = last_idx = a * count + b if var else a 

1010 diff = idx - last_index 

1011 if diff >= diff_high: 

1012 break 

1013 diff_high = diff 

1014 

1015 # If a < 0, our count is working backwards, so floor the index by increasing the count. 

1016 # Find the count that yields the lowest, in bound value and use that. 

1017 # Lastly reverse count increment so that we'll increase our index. 

1018 lowest = count 

1019 if a < 0: 

1020 while idx >= 1: 

1021 lowest = count 

1022 count += count_incr 

1023 idx = last_idx = a * count + b if var else a 

1024 count_incr = -1 

1025 count = lowest 

1026 idx = last_idx = a * count + b if var else a 

1027 

1028 # Evaluate elements while our calculated nth index is still in range 

1029 while 1 <= idx <= last_index + 1: 

1030 child = None # type: bs4.element.PageElement | None 

1031 # Evaluate while our child index is still in range. 

1032 for child in self.get_children(parent, start=index, reverse=factor < 0): 

1033 index += factor 

1034 if not isinstance(child, bs4.Tag): 

1035 continue 

1036 # Handle `of S` in `nth-child` 

1037 if n.selectors and not self.match_selectors(child, n.selectors): 

1038 continue 

1039 # Handle `of-type` 

1040 if n.of_type and not self.match_nth_tag_type(el, child): 

1041 continue 

1042 relative_index += 1 

1043 if relative_index == idx: 

1044 if child is el: 

1045 matched = True 

1046 else: 

1047 break 

1048 if child is el: 

1049 break 

1050 if child is el: 

1051 break 

1052 last_idx = idx 

1053 count += count_incr 

1054 if count < 0: 

1055 # Count is counting down and has now ventured into invalid territory. 

1056 break 

1057 idx = a * count + b if var else a 

1058 if last_idx == idx: 

1059 break 

1060 if not matched: 

1061 break 

1062 return matched 

1063 

1064 def match_empty(self, el: bs4.Tag) -> bool: 

1065 """Check if element is empty (if requested).""" 

1066 

1067 is_empty = True 

1068 for child in self.get_children(el): 

1069 if self.is_tag(child): 

1070 is_empty = False 

1071 break 

1072 elif self.is_content_string(child) and RE_NOT_EMPTY.search(child): # type: ignore[call-overload] 

1073 is_empty = False 

1074 break 

1075 return is_empty 

1076 

1077 def match_subselectors(self, el: bs4.Tag, selectors: tuple[ct.SelectorList, ...]) -> bool: 

1078 """Match selectors.""" 

1079 

1080 match = True 

1081 for sel in selectors: 

1082 if not self.match_selectors(el, sel): 

1083 match = False 

1084 return match 

1085 

1086 def match_contains(self, el: bs4.Tag, contains: tuple[ct.SelectorContains, ...]) -> bool: 

1087 """Match element if it contains text.""" 

1088 

1089 match = True 

1090 content = None # type: str | Sequence[str] | None 

1091 for contain_list in contains: 

1092 if content is None: 

1093 if contain_list.own: 

1094 content = self.get_own_text(el, no_iframe=self.is_html) 

1095 else: 

1096 content = self.get_text(el, no_iframe=self.is_html) 

1097 found = False 

1098 for text in contain_list.text: 

1099 if contain_list.own: 

1100 for c in content: 

1101 if text in c: 

1102 found = True 

1103 break 

1104 if found: 

1105 break 

1106 else: 

1107 if text in content: 

1108 found = True 

1109 break 

1110 if not found: 

1111 match = False 

1112 return match 

1113 

1114 def match_default(self, el: bs4.Tag) -> bool: 

1115 """Match default.""" 

1116 

1117 match = False 

1118 

1119 # Find this input's form 

1120 form = None # type: bs4.Tag | None 

1121 parent = self.get_parent(el, no_iframe=True) 

1122 while parent and form is None: 

1123 if self.get_tag(parent) == 'form' and self.is_html_tag(parent): 

1124 form = parent 

1125 else: 

1126 parent = self.get_parent(parent, no_iframe=True) 

1127 

1128 if form is not None: 

1129 # Look in form cache to see if we've already located its default button 

1130 found_form = False 

1131 for f, t in self.cached_default_forms: 

1132 if f is form: 

1133 found_form = True 

1134 if t is el: 

1135 match = True 

1136 break 

1137 

1138 # We didn't have the form cached, so look for its default button 

1139 if not found_form: 

1140 for child in self.get_tag_descendants(form, no_iframe=True): 

1141 name = self.get_tag(child) 

1142 # Can't do nested forms (haven't figured out why we never hit this) 

1143 if name == 'form': # pragma: no cover 

1144 break 

1145 if name in ('input', 'button'): 

1146 v = self.get_attribute_by_name(child, 'type', '') 

1147 if v and util.lower(v) == 'submit': 

1148 self.cached_default_forms.append((form, child)) 

1149 if el is child: 

1150 match = True 

1151 break 

1152 return match 

1153 

1154 def match_indeterminate(self, el: bs4.Tag) -> bool: 

1155 """Match default.""" 

1156 

1157 match = False 

1158 name = cast(str, self.get_attribute_by_name(el, 'name')) 

1159 

1160 def get_parent_form(el: bs4.Tag) -> bs4.Tag | None: 

1161 """Find this input's form.""" 

1162 form = None 

1163 parent = self.get_parent(el, no_iframe=True) 

1164 while form is None: 

1165 if self.get_tag(parent) == 'form' and self.is_html_tag(parent): 

1166 form = parent 

1167 break 

1168 last_parent = parent 

1169 parent = self.get_parent(parent, no_iframe=True) 

1170 if parent is None: 

1171 form = last_parent 

1172 break 

1173 return form 

1174 

1175 form = get_parent_form(el) 

1176 

1177 # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate 

1178 if form is not None: 

1179 found_form = False 

1180 for f, n, i in self.cached_indeterminate_forms: 

1181 if f is form and n == name: 

1182 found_form = True 

1183 if i is True: 

1184 match = True 

1185 break 

1186 

1187 # We didn't have the form cached, so validate that the radio button is indeterminate 

1188 if not found_form: 

1189 checked = False 

1190 for child in self.get_tag_descendants(form, no_iframe=True): 

1191 if child is el: 

1192 continue 

1193 tag_name = self.get_tag(child) 

1194 if tag_name == 'input': 

1195 is_radio = False 

1196 check = False 

1197 has_name = False 

1198 for k, v in self.iter_attributes(child): 

1199 if util.lower(k) == 'type' and util.lower(v) == 'radio': 

1200 is_radio = True 

1201 elif util.lower(k) == 'name' and v == name: 

1202 has_name = True 

1203 elif util.lower(k) == 'checked': 

1204 check = True 

1205 if is_radio and check and has_name and get_parent_form(child) is form: 

1206 checked = True 

1207 break 

1208 if checked: 

1209 break 

1210 if not checked: 

1211 match = True 

1212 self.cached_indeterminate_forms.append((form, name, match)) 

1213 

1214 return match 

1215 

1216 def match_lang(self, el: bs4.Tag, langs: tuple[ct.SelectorLang, ...]) -> bool: 

1217 """Match languages.""" 

1218 

1219 match = False 

1220 has_ns = self.supports_namespaces() 

1221 root = self.root 

1222 has_html_namespace = self.has_html_namespace 

1223 

1224 # Walk parents looking for `lang` (HTML) or `xml:lang` XML property. 

1225 parent = el # type: bs4.Tag | None 

1226 found_lang = None 

1227 last = None 

1228 while not found_lang: 

1229 has_html_ns = self.has_html_ns(parent) 

1230 for k, v in self.iter_attributes(parent): 

1231 attr_ns, attr = self.split_namespace(parent, k) 

1232 if ( 

1233 ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or 

1234 ( 

1235 has_ns and not has_html_ns and attr_ns == NS_XML and 

1236 (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang' 

1237 ) 

1238 ): 

1239 found_lang = v 

1240 break 

1241 last = parent 

1242 parent = self.get_parent(parent, no_iframe=self.is_html) 

1243 

1244 if parent is None: 

1245 root = last 

1246 has_html_namespace = self.has_html_ns(root) 

1247 parent = last 

1248 break 

1249 

1250 # Use cached meta language. 

1251 if found_lang is None and self.cached_meta_lang: 

1252 for cache in self.cached_meta_lang: 

1253 if root is cache[0]: 

1254 found_lang = cache[1] 

1255 

1256 # If we couldn't find a language, and the document is HTML, look to meta to determine language. 

1257 if found_lang is None and (not self.is_xml or (has_html_namespace and root and root.name == 'html')): 

1258 # Find head 

1259 found = False 

1260 for tag in ('html', 'head'): 

1261 found = False 

1262 for child in self.get_tag_children(parent, no_iframe=self.is_html): 

1263 if self.get_tag(child) == tag and self.is_html_tag(child): 

1264 found = True 

1265 parent = child 

1266 break 

1267 if not found: # pragma: no cover 

1268 break 

1269 

1270 # Search meta tags 

1271 if found and parent is not None: 

1272 for child2 in parent: 

1273 if isinstance(child2, bs4.Tag) and self.get_tag(child2) == 'meta' and self.is_html_tag(parent): 

1274 c_lang = False 

1275 content = None 

1276 for k, v in self.iter_attributes(child2): 

1277 if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language': 

1278 c_lang = True 

1279 if util.lower(k) == 'content': 

1280 content = v 

1281 if c_lang and content: 

1282 found_lang = content 

1283 self.cached_meta_lang.append((cast(str, root), cast(str, found_lang))) 

1284 break 

1285 if found_lang is not None: 

1286 break 

1287 if found_lang is None: 

1288 self.cached_meta_lang.append((cast(str, root), '')) 

1289 

1290 # If we determined a language, compare. 

1291 if found_lang is not None: 

1292 for patterns in langs: 

1293 match = False 

1294 for pattern in patterns: 

1295 if self.extended_language_filter(pattern, cast(str, found_lang)): 

1296 match = True 

1297 if not match: 

1298 break 

1299 

1300 return match 

1301 

1302 def match_dir(self, el: bs4.Tag | None, directionality: int) -> bool: 

1303 """Check directionality.""" 

1304 

1305 # If we have to match both left and right, we can't match either. 

1306 if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL: 

1307 return False 

1308 

1309 if el is None or not self.is_html_tag(el): 

1310 return False 

1311 

1312 # Element has defined direction of left to right or right to left 

1313 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None) 

1314 if direction not in (None, 0): 

1315 return direction == directionality 

1316 

1317 # Element is the document element (the root) and no direction assigned, assume left to right. 

1318 is_root = self.is_root(el) 

1319 if is_root and direction is None: 

1320 return ct.SEL_DIR_LTR == directionality 

1321 

1322 # If `input[type=telephone]` and no direction is assigned, assume left to right. 

1323 name = self.get_tag(el) 

1324 is_input = name == 'input' 

1325 is_textarea = name == 'textarea' 

1326 is_bdi = name == 'bdi' 

1327 itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else '' 

1328 if is_input and itype == 'tel' and direction is None: 

1329 return ct.SEL_DIR_LTR == directionality 

1330 

1331 # Auto handling for text inputs 

1332 if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0: 

1333 if is_textarea: 

1334 value = ''.join(node for node in self.get_contents(el, no_iframe=True) if self.is_content_string(node)) # type: ignore[misc] 

1335 else: 

1336 value = cast(str, self.get_attribute_by_name(el, 'value', '')) 

1337 if value: 

1338 for c in value: 

1339 bidi = unicodedata.bidirectional(c) 

1340 if bidi in ('AL', 'R', 'L'): 

1341 direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL 

1342 return direction == directionality 

1343 # Assume left to right 

1344 return ct.SEL_DIR_LTR == directionality 

1345 elif is_root: 

1346 return ct.SEL_DIR_LTR == directionality 

1347 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) 

1348 

1349 # Auto handling for `bdi` and other non text inputs. 

1350 if (is_bdi and direction is None) or direction == 0: 

1351 direction = self.find_bidi(el) 

1352 if direction is not None: 

1353 return direction == directionality 

1354 elif is_root: 

1355 return ct.SEL_DIR_LTR == directionality 

1356 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) 

1357 

1358 # Match parents direction 

1359 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) 

1360 

1361 def match_range(self, el: bs4.Tag, condition: int) -> bool: 

1362 """ 

1363 Match range. 

1364 

1365 Behavior is modeled after what we see in browsers. Browsers seem to evaluate 

1366 if the value is out of range, and if not, it is in range. So a missing value 

1367 will not evaluate out of range; therefore, value is in range. Personally, I 

1368 feel like this should evaluate as neither in or out of range. 

1369 """ 

1370 

1371 out_of_range = False 

1372 

1373 itype = util.lower(self.get_attribute_by_name(el, 'type')) 

1374 mn = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'min', None))) 

1375 mx = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'max', None))) 

1376 

1377 # There is no valid min or max, so we cannot evaluate a range 

1378 if mn is None and mx is None: 

1379 return False 

1380 

1381 value = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'value', None))) 

1382 if value is not None: 

1383 if itype in ("date", "datetime-local", "month", "week", "number", "range"): 

1384 if mn is not None and value < mn: 

1385 out_of_range = True 

1386 if not out_of_range and mx is not None and value > mx: 

1387 out_of_range = True 

1388 elif itype == "time": 

1389 if mn is not None and mx is not None and mn > mx: 

1390 # Time is periodic, so this is a reversed/discontinuous range 

1391 if value < mn and value > mx: 

1392 out_of_range = True 

1393 else: 

1394 if mn is not None and value < mn: 

1395 out_of_range = True 

1396 if not out_of_range and mx is not None and value > mx: 

1397 out_of_range = True 

1398 

1399 return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range 

1400 

1401 def match_defined(self, el: bs4.Tag) -> bool: 

1402 """ 

1403 Match defined. 

1404 

1405 `:defined` is related to custom elements in a browser. 

1406 

1407 - If the document is XML (not XHTML), all tags will match. 

1408 - Tags that are not custom (don't have a hyphen) are marked defined. 

1409 - If the tag has a prefix (without or without a namespace), it will not match. 

1410 

1411 This is of course requires the parser to provide us with the proper prefix and namespace info, 

1412 if it doesn't, there is nothing we can do. 

1413 """ 

1414 

1415 name = self.get_tag(el) 

1416 return ( 

1417 name is not None and ( 

1418 name.find('-') == -1 or 

1419 name.find(':') != -1 or 

1420 self.get_prefix(el) is not None 

1421 ) 

1422 ) 

1423 

1424 def match_placeholder_shown(self, el: bs4.Tag) -> bool: 

1425 """ 

1426 Match placeholder shown according to HTML spec. 

1427 

1428 - text area should be checked if they have content. A single newline does not count as content. 

1429 

1430 """ 

1431 

1432 match = False 

1433 content = self.get_text(el) 

1434 if content in ('', '\n'): 

1435 match = True 

1436 

1437 return match 

1438 

1439 def match_selectors(self, el: bs4.Tag, selectors: ct.SelectorList) -> bool: 

1440 """Check if element matches one of the selectors.""" 

1441 

1442 match = False 

1443 is_not = selectors.is_not 

1444 is_html = selectors.is_html 

1445 

1446 # Internal selector lists that use the HTML flag, will automatically get the `html` namespace. 

1447 if is_html: 

1448 namespaces = self.namespaces 

1449 iframe_restrict = self.iframe_restrict 

1450 self.namespaces = {'html': NS_XHTML} 

1451 self.iframe_restrict = True 

1452 

1453 if not is_html or self.is_html: 

1454 for selector in selectors: 

1455 match = is_not 

1456 # We have a un-matchable situation (like `:focus` as you can focus an element in this environment) 

1457 if isinstance(selector, ct.SelectorNull): 

1458 continue 

1459 # Verify tag matches 

1460 if not self.match_tag(el, selector.tag): 

1461 continue 

1462 # Verify tag is defined 

1463 if selector.flags & ct.SEL_DEFINED and not self.match_defined(el): 

1464 continue 

1465 # Verify element is root 

1466 if selector.flags & ct.SEL_ROOT and not self.match_root(el): 

1467 continue 

1468 # Verify element is scope 

1469 if selector.flags & ct.SEL_SCOPE and not self.match_scope(el): 

1470 continue 

1471 # Verify element has placeholder shown 

1472 if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el): 

1473 continue 

1474 # Verify `nth` matches 

1475 if not self.match_nth(el, selector.nth): 

1476 continue 

1477 if selector.flags & ct.SEL_EMPTY and not self.match_empty(el): 

1478 continue 

1479 # Verify id matches 

1480 if selector.ids and not self.match_id(el, selector.ids): 

1481 continue 

1482 # Verify classes match 

1483 if selector.classes and not self.match_classes(el, selector.classes): 

1484 continue 

1485 # Verify attribute(s) match 

1486 if not self.match_attributes(el, selector.attributes): 

1487 continue 

1488 # Verify ranges 

1489 if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES): 

1490 continue 

1491 # Verify language patterns 

1492 if selector.lang and not self.match_lang(el, selector.lang): 

1493 continue 

1494 # Verify pseudo selector patterns 

1495 if selector.selectors and not self.match_subselectors(el, selector.selectors): 

1496 continue 

1497 # Verify relationship selectors 

1498 if selector.relation and not self.match_relations(el, selector.relation): 

1499 continue 

1500 # Validate that the current default selector match corresponds to the first submit button in the form 

1501 if selector.flags & ct.SEL_DEFAULT and not self.match_default(el): 

1502 continue 

1503 # Validate that the unset radio button is among radio buttons with the same name in a form that are 

1504 # also not set. 

1505 if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el): 

1506 continue 

1507 # Validate element directionality 

1508 if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS): 

1509 continue 

1510 # Validate that the tag contains the specified text. 

1511 if selector.contains and not self.match_contains(el, selector.contains): 

1512 continue 

1513 match = not is_not 

1514 break 

1515 

1516 # Restore actual namespaces being used for external selector lists 

1517 if is_html: 

1518 self.namespaces = namespaces 

1519 self.iframe_restrict = iframe_restrict 

1520 

1521 return match 

1522 

1523 def select(self, limit: int = 0) -> Iterator[bs4.Tag]: 

1524 """Match all tags under the targeted tag.""" 

1525 

1526 lim = None if limit < 1 else limit 

1527 

1528 for child in self.get_tag_descendants(self.tag): 

1529 if self.match(child): 

1530 yield child 

1531 if lim is not None: 

1532 lim -= 1 

1533 if lim < 1: 

1534 break 

1535 

1536 def closest(self) -> bs4.Tag | None: 

1537 """Match closest ancestor.""" 

1538 

1539 current = self.tag # type: bs4.Tag | None 

1540 closest = None 

1541 while closest is None and current is not None: 

1542 if self.match(current): 

1543 closest = current 

1544 else: 

1545 current = self.get_parent(current) 

1546 return closest 

1547 

1548 def filter(self) -> list[bs4.Tag]: # noqa A001 

1549 """Filter tag's children.""" 

1550 

1551 return [ 

1552 tag for tag in self.get_contents(self.tag) 

1553 if isinstance(tag, bs4.Tag) and self.match(tag) 

1554 ] 

1555 

1556 def match(self, el: bs4.Tag) -> bool: 

1557 """Match.""" 

1558 

1559 return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors) 

1560 

1561 

1562class SoupSieve(ct.Immutable): 

1563 """Compiled Soup Sieve selector matching object.""" 

1564 

1565 pattern: str 

1566 selectors: ct.SelectorList 

1567 namespaces: ct.Namespaces | None 

1568 custom: dict[str, str] 

1569 flags: int 

1570 

1571 __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash") 

1572 

1573 def __init__( 

1574 self, 

1575 pattern: str, 

1576 selectors: ct.SelectorList, 

1577 namespaces: ct.Namespaces | None, 

1578 custom: ct.CustomSelectors | None, 

1579 flags: int 

1580 ): 

1581 """Initialize.""" 

1582 

1583 super().__init__( 

1584 pattern=pattern, 

1585 selectors=selectors, 

1586 namespaces=namespaces, 

1587 custom=custom, 

1588 flags=flags 

1589 ) 

1590 

1591 def match(self, tag: bs4.Tag) -> bool: 

1592 """Match.""" 

1593 

1594 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag) 

1595 

1596 def closest(self, tag: bs4.Tag) -> bs4.Tag | None: 

1597 """Match closest ancestor.""" 

1598 

1599 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest() 

1600 

1601 def filter(self, iterable: Iterable[bs4.Tag]) -> list[bs4.Tag]: # noqa A001 

1602 """ 

1603 Filter. 

1604 

1605 `CSSMatch` can cache certain searches for tags of the same document, 

1606 so if we are given a tag, all tags are from the same document, 

1607 and we can take advantage of the optimization. 

1608 

1609 Any other kind of iterable could have tags from different documents or detached tags, 

1610 so for those, we use a new `CSSMatch` for each item in the iterable. 

1611 """ 

1612 

1613 if isinstance(iterable, bs4.Tag): 

1614 return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter() 

1615 else: 

1616 return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)] 

1617 

1618 def select_one(self, tag: bs4.Tag) -> bs4.Tag | None: 

1619 """Select a single tag.""" 

1620 

1621 tags = self.select(tag, limit=1) 

1622 return tags[0] if tags else None 

1623 

1624 def select(self, tag: bs4.Tag, limit: int = 0) -> list[bs4.Tag]: 

1625 """Select the specified tags.""" 

1626 

1627 return list(self.iselect(tag, limit)) 

1628 

1629 def iselect(self, tag: bs4.Tag, limit: int = 0) -> Iterator[bs4.Tag]: 

1630 """Iterate the specified tags.""" 

1631 

1632 yield from CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit) 

1633 

1634 def __repr__(self) -> str: # pragma: no cover 

1635 """Representation.""" 

1636 

1637 return ( 

1638 f"SoupSieve(pattern={self.pattern!r}, namespaces={self.namespaces!r}, " 

1639 f"custom={self.custom!r}, flags={self.flags!r})" 

1640 ) 

1641 

1642 __str__ = __repr__ 

1643 

1644 

1645ct.pickle_register(SoupSieve)