Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/soupsieve/css_match.py: 58%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

970 statements  

1"""CSS matcher.""" 

2from __future__ import annotations 

3from datetime import datetime 

4from . import util 

5import re 

6from . import css_types as ct 

7import unicodedata 

8import bs4 

9from typing import Iterator, Iterable, Any, Callable, Sequence, Any, cast # noqa: F401, F811 

10 

11# Empty tag pattern (whitespace okay) 

12RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') 

13 

14RE_NOT_WS = re.compile('[^ \t\r\n\f]+') 

15 

16# Relationships 

17REL_PARENT = ' ' 

18REL_CLOSE_PARENT = '>' 

19REL_SIBLING = '~' 

20REL_CLOSE_SIBLING = '+' 

21 

22# Relationships for :has() (forward looking) 

23REL_HAS_PARENT = ': ' 

24REL_HAS_CLOSE_PARENT = ':>' 

25REL_HAS_SIBLING = ':~' 

26REL_HAS_CLOSE_SIBLING = ':+' 

27 

28NS_XHTML = 'http://www.w3.org/1999/xhtml' 

29NS_XML = 'http://www.w3.org/XML/1998/namespace' 

30 

31DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL 

32RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE 

33 

34DIR_MAP = { 

35 'ltr': ct.SEL_DIR_LTR, 

36 'rtl': ct.SEL_DIR_RTL, 

37 'auto': 0 

38} 

39 

40RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$") 

41RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$') 

42RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$') 

43RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$') 

44RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$') 

45RE_DATETIME = re.compile( 

46 r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$' 

47) 

48RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)') 

49 

50MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November 

51FEB = 2 

52SHORT_MONTH = 30 

53LONG_MONTH = 31 

54FEB_MONTH = 28 

55FEB_LEAP_MONTH = 29 

56DAYS_IN_WEEK = 7 

57 

58 

59class _FakeParent: 

60 """ 

61 Fake parent class. 

62 

63 When we have a fragment with no `BeautifulSoup` document object, 

64 we can't evaluate `nth` selectors properly. Create a temporary 

65 fake parent so we can traverse the root element as a child. 

66 """ 

67 

68 def __init__(self, element: bs4.Tag) -> None: 

69 """Initialize.""" 

70 

71 self.contents = [element] 

72 

73 def __len__(self) -> int: 

74 """Length.""" 

75 

76 return len(self.contents) 

77 

78 

79class _DocumentNav: 

80 """Navigate a Beautiful Soup document.""" 

81 

82 @classmethod 

83 def assert_valid_input(cls, tag: Any) -> None: 

84 """Check if valid input tag or document.""" 

85 

86 # Fail on unexpected types. 

87 if not cls.is_tag(tag): 

88 raise TypeError(f"Expected a BeautifulSoup 'Tag', but instead received type {type(tag)}") 

89 

90 @staticmethod 

91 def is_doc(obj: bs4.element.PageElement | None) -> bool: 

92 """Is `BeautifulSoup` object.""" 

93 return isinstance(obj, bs4.BeautifulSoup) 

94 

95 @staticmethod 

96 def is_tag(obj: bs4.element.PageElement | None) -> bool: 

97 """Is tag.""" 

98 return isinstance(obj, bs4.Tag) 

99 

100 @staticmethod 

101 def is_declaration(obj: bs4.element.PageElement | None) -> bool: # pragma: no cover 

102 """Is declaration.""" 

103 return isinstance(obj, bs4.Declaration) 

104 

105 @staticmethod 

106 def is_cdata(obj: bs4.element.PageElement | None) -> bool: 

107 """Is CDATA.""" 

108 return isinstance(obj, bs4.CData) 

109 

110 @staticmethod 

111 def is_processing_instruction(obj: bs4.element.PageElement | None) -> bool: # pragma: no cover 

112 """Is processing instruction.""" 

113 return isinstance(obj, bs4.ProcessingInstruction) 

114 

115 @staticmethod 

116 def is_navigable_string(obj: bs4.element.PageElement | None) -> bool: 

117 """Is navigable string.""" 

118 return isinstance(obj, bs4.element.NavigableString) 

119 

120 @staticmethod 

121 def is_special_string(obj: bs4.element.PageElement | None) -> bool: 

122 """Is special string.""" 

123 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) 

124 

125 @classmethod 

126 def is_content_string(cls, obj: bs4.element.PageElement | None) -> bool: 

127 """Check if node is content string.""" 

128 

129 return cls.is_navigable_string(obj) and not cls.is_special_string(obj) 

130 

131 @staticmethod 

132 def create_fake_parent(el: bs4.Tag) -> _FakeParent: 

133 """Create fake parent for a given element.""" 

134 

135 return _FakeParent(el) 

136 

137 @staticmethod 

138 def is_xml_tree(el: bs4.Tag | None) -> bool: 

139 """Check if element (or document) is from a XML tree.""" 

140 

141 return el is not None and bool(el._is_xml) 

142 

143 def is_iframe(self, el: bs4.Tag | None) -> bool: 

144 """Check if element is an `iframe`.""" 

145 

146 if el is None: # pragma: no cover 

147 return False 

148 

149 return bool( 

150 ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and 

151 self.is_html_tag(el) # type: ignore[attr-defined] 

152 ) 

153 

154 def is_root(self, el: bs4.Tag) -> bool: 

155 """ 

156 Return whether element is a root element. 

157 

158 We check that the element is the root of the tree (which we have already pre-calculated), 

159 and we check if it is the root element under an `iframe`. 

160 """ 

161 

162 root = self.root and self.root is el # type: ignore[attr-defined] 

163 if not root: 

164 parent = self.get_parent(el) 

165 root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined] 

166 return root 

167 

168 def get_contents(self, el: bs4.Tag | None, no_iframe: bool = False) -> Iterator[bs4.element.PageElement]: 

169 """Get contents or contents in reverse.""" 

170 

171 if el is not None: 

172 if not no_iframe or not self.is_iframe(el): 

173 yield from el.contents 

174 

175 def get_tag_children( 

176 self, 

177 el: bs4.Tag | None, 

178 start: int | None = None, 

179 reverse: bool = False, 

180 no_iframe: bool = False 

181 ) -> Iterator[bs4.Tag]: 

182 """Get tag children.""" 

183 

184 return self.get_children(el, start, reverse, True, no_iframe) # type: ignore[return-value] 

185 

186 def get_children( 

187 self, 

188 el: bs4.Tag | None, 

189 start: int | None = None, 

190 reverse: bool = False, 

191 tags: bool = False, 

192 no_iframe: bool = False 

193 ) -> Iterator[bs4.element.PageElement]: 

194 """Get children.""" 

195 

196 if el is not None and (not no_iframe or not self.is_iframe(el)): 

197 last = len(el.contents) - 1 

198 if start is None: 

199 index = last if reverse else 0 

200 else: 

201 index = start 

202 end = -1 if reverse else last + 1 

203 incr = -1 if reverse else 1 

204 

205 if 0 <= index <= last: 

206 while index != end: 

207 node = el.contents[index] 

208 index += incr 

209 if not tags or self.is_tag(node): 

210 yield node 

211 

212 def get_tag_descendants( 

213 self, 

214 el: bs4.Tag | None, 

215 no_iframe: bool = False 

216 ) -> Iterator[bs4.Tag]: 

217 """Specifically get tag descendants.""" 

218 

219 yield from self.get_descendants(el, tags=True, no_iframe=no_iframe) # type: ignore[misc] 

220 

221 def get_descendants( 

222 self, 

223 el: bs4.Tag | None, 

224 tags: bool = False, 

225 no_iframe: bool = False 

226 ) -> Iterator[bs4.element.PageElement]: 

227 """Get descendants.""" 

228 

229 if el is not None and (not no_iframe or not self.is_iframe(el)): 

230 next_good = None 

231 for child in el.descendants: 

232 

233 if next_good is not None: 

234 if child is not next_good: 

235 continue 

236 next_good = None 

237 

238 if isinstance(child, bs4.Tag): 

239 if no_iframe and self.is_iframe(child): 

240 if child.next_sibling is not None: 

241 next_good = child.next_sibling 

242 else: 

243 last_child = child # type: bs4.element.PageElement 

244 while isinstance(last_child, bs4.Tag) and last_child.contents: 

245 last_child = last_child.contents[-1] 

246 next_good = last_child.next_element 

247 yield child 

248 if next_good is None: 

249 break 

250 # Coverage isn't seeing this even though it's executed 

251 continue # pragma: no cover 

252 yield child 

253 

254 elif not tags: 

255 yield child 

256 

257 def get_parent(self, el: bs4.Tag | None, no_iframe: bool = False) -> bs4.Tag | None: 

258 """Get parent.""" 

259 

260 parent = el.parent if el is not None else None 

261 if no_iframe and parent is not None and self.is_iframe(parent): # pragma: no cover 

262 parent = None 

263 return parent 

264 

265 @staticmethod 

266 def get_tag_name(el: bs4.Tag | None) -> str | None: 

267 """Get tag.""" 

268 

269 return el.name if el is not None else None 

270 

271 @staticmethod 

272 def get_prefix_name(el: bs4.Tag) -> str | None: 

273 """Get prefix.""" 

274 

275 return el.prefix 

276 

277 @staticmethod 

278 def get_uri(el: bs4.Tag | None) -> str | None: 

279 """Get namespace `URI`.""" 

280 

281 return el.namespace if el is not None else None 

282 

283 @classmethod 

284 def get_next_tag(cls, el: bs4.Tag) -> bs4.Tag | None: 

285 """Get next sibling tag.""" 

286 

287 return cls.get_next(el, tags=True) # type: ignore[return-value] 

288 

289 @classmethod 

290 def get_next(cls, el: bs4.Tag, tags: bool = False) -> bs4.element.PageElement | None: 

291 """Get next sibling tag.""" 

292 

293 sibling = el.next_sibling 

294 while tags and not isinstance(sibling, bs4.Tag) and sibling is not None: 

295 sibling = sibling.next_sibling 

296 

297 if tags and not isinstance(sibling, bs4.Tag): 

298 sibling = None 

299 

300 return sibling 

301 

302 @classmethod 

303 def get_previous_tag(cls, el: bs4.Tag, tags: bool = True) -> bs4.Tag | None: 

304 """Get previous sibling tag.""" 

305 

306 return cls.get_previous(el, True) # type: ignore[return-value] 

307 

308 @classmethod 

309 def get_previous(cls, el: bs4.Tag, tags: bool = False) -> bs4.element.PageElement | None: 

310 """Get previous sibling tag.""" 

311 

312 sibling = el.previous_sibling 

313 while tags and not isinstance(sibling, bs4.Tag) and sibling is not None: 

314 sibling = sibling.previous_sibling 

315 

316 if tags and not isinstance(sibling, bs4.Tag): 

317 sibling = None 

318 

319 return sibling 

320 

321 @staticmethod 

322 def has_html_ns(el: bs4.Tag | None) -> bool: 

323 """ 

324 Check if element has an HTML namespace. 

325 

326 This is a bit different than whether a element is treated as having an HTML namespace, 

327 like we do in the case of `is_html_tag`. 

328 """ 

329 

330 ns = getattr(el, 'namespace') if el is not None else None # noqa: B009 

331 return bool(ns and ns == NS_XHTML) 

332 

333 @staticmethod 

334 def split_namespace(el: bs4.Tag | None, attr_name: str) -> tuple[str | None, str | None]: 

335 """Return namespace and attribute name without the prefix.""" 

336 

337 if el is None: # pragma: no cover 

338 return None, None 

339 

340 return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None) 

341 

342 @classmethod 

343 def normalize_value(cls, value: Any) -> str | Sequence[str]: 

344 """Normalize the value to be a string or list of strings.""" 

345 

346 # Treat `None` as empty string. 

347 if value is None: 

348 return '' 

349 

350 # Pass through strings 

351 if (isinstance(value, str)): 

352 return value 

353 

354 # If it's a byte string, convert it to Unicode, treating it as UTF-8. 

355 if isinstance(value, bytes): 

356 return value.decode("utf8") 

357 

358 # BeautifulSoup supports sequences of attribute values, so make sure the children are strings. 

359 if isinstance(value, Sequence): 

360 new_value = [] 

361 for v in value: 

362 if not isinstance(v, (str, bytes)) and isinstance(v, Sequence): 

363 # This is most certainly a user error and will crash and burn later. 

364 # To keep things working, we'll do what we do with all objects, 

365 # And convert them to strings. 

366 new_value.append(str(v)) 

367 else: 

368 # Convert the child to a string 

369 new_value.append(cast(str, cls.normalize_value(v))) 

370 return new_value 

371 

372 # Try and make anything else a string 

373 return str(value) 

374 

375 @classmethod 

376 def get_attribute_by_name( 

377 cls, 

378 el: bs4.Tag, 

379 name: str, 

380 default: str | Sequence[str] | None = None 

381 ) -> str | Sequence[str] | None: 

382 """Get attribute by name.""" 

383 

384 value = default 

385 if el._is_xml: 

386 try: 

387 value = cls.normalize_value(el.attrs[name]) 

388 except KeyError: 

389 pass 

390 else: 

391 for k, v in el.attrs.items(): 

392 if util.lower(k) == name: 

393 value = cls.normalize_value(v) 

394 break 

395 return value 

396 

397 @classmethod 

398 def iter_attributes(cls, el: bs4.Tag | None) -> Iterator[tuple[str, str | Sequence[str] | None]]: 

399 """Iterate attributes.""" 

400 

401 if el is not None: 

402 for k, v in el.attrs.items(): 

403 yield k, cls.normalize_value(v) 

404 

405 @classmethod 

406 def get_classes(cls, el: bs4.Tag) -> Sequence[str]: 

407 """Get classes.""" 

408 

409 classes = cls.get_attribute_by_name(el, 'class', []) 

410 if isinstance(classes, str): 

411 classes = RE_NOT_WS.findall(classes) 

412 return cast(Sequence[str], classes) 

413 

414 def get_text(self, el: bs4.Tag, no_iframe: bool = False) -> str: 

415 """Get text.""" 

416 

417 return ''.join( 

418 [ 

419 node for node in self.get_descendants(el, no_iframe=no_iframe) # type: ignore[misc] 

420 if self.is_content_string(node) 

421 ] 

422 ) 

423 

424 def get_own_text(self, el: bs4.Tag, no_iframe: bool = False) -> list[str]: 

425 """Get Own Text.""" 

426 

427 return [ 

428 node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node) # type: ignore[misc] 

429 ] 

430 

431 

432class Inputs: 

433 """Class for parsing and validating input items.""" 

434 

435 @staticmethod 

436 def validate_day(year: int, month: int, day: int) -> bool: 

437 """Validate day.""" 

438 

439 max_days = LONG_MONTH 

440 if month == FEB: 

441 max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH 

442 elif month in MONTHS_30: 

443 max_days = SHORT_MONTH 

444 return 1 <= day <= max_days 

445 

446 @staticmethod 

447 def validate_week(year: int, week: int) -> bool: 

448 """Validate week.""" 

449 

450 # Validate an ISO week number for `year`. 

451 # 

452 # Per ISO 8601 rules, the last ISO week of a year is the week 

453 # containing Dec 28. Using Dec 28 guarantees we obtain the 

454 # correct ISO week-number for the final week of `year`, even in 

455 # years where Dec 31 falls in ISO week 01 of the following year. 

456 # 

457 # Example: if Dec 31 is a Thursday the year's last ISO week will 

458 # be week 53; if Dec 31 is a Monday and that week is counted as 

459 # week 1 of the next year, Dec 28 still belongs to the final 

460 # week of the current ISO year and yields the correct max week. 

461 max_week = datetime(year, 12, 28).isocalendar()[1] 

462 return 1 <= week <= max_week 

463 

464 @staticmethod 

465 def validate_month(month: int) -> bool: 

466 """Validate month.""" 

467 

468 return 1 <= month <= 12 

469 

470 @staticmethod 

471 def validate_year(year: int) -> bool: 

472 """Validate year.""" 

473 

474 return 1 <= year 

475 

476 @staticmethod 

477 def validate_hour(hour: int) -> bool: 

478 """Validate hour.""" 

479 

480 return 0 <= hour <= 23 

481 

482 @staticmethod 

483 def validate_minutes(minutes: int) -> bool: 

484 """Validate minutes.""" 

485 

486 return 0 <= minutes <= 59 

487 

488 @classmethod 

489 def parse_value(cls, itype: str, value: str | None) -> tuple[float, ...] | None: 

490 """Parse the input value.""" 

491 

492 parsed = None # type: tuple[float, ...] | None 

493 if value is None: 

494 return value 

495 if itype == "date": 

496 m = RE_DATE.match(value) 

497 if m: 

498 year = int(m.group('year'), 10) 

499 month = int(m.group('month'), 10) 

500 day = int(m.group('day'), 10) 

501 if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day): 

502 parsed = (year, month, day) 

503 elif itype == "month": 

504 m = RE_MONTH.match(value) 

505 if m: 

506 year = int(m.group('year'), 10) 

507 month = int(m.group('month'), 10) 

508 if cls.validate_year(year) and cls.validate_month(month): 

509 parsed = (year, month) 

510 elif itype == "week": 

511 m = RE_WEEK.match(value) 

512 if m: 

513 year = int(m.group('year'), 10) 

514 week = int(m.group('week'), 10) 

515 if cls.validate_year(year) and cls.validate_week(year, week): 

516 parsed = (year, week) 

517 elif itype == "time": 

518 m = RE_TIME.match(value) 

519 if m: 

520 hour = int(m.group('hour'), 10) 

521 minutes = int(m.group('minutes'), 10) 

522 if cls.validate_hour(hour) and cls.validate_minutes(minutes): 

523 parsed = (hour, minutes) 

524 elif itype == "datetime-local": 

525 m = RE_DATETIME.match(value) 

526 if m: 

527 year = int(m.group('year'), 10) 

528 month = int(m.group('month'), 10) 

529 day = int(m.group('day'), 10) 

530 hour = int(m.group('hour'), 10) 

531 minutes = int(m.group('minutes'), 10) 

532 if ( 

533 cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and 

534 cls.validate_hour(hour) and cls.validate_minutes(minutes) 

535 ): 

536 parsed = (year, month, day, hour, minutes) 

537 elif itype in ("number", "range"): 

538 m = RE_NUM.match(value) 

539 if m: 

540 parsed = (float(m.group('value')),) 

541 return parsed 

542 

543 

544class CSSMatch(_DocumentNav): 

545 """Perform CSS matching.""" 

546 

547 def __init__( 

548 self, 

549 selectors: ct.SelectorList, 

550 scope: bs4.Tag | None, 

551 namespaces: ct.Namespaces | None, 

552 flags: int 

553 ) -> None: 

554 """Initialize.""" 

555 

556 self.assert_valid_input(scope) 

557 self.tag = scope 

558 self.cached_meta_lang = [] # type: list[tuple[str, str]] 

559 self.cached_default_forms = [] # type: list[tuple[bs4.Tag, bs4.Tag]] 

560 self.cached_indeterminate_forms = [] # type: list[tuple[bs4.Tag, str, bool]] 

561 self.selectors = selectors 

562 self.namespaces = {} if namespaces is None else namespaces # type: ct.Namespaces | dict[str, str] 

563 self.flags = flags 

564 self.iframe_restrict = False 

565 

566 # Find the root element for the whole tree 

567 doc = scope 

568 parent = self.get_parent(doc) 

569 while parent: 

570 doc = parent 

571 parent = self.get_parent(doc) 

572 root = None # type: bs4.Tag | None 

573 if not self.is_doc(doc): 

574 root = doc 

575 else: 

576 for child in self.get_tag_children(doc): 

577 root = child 

578 break 

579 

580 self.root = root 

581 self.scope = scope if scope is not doc else root 

582 self.has_html_namespace = self.has_html_ns(root) 

583 

584 # A document can be both XML and HTML (XHTML) 

585 self.is_xml = self.is_xml_tree(doc) 

586 self.is_html = not self.is_xml or self.has_html_namespace 

587 

588 def supports_namespaces(self) -> bool: 

589 """Check if namespaces are supported in the HTML type.""" 

590 

591 return self.is_xml or self.has_html_namespace 

592 

593 def get_tag_ns(self, el: bs4.Tag | None) -> str: 

594 """Get tag namespace.""" 

595 

596 namespace = '' 

597 if el is None: # pragma: no cover 

598 return namespace 

599 

600 if self.supports_namespaces(): 

601 ns = self.get_uri(el) 

602 if ns: 

603 namespace = ns 

604 else: 

605 namespace = NS_XHTML 

606 return namespace 

607 

608 def is_html_tag(self, el: bs4.Tag | None) -> bool: 

609 """Check if tag is in HTML namespace.""" 

610 

611 return self.get_tag_ns(el) == NS_XHTML 

612 

613 def get_tag(self, el: bs4.Tag | None) -> str | None: 

614 """Get tag.""" 

615 

616 name = self.get_tag_name(el) 

617 return util.lower(name) if name is not None and not self.is_xml else name 

618 

619 def get_prefix(self, el: bs4.Tag) -> str | None: 

620 """Get prefix.""" 

621 

622 prefix = self.get_prefix_name(el) 

623 return util.lower(prefix) if prefix is not None and not self.is_xml else prefix 

624 

625 def find_bidi(self, el: bs4.Tag) -> int | None: 

626 """Get directionality from element text.""" 

627 

628 for node in self.get_children(el): 

629 

630 # Analyze child text nodes 

631 if self.is_tag(node): 

632 

633 # Avoid analyzing certain elements specified in the specification. 

634 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None) # type: ignore[arg-type] 

635 name = self.get_tag(node) # type: ignore[arg-type] 

636 if ( 

637 (name and name in ('bdi', 'script', 'style', 'textarea', 'iframe')) or 

638 not self.is_html_tag(node) or # type: ignore[arg-type] 

639 direction is not None 

640 ): 

641 continue # pragma: no cover 

642 

643 # Check directionality of this node's text 

644 value = self.find_bidi(node) # type: ignore[arg-type] 

645 if value is not None: 

646 return value 

647 

648 # Direction could not be determined 

649 continue # pragma: no cover 

650 

651 # Skip `doctype` comments, etc. 

652 if self.is_special_string(node): 

653 continue 

654 

655 # Analyze text nodes for directionality. 

656 for c in node: # type: ignore[attr-defined] 

657 bidi = unicodedata.bidirectional(c) 

658 if bidi in ('AL', 'R', 'L'): 

659 return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL 

660 return None 

661 

662 def extended_language_filter(self, lang_range: str, lang_tag: str) -> bool: 

663 """Filter the language tags.""" 

664 

665 match = True 

666 lang_range = RE_WILD_STRIP.sub('-', lang_range).lower() 

667 ranges = lang_range.split('-') 

668 subtags = lang_tag.lower().split('-') 

669 length = len(ranges) 

670 slength = len(subtags) 

671 rindex = 0 

672 sindex = 0 

673 r = ranges[rindex] 

674 s = subtags[sindex] 

675 

676 # Empty specified language should match unspecified language attributes 

677 if length == 1 and slength == 1 and not r and r == s: 

678 return True 

679 

680 # Primary tag needs to match 

681 if (r != '*' and r != s) or (r == '*' and slength == 1 and not s): 

682 match = False 

683 

684 rindex += 1 

685 sindex += 1 

686 

687 # Match until we run out of ranges 

688 while match and rindex < length: 

689 r = ranges[rindex] 

690 try: 

691 s = subtags[sindex] 

692 except IndexError: 

693 # Ran out of subtags, 

694 # but we still have ranges 

695 match = False 

696 continue 

697 

698 # Empty range 

699 if not r: 

700 match = False 

701 continue 

702 

703 # Matched range 

704 elif s == r: 

705 rindex += 1 

706 

707 # Implicit wildcard cannot match 

708 # singletons 

709 elif len(s) == 1: 

710 match = False 

711 continue 

712 

713 # Implicitly matched, so grab next subtag 

714 sindex += 1 

715 

716 return match 

717 

718 def match_attribute_name( 

719 self, 

720 el: bs4.Tag, 

721 attr: str, 

722 prefix: str | None 

723 ) -> str | Sequence[str] | None: 

724 """Match attribute name and return value if it exists.""" 

725 

726 value = None 

727 if self.supports_namespaces(): 

728 value = None 

729 # If we have not defined namespaces, we can't very well find them, so don't bother trying. 

730 if prefix: 

731 ns = self.namespaces.get(prefix) 

732 if ns is None and prefix != '*': 

733 return None 

734 else: 

735 ns = None 

736 

737 for k, v in self.iter_attributes(el): 

738 

739 # Get attribute parts 

740 namespace, name = self.split_namespace(el, k) 

741 

742 # Can't match a prefix attribute as we haven't specified one to match 

743 # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`. 

744 if ns is None: 

745 if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)): 

746 value = v 

747 break 

748 # Coverage is not finding this even though it is executed. 

749 # Adding a print statement before this (and erasing coverage) causes coverage to find the line. 

750 # Ignore the false positive message. 

751 continue # pragma: no cover 

752 

753 # We can't match our desired prefix attribute as the attribute doesn't have a prefix 

754 if namespace is None or (ns != namespace and prefix != '*'): 

755 continue 

756 

757 # The attribute doesn't match. 

758 if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name): 

759 continue 

760 

761 value = v 

762 break 

763 else: 

764 for k, v in self.iter_attributes(el): 

765 if util.lower(attr) != util.lower(k): 

766 continue 

767 value = v 

768 break 

769 return value 

770 

771 def match_namespace(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool: 

772 """Match the namespace of the element.""" 

773 

774 match = True 

775 namespace = self.get_tag_ns(el) 

776 default_namespace = self.namespaces.get('') 

777 tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix) 

778 # We must match the default namespace if one is not provided 

779 if tag.prefix is None and (default_namespace is not None and namespace != default_namespace): 

780 match = False 

781 # If we specified `|tag`, we must not have a namespace. 

782 elif (tag.prefix is not None and tag.prefix == '' and namespace): 

783 match = False 

784 # Verify prefix matches 

785 elif ( 

786 tag.prefix and 

787 tag.prefix != '*' and (tag_ns is None or namespace != tag_ns) 

788 ): 

789 match = False 

790 return match 

791 

792 def match_attributes(self, el: bs4.Tag, attributes: tuple[ct.SelectorAttribute, ...]) -> bool: 

793 """Match attributes.""" 

794 

795 match = True 

796 if attributes: 

797 for a in attributes: 

798 temp = self.match_attribute_name(el, a.attribute, a.prefix) 

799 pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern 

800 if temp is None: 

801 match = False 

802 break 

803 value = temp if isinstance(temp, str) else ' '.join(temp) 

804 if pattern is None: 

805 continue 

806 elif pattern.match(value) is None: 

807 match = False 

808 break 

809 return match 

810 

811 def match_tagname(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool: 

812 """Match tag name.""" 

813 

814 name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name) 

815 return not ( 

816 name is not None and 

817 name not in (self.get_tag(el), '*') 

818 ) 

819 

820 def match_tag(self, el: bs4.Tag, tag: ct.SelectorTag | None) -> bool: 

821 """Match the tag.""" 

822 

823 match = True 

824 if tag is not None: 

825 # Verify namespace 

826 if not self.match_namespace(el, tag): 

827 match = False 

828 if not self.match_tagname(el, tag): 

829 match = False 

830 return match 

831 

832 def match_past_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: 

833 """Match past relationship.""" 

834 

835 found = False 

836 # I don't think this can ever happen, but it makes `mypy` happy 

837 if isinstance(relation[0], ct.SelectorNull): # pragma: no cover 

838 return found 

839 

840 if relation[0].rel_type == REL_PARENT: 

841 parent = self.get_parent(el, no_iframe=self.iframe_restrict) 

842 while not found and parent: 

843 found = self.match_selectors(parent, relation) 

844 parent = self.get_parent(parent, no_iframe=self.iframe_restrict) 

845 elif relation[0].rel_type == REL_CLOSE_PARENT: 

846 parent = self.get_parent(el, no_iframe=self.iframe_restrict) 

847 if parent: 

848 found = self.match_selectors(parent, relation) 

849 elif relation[0].rel_type == REL_SIBLING: 

850 sibling = self.get_previous_tag(el) 

851 while not found and sibling: 

852 found = self.match_selectors(sibling, relation) 

853 sibling = self.get_previous_tag(sibling) 

854 elif relation[0].rel_type == REL_CLOSE_SIBLING: 

855 sibling = self.get_previous_tag(el) 

856 if sibling and self.is_tag(sibling): 

857 found = self.match_selectors(sibling, relation) 

858 return found 

859 

860 def match_future_child(self, parent: bs4.Tag, relation: ct.SelectorList, recursive: bool = False) -> bool: 

861 """Match future child.""" 

862 

863 match = False 

864 if recursive: 

865 children = self.get_tag_descendants # type: Callable[..., Iterator[bs4.Tag]] 

866 else: 

867 children = self.get_tag_children 

868 for child in children(parent, no_iframe=self.iframe_restrict): 

869 match = self.match_selectors(child, relation) 

870 if match: 

871 break 

872 return match 

873 

874 def match_future_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: 

875 """Match future relationship.""" 

876 

877 found = False 

878 # I don't think this can ever happen, but it makes `mypy` happy 

879 if isinstance(relation[0], ct.SelectorNull): # pragma: no cover 

880 return found 

881 

882 if relation[0].rel_type == REL_HAS_PARENT: 

883 found = self.match_future_child(el, relation, True) 

884 elif relation[0].rel_type == REL_HAS_CLOSE_PARENT: 

885 found = self.match_future_child(el, relation) 

886 elif relation[0].rel_type == REL_HAS_SIBLING: 

887 sibling = self.get_next_tag(el) 

888 while not found and sibling: 

889 found = self.match_selectors(sibling, relation) 

890 sibling = self.get_next_tag(sibling) 

891 elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING: 

892 sibling = self.get_next_tag(el) 

893 if sibling and self.is_tag(sibling): 

894 found = self.match_selectors(sibling, relation) 

895 return found 

896 

897 def match_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: 

898 """Match relationship to other elements.""" 

899 

900 found = False 

901 

902 if isinstance(relation[0], ct.SelectorNull) or relation[0].rel_type is None: 

903 return found 

904 

905 if relation[0].rel_type.startswith(':'): 

906 found = self.match_future_relations(el, relation) 

907 else: 

908 found = self.match_past_relations(el, relation) 

909 

910 return found 

911 

912 def match_id(self, el: bs4.Tag, ids: tuple[str, ...]) -> bool: 

913 """Match element's ID.""" 

914 

915 found = True 

916 for i in ids: 

917 if i != self.get_attribute_by_name(el, 'id', ''): 

918 found = False 

919 break 

920 return found 

921 

922 def match_classes(self, el: bs4.Tag, classes: tuple[str, ...]) -> bool: 

923 """Match element's classes.""" 

924 

925 current_classes = self.get_classes(el) 

926 found = True 

927 for c in classes: 

928 if c not in current_classes: 

929 found = False 

930 break 

931 return found 

932 

933 def match_root(self, el: bs4.Tag) -> bool: 

934 """Match element as root.""" 

935 

936 is_root = self.is_root(el) 

937 if is_root: 

938 sibling = self.get_previous(el) # type: Any 

939 while is_root and sibling is not None: 

940 if ( 

941 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or 

942 self.is_cdata(sibling) 

943 ): 

944 is_root = False 

945 else: 

946 sibling = self.get_previous(sibling) 

947 if is_root: 

948 sibling = self.get_next(el) 

949 while is_root and sibling is not None: 

950 if ( 

951 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or 

952 self.is_cdata(sibling) 

953 ): 

954 is_root = False 

955 else: 

956 sibling = self.get_next(sibling) 

957 return is_root 

958 

959 def match_scope(self, el: bs4.Tag) -> bool: 

960 """Match element as scope.""" 

961 

962 return self.scope is el 

963 

964 def match_nth_tag_type(self, el: bs4.Tag, child: bs4.Tag) -> bool: 

965 """Match tag type for `nth` matches.""" 

966 

967 return ( 

968 (self.get_tag(child) == self.get_tag(el)) and 

969 (self.get_tag_ns(child) == self.get_tag_ns(el)) 

970 ) 

971 

972 def match_nth(self, el: bs4.Tag, nth: tuple[ct.SelectorNth, ...]) -> bool: 

973 """Match `nth` elements.""" 

974 

975 matched = True 

976 

977 for n in nth: 

978 matched = False 

979 if n.selectors and not self.match_selectors(el, n.selectors): 

980 break 

981 parent = self.get_parent(el) # type: bs4.Tag | None 

982 if parent is None: 

983 parent = cast('bs4.Tag', self.create_fake_parent(el)) 

984 last = n.last 

985 last_index = len(parent) - 1 

986 index = last_index if last else 0 

987 relative_index = 0 

988 a = n.a 

989 b = n.b 

990 var = n.n 

991 count = 0 

992 count_incr = 1 

993 factor = -1 if last else 1 

994 idx = last_idx = a * count + b if var else a 

995 

996 # We can only adjust bounds within a variable index 

997 if var: 

998 # Abort if our nth index is out of bounds and only getting further out of bounds as we increment. 

999 # Otherwise, increment to try to get in bounds. 

1000 adjust = None 

1001 while idx < 1 or idx > last_index: 

1002 if idx < 0: 

1003 diff_low = 0 - idx 

1004 if adjust is not None and adjust == 1: 

1005 break 

1006 adjust = -1 

1007 count += count_incr 

1008 idx = last_idx = a * count + b if var else a 

1009 diff = 0 - idx 

1010 if diff >= diff_low: 

1011 break 

1012 else: 

1013 diff_high = idx - last_index 

1014 if adjust is not None and adjust == -1: 

1015 break 

1016 adjust = 1 

1017 count += count_incr 

1018 idx = last_idx = a * count + b if var else a 

1019 diff = idx - last_index 

1020 if diff >= diff_high: 

1021 break 

1022 diff_high = diff 

1023 

1024 # If a < 0, our count is working backwards, so floor the index by increasing the count. 

1025 # Find the count that yields the lowest, in bound value and use that. 

1026 # Lastly reverse count increment so that we'll increase our index. 

1027 lowest = count 

1028 if a < 0: 

1029 while idx >= 1: 

1030 lowest = count 

1031 count += count_incr 

1032 idx = last_idx = a * count + b if var else a 

1033 count_incr = -1 

1034 count = lowest 

1035 idx = last_idx = a * count + b if var else a 

1036 

1037 # Evaluate elements while our calculated nth index is still in range 

1038 while 1 <= idx <= last_index + 1: 

1039 child = None # type: bs4.element.PageElement | None 

1040 # Evaluate while our child index is still in range. 

1041 for child in self.get_children(parent, start=index, reverse=factor < 0): 

1042 index += factor 

1043 if not isinstance(child, bs4.Tag): 

1044 continue 

1045 # Handle `of S` in `nth-child` 

1046 if n.selectors and not self.match_selectors(child, n.selectors): 

1047 continue 

1048 # Handle `of-type` 

1049 if n.of_type and not self.match_nth_tag_type(el, child): 

1050 continue 

1051 relative_index += 1 

1052 if relative_index == idx: 

1053 if child is el: 

1054 matched = True 

1055 else: 

1056 break 

1057 if child is el: 

1058 break 

1059 if child is el: 

1060 break 

1061 last_idx = idx 

1062 count += count_incr 

1063 if count < 0: 

1064 # Count is counting down and has now ventured into invalid territory. 

1065 break 

1066 idx = a * count + b if var else a 

1067 if last_idx == idx: 

1068 break 

1069 if not matched: 

1070 break 

1071 return matched 

1072 

1073 def match_empty(self, el: bs4.Tag) -> bool: 

1074 """Check if element is empty (if requested).""" 

1075 

1076 is_empty = True 

1077 for child in self.get_children(el): 

1078 if self.is_tag(child): 

1079 is_empty = False 

1080 break 

1081 elif self.is_content_string(child) and RE_NOT_EMPTY.search(child): # type: ignore[call-overload] 

1082 is_empty = False 

1083 break 

1084 return is_empty 

1085 

1086 def match_subselectors(self, el: bs4.Tag, selectors: tuple[ct.SelectorList, ...]) -> bool: 

1087 """Match selectors.""" 

1088 

1089 match = True 

1090 for sel in selectors: 

1091 if not self.match_selectors(el, sel): 

1092 match = False 

1093 return match 

1094 

1095 def match_contains(self, el: bs4.Tag, contains: tuple[ct.SelectorContains, ...]) -> bool: 

1096 """Match element if it contains text.""" 

1097 

1098 match = True 

1099 content = None # type: str | Sequence[str] | None 

1100 for contain_list in contains: 

1101 if content is None: 

1102 if contain_list.own: 

1103 content = self.get_own_text(el, no_iframe=self.is_html) 

1104 else: 

1105 content = self.get_text(el, no_iframe=self.is_html) 

1106 found = False 

1107 for text in contain_list.text: 

1108 if contain_list.own: 

1109 for c in content: 

1110 if text in c: 

1111 found = True 

1112 break 

1113 if found: 

1114 break 

1115 else: 

1116 if text in content: 

1117 found = True 

1118 break 

1119 if not found: 

1120 match = False 

1121 return match 

1122 

1123 def match_default(self, el: bs4.Tag) -> bool: 

1124 """Match default.""" 

1125 

1126 match = False 

1127 

1128 # Find this input's form 

1129 form = None # type: bs4.Tag | None 

1130 parent = self.get_parent(el, no_iframe=True) 

1131 while parent and form is None: 

1132 if self.get_tag(parent) == 'form' and self.is_html_tag(parent): 

1133 form = parent 

1134 else: 

1135 parent = self.get_parent(parent, no_iframe=True) 

1136 

1137 if form is not None: 

1138 # Look in form cache to see if we've already located its default button 

1139 found_form = False 

1140 for f, t in self.cached_default_forms: 

1141 if f is form: 

1142 found_form = True 

1143 if t is el: 

1144 match = True 

1145 break 

1146 

1147 # We didn't have the form cached, so look for its default button 

1148 if not found_form: 

1149 for child in self.get_tag_descendants(form, no_iframe=True): 

1150 name = self.get_tag(child) 

1151 # Can't do nested forms (haven't figured out why we never hit this) 

1152 if name == 'form': # pragma: no cover 

1153 break 

1154 if name in ('input', 'button'): 

1155 v = self.get_attribute_by_name(child, 'type', '') 

1156 if v and util.lower(v) == 'submit': 

1157 self.cached_default_forms.append((form, child)) 

1158 if el is child: 

1159 match = True 

1160 break 

1161 return match 

1162 

1163 def match_indeterminate(self, el: bs4.Tag) -> bool: 

1164 """Match default.""" 

1165 

1166 match = False 

1167 name = cast(str, self.get_attribute_by_name(el, 'name')) 

1168 

1169 def get_parent_form(el: bs4.Tag) -> bs4.Tag | None: 

1170 """Find this input's form.""" 

1171 form = None 

1172 parent = self.get_parent(el, no_iframe=True) 

1173 while form is None: 

1174 if self.get_tag(parent) == 'form' and self.is_html_tag(parent): 

1175 form = parent 

1176 break 

1177 last_parent = parent 

1178 parent = self.get_parent(parent, no_iframe=True) 

1179 if parent is None: 

1180 form = last_parent 

1181 break 

1182 return form 

1183 

1184 form = get_parent_form(el) 

1185 

1186 # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate 

1187 if form is not None: 

1188 found_form = False 

1189 for f, n, i in self.cached_indeterminate_forms: 

1190 if f is form and n == name: 

1191 found_form = True 

1192 if i is True: 

1193 match = True 

1194 break 

1195 

1196 # We didn't have the form cached, so validate that the radio button is indeterminate 

1197 if not found_form: 

1198 checked = False 

1199 for child in self.get_tag_descendants(form, no_iframe=True): 

1200 if child is el: 

1201 continue 

1202 tag_name = self.get_tag(child) 

1203 if tag_name == 'input': 

1204 is_radio = False 

1205 check = False 

1206 has_name = False 

1207 for k, v in self.iter_attributes(child): 

1208 if util.lower(k) == 'type' and util.lower(v) == 'radio': 

1209 is_radio = True 

1210 elif util.lower(k) == 'name' and v == name: 

1211 has_name = True 

1212 elif util.lower(k) == 'checked': 

1213 check = True 

1214 if is_radio and check and has_name and get_parent_form(child) is form: 

1215 checked = True 

1216 break 

1217 if checked: 

1218 break 

1219 if not checked: 

1220 match = True 

1221 self.cached_indeterminate_forms.append((form, name, match)) 

1222 

1223 return match 

1224 

1225 def match_lang(self, el: bs4.Tag, langs: tuple[ct.SelectorLang, ...]) -> bool: 

1226 """Match languages.""" 

1227 

1228 match = False 

1229 has_ns = self.supports_namespaces() 

1230 root = self.root 

1231 has_html_namespace = self.has_html_namespace 

1232 

1233 # Walk parents looking for `lang` (HTML) or `xml:lang` XML property. 

1234 parent = el # type: bs4.Tag | None 

1235 found_lang = None 

1236 last = None 

1237 while not found_lang: 

1238 has_html_ns = self.has_html_ns(parent) 

1239 for k, v in self.iter_attributes(parent): 

1240 attr_ns, attr = self.split_namespace(parent, k) 

1241 if ( 

1242 ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or 

1243 ( 

1244 has_ns and not has_html_ns and attr_ns == NS_XML and 

1245 (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang' 

1246 ) 

1247 ): 

1248 found_lang = v 

1249 break 

1250 last = parent 

1251 parent = self.get_parent(parent, no_iframe=self.is_html) 

1252 

1253 if parent is None: 

1254 root = last 

1255 has_html_namespace = self.has_html_ns(root) 

1256 parent = last 

1257 break 

1258 

1259 # Use cached meta language. 

1260 if found_lang is None and self.cached_meta_lang: 

1261 for cache in self.cached_meta_lang: 

1262 if root is not None and cast(str, root) is cache[0]: 

1263 found_lang = cache[1] 

1264 

1265 # If we couldn't find a language, and the document is HTML, look to meta to determine language. 

1266 if found_lang is None and (not self.is_xml or (has_html_namespace and root and root.name == 'html')): 

1267 # Find head 

1268 found = False 

1269 for tag in ('html', 'head'): 

1270 found = False 

1271 for child in self.get_tag_children(parent, no_iframe=self.is_html): 

1272 if self.get_tag(child) == tag and self.is_html_tag(child): 

1273 found = True 

1274 parent = child 

1275 break 

1276 if not found: # pragma: no cover 

1277 break 

1278 

1279 # Search meta tags 

1280 if found and parent is not None: 

1281 for child2 in parent: 

1282 if isinstance(child2, bs4.Tag) and self.get_tag(child2) == 'meta' and self.is_html_tag(parent): 

1283 c_lang = False 

1284 content = None 

1285 for k, v in self.iter_attributes(child2): 

1286 if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language': 

1287 c_lang = True 

1288 if util.lower(k) == 'content': 

1289 content = v 

1290 if c_lang and content: 

1291 found_lang = content 

1292 self.cached_meta_lang.append((cast(str, root), cast(str, found_lang))) 

1293 break 

1294 if found_lang is not None: 

1295 break 

1296 if found_lang is None: 

1297 self.cached_meta_lang.append((cast(str, root), '')) 

1298 

1299 # If we determined a language, compare. 

1300 if found_lang is not None: 

1301 for patterns in langs: 

1302 match = False 

1303 for pattern in patterns: 

1304 if self.extended_language_filter(pattern, cast(str, found_lang)): 

1305 match = True 

1306 if not match: 

1307 break 

1308 

1309 return match 

1310 

1311 def match_dir(self, el: bs4.Tag | None, directionality: int) -> bool: 

1312 """Check directionality.""" 

1313 

1314 # If we have to match both left and right, we can't match either. 

1315 if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL: 

1316 return False 

1317 

1318 if el is None or not self.is_html_tag(el): 

1319 return False 

1320 

1321 # Element has defined direction of left to right or right to left 

1322 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None) 

1323 if direction not in (None, 0): 

1324 return direction == directionality 

1325 

1326 # Element is the document element (the root) and no direction assigned, assume left to right. 

1327 is_root = self.is_root(el) 

1328 if is_root and direction is None: 

1329 return ct.SEL_DIR_LTR == directionality 

1330 

1331 # If `input[type=telephone]` and no direction is assigned, assume left to right. 

1332 name = self.get_tag(el) 

1333 is_input = name == 'input' 

1334 is_textarea = name == 'textarea' 

1335 is_bdi = name == 'bdi' 

1336 itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else '' 

1337 if is_input and itype == 'tel' and direction is None: 

1338 return ct.SEL_DIR_LTR == directionality 

1339 

1340 # Auto handling for text inputs 

1341 if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0: 

1342 if is_textarea: 

1343 value = ''.join(node for node in self.get_contents(el, no_iframe=True) if self.is_content_string(node)) # type: ignore[misc] 

1344 else: 

1345 value = cast(str, self.get_attribute_by_name(el, 'value', '')) 

1346 if value: 

1347 for c in value: 

1348 bidi = unicodedata.bidirectional(c) 

1349 if bidi in ('AL', 'R', 'L'): 

1350 direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL 

1351 return direction == directionality 

1352 # Assume left to right 

1353 return ct.SEL_DIR_LTR == directionality 

1354 elif is_root: 

1355 return ct.SEL_DIR_LTR == directionality 

1356 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) 

1357 

1358 # Auto handling for `bdi` and other non text inputs. 

1359 if (is_bdi and direction is None) or direction == 0: 

1360 direction = self.find_bidi(el) 

1361 if direction is not None: 

1362 return direction == directionality 

1363 elif is_root: 

1364 return ct.SEL_DIR_LTR == directionality 

1365 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) 

1366 

1367 # Match parents direction 

1368 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) 

1369 

1370 def match_range(self, el: bs4.Tag, condition: int) -> bool: 

1371 """ 

1372 Match range. 

1373 

1374 Behavior is modeled after what we see in browsers. Browsers seem to evaluate 

1375 if the value is out of range, and if not, it is in range. So a missing value 

1376 will not evaluate out of range; therefore, value is in range. Personally, I 

1377 feel like this should evaluate as neither in or out of range. 

1378 """ 

1379 

1380 out_of_range = False 

1381 

1382 itype = util.lower(self.get_attribute_by_name(el, 'type')) 

1383 mn = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'min', None))) 

1384 mx = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'max', None))) 

1385 

1386 # There is no valid min or max, so we cannot evaluate a range 

1387 if mn is None and mx is None: 

1388 return False 

1389 

1390 value = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'value', None))) 

1391 if value is not None: 

1392 if itype in ("date", "datetime-local", "month", "week", "number", "range"): 

1393 if mn is not None and value < mn: 

1394 out_of_range = True 

1395 if not out_of_range and mx is not None and value > mx: 

1396 out_of_range = True 

1397 elif itype == "time": 

1398 if mn is not None and mx is not None and mn > mx: 

1399 # Time is periodic, so this is a reversed/discontinuous range 

1400 if value < mn and value > mx: 

1401 out_of_range = True 

1402 else: 

1403 if mn is not None and value < mn: 

1404 out_of_range = True 

1405 if not out_of_range and mx is not None and value > mx: 

1406 out_of_range = True 

1407 

1408 return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range 

1409 

1410 def match_defined(self, el: bs4.Tag) -> bool: 

1411 """ 

1412 Match defined. 

1413 

1414 `:defined` is related to custom elements in a browser. 

1415 

1416 - If the document is XML (not XHTML), all tags will match. 

1417 - Tags that are not custom (don't have a hyphen) are marked defined. 

1418 - If the tag has a prefix (without or without a namespace), it will not match. 

1419 

1420 This is of course requires the parser to provide us with the proper prefix and namespace info, 

1421 if it doesn't, there is nothing we can do. 

1422 """ 

1423 

1424 name = self.get_tag(el) 

1425 return ( 

1426 name is not None and ( 

1427 name.find('-') == -1 or 

1428 name.find(':') != -1 or 

1429 self.get_prefix(el) is not None 

1430 ) 

1431 ) 

1432 

1433 def match_placeholder_shown(self, el: bs4.Tag) -> bool: 

1434 """ 

1435 Match placeholder shown according to HTML spec. 

1436 

1437 - text area should be checked if they have content. A single newline does not count as content. 

1438 

1439 """ 

1440 

1441 match = False 

1442 content = self.get_text(el) 

1443 if content in ('', '\n'): 

1444 match = True 

1445 

1446 return match 

1447 

1448 def match_selectors(self, el: bs4.Tag, selectors: ct.SelectorList) -> bool: 

1449 """Check if element matches one of the selectors.""" 

1450 

1451 match = False 

1452 is_not = selectors.is_not 

1453 is_html = selectors.is_html 

1454 

1455 # Internal selector lists that use the HTML flag, will automatically get the `html` namespace. 

1456 if is_html: 

1457 namespaces = self.namespaces 

1458 iframe_restrict = self.iframe_restrict 

1459 self.namespaces = {'html': NS_XHTML} 

1460 self.iframe_restrict = True 

1461 

1462 if not is_html or self.is_html: 

1463 for selector in selectors: 

1464 match = is_not 

1465 # We have a un-matchable situation (like `:focus` as you can focus an element in this environment) 

1466 if isinstance(selector, ct.SelectorNull): 

1467 continue 

1468 # Verify tag matches 

1469 if not self.match_tag(el, selector.tag): 

1470 continue 

1471 # Verify tag is defined 

1472 if selector.flags & ct.SEL_DEFINED and not self.match_defined(el): 

1473 continue 

1474 # Verify element is root 

1475 if selector.flags & ct.SEL_ROOT and not self.match_root(el): 

1476 continue 

1477 # Verify element is scope 

1478 if selector.flags & ct.SEL_SCOPE and not self.match_scope(el): 

1479 continue 

1480 # Verify element has placeholder shown 

1481 if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el): 

1482 continue 

1483 # Verify `nth` matches 

1484 if not self.match_nth(el, selector.nth): 

1485 continue 

1486 if selector.flags & ct.SEL_EMPTY and not self.match_empty(el): 

1487 continue 

1488 # Verify id matches 

1489 if selector.ids and not self.match_id(el, selector.ids): 

1490 continue 

1491 # Verify classes match 

1492 if selector.classes and not self.match_classes(el, selector.classes): 

1493 continue 

1494 # Verify attribute(s) match 

1495 if not self.match_attributes(el, selector.attributes): 

1496 continue 

1497 # Verify ranges 

1498 if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES): 

1499 continue 

1500 # Verify language patterns 

1501 if selector.lang and not self.match_lang(el, selector.lang): 

1502 continue 

1503 # Verify pseudo selector patterns 

1504 if selector.selectors and not self.match_subselectors(el, selector.selectors): 

1505 continue 

1506 # Verify relationship selectors 

1507 if selector.relation and not self.match_relations(el, selector.relation): 

1508 continue 

1509 # Validate that the current default selector match corresponds to the first submit button in the form 

1510 if selector.flags & ct.SEL_DEFAULT and not self.match_default(el): 

1511 continue 

1512 # Validate that the unset radio button is among radio buttons with the same name in a form that are 

1513 # also not set. 

1514 if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el): 

1515 continue 

1516 # Validate element directionality 

1517 if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS): 

1518 continue 

1519 # Validate that the tag contains the specified text. 

1520 if selector.contains and not self.match_contains(el, selector.contains): 

1521 continue 

1522 match = not is_not 

1523 break 

1524 

1525 # Restore actual namespaces being used for external selector lists 

1526 if is_html: 

1527 self.namespaces = namespaces 

1528 self.iframe_restrict = iframe_restrict 

1529 

1530 return match 

1531 

1532 def select(self, limit: int = 0) -> Iterator[bs4.Tag]: 

1533 """Match all tags under the targeted tag.""" 

1534 

1535 lim = None if limit < 1 else limit 

1536 

1537 for child in self.get_tag_descendants(self.tag): 

1538 if self.match(child): 

1539 yield child 

1540 if lim is not None: 

1541 lim -= 1 

1542 if lim < 1: 

1543 break 

1544 

1545 def closest(self) -> bs4.Tag | None: 

1546 """Match closest ancestor.""" 

1547 

1548 current = self.tag # type: bs4.Tag | None 

1549 closest = None 

1550 while closest is None and current is not None: 

1551 if self.match(current): 

1552 closest = current 

1553 else: 

1554 current = self.get_parent(current) 

1555 return closest 

1556 

1557 def filter(self) -> list[bs4.Tag]: # noqa A001 

1558 """Filter tag's children.""" 

1559 

1560 return [ 

1561 tag for tag in self.get_contents(self.tag) 

1562 if isinstance(tag, bs4.Tag) and self.match(tag) 

1563 ] 

1564 

1565 def match(self, el: bs4.Tag) -> bool: 

1566 """Match.""" 

1567 

1568 return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors) 

1569 

1570 

1571class SoupSieve(ct.Immutable): 

1572 """Compiled Soup Sieve selector matching object.""" 

1573 

1574 pattern: str 

1575 selectors: ct.SelectorList 

1576 namespaces: ct.Namespaces | None 

1577 custom: dict[str, str] 

1578 flags: int 

1579 

1580 __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash") 

1581 

1582 def __init__( 

1583 self, 

1584 pattern: str, 

1585 selectors: ct.SelectorList, 

1586 namespaces: ct.Namespaces | None, 

1587 custom: ct.CustomSelectors | None, 

1588 flags: int 

1589 ): 

1590 """Initialize.""" 

1591 

1592 super().__init__( 

1593 pattern=pattern, 

1594 selectors=selectors, 

1595 namespaces=namespaces, 

1596 custom=custom, 

1597 flags=flags 

1598 ) 

1599 

1600 def match(self, tag: bs4.Tag) -> bool: 

1601 """Match.""" 

1602 

1603 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag) 

1604 

1605 def closest(self, tag: bs4.Tag) -> bs4.Tag | None: 

1606 """Match closest ancestor.""" 

1607 

1608 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest() 

1609 

1610 def filter(self, iterable: Iterable[bs4.Tag]) -> list[bs4.Tag]: # noqa A001 

1611 """ 

1612 Filter. 

1613 

1614 `CSSMatch` can cache certain searches for tags of the same document, 

1615 so if we are given a tag, all tags are from the same document, 

1616 and we can take advantage of the optimization. 

1617 

1618 Any other kind of iterable could have tags from different documents or detached tags, 

1619 so for those, we use a new `CSSMatch` for each item in the iterable. 

1620 """ 

1621 

1622 if isinstance(iterable, bs4.Tag): 

1623 return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter() 

1624 else: 

1625 return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)] 

1626 

1627 def select_one(self, tag: bs4.Tag) -> bs4.Tag | None: 

1628 """Select a single tag.""" 

1629 

1630 tags = self.select(tag, limit=1) 

1631 return tags[0] if tags else None 

1632 

1633 def select(self, tag: bs4.Tag, limit: int = 0) -> list[bs4.Tag]: 

1634 """Select the specified tags.""" 

1635 

1636 return list(self.iselect(tag, limit)) 

1637 

1638 def iselect(self, tag: bs4.Tag, limit: int = 0) -> Iterator[bs4.Tag]: 

1639 """Iterate the specified tags.""" 

1640 

1641 yield from CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit) 

1642 

1643 def __repr__(self) -> str: # pragma: no cover 

1644 """Representation.""" 

1645 

1646 return ( 

1647 f"SoupSieve(pattern={self.pattern!r}, namespaces={self.namespaces!r}, " 

1648 f"custom={self.custom!r}, flags={self.flags!r})" 

1649 ) 

1650 

1651 __str__ = __repr__ 

1652 

1653 

1654ct.pickle_register(SoupSieve)