Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/cssselect/xpath.py: 27%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

340 statements  

1""" 

2cssselect.xpath 

3=============== 

4 

5Translation of parsed CSS selectors to XPath expressions. 

6 

7 

8:copyright: (c) 2007-2012 Ian Bicking and contributors. 

9See AUTHORS for more details. 

10:license: BSD, see LICENSE for more details. 

11 

12""" 

13 

14from __future__ import annotations 

15 

16import re 

17from typing import TYPE_CHECKING, cast 

18 

19from cssselect.parser import ( 

20 Attrib, 

21 Class, 

22 CombinedSelector, 

23 Element, 

24 Function, 

25 Hash, 

26 Matching, 

27 Negation, 

28 Pseudo, 

29 PseudoElement, 

30 Relation, 

31 Selector, 

32 SelectorError, 

33 SpecificityAdjustment, 

34 Tree, 

35 parse, 

36 parse_series, 

37) 

38 

39if TYPE_CHECKING: 

40 from collections.abc import Callable 

41 

42 # typing.Self requires Python 3.11 

43 from typing_extensions import Self 

44 

45 

46class ExpressionError(SelectorError, RuntimeError): 

47 """Unknown or unsupported selector (eg. pseudo-class).""" 

48 

49 

50#### XPath Helpers 

51 

52 

53class XPathExpr: 

54 def __init__( 

55 self, 

56 path: str = "", 

57 element: str = "*", 

58 condition: str = "", 

59 star_prefix: bool = False, 

60 ) -> None: 

61 self.path = path 

62 self.element = element 

63 self.condition = condition 

64 

65 def __str__(self) -> str: 

66 path = str(self.path) + str(self.element) 

67 if self.condition: 

68 path += f"[{self.condition}]" 

69 return path 

70 

71 def __repr__(self) -> str: 

72 return f"{self.__class__.__name__}[{self}]" 

73 

74 def add_condition(self, condition: str, conjuction: str = "and") -> Self: 

75 if self.condition: 

76 self.condition = f"({self.condition}) {conjuction} ({condition})" 

77 else: 

78 self.condition = condition 

79 return self 

80 

81 def add_name_test(self) -> None: 

82 if self.element == "*": 

83 # We weren't doing a test anyway 

84 return 

85 self.add_condition(f"name() = {GenericTranslator.xpath_literal(self.element)}") 

86 self.element = "*" 

87 

88 def add_star_prefix(self) -> None: 

89 """ 

90 Append '*/' to the path to keep the context constrained 

91 to a single parent. 

92 """ 

93 self.path += "*/" 

94 

95 def join( 

96 self, 

97 combiner: str, 

98 other: XPathExpr, 

99 closing_combiner: str | None = None, 

100 has_inner_condition: bool = False, 

101 ) -> Self: 

102 path = str(self) + combiner 

103 # Any "star prefix" is redundant when joining. 

104 if other.path != "*/": 

105 path += other.path 

106 self.path = path 

107 if not has_inner_condition: 

108 self.element = ( 

109 other.element + closing_combiner if closing_combiner else other.element 

110 ) 

111 self.condition = other.condition 

112 else: 

113 self.element = other.element 

114 if other.condition: 

115 self.element += "[" + other.condition + "]" 

116 if closing_combiner: 

117 self.element += closing_combiner 

118 return self 

119 

120 

121split_at_single_quotes = re.compile("('+)").split 

122 

123# The spec is actually more permissive than that, but don’t bother. 

124# This is just for the fast path. 

125# http://www.w3.org/TR/REC-xml/#NT-NameStartChar 

126is_safe_name = re.compile("^[a-zA-Z_][a-zA-Z0-9_.-]*$").match 

127 

128# Test that the string is not empty and does not contain whitespace 

129is_non_whitespace = re.compile(r"^[^ \t\r\n\f]+$").match 

130 

131 

132#### Translation 

133 

134 

135class GenericTranslator: 

136 """ 

137 Translator for "generic" XML documents. 

138 

139 Everything is case-sensitive, no assumption is made on the meaning 

140 of element names and attribute names. 

141 

142 """ 

143 

144 #### 

145 #### HERE BE DRAGONS 

146 #### 

147 #### You are welcome to hook into this to change some behavior, 

148 #### but do so at your own risks. 

149 #### Until it has received a lot more work and review, 

150 #### I reserve the right to change this API in backward-incompatible ways 

151 #### with any minor version of cssselect. 

152 #### See https://github.com/scrapy/cssselect/pull/22 

153 #### -- Simon Sapin. 

154 #### 

155 

156 combinator_mapping = { 

157 " ": "descendant", 

158 ">": "child", 

159 "+": "direct_adjacent", 

160 "~": "indirect_adjacent", 

161 } 

162 

163 attribute_operator_mapping = { 

164 "exists": "exists", 

165 "=": "equals", 

166 "~=": "includes", 

167 "|=": "dashmatch", 

168 "^=": "prefixmatch", 

169 "$=": "suffixmatch", 

170 "*=": "substringmatch", 

171 "!=": "different", # XXX Not in Level 3 but meh 

172 } 

173 

174 #: The attribute used for ID selectors depends on the document language: 

175 #: http://www.w3.org/TR/selectors/#id-selectors 

176 id_attribute = "id" 

177 

178 #: The attribute used for ``:lang()`` depends on the document language: 

179 #: http://www.w3.org/TR/selectors/#lang-pseudo 

180 lang_attribute = "xml:lang" 

181 

182 #: The case sensitivity of document language element names, 

183 #: attribute names, and attribute values in selectors depends 

184 #: on the document language. 

185 #: http://www.w3.org/TR/selectors/#casesens 

186 #: 

187 #: When a document language defines one of these as case-insensitive, 

188 #: cssselect assumes that the document parser makes the parsed values 

189 #: lower-case. Making the selector lower-case too makes the comparaison 

190 #: case-insensitive. 

191 #: 

192 #: In HTML, element names and attributes names (but not attribute values) 

193 #: are case-insensitive. All of lxml.html, html5lib, BeautifulSoup4 

194 #: and HTMLParser make them lower-case in their parse result, so 

195 #: the assumption holds. 

196 lower_case_element_names = False 

197 lower_case_attribute_names = False 

198 lower_case_attribute_values = False 

199 

200 # class used to represent and xpath expression 

201 xpathexpr_cls = XPathExpr 

202 

203 def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str: 

204 """Translate a *group of selectors* to XPath. 

205 

206 Pseudo-elements are not supported here since XPath only knows 

207 about "real" elements. 

208 

209 :param css: 

210 A *group of selectors* as a string. 

211 :param prefix: 

212 This string is prepended to the XPath expression for each selector. 

213 The default makes selectors scoped to the context node’s subtree. 

214 :raises: 

215 :class:`~cssselect.SelectorSyntaxError` on invalid selectors, 

216 :class:`ExpressionError` on unknown/unsupported selectors, 

217 including pseudo-elements. 

218 :returns: 

219 The equivalent XPath 1.0 expression as a string. 

220 

221 """ 

222 return " | ".join( 

223 self.selector_to_xpath(selector, prefix, translate_pseudo_elements=True) 

224 for selector in parse(css) 

225 ) 

226 

227 def selector_to_xpath( 

228 self, 

229 selector: Selector, 

230 prefix: str = "descendant-or-self::", 

231 translate_pseudo_elements: bool = False, 

232 ) -> str: 

233 """Translate a parsed selector to XPath. 

234 

235 

236 :param selector: 

237 A parsed :class:`Selector` object. 

238 :param prefix: 

239 This string is prepended to the resulting XPath expression. 

240 The default makes selectors scoped to the context node’s subtree. 

241 :param translate_pseudo_elements: 

242 Unless this is set to ``True`` (as :meth:`css_to_xpath` does), 

243 the :attr:`~Selector.pseudo_element` attribute of the selector 

244 is ignored. 

245 It is the caller's responsibility to reject selectors 

246 with pseudo-elements, or to account for them somehow. 

247 :raises: 

248 :class:`ExpressionError` on unknown/unsupported selectors. 

249 :returns: 

250 The equivalent XPath 1.0 expression as a string. 

251 

252 """ 

253 tree = getattr(selector, "parsed_tree", None) 

254 if not tree: 

255 raise TypeError(f"Expected a parsed selector, got {selector!r}") 

256 xpath = self.xpath(tree) 

257 assert isinstance(xpath, self.xpathexpr_cls) # help debug a missing 'return' 

258 if translate_pseudo_elements and selector.pseudo_element: 

259 xpath = self.xpath_pseudo_element(xpath, selector.pseudo_element) 

260 return (prefix or "") + str(xpath) 

261 

262 def xpath_pseudo_element( 

263 self, xpath: XPathExpr, pseudo_element: PseudoElement 

264 ) -> XPathExpr: 

265 """Translate a pseudo-element. 

266 

267 Defaults to not supporting pseudo-elements at all, 

268 but can be overridden by sub-classes. 

269 

270 """ 

271 raise ExpressionError("Pseudo-elements are not supported.") 

272 

273 @staticmethod 

274 def xpath_literal(s: str) -> str: 

275 s = str(s) 

276 if "'" not in s: 

277 s = f"'{s}'" 

278 elif '"' not in s: 

279 s = f'"{s}"' 

280 else: 

281 parts_quoted = [ 

282 f'"{part}"' if "'" in part else f"'{part}'" 

283 for part in split_at_single_quotes(s) 

284 if part 

285 ] 

286 s = "concat({})".format(",".join(parts_quoted)) 

287 return s 

288 

289 def xpath(self, parsed_selector: Tree) -> XPathExpr: 

290 """Translate any parsed selector object.""" 

291 type_name = type(parsed_selector).__name__ 

292 method = cast( 

293 "Callable[[Tree], XPathExpr] | None", 

294 getattr(self, f"xpath_{type_name.lower()}", None), 

295 ) 

296 if method is None: 

297 raise ExpressionError(f"{type_name} is not supported.") 

298 return method(parsed_selector) 

299 

300 # Dispatched by parsed object type 

301 

302 def xpath_combinedselector(self, combined: CombinedSelector) -> XPathExpr: 

303 """Translate a combined selector.""" 

304 combinator = self.combinator_mapping[combined.combinator] 

305 method = cast( 

306 "Callable[[XPathExpr, XPathExpr], XPathExpr]", 

307 getattr(self, f"xpath_{combinator}_combinator"), 

308 ) 

309 return method(self.xpath(combined.selector), self.xpath(combined.subselector)) 

310 

311 def xpath_negation(self, negation: Negation) -> XPathExpr: 

312 xpath = self.xpath(negation.selector) 

313 sub_xpath = self.xpath(negation.subselector) 

314 sub_xpath.add_name_test() 

315 if sub_xpath.condition: 

316 return xpath.add_condition(f"not({sub_xpath.condition})") 

317 return xpath.add_condition("0") 

318 

319 def xpath_relation(self, relation: Relation) -> XPathExpr: 

320 xpath = self.xpath(relation.selector) 

321 combinator = relation.combinator 

322 subselector = relation.subselector 

323 right = self.xpath(subselector.parsed_tree) 

324 method = cast( 

325 "Callable[[XPathExpr, XPathExpr], XPathExpr]", 

326 getattr( 

327 self, 

328 f"xpath_relation_{self.combinator_mapping[cast('str', combinator.value)]}_combinator", 

329 ), 

330 ) 

331 return method(xpath, right) 

332 

333 def xpath_matching(self, matching: Matching) -> XPathExpr: 

334 xpath = self.xpath(matching.selector) 

335 exprs = [self.xpath(selector) for selector in matching.selector_list] 

336 for e in exprs: 

337 e.add_name_test() 

338 if e.condition: 

339 xpath.add_condition(e.condition, "or") 

340 return xpath 

341 

342 def xpath_specificityadjustment(self, matching: SpecificityAdjustment) -> XPathExpr: 

343 xpath = self.xpath(matching.selector) 

344 exprs = [self.xpath(selector) for selector in matching.selector_list] 

345 for e in exprs: 

346 e.add_name_test() 

347 if e.condition: 

348 xpath.add_condition(e.condition, "or") 

349 return xpath 

350 

351 def xpath_function(self, function: Function) -> XPathExpr: 

352 """Translate a functional pseudo-class.""" 

353 method_name = "xpath_{}_function".format(function.name.replace("-", "_")) 

354 method = cast( 

355 "Callable[[XPathExpr, Function], XPathExpr] | None", 

356 getattr(self, method_name, None), 

357 ) 

358 if not method: 

359 raise ExpressionError(f"The pseudo-class :{function.name}() is unknown") 

360 return method(self.xpath(function.selector), function) 

361 

362 def xpath_pseudo(self, pseudo: Pseudo) -> XPathExpr: 

363 """Translate a pseudo-class.""" 

364 method_name = "xpath_{}_pseudo".format(pseudo.ident.replace("-", "_")) 

365 method = cast( 

366 "Callable[[XPathExpr], XPathExpr] | None", 

367 getattr(self, method_name, None), 

368 ) 

369 if not method: 

370 # TODO: better error message for pseudo-elements? 

371 raise ExpressionError(f"The pseudo-class :{pseudo.ident} is unknown") 

372 return method(self.xpath(pseudo.selector)) 

373 

374 def xpath_attrib(self, selector: Attrib) -> XPathExpr: 

375 """Translate an attribute selector.""" 

376 operator = self.attribute_operator_mapping[selector.operator] 

377 method = cast( 

378 "Callable[[XPathExpr, str, str | None], XPathExpr]", 

379 getattr(self, f"xpath_attrib_{operator}"), 

380 ) 

381 if self.lower_case_attribute_names: 

382 name = selector.attrib.lower() 

383 else: 

384 name = selector.attrib 

385 safe = is_safe_name(name) 

386 if selector.namespace: 

387 name = f"{selector.namespace}:{name}" 

388 safe = safe and is_safe_name(selector.namespace) 

389 if safe: 

390 attrib = "@" + name 

391 else: 

392 attrib = f"attribute::*[name() = {self.xpath_literal(name)}]" 

393 if selector.value is None: 

394 value = None 

395 elif self.lower_case_attribute_values: 

396 value = cast("str", selector.value.value).lower() 

397 else: 

398 value = selector.value.value 

399 return method(self.xpath(selector.selector), attrib, value) 

400 

401 def xpath_class(self, class_selector: Class) -> XPathExpr: 

402 """Translate a class selector.""" 

403 # .foo is defined as [class~=foo] in the spec. 

404 xpath = self.xpath(class_selector.selector) 

405 return self.xpath_attrib_includes(xpath, "@class", class_selector.class_name) 

406 

407 def xpath_hash(self, id_selector: Hash) -> XPathExpr: 

408 """Translate an ID selector.""" 

409 xpath = self.xpath(id_selector.selector) 

410 return self.xpath_attrib_equals(xpath, "@id", id_selector.id) 

411 

412 def xpath_element(self, selector: Element) -> XPathExpr: 

413 """Translate a type or universal selector.""" 

414 element = selector.element 

415 if not element: 

416 element = "*" 

417 safe = True 

418 else: 

419 safe = bool(is_safe_name(element)) 

420 if self.lower_case_element_names: 

421 element = element.lower() 

422 if selector.namespace: 

423 # Namespace prefixes are case-sensitive. 

424 # http://www.w3.org/TR/css3-namespace/#prefixes 

425 element = f"{selector.namespace}:{element}" 

426 safe = safe and bool(is_safe_name(selector.namespace)) 

427 xpath = self.xpathexpr_cls(element=element) 

428 if not safe: 

429 xpath.add_name_test() 

430 return xpath 

431 

432 # CombinedSelector: dispatch by combinator 

433 

434 def xpath_descendant_combinator( 

435 self, left: XPathExpr, right: XPathExpr 

436 ) -> XPathExpr: 

437 """right is a child, grand-child or further descendant of left""" 

438 return left.join("/descendant-or-self::*/", right) 

439 

440 def xpath_child_combinator(self, left: XPathExpr, right: XPathExpr) -> XPathExpr: 

441 """right is an immediate child of left""" 

442 return left.join("/", right) 

443 

444 def xpath_direct_adjacent_combinator( 

445 self, left: XPathExpr, right: XPathExpr 

446 ) -> XPathExpr: 

447 """right is a sibling immediately after left""" 

448 xpath = left.join("/following-sibling::", right) 

449 xpath.add_name_test() 

450 return xpath.add_condition("position() = 1") 

451 

452 def xpath_indirect_adjacent_combinator( 

453 self, left: XPathExpr, right: XPathExpr 

454 ) -> XPathExpr: 

455 """right is a sibling after left, immediately or not""" 

456 return left.join("/following-sibling::", right) 

457 

458 def xpath_relation_descendant_combinator( 

459 self, left: XPathExpr, right: XPathExpr 

460 ) -> XPathExpr: 

461 """right is a child, grand-child or further descendant of left; select left""" 

462 return left.join( 

463 "[descendant::", right, closing_combiner="]", has_inner_condition=True 

464 ) 

465 

466 def xpath_relation_child_combinator( 

467 self, left: XPathExpr, right: XPathExpr 

468 ) -> XPathExpr: 

469 """right is an immediate child of left; select left""" 

470 return left.join("[./", right, closing_combiner="]") 

471 

472 def xpath_relation_direct_adjacent_combinator( 

473 self, left: XPathExpr, right: XPathExpr 

474 ) -> XPathExpr: 

475 """right is a sibling immediately after left; select left""" 

476 return left.add_condition( 

477 f"following-sibling::*[(name() = '{right.element}') and (position() = 1)]" 

478 ) 

479 

480 def xpath_relation_indirect_adjacent_combinator( 

481 self, left: XPathExpr, right: XPathExpr 

482 ) -> XPathExpr: 

483 """right is a sibling after left, immediately or not; select left""" 

484 return left.join("[following-sibling::", right, closing_combiner="]") 

485 

486 # Function: dispatch by function/pseudo-class name 

487 

488 def xpath_nth_child_function( 

489 self, 

490 xpath: XPathExpr, 

491 function: Function, 

492 last: bool = False, 

493 add_name_test: bool = True, 

494 ) -> XPathExpr: 

495 try: 

496 a, b = parse_series(function.arguments) 

497 except ValueError as ex: 

498 raise ExpressionError(f"Invalid series: '{function.arguments!r}'") from ex 

499 

500 # From https://www.w3.org/TR/css3-selectors/#structural-pseudos: 

501 # 

502 # :nth-child(an+b) 

503 # an+b-1 siblings before 

504 # 

505 # :nth-last-child(an+b) 

506 # an+b-1 siblings after 

507 # 

508 # :nth-of-type(an+b) 

509 # an+b-1 siblings with the same expanded element name before 

510 # 

511 # :nth-last-of-type(an+b) 

512 # an+b-1 siblings with the same expanded element name after 

513 # 

514 # So, 

515 # for :nth-child and :nth-of-type 

516 # 

517 # count(preceding-sibling::<nodetest>) = an+b-1 

518 # 

519 # for :nth-last-child and :nth-last-of-type 

520 # 

521 # count(following-sibling::<nodetest>) = an+b-1 

522 # 

523 # therefore, 

524 # count(...) - (b-1) ≡ 0 (mod a) 

525 # 

526 # if a == 0: 

527 # ~~~~~~~~~~ 

528 # count(...) = b-1 

529 # 

530 # if a < 0: 

531 # ~~~~~~~~~ 

532 # count(...) - b +1 <= 0 

533 # -> count(...) <= b-1 

534 # 

535 # if a > 0: 

536 # ~~~~~~~~~ 

537 # count(...) - b +1 >= 0 

538 # -> count(...) >= b-1 

539 

540 # work with b-1 instead 

541 b_min_1 = b - 1 

542 

543 # early-exit condition 1: 

544 # ~~~~~~~~~~~~~~~~~~~~~~~ 

545 # for a == 1, nth-*(an+b) means n+b-1 siblings before/after, 

546 # and since n ∈ {0, 1, 2, ...}, if b-1<=0, 

547 # there is always an "n" matching any number of siblings (maybe none) 

548 if a == 1 and b_min_1 <= 0: 

549 return xpath 

550 

551 # early-exit condition 2: 

552 # ~~~~~~~~~~~~~~~~~~~~~~~ 

553 # an+b-1 siblings with a<0 and (b-1)<0 is not possible 

554 if a < 0 and b_min_1 < 0: 

555 return xpath.add_condition("0") 

556 

557 # `add_name_test` boolean is inverted and somewhat counter-intuitive: 

558 # 

559 # nth_of_type() calls nth_child(add_name_test=False) 

560 nodetest = "*" if add_name_test else f"{xpath.element}" 

561 

562 # count siblings before or after the element 

563 if not last: 

564 siblings_count = f"count(preceding-sibling::{nodetest})" 

565 else: 

566 siblings_count = f"count(following-sibling::{nodetest})" 

567 

568 # special case of fixed position: nth-*(0n+b) 

569 # if a == 0: 

570 # ~~~~~~~~~~ 

571 # count(***-sibling::***) = b-1 

572 if a == 0: 

573 return xpath.add_condition(f"{siblings_count} = {b_min_1}") 

574 

575 expressions = [] 

576 

577 if a > 0: 

578 # siblings count, an+b-1, is always >= 0, 

579 # so if a>0, and (b-1)<=0, an "n" exists to satisfy this, 

580 # therefore, the predicate is only interesting if (b-1)>0 

581 if b_min_1 > 0: 

582 expressions.append(f"{siblings_count} >= {b_min_1}") 

583 else: 

584 # if a<0, and (b-1)<0, no "n" satisfies this, 

585 # this is tested above as an early exist condition 

586 # otherwise, 

587 expressions.append(f"{siblings_count} <= {b_min_1}") 

588 

589 # operations modulo 1 or -1 are simpler, one only needs to verify: 

590 # 

591 # - either: 

592 # count(***-sibling::***) - (b-1) = n = 0, 1, 2, 3, etc., 

593 # i.e. count(***-sibling::***) >= (b-1) 

594 # 

595 # - or: 

596 # count(***-sibling::***) - (b-1) = -n = 0, -1, -2, -3, etc., 

597 # i.e. count(***-sibling::***) <= (b-1) 

598 # we we just did above. 

599 # 

600 if abs(a) != 1: 

601 # count(***-sibling::***) - (b-1) ≡ 0 (mod a) 

602 left = siblings_count 

603 

604 # apply "modulo a" on 2nd term, -(b-1), 

605 # to simplify things like "(... +6) % -3", 

606 # and also make it positive with |a| 

607 b_neg = (-b_min_1) % abs(a) 

608 

609 if b_neg != 0: 

610 left = f"({left} +{b_neg})" 

611 

612 expressions.append(f"{left} mod {a} = 0") 

613 

614 template = "(%s)" if len(expressions) > 1 else "%s" 

615 xpath.add_condition( 

616 " and ".join(template % expression for expression in expressions) 

617 ) 

618 return xpath 

619 

620 def xpath_nth_last_child_function( 

621 self, xpath: XPathExpr, function: Function 

622 ) -> XPathExpr: 

623 return self.xpath_nth_child_function(xpath, function, last=True) 

624 

625 def xpath_nth_of_type_function( 

626 self, xpath: XPathExpr, function: Function 

627 ) -> XPathExpr: 

628 if xpath.element == "*": 

629 raise ExpressionError("*:nth-of-type() is not implemented") 

630 return self.xpath_nth_child_function(xpath, function, add_name_test=False) 

631 

632 def xpath_nth_last_of_type_function( 

633 self, xpath: XPathExpr, function: Function 

634 ) -> XPathExpr: 

635 if xpath.element == "*": 

636 raise ExpressionError("*:nth-of-type() is not implemented") 

637 return self.xpath_nth_child_function( 

638 xpath, function, last=True, add_name_test=False 

639 ) 

640 

641 def xpath_contains_function( 

642 self, xpath: XPathExpr, function: Function 

643 ) -> XPathExpr: 

644 # Defined there, removed in later drafts: 

645 # http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors 

646 if function.argument_types() not in (["STRING"], ["IDENT"]): 

647 raise ExpressionError( 

648 f"Expected a single string or ident for :contains(), got {function.arguments!r}" 

649 ) 

650 value = cast("str", function.arguments[0].value) 

651 return xpath.add_condition(f"contains(., {self.xpath_literal(value)})") 

652 

653 def xpath_lang_function(self, xpath: XPathExpr, function: Function) -> XPathExpr: 

654 if function.argument_types() not in (["STRING"], ["IDENT"]): 

655 raise ExpressionError( 

656 f"Expected a single string or ident for :lang(), got {function.arguments!r}" 

657 ) 

658 value = cast("str", function.arguments[0].value) 

659 return xpath.add_condition(f"lang({self.xpath_literal(value)})") 

660 

661 # Pseudo: dispatch by pseudo-class name 

662 

663 def xpath_root_pseudo(self, xpath: XPathExpr) -> XPathExpr: 

664 return xpath.add_condition("not(parent::*)") 

665 

666 # CSS immediate children (CSS ":scope > div" to XPath "child::div" or "./div") 

667 # Works only at the start of a selector 

668 # Needed to get immediate children of a processed selector in Scrapy 

669 # for product in response.css('.product'): 

670 # description = product.css(':scope > div::text').get() 

671 def xpath_scope_pseudo(self, xpath: XPathExpr) -> XPathExpr: 

672 return xpath.add_condition("1") 

673 

674 def xpath_first_child_pseudo(self, xpath: XPathExpr) -> XPathExpr: 

675 return xpath.add_condition("count(preceding-sibling::*) = 0") 

676 

677 def xpath_last_child_pseudo(self, xpath: XPathExpr) -> XPathExpr: 

678 return xpath.add_condition("count(following-sibling::*) = 0") 

679 

680 def xpath_first_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr: 

681 if xpath.element == "*": 

682 raise ExpressionError("*:first-of-type is not implemented") 

683 return xpath.add_condition(f"count(preceding-sibling::{xpath.element}) = 0") 

684 

685 def xpath_last_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr: 

686 if xpath.element == "*": 

687 raise ExpressionError("*:last-of-type is not implemented") 

688 return xpath.add_condition(f"count(following-sibling::{xpath.element}) = 0") 

689 

690 def xpath_only_child_pseudo(self, xpath: XPathExpr) -> XPathExpr: 

691 return xpath.add_condition("count(parent::*/child::*) = 1") 

692 

693 def xpath_only_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr: 

694 if xpath.element == "*": 

695 raise ExpressionError("*:only-of-type is not implemented") 

696 return xpath.add_condition(f"count(parent::*/child::{xpath.element}) = 1") 

697 

698 def xpath_empty_pseudo(self, xpath: XPathExpr) -> XPathExpr: 

699 return xpath.add_condition("not(*) and not(string-length())") 

700 

701 def pseudo_never_matches(self, xpath: XPathExpr) -> XPathExpr: 

702 """Common implementation for pseudo-classes that never match.""" 

703 return xpath.add_condition("0") 

704 

705 xpath_link_pseudo = pseudo_never_matches 

706 xpath_visited_pseudo = pseudo_never_matches 

707 xpath_hover_pseudo = pseudo_never_matches 

708 xpath_active_pseudo = pseudo_never_matches 

709 xpath_focus_pseudo = pseudo_never_matches 

710 xpath_target_pseudo = pseudo_never_matches 

711 xpath_enabled_pseudo = pseudo_never_matches 

712 xpath_disabled_pseudo = pseudo_never_matches 

713 xpath_checked_pseudo = pseudo_never_matches 

714 

715 # Attrib: dispatch by attribute operator 

716 

717 def xpath_attrib_exists( 

718 self, xpath: XPathExpr, name: str, value: str | None 

719 ) -> XPathExpr: 

720 assert not value 

721 xpath.add_condition(name) 

722 return xpath 

723 

724 def xpath_attrib_equals( 

725 self, xpath: XPathExpr, name: str, value: str | None 

726 ) -> XPathExpr: 

727 assert value is not None 

728 xpath.add_condition(f"{name} = {self.xpath_literal(value)}") 

729 return xpath 

730 

731 def xpath_attrib_different( 

732 self, xpath: XPathExpr, name: str, value: str | None 

733 ) -> XPathExpr: 

734 assert value is not None 

735 # FIXME: this seems like a weird hack... 

736 if value: 

737 xpath.add_condition(f"not({name}) or {name} != {self.xpath_literal(value)}") 

738 else: 

739 xpath.add_condition(f"{name} != {self.xpath_literal(value)}") 

740 return xpath 

741 

742 def xpath_attrib_includes( 

743 self, xpath: XPathExpr, name: str, value: str | None 

744 ) -> XPathExpr: 

745 if value and is_non_whitespace(value): 

746 arg = self.xpath_literal(" " + value + " ") 

747 xpath.add_condition( 

748 f"{name} and contains(concat(' ', normalize-space({name}), ' '), {arg})" 

749 ) 

750 else: 

751 xpath.add_condition("0") 

752 return xpath 

753 

754 def xpath_attrib_dashmatch( 

755 self, xpath: XPathExpr, name: str, value: str | None 

756 ) -> XPathExpr: 

757 assert value is not None 

758 arg = self.xpath_literal(value) 

759 arg_dash = self.xpath_literal(value + "-") 

760 # Weird, but true... 

761 xpath.add_condition( 

762 f"{name} and ({name} = {arg} or starts-with({name}, {arg_dash}))" 

763 ) 

764 return xpath 

765 

766 def xpath_attrib_prefixmatch( 

767 self, xpath: XPathExpr, name: str, value: str | None 

768 ) -> XPathExpr: 

769 if value: 

770 xpath.add_condition( 

771 f"{name} and starts-with({name}, {self.xpath_literal(value)})" 

772 ) 

773 else: 

774 xpath.add_condition("0") 

775 return xpath 

776 

777 def xpath_attrib_suffixmatch( 

778 self, xpath: XPathExpr, name: str, value: str | None 

779 ) -> XPathExpr: 

780 if value: 

781 # Oddly there is a starts-with in XPath 1.0, but not ends-with 

782 xpath.add_condition( 

783 f"{name} and substring({name}, string-length({name})-{len(value) - 1}) = {self.xpath_literal(value)}" 

784 ) 

785 else: 

786 xpath.add_condition("0") 

787 return xpath 

788 

789 def xpath_attrib_substringmatch( 

790 self, xpath: XPathExpr, name: str, value: str | None 

791 ) -> XPathExpr: 

792 if value: 

793 # Attribute selectors are case sensitive 

794 xpath.add_condition( 

795 f"{name} and contains({name}, {self.xpath_literal(value)})" 

796 ) 

797 else: 

798 xpath.add_condition("0") 

799 return xpath 

800 

801 

802class HTMLTranslator(GenericTranslator): 

803 """ 

804 Translator for (X)HTML documents. 

805 

806 Has a more useful implementation of some pseudo-classes based on 

807 HTML-specific element names and attribute names, as described in 

808 the `HTML5 specification`_. It assumes no-quirks mode. 

809 The API is the same as :class:`GenericTranslator`. 

810 

811 .. _HTML5 specification: http://www.w3.org/TR/html5/links.html#selectors 

812 

813 :param xhtml: 

814 If false (the default), element names and attribute names 

815 are case-insensitive. 

816 

817 """ 

818 

819 lang_attribute = "lang" 

820 

821 def __init__(self, xhtml: bool = False) -> None: 

822 self.xhtml = xhtml # Might be useful for sub-classes? 

823 if not xhtml: 

824 # See their definition in GenericTranslator. 

825 self.lower_case_element_names = True 

826 self.lower_case_attribute_names = True 

827 

828 def xpath_checked_pseudo(self, xpath: XPathExpr) -> XPathExpr: # type: ignore[override] 

829 # FIXME: is this really all the elements? 

830 return xpath.add_condition( 

831 "(@selected and name(.) = 'option') or " 

832 "(@checked " 

833 "and (name(.) = 'input' or name(.) = 'command')" 

834 "and (@type = 'checkbox' or @type = 'radio'))" 

835 ) 

836 

837 def xpath_lang_function(self, xpath: XPathExpr, function: Function) -> XPathExpr: 

838 if function.argument_types() not in (["STRING"], ["IDENT"]): 

839 raise ExpressionError( 

840 f"Expected a single string or ident for :lang(), got {function.arguments!r}" 

841 ) 

842 value = function.arguments[0].value 

843 assert value 

844 arg = self.xpath_literal(value.lower() + "-") 

845 return xpath.add_condition( 

846 "ancestor-or-self::*[@lang][1][starts-with(concat(" 

847 # XPath 1.0 has no lower-case function... 

848 f"translate(@{self.lang_attribute}, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', " 

849 "'abcdefghijklmnopqrstuvwxyz'), " 

850 f"'-'), {arg})]" 

851 ) 

852 

853 def xpath_link_pseudo(self, xpath: XPathExpr) -> XPathExpr: # type: ignore[override] 

854 return xpath.add_condition( 

855 "@href and (name(.) = 'a' or name(.) = 'link' or name(.) = 'area')" 

856 ) 

857 

858 # Links are never visited, the implementation for :visited is the same 

859 # as in GenericTranslator 

860 

861 def xpath_disabled_pseudo(self, xpath: XPathExpr) -> XPathExpr: # type: ignore[override] 

862 # http://www.w3.org/TR/html5/section-index.html#attributes-1 

863 return xpath.add_condition( 

864 """ 

865 ( 

866 @disabled and 

867 ( 

868 (name(.) = 'input' and @type != 'hidden') or 

869 name(.) = 'button' or 

870 name(.) = 'select' or 

871 name(.) = 'textarea' or 

872 name(.) = 'command' or 

873 name(.) = 'fieldset' or 

874 name(.) = 'optgroup' or 

875 name(.) = 'option' 

876 ) 

877 ) or ( 

878 ( 

879 (name(.) = 'input' and @type != 'hidden') or 

880 name(.) = 'button' or 

881 name(.) = 'select' or 

882 name(.) = 'textarea' 

883 ) 

884 and ancestor::fieldset[@disabled] 

885 ) 

886 """ 

887 ) 

888 # FIXME: in the second half, add "and is not a descendant of that 

889 # fieldset element's first legend element child, if any." 

890 

891 def xpath_enabled_pseudo(self, xpath: XPathExpr) -> XPathExpr: # type: ignore[override] 

892 # http://www.w3.org/TR/html5/section-index.html#attributes-1 

893 return xpath.add_condition( 

894 """ 

895 ( 

896 @href and ( 

897 name(.) = 'a' or 

898 name(.) = 'link' or 

899 name(.) = 'area' 

900 ) 

901 ) or ( 

902 ( 

903 name(.) = 'command' or 

904 name(.) = 'fieldset' or 

905 name(.) = 'optgroup' 

906 ) 

907 and not(@disabled) 

908 ) or ( 

909 ( 

910 (name(.) = 'input' and @type != 'hidden') or 

911 name(.) = 'button' or 

912 name(.) = 'select' or 

913 name(.) = 'textarea' or 

914 name(.) = 'keygen' 

915 ) 

916 and not (@disabled or ancestor::fieldset[@disabled]) 

917 ) or ( 

918 name(.) = 'option' and not( 

919 @disabled or ancestor::optgroup[@disabled] 

920 ) 

921 ) 

922 """ 

923 ) 

924 # FIXME: ... or "li elements that are children of menu elements, 

925 # and that have a child element that defines a command, if the first 

926 # such element's Disabled State facet is false (not disabled)". 

927 # FIXME: after ancestor::fieldset[@disabled], add "and is not a 

928 # descendant of that fieldset element's first legend element child, 

929 # if any."