Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/cssselect/xpath.py: 75%

343 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-09 06:19 +0000

1# -*- coding: utf-8 -*- 

2""" 

3 cssselect.xpath 

4 =============== 

5 

6 Translation of parsed CSS selectors to XPath expressions. 

7 

8 

9 :copyright: (c) 2007-2012 Ian Bicking and contributors. 

10 See AUTHORS for more details. 

11 :license: BSD, see LICENSE for more details. 

12 

13""" 

14 

15import re 

16import typing 

17import warnings 

18from typing import Optional 

19 

20from cssselect.parser import ( 

21 parse, 

22 parse_series, 

23 PseudoElement, 

24 Selector, 

25 SelectorError, 

26 Tree, 

27 Element, 

28 Hash, 

29 Class, 

30 Function, 

31 Pseudo, 

32 Attrib, 

33 Negation, 

34 Relation, 

35 Matching, 

36 SpecificityAdjustment, 

37 CombinedSelector, 

38) 

39 

40 

41@typing.no_type_check 

42def _unicode_safe_getattr(obj, name, default=None): 

43 warnings.warn( 

44 "_unicode_safe_getattr is deprecated and will be removed in the" 

45 " next release, use getattr() instead", 

46 DeprecationWarning, 

47 stacklevel=2, 

48 ) 

49 return getattr(obj, name, default) 

50 

51 

52class ExpressionError(SelectorError, RuntimeError): 

53 """Unknown or unsupported selector (eg. pseudo-class).""" 

54 

55 

56#### XPath Helpers 

57 

58 

59class XPathExpr: 

60 def __init__( 

61 self, path: str = "", element: str = "*", condition: str = "", star_prefix: bool = False 

62 ) -> None: 

63 self.path = path 

64 self.element = element 

65 self.condition = condition 

66 

67 def __str__(self) -> str: 

68 path = str(self.path) + str(self.element) 

69 if self.condition: 

70 path += "[%s]" % self.condition 

71 return path 

72 

73 def __repr__(self) -> str: 

74 return "%s[%s]" % (self.__class__.__name__, self) 

75 

76 def add_condition(self, condition: str, conjuction: str = "and") -> "XPathExpr": 

77 if self.condition: 

78 self.condition = "(%s) %s (%s)" % (self.condition, conjuction, condition) 

79 else: 

80 self.condition = condition 

81 return self 

82 

83 def add_name_test(self) -> None: 

84 if self.element == "*": 

85 # We weren't doing a test anyway 

86 return 

87 self.add_condition("name() = %s" % GenericTranslator.xpath_literal(self.element)) 

88 self.element = "*" 

89 

90 def add_star_prefix(self) -> None: 

91 """ 

92 Append '*/' to the path to keep the context constrained 

93 to a single parent. 

94 """ 

95 self.path += "*/" 

96 

97 def join( 

98 self, 

99 combiner: str, 

100 other: "XPathExpr", 

101 closing_combiner: Optional[str] = None, 

102 has_inner_condition: bool = False, 

103 ) -> "XPathExpr": 

104 path = str(self) + combiner 

105 # Any "star prefix" is redundant when joining. 

106 if other.path != "*/": 

107 path += other.path 

108 self.path = path 

109 if not has_inner_condition: 

110 self.element = other.element + closing_combiner if closing_combiner else other.element 

111 self.condition = other.condition 

112 else: 

113 self.element = other.element 

114 if other.condition: 

115 self.element += "[" + other.condition + "]" 

116 if closing_combiner: 

117 self.element += closing_combiner 

118 return self 

119 

120 

121split_at_single_quotes = re.compile("('+)").split 

122 

123# The spec is actually more permissive than that, but don’t bother. 

124# This is just for the fast path. 

125# http://www.w3.org/TR/REC-xml/#NT-NameStartChar 

126is_safe_name = re.compile("^[a-zA-Z_][a-zA-Z0-9_.-]*$").match 

127 

128# Test that the string is not empty and does not contain whitespace 

129is_non_whitespace = re.compile(r"^[^ \t\r\n\f]+$").match 

130 

131 

132#### Translation 

133 

134 

135class GenericTranslator: 

136 """ 

137 Translator for "generic" XML documents. 

138 

139 Everything is case-sensitive, no assumption is made on the meaning 

140 of element names and attribute names. 

141 

142 """ 

143 

144 #### 

145 #### HERE BE DRAGONS 

146 #### 

147 #### You are welcome to hook into this to change some behavior, 

148 #### but do so at your own risks. 

149 #### Until it has received a lot more work and review, 

150 #### I reserve the right to change this API in backward-incompatible ways 

151 #### with any minor version of cssselect. 

152 #### See https://github.com/scrapy/cssselect/pull/22 

153 #### -- Simon Sapin. 

154 #### 

155 

156 combinator_mapping = { 

157 " ": "descendant", 

158 ">": "child", 

159 "+": "direct_adjacent", 

160 "~": "indirect_adjacent", 

161 } 

162 

163 attribute_operator_mapping = { 

164 "exists": "exists", 

165 "=": "equals", 

166 "~=": "includes", 

167 "|=": "dashmatch", 

168 "^=": "prefixmatch", 

169 "$=": "suffixmatch", 

170 "*=": "substringmatch", 

171 "!=": "different", # XXX Not in Level 3 but meh 

172 } 

173 

174 #: The attribute used for ID selectors depends on the document language: 

175 #: http://www.w3.org/TR/selectors/#id-selectors 

176 id_attribute = "id" 

177 

178 #: The attribute used for ``:lang()`` depends on the document language: 

179 #: http://www.w3.org/TR/selectors/#lang-pseudo 

180 lang_attribute = "xml:lang" 

181 

182 #: The case sensitivity of document language element names, 

183 #: attribute names, and attribute values in selectors depends 

184 #: on the document language. 

185 #: http://www.w3.org/TR/selectors/#casesens 

186 #: 

187 #: When a document language defines one of these as case-insensitive, 

188 #: cssselect assumes that the document parser makes the parsed values 

189 #: lower-case. Making the selector lower-case too makes the comparaison 

190 #: case-insensitive. 

191 #: 

192 #: In HTML, element names and attributes names (but not attribute values) 

193 #: are case-insensitive. All of lxml.html, html5lib, BeautifulSoup4 

194 #: and HTMLParser make them lower-case in their parse result, so 

195 #: the assumption holds. 

196 lower_case_element_names = False 

197 lower_case_attribute_names = False 

198 lower_case_attribute_values = False 

199 

200 # class used to represent and xpath expression 

201 xpathexpr_cls = XPathExpr 

202 

203 def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str: 

204 """Translate a *group of selectors* to XPath. 

205 

206 Pseudo-elements are not supported here since XPath only knows 

207 about "real" elements. 

208 

209 :param css: 

210 A *group of selectors* as a string. 

211 :param prefix: 

212 This string is prepended to the XPath expression for each selector. 

213 The default makes selectors scoped to the context node’s subtree. 

214 :raises: 

215 :class:`~cssselect.SelectorSyntaxError` on invalid selectors, 

216 :class:`ExpressionError` on unknown/unsupported selectors, 

217 including pseudo-elements. 

218 :returns: 

219 The equivalent XPath 1.0 expression as a string. 

220 

221 """ 

222 return " | ".join( 

223 self.selector_to_xpath(selector, prefix, translate_pseudo_elements=True) 

224 for selector in parse(css) 

225 ) 

226 

227 def selector_to_xpath( 

228 self, 

229 selector: Selector, 

230 prefix: str = "descendant-or-self::", 

231 translate_pseudo_elements: bool = False, 

232 ) -> str: 

233 """Translate a parsed selector to XPath. 

234 

235 

236 :param selector: 

237 A parsed :class:`Selector` object. 

238 :param prefix: 

239 This string is prepended to the resulting XPath expression. 

240 The default makes selectors scoped to the context node’s subtree. 

241 :param translate_pseudo_elements: 

242 Unless this is set to ``True`` (as :meth:`css_to_xpath` does), 

243 the :attr:`~Selector.pseudo_element` attribute of the selector 

244 is ignored. 

245 It is the caller's responsibility to reject selectors 

246 with pseudo-elements, or to account for them somehow. 

247 :raises: 

248 :class:`ExpressionError` on unknown/unsupported selectors. 

249 :returns: 

250 The equivalent XPath 1.0 expression as a string. 

251 

252 """ 

253 tree = getattr(selector, "parsed_tree", None) 

254 if not tree: 

255 raise TypeError("Expected a parsed selector, got %r" % (selector,)) 

256 xpath = self.xpath(tree) 

257 assert isinstance(xpath, self.xpathexpr_cls) # help debug a missing 'return' 

258 if translate_pseudo_elements and selector.pseudo_element: 

259 xpath = self.xpath_pseudo_element(xpath, selector.pseudo_element) 

260 return (prefix or "") + str(xpath) 

261 

262 def xpath_pseudo_element(self, xpath: XPathExpr, pseudo_element: PseudoElement) -> XPathExpr: 

263 """Translate a pseudo-element. 

264 

265 Defaults to not supporting pseudo-elements at all, 

266 but can be overridden by sub-classes. 

267 

268 """ 

269 raise ExpressionError("Pseudo-elements are not supported.") 

270 

271 @staticmethod 

272 def xpath_literal(s: str) -> str: 

273 s = str(s) 

274 if "'" not in s: 

275 s = "'%s'" % s 

276 elif '"' not in s: 

277 s = '"%s"' % s 

278 else: 

279 s = "concat(%s)" % ",".join( 

280 [ 

281 (("'" in part) and '"%s"' or "'%s'") % part 

282 for part in split_at_single_quotes(s) 

283 if part 

284 ] 

285 ) 

286 return s 

287 

288 def xpath(self, parsed_selector: Tree) -> XPathExpr: 

289 """Translate any parsed selector object.""" 

290 type_name = type(parsed_selector).__name__ 

291 method = getattr(self, "xpath_%s" % type_name.lower(), None) 

292 if method is None: 

293 raise ExpressionError("%s is not supported." % type_name) 

294 return typing.cast(XPathExpr, method(parsed_selector)) 

295 

296 # Dispatched by parsed object type 

297 

298 def xpath_combinedselector(self, combined: CombinedSelector) -> XPathExpr: 

299 """Translate a combined selector.""" 

300 combinator = self.combinator_mapping[combined.combinator] 

301 method = getattr(self, "xpath_%s_combinator" % combinator) 

302 return typing.cast( 

303 XPathExpr, method(self.xpath(combined.selector), self.xpath(combined.subselector)) 

304 ) 

305 

306 def xpath_negation(self, negation: Negation) -> XPathExpr: 

307 xpath = self.xpath(negation.selector) 

308 sub_xpath = self.xpath(negation.subselector) 

309 sub_xpath.add_name_test() 

310 if sub_xpath.condition: 

311 return xpath.add_condition("not(%s)" % sub_xpath.condition) 

312 else: 

313 return xpath.add_condition("0") 

314 

315 def xpath_relation(self, relation: Relation) -> XPathExpr: 

316 xpath = self.xpath(relation.selector) 

317 combinator = relation.combinator 

318 subselector = relation.subselector 

319 right = self.xpath(subselector.parsed_tree) 

320 method = getattr( 

321 self, 

322 "xpath_relation_%s_combinator" 

323 % self.combinator_mapping[typing.cast(str, combinator.value)], 

324 ) 

325 return typing.cast(XPathExpr, method(xpath, right)) 

326 

327 def xpath_matching(self, matching: Matching) -> XPathExpr: 

328 xpath = self.xpath(matching.selector) 

329 exprs = [self.xpath(selector) for selector in matching.selector_list] 

330 for e in exprs: 

331 e.add_name_test() 

332 if e.condition: 

333 xpath.add_condition(e.condition, "or") 

334 return xpath 

335 

336 def xpath_specificityadjustment(self, matching: SpecificityAdjustment) -> XPathExpr: 

337 xpath = self.xpath(matching.selector) 

338 exprs = [self.xpath(selector) for selector in matching.selector_list] 

339 for e in exprs: 

340 e.add_name_test() 

341 if e.condition: 

342 xpath.add_condition(e.condition, "or") 

343 return xpath 

344 

345 def xpath_function(self, function: Function) -> XPathExpr: 

346 """Translate a functional pseudo-class.""" 

347 method_name = "xpath_%s_function" % function.name.replace("-", "_") 

348 method = getattr(self, method_name, None) 

349 if not method: 

350 raise ExpressionError("The pseudo-class :%s() is unknown" % function.name) 

351 return typing.cast(XPathExpr, method(self.xpath(function.selector), function)) 

352 

353 def xpath_pseudo(self, pseudo: Pseudo) -> XPathExpr: 

354 """Translate a pseudo-class.""" 

355 method_name = "xpath_%s_pseudo" % pseudo.ident.replace("-", "_") 

356 method = getattr(self, method_name, None) 

357 if not method: 

358 # TODO: better error message for pseudo-elements? 

359 raise ExpressionError("The pseudo-class :%s is unknown" % pseudo.ident) 

360 return typing.cast(XPathExpr, method(self.xpath(pseudo.selector))) 

361 

362 def xpath_attrib(self, selector: Attrib) -> XPathExpr: 

363 """Translate an attribute selector.""" 

364 operator = self.attribute_operator_mapping[selector.operator] 

365 method = getattr(self, "xpath_attrib_%s" % operator) 

366 if self.lower_case_attribute_names: 

367 name = selector.attrib.lower() 

368 else: 

369 name = selector.attrib 

370 safe = is_safe_name(name) 

371 if selector.namespace: 

372 name = "%s:%s" % (selector.namespace, name) 

373 safe = safe and is_safe_name(selector.namespace) 

374 if safe: 

375 attrib = "@" + name 

376 else: 

377 attrib = "attribute::*[name() = %s]" % self.xpath_literal(name) 

378 if selector.value is None: 

379 value = None 

380 elif self.lower_case_attribute_values: 

381 value = typing.cast(str, selector.value.value).lower() 

382 else: 

383 value = selector.value.value 

384 return typing.cast(XPathExpr, method(self.xpath(selector.selector), attrib, value)) 

385 

386 def xpath_class(self, class_selector: Class) -> XPathExpr: 

387 """Translate a class selector.""" 

388 # .foo is defined as [class~=foo] in the spec. 

389 xpath = self.xpath(class_selector.selector) 

390 return self.xpath_attrib_includes(xpath, "@class", class_selector.class_name) 

391 

392 def xpath_hash(self, id_selector: Hash) -> XPathExpr: 

393 """Translate an ID selector.""" 

394 xpath = self.xpath(id_selector.selector) 

395 return self.xpath_attrib_equals(xpath, "@id", id_selector.id) 

396 

397 def xpath_element(self, selector: Element) -> XPathExpr: 

398 """Translate a type or universal selector.""" 

399 element = selector.element 

400 if not element: 

401 element = "*" 

402 safe = True 

403 else: 

404 safe = bool(is_safe_name(element)) 

405 if self.lower_case_element_names: 

406 element = element.lower() 

407 if selector.namespace: 

408 # Namespace prefixes are case-sensitive. 

409 # http://www.w3.org/TR/css3-namespace/#prefixes 

410 element = "%s:%s" % (selector.namespace, element) 

411 safe = safe and bool(is_safe_name(selector.namespace)) 

412 xpath = self.xpathexpr_cls(element=element) 

413 if not safe: 

414 xpath.add_name_test() 

415 return xpath 

416 

417 # CombinedSelector: dispatch by combinator 

418 

419 def xpath_descendant_combinator(self, left: XPathExpr, right: XPathExpr) -> XPathExpr: 

420 """right is a child, grand-child or further descendant of left""" 

421 return left.join("/descendant-or-self::*/", right) 

422 

423 def xpath_child_combinator(self, left: XPathExpr, right: XPathExpr) -> XPathExpr: 

424 """right is an immediate child of left""" 

425 return left.join("/", right) 

426 

427 def xpath_direct_adjacent_combinator(self, left: XPathExpr, right: XPathExpr) -> XPathExpr: 

428 """right is a sibling immediately after left""" 

429 xpath = left.join("/following-sibling::", right) 

430 xpath.add_name_test() 

431 return xpath.add_condition("position() = 1") 

432 

433 def xpath_indirect_adjacent_combinator(self, left: XPathExpr, right: XPathExpr) -> XPathExpr: 

434 """right is a sibling after left, immediately or not""" 

435 return left.join("/following-sibling::", right) 

436 

437 def xpath_relation_descendant_combinator(self, left: XPathExpr, right: XPathExpr) -> XPathExpr: 

438 """right is a child, grand-child or further descendant of left; select left""" 

439 return left.join("[descendant::", right, closing_combiner="]", has_inner_condition=True) 

440 

441 def xpath_relation_child_combinator(self, left: XPathExpr, right: XPathExpr) -> XPathExpr: 

442 """right is an immediate child of left; select left""" 

443 return left.join("[./", right, closing_combiner="]") 

444 

445 def xpath_relation_direct_adjacent_combinator( 

446 self, left: XPathExpr, right: XPathExpr 

447 ) -> XPathExpr: 

448 """right is a sibling immediately after left; select left""" 

449 xpath = left.add_condition( 

450 "following-sibling::*[(name() = '{}') and (position() = 1)]".format(right.element) 

451 ) 

452 return xpath 

453 

454 def xpath_relation_indirect_adjacent_combinator( 

455 self, left: XPathExpr, right: XPathExpr 

456 ) -> XPathExpr: 

457 """right is a sibling after left, immediately or not; select left""" 

458 return left.join("[following-sibling::", right, closing_combiner="]") 

459 

460 # Function: dispatch by function/pseudo-class name 

461 

462 def xpath_nth_child_function( 

463 self, xpath: XPathExpr, function: Function, last: bool = False, add_name_test: bool = True 

464 ) -> XPathExpr: 

465 try: 

466 a, b = parse_series(function.arguments) 

467 except ValueError: 

468 raise ExpressionError("Invalid series: '%r'" % function.arguments) 

469 

470 # From https://www.w3.org/TR/css3-selectors/#structural-pseudos: 

471 # 

472 # :nth-child(an+b) 

473 # an+b-1 siblings before 

474 # 

475 # :nth-last-child(an+b) 

476 # an+b-1 siblings after 

477 # 

478 # :nth-of-type(an+b) 

479 # an+b-1 siblings with the same expanded element name before 

480 # 

481 # :nth-last-of-type(an+b) 

482 # an+b-1 siblings with the same expanded element name after 

483 # 

484 # So, 

485 # for :nth-child and :nth-of-type 

486 # 

487 # count(preceding-sibling::<nodetest>) = an+b-1 

488 # 

489 # for :nth-last-child and :nth-last-of-type 

490 # 

491 # count(following-sibling::<nodetest>) = an+b-1 

492 # 

493 # therefore, 

494 # count(...) - (b-1) ≡ 0 (mod a) 

495 # 

496 # if a == 0: 

497 # ~~~~~~~~~~ 

498 # count(...) = b-1 

499 # 

500 # if a < 0: 

501 # ~~~~~~~~~ 

502 # count(...) - b +1 <= 0 

503 # -> count(...) <= b-1 

504 # 

505 # if a > 0: 

506 # ~~~~~~~~~ 

507 # count(...) - b +1 >= 0 

508 # -> count(...) >= b-1 

509 

510 # work with b-1 instead 

511 b_min_1 = b - 1 

512 

513 # early-exit condition 1: 

514 # ~~~~~~~~~~~~~~~~~~~~~~~ 

515 # for a == 1, nth-*(an+b) means n+b-1 siblings before/after, 

516 # and since n ∈ {0, 1, 2, ...}, if b-1<=0, 

517 # there is always an "n" matching any number of siblings (maybe none) 

518 if a == 1 and b_min_1 <= 0: 

519 return xpath 

520 

521 # early-exit condition 2: 

522 # ~~~~~~~~~~~~~~~~~~~~~~~ 

523 # an+b-1 siblings with a<0 and (b-1)<0 is not possible 

524 if a < 0 and b_min_1 < 0: 

525 return xpath.add_condition("0") 

526 

527 # `add_name_test` boolean is inverted and somewhat counter-intuitive: 

528 # 

529 # nth_of_type() calls nth_child(add_name_test=False) 

530 if add_name_test: 

531 nodetest = "*" 

532 else: 

533 nodetest = "%s" % xpath.element 

534 

535 # count siblings before or after the element 

536 if not last: 

537 siblings_count = "count(preceding-sibling::%s)" % nodetest 

538 else: 

539 siblings_count = "count(following-sibling::%s)" % nodetest 

540 

541 # special case of fixed position: nth-*(0n+b) 

542 # if a == 0: 

543 # ~~~~~~~~~~ 

544 # count(***-sibling::***) = b-1 

545 if a == 0: 

546 return xpath.add_condition("%s = %s" % (siblings_count, b_min_1)) 

547 

548 expressions = [] 

549 

550 if a > 0: 

551 # siblings count, an+b-1, is always >= 0, 

552 # so if a>0, and (b-1)<=0, an "n" exists to satisfy this, 

553 # therefore, the predicate is only interesting if (b-1)>0 

554 if b_min_1 > 0: 

555 expressions.append("%s >= %s" % (siblings_count, b_min_1)) 

556 else: 

557 # if a<0, and (b-1)<0, no "n" satisfies this, 

558 # this is tested above as an early exist condition 

559 # otherwise, 

560 expressions.append("%s <= %s" % (siblings_count, b_min_1)) 

561 

562 # operations modulo 1 or -1 are simpler, one only needs to verify: 

563 # 

564 # - either: 

565 # count(***-sibling::***) - (b-1) = n = 0, 1, 2, 3, etc., 

566 # i.e. count(***-sibling::***) >= (b-1) 

567 # 

568 # - or: 

569 # count(***-sibling::***) - (b-1) = -n = 0, -1, -2, -3, etc., 

570 # i.e. count(***-sibling::***) <= (b-1) 

571 # we we just did above. 

572 # 

573 if abs(a) != 1: 

574 # count(***-sibling::***) - (b-1) ≡ 0 (mod a) 

575 left = siblings_count 

576 

577 # apply "modulo a" on 2nd term, -(b-1), 

578 # to simplify things like "(... +6) % -3", 

579 # and also make it positive with |a| 

580 b_neg = (-b_min_1) % abs(a) 

581 

582 if b_neg != 0: 

583 b_neg_as_str = "+%s" % b_neg 

584 left = "(%s %s)" % (left, b_neg_as_str) 

585 

586 expressions.append("%s mod %s = 0" % (left, a)) 

587 

588 if len(expressions) > 1: 

589 template = "(%s)" 

590 else: 

591 template = "%s" 

592 xpath.add_condition(" and ".join(template % expression for expression in expressions)) 

593 return xpath 

594 

595 def xpath_nth_last_child_function(self, xpath: XPathExpr, function: Function) -> XPathExpr: 

596 return self.xpath_nth_child_function(xpath, function, last=True) 

597 

598 def xpath_nth_of_type_function(self, xpath: XPathExpr, function: Function) -> XPathExpr: 

599 if xpath.element == "*": 

600 raise ExpressionError("*:nth-of-type() is not implemented") 

601 return self.xpath_nth_child_function(xpath, function, add_name_test=False) 

602 

603 def xpath_nth_last_of_type_function(self, xpath: XPathExpr, function: Function) -> XPathExpr: 

604 if xpath.element == "*": 

605 raise ExpressionError("*:nth-of-type() is not implemented") 

606 return self.xpath_nth_child_function(xpath, function, last=True, add_name_test=False) 

607 

608 def xpath_contains_function(self, xpath: XPathExpr, function: Function) -> XPathExpr: 

609 # Defined there, removed in later drafts: 

610 # http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors 

611 if function.argument_types() not in (["STRING"], ["IDENT"]): 

612 raise ExpressionError( 

613 "Expected a single string or ident for :contains(), got %r" % function.arguments 

614 ) 

615 value = typing.cast(str, function.arguments[0].value) 

616 return xpath.add_condition("contains(., %s)" % self.xpath_literal(value)) 

617 

618 def xpath_lang_function(self, xpath: XPathExpr, function: Function) -> XPathExpr: 

619 if function.argument_types() not in (["STRING"], ["IDENT"]): 

620 raise ExpressionError( 

621 "Expected a single string or ident for :lang(), got %r" % function.arguments 

622 ) 

623 value = typing.cast(str, function.arguments[0].value) 

624 return xpath.add_condition("lang(%s)" % (self.xpath_literal(value))) 

625 

626 # Pseudo: dispatch by pseudo-class name 

627 

628 def xpath_root_pseudo(self, xpath: XPathExpr) -> XPathExpr: 

629 return xpath.add_condition("not(parent::*)") 

630 

631 # CSS immediate children (CSS ":scope > div" to XPath "child::div" or "./div") 

632 # Works only at the start of a selector 

633 # Needed to get immediate children of a processed selector in Scrapy 

634 # for product in response.css('.product'): 

635 # description = product.css(':scope > div::text').get() 

636 def xpath_scope_pseudo(self, xpath: XPathExpr) -> XPathExpr: 

637 return xpath.add_condition("1") 

638 

639 def xpath_first_child_pseudo(self, xpath: XPathExpr) -> XPathExpr: 

640 return xpath.add_condition("count(preceding-sibling::*) = 0") 

641 

642 def xpath_last_child_pseudo(self, xpath: XPathExpr) -> XPathExpr: 

643 return xpath.add_condition("count(following-sibling::*) = 0") 

644 

645 def xpath_first_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr: 

646 if xpath.element == "*": 

647 raise ExpressionError("*:first-of-type is not implemented") 

648 return xpath.add_condition("count(preceding-sibling::%s) = 0" % xpath.element) 

649 

650 def xpath_last_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr: 

651 if xpath.element == "*": 

652 raise ExpressionError("*:last-of-type is not implemented") 

653 return xpath.add_condition("count(following-sibling::%s) = 0" % xpath.element) 

654 

655 def xpath_only_child_pseudo(self, xpath: XPathExpr) -> XPathExpr: 

656 return xpath.add_condition("count(parent::*/child::*) = 1") 

657 

658 def xpath_only_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr: 

659 if xpath.element == "*": 

660 raise ExpressionError("*:only-of-type is not implemented") 

661 return xpath.add_condition("count(parent::*/child::%s) = 1" % xpath.element) 

662 

663 def xpath_empty_pseudo(self, xpath: XPathExpr) -> XPathExpr: 

664 return xpath.add_condition("not(*) and not(string-length())") 

665 

666 def pseudo_never_matches(self, xpath: XPathExpr) -> XPathExpr: 

667 """Common implementation for pseudo-classes that never match.""" 

668 return xpath.add_condition("0") 

669 

670 xpath_link_pseudo = pseudo_never_matches 

671 xpath_visited_pseudo = pseudo_never_matches 

672 xpath_hover_pseudo = pseudo_never_matches 

673 xpath_active_pseudo = pseudo_never_matches 

674 xpath_focus_pseudo = pseudo_never_matches 

675 xpath_target_pseudo = pseudo_never_matches 

676 xpath_enabled_pseudo = pseudo_never_matches 

677 xpath_disabled_pseudo = pseudo_never_matches 

678 xpath_checked_pseudo = pseudo_never_matches 

679 

680 # Attrib: dispatch by attribute operator 

681 

682 def xpath_attrib_exists(self, xpath: XPathExpr, name: str, value: Optional[str]) -> XPathExpr: 

683 assert not value 

684 xpath.add_condition(name) 

685 return xpath 

686 

687 def xpath_attrib_equals(self, xpath: XPathExpr, name: str, value: Optional[str]) -> XPathExpr: 

688 assert value is not None 

689 xpath.add_condition("%s = %s" % (name, self.xpath_literal(value))) 

690 return xpath 

691 

692 def xpath_attrib_different( 

693 self, xpath: XPathExpr, name: str, value: Optional[str] 

694 ) -> XPathExpr: 

695 assert value is not None 

696 # FIXME: this seems like a weird hack... 

697 if value: 

698 xpath.add_condition("not(%s) or %s != %s" % (name, name, self.xpath_literal(value))) 

699 else: 

700 xpath.add_condition("%s != %s" % (name, self.xpath_literal(value))) 

701 return xpath 

702 

703 def xpath_attrib_includes( 

704 self, xpath: XPathExpr, name: str, value: Optional[str] 

705 ) -> XPathExpr: 

706 if value and is_non_whitespace(value): 

707 xpath.add_condition( 

708 "%s and contains(concat(' ', normalize-space(%s), ' '), %s)" 

709 % (name, name, self.xpath_literal(" " + value + " ")) 

710 ) 

711 else: 

712 xpath.add_condition("0") 

713 return xpath 

714 

715 def xpath_attrib_dashmatch( 

716 self, xpath: XPathExpr, name: str, value: Optional[str] 

717 ) -> XPathExpr: 

718 assert value is not None 

719 # Weird, but true... 

720 xpath.add_condition( 

721 "%s and (%s = %s or starts-with(%s, %s))" 

722 % (name, name, self.xpath_literal(value), name, self.xpath_literal(value + "-")) 

723 ) 

724 return xpath 

725 

726 def xpath_attrib_prefixmatch( 

727 self, xpath: XPathExpr, name: str, value: Optional[str] 

728 ) -> XPathExpr: 

729 if value: 

730 xpath.add_condition( 

731 "%s and starts-with(%s, %s)" % (name, name, self.xpath_literal(value)) 

732 ) 

733 else: 

734 xpath.add_condition("0") 

735 return xpath 

736 

737 def xpath_attrib_suffixmatch( 

738 self, xpath: XPathExpr, name: str, value: Optional[str] 

739 ) -> XPathExpr: 

740 if value: 

741 # Oddly there is a starts-with in XPath 1.0, but not ends-with 

742 xpath.add_condition( 

743 "%s and substring(%s, string-length(%s)-%s) = %s" 

744 % (name, name, name, len(value) - 1, self.xpath_literal(value)) 

745 ) 

746 else: 

747 xpath.add_condition("0") 

748 return xpath 

749 

750 def xpath_attrib_substringmatch( 

751 self, xpath: XPathExpr, name: str, value: Optional[str] 

752 ) -> XPathExpr: 

753 if value: 

754 # Attribute selectors are case sensitive 

755 xpath.add_condition( 

756 "%s and contains(%s, %s)" % (name, name, self.xpath_literal(value)) 

757 ) 

758 else: 

759 xpath.add_condition("0") 

760 return xpath 

761 

762 

763class HTMLTranslator(GenericTranslator): 

764 """ 

765 Translator for (X)HTML documents. 

766 

767 Has a more useful implementation of some pseudo-classes based on 

768 HTML-specific element names and attribute names, as described in 

769 the `HTML5 specification`_. It assumes no-quirks mode. 

770 The API is the same as :class:`GenericTranslator`. 

771 

772 .. _HTML5 specification: http://www.w3.org/TR/html5/links.html#selectors 

773 

774 :param xhtml: 

775 If false (the default), element names and attribute names 

776 are case-insensitive. 

777 

778 """ 

779 

780 lang_attribute = "lang" 

781 

782 def __init__(self, xhtml: bool = False) -> None: 

783 self.xhtml = xhtml # Might be useful for sub-classes? 

784 if not xhtml: 

785 # See their definition in GenericTranslator. 

786 self.lower_case_element_names = True 

787 self.lower_case_attribute_names = True 

788 

789 def xpath_checked_pseudo(self, xpath: XPathExpr) -> XPathExpr: # type: ignore 

790 # FIXME: is this really all the elements? 

791 return xpath.add_condition( 

792 "(@selected and name(.) = 'option') or " 

793 "(@checked " 

794 "and (name(.) = 'input' or name(.) = 'command')" 

795 "and (@type = 'checkbox' or @type = 'radio'))" 

796 ) 

797 

798 def xpath_lang_function(self, xpath: XPathExpr, function: Function) -> XPathExpr: 

799 if function.argument_types() not in (["STRING"], ["IDENT"]): 

800 raise ExpressionError( 

801 "Expected a single string or ident for :lang(), got %r" % function.arguments 

802 ) 

803 value = function.arguments[0].value 

804 assert value 

805 return xpath.add_condition( 

806 "ancestor-or-self::*[@lang][1][starts-with(concat(" 

807 # XPath 1.0 has no lower-case function... 

808 "translate(@%s, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', " 

809 "'abcdefghijklmnopqrstuvwxyz'), " 

810 "'-'), %s)]" % (self.lang_attribute, self.xpath_literal(value.lower() + "-")) 

811 ) 

812 

813 def xpath_link_pseudo(self, xpath: XPathExpr) -> XPathExpr: # type: ignore 

814 return xpath.add_condition( 

815 "@href and (name(.) = 'a' or name(.) = 'link' or name(.) = 'area')" 

816 ) 

817 

818 # Links are never visited, the implementation for :visited is the same 

819 # as in GenericTranslator 

820 

821 def xpath_disabled_pseudo(self, xpath: XPathExpr) -> XPathExpr: # type: ignore 

822 # http://www.w3.org/TR/html5/section-index.html#attributes-1 

823 return xpath.add_condition( 

824 """ 

825 ( 

826 @disabled and 

827 ( 

828 (name(.) = 'input' and @type != 'hidden') or 

829 name(.) = 'button' or 

830 name(.) = 'select' or 

831 name(.) = 'textarea' or 

832 name(.) = 'command' or 

833 name(.) = 'fieldset' or 

834 name(.) = 'optgroup' or 

835 name(.) = 'option' 

836 ) 

837 ) or ( 

838 ( 

839 (name(.) = 'input' and @type != 'hidden') or 

840 name(.) = 'button' or 

841 name(.) = 'select' or 

842 name(.) = 'textarea' 

843 ) 

844 and ancestor::fieldset[@disabled] 

845 ) 

846 """ 

847 ) 

848 # FIXME: in the second half, add "and is not a descendant of that 

849 # fieldset element's first legend element child, if any." 

850 

851 def xpath_enabled_pseudo(self, xpath: XPathExpr) -> XPathExpr: # type: ignore 

852 # http://www.w3.org/TR/html5/section-index.html#attributes-1 

853 return xpath.add_condition( 

854 """ 

855 ( 

856 @href and ( 

857 name(.) = 'a' or 

858 name(.) = 'link' or 

859 name(.) = 'area' 

860 ) 

861 ) or ( 

862 ( 

863 name(.) = 'command' or 

864 name(.) = 'fieldset' or 

865 name(.) = 'optgroup' 

866 ) 

867 and not(@disabled) 

868 ) or ( 

869 ( 

870 (name(.) = 'input' and @type != 'hidden') or 

871 name(.) = 'button' or 

872 name(.) = 'select' or 

873 name(.) = 'textarea' or 

874 name(.) = 'keygen' 

875 ) 

876 and not (@disabled or ancestor::fieldset[@disabled]) 

877 ) or ( 

878 name(.) = 'option' and not( 

879 @disabled or ancestor::optgroup[@disabled] 

880 ) 

881 ) 

882 """ 

883 ) 

884 # FIXME: ... or "li elements that are children of menu elements, 

885 # and that have a child element that defines a command, if the first 

886 # such element's Disabled State facet is false (not disabled)". 

887 # FIXME: after ancestor::fieldset[@disabled], add "and is not a 

888 # descendant of that fieldset element's first legend element child, 

889 # if any."