Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bleach/_vendor/html5lib/filters/sanitizer.py: 24%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

122 statements  

1"""Deprecated from html5lib 1.1. 

2 

3See `here <https://github.com/html5lib/html5lib-python/issues/443>`_ for 

4information about its deprecation; `Bleach <https://github.com/mozilla/bleach>`_ 

5is recommended as a replacement. Please let us know in the aforementioned issue 

6if Bleach is unsuitable for your needs. 

7 

8""" 

9from __future__ import absolute_import, division, unicode_literals 

10 

11import re 

12import warnings 

13from xml.sax.saxutils import escape, unescape 

14 

15from bleach.six_shim import urllib_parse as urlparse 

16 

17from . import base 

18from ..constants import namespaces, prefixes 

19 

20__all__ = ["Filter"] 

21 

22 

23_deprecation_msg = ( 

24 "html5lib's sanitizer is deprecated; see " + 

25 "https://github.com/html5lib/html5lib-python/issues/443 and please let " + 

26 "us know if Bleach is unsuitable for your needs" 

27) 

28 

29warnings.warn(_deprecation_msg, DeprecationWarning) 

30 

31allowed_elements = frozenset(( 

32 (namespaces['html'], 'a'), 

33 (namespaces['html'], 'abbr'), 

34 (namespaces['html'], 'acronym'), 

35 (namespaces['html'], 'address'), 

36 (namespaces['html'], 'area'), 

37 (namespaces['html'], 'article'), 

38 (namespaces['html'], 'aside'), 

39 (namespaces['html'], 'audio'), 

40 (namespaces['html'], 'b'), 

41 (namespaces['html'], 'big'), 

42 (namespaces['html'], 'blockquote'), 

43 (namespaces['html'], 'br'), 

44 (namespaces['html'], 'button'), 

45 (namespaces['html'], 'canvas'), 

46 (namespaces['html'], 'caption'), 

47 (namespaces['html'], 'center'), 

48 (namespaces['html'], 'cite'), 

49 (namespaces['html'], 'code'), 

50 (namespaces['html'], 'col'), 

51 (namespaces['html'], 'colgroup'), 

52 (namespaces['html'], 'command'), 

53 (namespaces['html'], 'datagrid'), 

54 (namespaces['html'], 'datalist'), 

55 (namespaces['html'], 'dd'), 

56 (namespaces['html'], 'del'), 

57 (namespaces['html'], 'details'), 

58 (namespaces['html'], 'dfn'), 

59 (namespaces['html'], 'dialog'), 

60 (namespaces['html'], 'dir'), 

61 (namespaces['html'], 'div'), 

62 (namespaces['html'], 'dl'), 

63 (namespaces['html'], 'dt'), 

64 (namespaces['html'], 'em'), 

65 (namespaces['html'], 'event-source'), 

66 (namespaces['html'], 'fieldset'), 

67 (namespaces['html'], 'figcaption'), 

68 (namespaces['html'], 'figure'), 

69 (namespaces['html'], 'footer'), 

70 (namespaces['html'], 'font'), 

71 (namespaces['html'], 'form'), 

72 (namespaces['html'], 'header'), 

73 (namespaces['html'], 'h1'), 

74 (namespaces['html'], 'h2'), 

75 (namespaces['html'], 'h3'), 

76 (namespaces['html'], 'h4'), 

77 (namespaces['html'], 'h5'), 

78 (namespaces['html'], 'h6'), 

79 (namespaces['html'], 'hr'), 

80 (namespaces['html'], 'i'), 

81 (namespaces['html'], 'img'), 

82 (namespaces['html'], 'input'), 

83 (namespaces['html'], 'ins'), 

84 (namespaces['html'], 'keygen'), 

85 (namespaces['html'], 'kbd'), 

86 (namespaces['html'], 'label'), 

87 (namespaces['html'], 'legend'), 

88 (namespaces['html'], 'li'), 

89 (namespaces['html'], 'm'), 

90 (namespaces['html'], 'map'), 

91 (namespaces['html'], 'menu'), 

92 (namespaces['html'], 'meter'), 

93 (namespaces['html'], 'multicol'), 

94 (namespaces['html'], 'nav'), 

95 (namespaces['html'], 'nextid'), 

96 (namespaces['html'], 'ol'), 

97 (namespaces['html'], 'output'), 

98 (namespaces['html'], 'optgroup'), 

99 (namespaces['html'], 'option'), 

100 (namespaces['html'], 'p'), 

101 (namespaces['html'], 'pre'), 

102 (namespaces['html'], 'progress'), 

103 (namespaces['html'], 'q'), 

104 (namespaces['html'], 's'), 

105 (namespaces['html'], 'samp'), 

106 (namespaces['html'], 'section'), 

107 (namespaces['html'], 'select'), 

108 (namespaces['html'], 'small'), 

109 (namespaces['html'], 'sound'), 

110 (namespaces['html'], 'source'), 

111 (namespaces['html'], 'spacer'), 

112 (namespaces['html'], 'span'), 

113 (namespaces['html'], 'strike'), 

114 (namespaces['html'], 'strong'), 

115 (namespaces['html'], 'sub'), 

116 (namespaces['html'], 'sup'), 

117 (namespaces['html'], 'table'), 

118 (namespaces['html'], 'tbody'), 

119 (namespaces['html'], 'td'), 

120 (namespaces['html'], 'textarea'), 

121 (namespaces['html'], 'time'), 

122 (namespaces['html'], 'tfoot'), 

123 (namespaces['html'], 'th'), 

124 (namespaces['html'], 'thead'), 

125 (namespaces['html'], 'tr'), 

126 (namespaces['html'], 'tt'), 

127 (namespaces['html'], 'u'), 

128 (namespaces['html'], 'ul'), 

129 (namespaces['html'], 'var'), 

130 (namespaces['html'], 'video'), 

131 (namespaces['html'], 'wbr'), 

132 (namespaces['mathml'], 'maction'), 

133 (namespaces['mathml'], 'math'), 

134 (namespaces['mathml'], 'merror'), 

135 (namespaces['mathml'], 'mfrac'), 

136 (namespaces['mathml'], 'mi'), 

137 (namespaces['mathml'], 'mmultiscripts'), 

138 (namespaces['mathml'], 'mn'), 

139 (namespaces['mathml'], 'mo'), 

140 (namespaces['mathml'], 'mover'), 

141 (namespaces['mathml'], 'mpadded'), 

142 (namespaces['mathml'], 'mphantom'), 

143 (namespaces['mathml'], 'mprescripts'), 

144 (namespaces['mathml'], 'mroot'), 

145 (namespaces['mathml'], 'mrow'), 

146 (namespaces['mathml'], 'mspace'), 

147 (namespaces['mathml'], 'msqrt'), 

148 (namespaces['mathml'], 'mstyle'), 

149 (namespaces['mathml'], 'msub'), 

150 (namespaces['mathml'], 'msubsup'), 

151 (namespaces['mathml'], 'msup'), 

152 (namespaces['mathml'], 'mtable'), 

153 (namespaces['mathml'], 'mtd'), 

154 (namespaces['mathml'], 'mtext'), 

155 (namespaces['mathml'], 'mtr'), 

156 (namespaces['mathml'], 'munder'), 

157 (namespaces['mathml'], 'munderover'), 

158 (namespaces['mathml'], 'none'), 

159 (namespaces['svg'], 'a'), 

160 (namespaces['svg'], 'animate'), 

161 (namespaces['svg'], 'animateColor'), 

162 (namespaces['svg'], 'animateMotion'), 

163 (namespaces['svg'], 'animateTransform'), 

164 (namespaces['svg'], 'clipPath'), 

165 (namespaces['svg'], 'circle'), 

166 (namespaces['svg'], 'defs'), 

167 (namespaces['svg'], 'desc'), 

168 (namespaces['svg'], 'ellipse'), 

169 (namespaces['svg'], 'font-face'), 

170 (namespaces['svg'], 'font-face-name'), 

171 (namespaces['svg'], 'font-face-src'), 

172 (namespaces['svg'], 'g'), 

173 (namespaces['svg'], 'glyph'), 

174 (namespaces['svg'], 'hkern'), 

175 (namespaces['svg'], 'linearGradient'), 

176 (namespaces['svg'], 'line'), 

177 (namespaces['svg'], 'marker'), 

178 (namespaces['svg'], 'metadata'), 

179 (namespaces['svg'], 'missing-glyph'), 

180 (namespaces['svg'], 'mpath'), 

181 (namespaces['svg'], 'path'), 

182 (namespaces['svg'], 'polygon'), 

183 (namespaces['svg'], 'polyline'), 

184 (namespaces['svg'], 'radialGradient'), 

185 (namespaces['svg'], 'rect'), 

186 (namespaces['svg'], 'set'), 

187 (namespaces['svg'], 'stop'), 

188 (namespaces['svg'], 'svg'), 

189 (namespaces['svg'], 'switch'), 

190 (namespaces['svg'], 'text'), 

191 (namespaces['svg'], 'title'), 

192 (namespaces['svg'], 'tspan'), 

193 (namespaces['svg'], 'use'), 

194)) 

195 

196allowed_attributes = frozenset(( 

197 # HTML attributes 

198 (None, 'abbr'), 

199 (None, 'accept'), 

200 (None, 'accept-charset'), 

201 (None, 'accesskey'), 

202 (None, 'action'), 

203 (None, 'align'), 

204 (None, 'alt'), 

205 (None, 'autocomplete'), 

206 (None, 'autofocus'), 

207 (None, 'axis'), 

208 (None, 'background'), 

209 (None, 'balance'), 

210 (None, 'bgcolor'), 

211 (None, 'bgproperties'), 

212 (None, 'border'), 

213 (None, 'bordercolor'), 

214 (None, 'bordercolordark'), 

215 (None, 'bordercolorlight'), 

216 (None, 'bottompadding'), 

217 (None, 'cellpadding'), 

218 (None, 'cellspacing'), 

219 (None, 'ch'), 

220 (None, 'challenge'), 

221 (None, 'char'), 

222 (None, 'charoff'), 

223 (None, 'choff'), 

224 (None, 'charset'), 

225 (None, 'checked'), 

226 (None, 'cite'), 

227 (None, 'class'), 

228 (None, 'clear'), 

229 (None, 'color'), 

230 (None, 'cols'), 

231 (None, 'colspan'), 

232 (None, 'compact'), 

233 (None, 'contenteditable'), 

234 (None, 'controls'), 

235 (None, 'coords'), 

236 (None, 'data'), 

237 (None, 'datafld'), 

238 (None, 'datapagesize'), 

239 (None, 'datasrc'), 

240 (None, 'datetime'), 

241 (None, 'default'), 

242 (None, 'delay'), 

243 (None, 'dir'), 

244 (None, 'disabled'), 

245 (None, 'draggable'), 

246 (None, 'dynsrc'), 

247 (None, 'enctype'), 

248 (None, 'end'), 

249 (None, 'face'), 

250 (None, 'for'), 

251 (None, 'form'), 

252 (None, 'frame'), 

253 (None, 'galleryimg'), 

254 (None, 'gutter'), 

255 (None, 'headers'), 

256 (None, 'height'), 

257 (None, 'hidefocus'), 

258 (None, 'hidden'), 

259 (None, 'high'), 

260 (None, 'href'), 

261 (None, 'hreflang'), 

262 (None, 'hspace'), 

263 (None, 'icon'), 

264 (None, 'id'), 

265 (None, 'inputmode'), 

266 (None, 'ismap'), 

267 (None, 'keytype'), 

268 (None, 'label'), 

269 (None, 'leftspacing'), 

270 (None, 'lang'), 

271 (None, 'list'), 

272 (None, 'longdesc'), 

273 (None, 'loop'), 

274 (None, 'loopcount'), 

275 (None, 'loopend'), 

276 (None, 'loopstart'), 

277 (None, 'low'), 

278 (None, 'lowsrc'), 

279 (None, 'max'), 

280 (None, 'maxlength'), 

281 (None, 'media'), 

282 (None, 'method'), 

283 (None, 'min'), 

284 (None, 'multiple'), 

285 (None, 'name'), 

286 (None, 'nohref'), 

287 (None, 'noshade'), 

288 (None, 'nowrap'), 

289 (None, 'open'), 

290 (None, 'optimum'), 

291 (None, 'pattern'), 

292 (None, 'ping'), 

293 (None, 'point-size'), 

294 (None, 'poster'), 

295 (None, 'pqg'), 

296 (None, 'preload'), 

297 (None, 'prompt'), 

298 (None, 'radiogroup'), 

299 (None, 'readonly'), 

300 (None, 'rel'), 

301 (None, 'repeat-max'), 

302 (None, 'repeat-min'), 

303 (None, 'replace'), 

304 (None, 'required'), 

305 (None, 'rev'), 

306 (None, 'rightspacing'), 

307 (None, 'rows'), 

308 (None, 'rowspan'), 

309 (None, 'rules'), 

310 (None, 'scope'), 

311 (None, 'selected'), 

312 (None, 'shape'), 

313 (None, 'size'), 

314 (None, 'span'), 

315 (None, 'src'), 

316 (None, 'start'), 

317 (None, 'step'), 

318 (None, 'style'), 

319 (None, 'summary'), 

320 (None, 'suppress'), 

321 (None, 'tabindex'), 

322 (None, 'target'), 

323 (None, 'template'), 

324 (None, 'title'), 

325 (None, 'toppadding'), 

326 (None, 'type'), 

327 (None, 'unselectable'), 

328 (None, 'usemap'), 

329 (None, 'urn'), 

330 (None, 'valign'), 

331 (None, 'value'), 

332 (None, 'variable'), 

333 (None, 'volume'), 

334 (None, 'vspace'), 

335 (None, 'vrml'), 

336 (None, 'width'), 

337 (None, 'wrap'), 

338 (namespaces['xml'], 'lang'), 

339 # MathML attributes 

340 (None, 'actiontype'), 

341 (None, 'align'), 

342 (None, 'columnalign'), 

343 (None, 'columnalign'), 

344 (None, 'columnalign'), 

345 (None, 'columnlines'), 

346 (None, 'columnspacing'), 

347 (None, 'columnspan'), 

348 (None, 'depth'), 

349 (None, 'display'), 

350 (None, 'displaystyle'), 

351 (None, 'equalcolumns'), 

352 (None, 'equalrows'), 

353 (None, 'fence'), 

354 (None, 'fontstyle'), 

355 (None, 'fontweight'), 

356 (None, 'frame'), 

357 (None, 'height'), 

358 (None, 'linethickness'), 

359 (None, 'lspace'), 

360 (None, 'mathbackground'), 

361 (None, 'mathcolor'), 

362 (None, 'mathvariant'), 

363 (None, 'mathvariant'), 

364 (None, 'maxsize'), 

365 (None, 'minsize'), 

366 (None, 'other'), 

367 (None, 'rowalign'), 

368 (None, 'rowalign'), 

369 (None, 'rowalign'), 

370 (None, 'rowlines'), 

371 (None, 'rowspacing'), 

372 (None, 'rowspan'), 

373 (None, 'rspace'), 

374 (None, 'scriptlevel'), 

375 (None, 'selection'), 

376 (None, 'separator'), 

377 (None, 'stretchy'), 

378 (None, 'width'), 

379 (None, 'width'), 

380 (namespaces['xlink'], 'href'), 

381 (namespaces['xlink'], 'show'), 

382 (namespaces['xlink'], 'type'), 

383 # SVG attributes 

384 (None, 'accent-height'), 

385 (None, 'accumulate'), 

386 (None, 'additive'), 

387 (None, 'alphabetic'), 

388 (None, 'arabic-form'), 

389 (None, 'ascent'), 

390 (None, 'attributeName'), 

391 (None, 'attributeType'), 

392 (None, 'baseProfile'), 

393 (None, 'bbox'), 

394 (None, 'begin'), 

395 (None, 'by'), 

396 (None, 'calcMode'), 

397 (None, 'cap-height'), 

398 (None, 'class'), 

399 (None, 'clip-path'), 

400 (None, 'color'), 

401 (None, 'color-rendering'), 

402 (None, 'content'), 

403 (None, 'cx'), 

404 (None, 'cy'), 

405 (None, 'd'), 

406 (None, 'dx'), 

407 (None, 'dy'), 

408 (None, 'descent'), 

409 (None, 'display'), 

410 (None, 'dur'), 

411 (None, 'end'), 

412 (None, 'fill'), 

413 (None, 'fill-opacity'), 

414 (None, 'fill-rule'), 

415 (None, 'font-family'), 

416 (None, 'font-size'), 

417 (None, 'font-stretch'), 

418 (None, 'font-style'), 

419 (None, 'font-variant'), 

420 (None, 'font-weight'), 

421 (None, 'from'), 

422 (None, 'fx'), 

423 (None, 'fy'), 

424 (None, 'g1'), 

425 (None, 'g2'), 

426 (None, 'glyph-name'), 

427 (None, 'gradientUnits'), 

428 (None, 'hanging'), 

429 (None, 'height'), 

430 (None, 'horiz-adv-x'), 

431 (None, 'horiz-origin-x'), 

432 (None, 'id'), 

433 (None, 'ideographic'), 

434 (None, 'k'), 

435 (None, 'keyPoints'), 

436 (None, 'keySplines'), 

437 (None, 'keyTimes'), 

438 (None, 'lang'), 

439 (None, 'marker-end'), 

440 (None, 'marker-mid'), 

441 (None, 'marker-start'), 

442 (None, 'markerHeight'), 

443 (None, 'markerUnits'), 

444 (None, 'markerWidth'), 

445 (None, 'mathematical'), 

446 (None, 'max'), 

447 (None, 'min'), 

448 (None, 'name'), 

449 (None, 'offset'), 

450 (None, 'opacity'), 

451 (None, 'orient'), 

452 (None, 'origin'), 

453 (None, 'overline-position'), 

454 (None, 'overline-thickness'), 

455 (None, 'panose-1'), 

456 (None, 'path'), 

457 (None, 'pathLength'), 

458 (None, 'points'), 

459 (None, 'preserveAspectRatio'), 

460 (None, 'r'), 

461 (None, 'refX'), 

462 (None, 'refY'), 

463 (None, 'repeatCount'), 

464 (None, 'repeatDur'), 

465 (None, 'requiredExtensions'), 

466 (None, 'requiredFeatures'), 

467 (None, 'restart'), 

468 (None, 'rotate'), 

469 (None, 'rx'), 

470 (None, 'ry'), 

471 (None, 'slope'), 

472 (None, 'stemh'), 

473 (None, 'stemv'), 

474 (None, 'stop-color'), 

475 (None, 'stop-opacity'), 

476 (None, 'strikethrough-position'), 

477 (None, 'strikethrough-thickness'), 

478 (None, 'stroke'), 

479 (None, 'stroke-dasharray'), 

480 (None, 'stroke-dashoffset'), 

481 (None, 'stroke-linecap'), 

482 (None, 'stroke-linejoin'), 

483 (None, 'stroke-miterlimit'), 

484 (None, 'stroke-opacity'), 

485 (None, 'stroke-width'), 

486 (None, 'systemLanguage'), 

487 (None, 'target'), 

488 (None, 'text-anchor'), 

489 (None, 'to'), 

490 (None, 'transform'), 

491 (None, 'type'), 

492 (None, 'u1'), 

493 (None, 'u2'), 

494 (None, 'underline-position'), 

495 (None, 'underline-thickness'), 

496 (None, 'unicode'), 

497 (None, 'unicode-range'), 

498 (None, 'units-per-em'), 

499 (None, 'values'), 

500 (None, 'version'), 

501 (None, 'viewBox'), 

502 (None, 'visibility'), 

503 (None, 'width'), 

504 (None, 'widths'), 

505 (None, 'x'), 

506 (None, 'x-height'), 

507 (None, 'x1'), 

508 (None, 'x2'), 

509 (namespaces['xlink'], 'actuate'), 

510 (namespaces['xlink'], 'arcrole'), 

511 (namespaces['xlink'], 'href'), 

512 (namespaces['xlink'], 'role'), 

513 (namespaces['xlink'], 'show'), 

514 (namespaces['xlink'], 'title'), 

515 (namespaces['xlink'], 'type'), 

516 (namespaces['xml'], 'base'), 

517 (namespaces['xml'], 'lang'), 

518 (namespaces['xml'], 'space'), 

519 (None, 'y'), 

520 (None, 'y1'), 

521 (None, 'y2'), 

522 (None, 'zoomAndPan'), 

523)) 

524 

525attr_val_is_uri = frozenset(( 

526 (None, 'href'), 

527 (None, 'src'), 

528 (None, 'cite'), 

529 (None, 'action'), 

530 (None, 'longdesc'), 

531 (None, 'poster'), 

532 (None, 'background'), 

533 (None, 'datasrc'), 

534 (None, 'dynsrc'), 

535 (None, 'lowsrc'), 

536 (None, 'ping'), 

537 (namespaces['xlink'], 'href'), 

538 (namespaces['xml'], 'base'), 

539)) 

540 

541svg_attr_val_allows_ref = frozenset(( 

542 (None, 'clip-path'), 

543 (None, 'color-profile'), 

544 (None, 'cursor'), 

545 (None, 'fill'), 

546 (None, 'filter'), 

547 (None, 'marker'), 

548 (None, 'marker-start'), 

549 (None, 'marker-mid'), 

550 (None, 'marker-end'), 

551 (None, 'mask'), 

552 (None, 'stroke'), 

553)) 

554 

555svg_allow_local_href = frozenset(( 

556 (None, 'altGlyph'), 

557 (None, 'animate'), 

558 (None, 'animateColor'), 

559 (None, 'animateMotion'), 

560 (None, 'animateTransform'), 

561 (None, 'cursor'), 

562 (None, 'feImage'), 

563 (None, 'filter'), 

564 (None, 'linearGradient'), 

565 (None, 'pattern'), 

566 (None, 'radialGradient'), 

567 (None, 'textpath'), 

568 (None, 'tref'), 

569 (None, 'set'), 

570 (None, 'use') 

571)) 

572 

573allowed_css_properties = frozenset(( 

574 'azimuth', 

575 'background-color', 

576 'border-bottom-color', 

577 'border-collapse', 

578 'border-color', 

579 'border-left-color', 

580 'border-right-color', 

581 'border-top-color', 

582 'clear', 

583 'color', 

584 'cursor', 

585 'direction', 

586 'display', 

587 'elevation', 

588 'float', 

589 'font', 

590 'font-family', 

591 'font-size', 

592 'font-style', 

593 'font-variant', 

594 'font-weight', 

595 'height', 

596 'letter-spacing', 

597 'line-height', 

598 'overflow', 

599 'pause', 

600 'pause-after', 

601 'pause-before', 

602 'pitch', 

603 'pitch-range', 

604 'richness', 

605 'speak', 

606 'speak-header', 

607 'speak-numeral', 

608 'speak-punctuation', 

609 'speech-rate', 

610 'stress', 

611 'text-align', 

612 'text-decoration', 

613 'text-indent', 

614 'unicode-bidi', 

615 'vertical-align', 

616 'voice-family', 

617 'volume', 

618 'white-space', 

619 'width', 

620)) 

621 

622allowed_css_keywords = frozenset(( 

623 'auto', 

624 'aqua', 

625 'black', 

626 'block', 

627 'blue', 

628 'bold', 

629 'both', 

630 'bottom', 

631 'brown', 

632 'center', 

633 'collapse', 

634 'dashed', 

635 'dotted', 

636 'fuchsia', 

637 'gray', 

638 'green', 

639 '!important', 

640 'italic', 

641 'left', 

642 'lime', 

643 'maroon', 

644 'medium', 

645 'none', 

646 'navy', 

647 'normal', 

648 'nowrap', 

649 'olive', 

650 'pointer', 

651 'purple', 

652 'red', 

653 'right', 

654 'solid', 

655 'silver', 

656 'teal', 

657 'top', 

658 'transparent', 

659 'underline', 

660 'white', 

661 'yellow', 

662)) 

663 

664allowed_svg_properties = frozenset(( 

665 'fill', 

666 'fill-opacity', 

667 'fill-rule', 

668 'stroke', 

669 'stroke-width', 

670 'stroke-linecap', 

671 'stroke-linejoin', 

672 'stroke-opacity', 

673)) 

674 

675allowed_protocols = frozenset(( 

676 'ed2k', 

677 'ftp', 

678 'http', 

679 'https', 

680 'irc', 

681 'mailto', 

682 'news', 

683 'gopher', 

684 'nntp', 

685 'telnet', 

686 'webcal', 

687 'xmpp', 

688 'callto', 

689 'feed', 

690 'urn', 

691 'aim', 

692 'rsync', 

693 'tag', 

694 'ssh', 

695 'sftp', 

696 'rtsp', 

697 'afs', 

698 'data', 

699)) 

700 

701allowed_content_types = frozenset(( 

702 'image/png', 

703 'image/jpeg', 

704 'image/gif', 

705 'image/webp', 

706 'image/bmp', 

707 'text/plain', 

708)) 

709 

710 

711data_content_type = re.compile(r''' 

712 ^ 

713 # Match a content type <application>/<type> 

714 (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+) 

715 # Match any character set and encoding 

716 (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?) 

717 |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?) 

718 # Assume the rest is data 

719 ,.* 

720 $ 

721 ''', 

722 re.VERBOSE) 

723 

724 

725class Filter(base.Filter): 

726 """Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes""" 

727 def __init__(self, 

728 source, 

729 allowed_elements=allowed_elements, 

730 allowed_attributes=allowed_attributes, 

731 allowed_css_properties=allowed_css_properties, 

732 allowed_css_keywords=allowed_css_keywords, 

733 allowed_svg_properties=allowed_svg_properties, 

734 allowed_protocols=allowed_protocols, 

735 allowed_content_types=allowed_content_types, 

736 attr_val_is_uri=attr_val_is_uri, 

737 svg_attr_val_allows_ref=svg_attr_val_allows_ref, 

738 svg_allow_local_href=svg_allow_local_href): 

739 """Creates a Filter 

740 

741 :arg allowed_elements: set of elements to allow--everything else will 

742 be escaped 

743 

744 :arg allowed_attributes: set of attributes to allow in 

745 elements--everything else will be stripped 

746 

747 :arg allowed_css_properties: set of CSS properties to allow--everything 

748 else will be stripped 

749 

750 :arg allowed_css_keywords: set of CSS keywords to allow--everything 

751 else will be stripped 

752 

753 :arg allowed_svg_properties: set of SVG properties to allow--everything 

754 else will be removed 

755 

756 :arg allowed_protocols: set of allowed protocols for URIs 

757 

758 :arg allowed_content_types: set of allowed content types for ``data`` URIs. 

759 

760 :arg attr_val_is_uri: set of attributes that have URI values--values 

761 that have a scheme not listed in ``allowed_protocols`` are removed 

762 

763 :arg svg_attr_val_allows_ref: set of SVG attributes that can have 

764 references 

765 

766 :arg svg_allow_local_href: set of SVG elements that can have local 

767 hrefs--these are removed 

768 

769 """ 

770 super(Filter, self).__init__(source) 

771 

772 warnings.warn(_deprecation_msg, DeprecationWarning) 

773 

774 self.allowed_elements = allowed_elements 

775 self.allowed_attributes = allowed_attributes 

776 self.allowed_css_properties = allowed_css_properties 

777 self.allowed_css_keywords = allowed_css_keywords 

778 self.allowed_svg_properties = allowed_svg_properties 

779 self.allowed_protocols = allowed_protocols 

780 self.allowed_content_types = allowed_content_types 

781 self.attr_val_is_uri = attr_val_is_uri 

782 self.svg_attr_val_allows_ref = svg_attr_val_allows_ref 

783 self.svg_allow_local_href = svg_allow_local_href 

784 

785 def __iter__(self): 

786 for token in base.Filter.__iter__(self): 

787 token = self.sanitize_token(token) 

788 if token: 

789 yield token 

790 

791 # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and 

792 # stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes 

793 # are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and 

794 # ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI 

795 # are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are 

796 # allowed. 

797 # 

798 # sanitize_html('<script> do_nasty_stuff() </script>') 

799 # => &lt;script> do_nasty_stuff() &lt;/script> 

800 # sanitize_html('<a href="javascript: sucker();">Click here for $100</a>') 

801 # => <a>Click here for $100</a> 

802 def sanitize_token(self, token): 

803 

804 # accommodate filters which use token_type differently 

805 token_type = token["type"] 

806 if token_type in ("StartTag", "EndTag", "EmptyTag"): 

807 name = token["name"] 

808 namespace = token["namespace"] 

809 if ((namespace, name) in self.allowed_elements or 

810 (namespace is None and 

811 (namespaces["html"], name) in self.allowed_elements)): 

812 return self.allowed_token(token) 

813 else: 

814 return self.disallowed_token(token) 

815 elif token_type == "Comment": 

816 pass 

817 else: 

818 return token 

819 

820 def allowed_token(self, token): 

821 if "data" in token: 

822 attrs = token["data"] 

823 attr_names = set(attrs.keys()) 

824 

825 # Remove forbidden attributes 

826 for to_remove in (attr_names - self.allowed_attributes): 

827 del token["data"][to_remove] 

828 attr_names.remove(to_remove) 

829 

830 # Remove attributes with disallowed URL values 

831 for attr in (attr_names & self.attr_val_is_uri): 

832 assert attr in attrs 

833 # I don't have a clue where this regexp comes from or why it matches those 

834 # characters, nor why we call unescape. I just know it's always been here. 

835 # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all 

836 # this will do is remove *more* than it otherwise would. 

837 val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '', 

838 unescape(attrs[attr])).lower() 

839 # remove replacement characters from unescaped characters 

840 val_unescaped = val_unescaped.replace("\ufffd", "") 

841 try: 

842 uri = urlparse.urlparse(val_unescaped) 

843 except ValueError: 

844 uri = None 

845 del attrs[attr] 

846 if uri and uri.scheme: 

847 if uri.scheme not in self.allowed_protocols: 

848 del attrs[attr] 

849 if uri.scheme == 'data': 

850 m = data_content_type.match(uri.path) 

851 if not m: 

852 del attrs[attr] 

853 elif m.group('content_type') not in self.allowed_content_types: 

854 del attrs[attr] 

855 

856 for attr in self.svg_attr_val_allows_ref: 

857 if attr in attrs: 

858 attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', 

859 ' ', 

860 unescape(attrs[attr])) 

861 if (token["name"] in self.svg_allow_local_href and 

862 (namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*', 

863 attrs[(namespaces['xlink'], 'href')])): 

864 del attrs[(namespaces['xlink'], 'href')] 

865 if (None, 'style') in attrs: 

866 attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')]) 

867 token["data"] = attrs 

868 return token 

869 

870 def disallowed_token(self, token): 

871 token_type = token["type"] 

872 if token_type == "EndTag": 

873 token["data"] = "</%s>" % token["name"] 

874 elif token["data"]: 

875 assert token_type in ("StartTag", "EmptyTag") 

876 attrs = [] 

877 for (ns, name), v in token["data"].items(): 

878 attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v))) 

879 token["data"] = "<%s%s>" % (token["name"], ''.join(attrs)) 

880 else: 

881 token["data"] = "<%s>" % token["name"] 

882 if token.get("selfClosing"): 

883 token["data"] = token["data"][:-1] + "/>" 

884 

885 token["type"] = "Characters" 

886 

887 del token["name"] 

888 return token 

889 

890 def sanitize_css(self, style): 

891 # disallow urls 

892 style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) 

893 

894 # gauntlet 

895 if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): 

896 return '' 

897 if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): 

898 return '' 

899 

900 clean = [] 

901 for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style): 

902 if not value: 

903 continue 

904 if prop.lower() in self.allowed_css_properties: 

905 clean.append(prop + ': ' + value + ';') 

906 elif prop.split('-')[0].lower() in ['background', 'border', 'margin', 

907 'padding']: 

908 for keyword in value.split(): 

909 if keyword not in self.allowed_css_keywords and \ 

910 not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa 

911 break 

912 else: 

913 clean.append(prop + ': ' + value + ';') 

914 elif prop.lower() in self.allowed_svg_properties: 

915 clean.append(prop + ': ' + value + ';') 

916 

917 return ' '.join(clean)