Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bleach/_vendor/html5lib/filters/sanitizer.py: 24%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

122 statements  

1"""Deprecated from html5lib 1.1. 

2 

3See `here <https://github.com/html5lib/html5lib-python/issues/443>`_ for 

4information about its deprecation; `Bleach <https://github.com/mozilla/bleach>`_ 

5is recommended as a replacement. Please let us know in the aforementioned issue 

6if Bleach is unsuitable for your needs. 

7 

8""" 

9from __future__ import absolute_import, division, unicode_literals 

10 

11import re 

12import warnings 

13from xml.sax.saxutils import escape, unescape 

14 

15from bleach.six_shim import urllib_parse as urlparse 

16 

17from . import base 

18from ..constants import namespaces, prefixes 

19 

20__all__ = ["Filter"] 

21 

22 

23_deprecation_msg = ( 

24 "html5lib's sanitizer is deprecated; see " + 

25 "https://github.com/html5lib/html5lib-python/issues/443 and please let " + 

26 "us know if Bleach is unsuitable for your needs" 

27) 

28 

29warnings.warn(_deprecation_msg, DeprecationWarning) 

30 

31allowed_elements = frozenset(( 

32 (namespaces['html'], 'a'), 

33 (namespaces['html'], 'abbr'), 

34 (namespaces['html'], 'acronym'), 

35 (namespaces['html'], 'address'), 

36 (namespaces['html'], 'area'), 

37 (namespaces['html'], 'article'), 

38 (namespaces['html'], 'aside'), 

39 (namespaces['html'], 'audio'), 

40 (namespaces['html'], 'b'), 

41 (namespaces['html'], 'big'), 

42 (namespaces['html'], 'blockquote'), 

43 (namespaces['html'], 'br'), 

44 (namespaces['html'], 'button'), 

45 (namespaces['html'], 'canvas'), 

46 (namespaces['html'], 'caption'), 

47 (namespaces['html'], 'center'), 

48 (namespaces['html'], 'cite'), 

49 (namespaces['html'], 'code'), 

50 (namespaces['html'], 'col'), 

51 (namespaces['html'], 'colgroup'), 

52 (namespaces['html'], 'command'), 

53 (namespaces['html'], 'datagrid'), 

54 (namespaces['html'], 'datalist'), 

55 (namespaces['html'], 'dd'), 

56 (namespaces['html'], 'del'), 

57 (namespaces['html'], 'details'), 

58 (namespaces['html'], 'dfn'), 

59 (namespaces['html'], 'dialog'), 

60 (namespaces['html'], 'dir'), 

61 (namespaces['html'], 'div'), 

62 (namespaces['html'], 'dl'), 

63 (namespaces['html'], 'dt'), 

64 (namespaces['html'], 'em'), 

65 (namespaces['html'], 'event-source'), 

66 (namespaces['html'], 'fieldset'), 

67 (namespaces['html'], 'figcaption'), 

68 (namespaces['html'], 'figure'), 

69 (namespaces['html'], 'footer'), 

70 (namespaces['html'], 'font'), 

71 (namespaces['html'], 'form'), 

72 (namespaces['html'], 'header'), 

73 (namespaces['html'], 'h1'), 

74 (namespaces['html'], 'h2'), 

75 (namespaces['html'], 'h3'), 

76 (namespaces['html'], 'h4'), 

77 (namespaces['html'], 'h5'), 

78 (namespaces['html'], 'h6'), 

79 (namespaces['html'], 'hr'), 

80 (namespaces['html'], 'i'), 

81 (namespaces['html'], 'img'), 

82 (namespaces['html'], 'input'), 

83 (namespaces['html'], 'ins'), 

84 (namespaces['html'], 'keygen'), 

85 (namespaces['html'], 'kbd'), 

86 (namespaces['html'], 'label'), 

87 (namespaces['html'], 'legend'), 

88 (namespaces['html'], 'li'), 

89 (namespaces['html'], 'm'), 

90 (namespaces['html'], 'map'), 

91 (namespaces['html'], 'menu'), 

92 (namespaces['html'], 'meter'), 

93 (namespaces['html'], 'multicol'), 

94 (namespaces['html'], 'nav'), 

95 (namespaces['html'], 'nextid'), 

96 (namespaces['html'], 'ol'), 

97 (namespaces['html'], 'output'), 

98 (namespaces['html'], 'optgroup'), 

99 (namespaces['html'], 'option'), 

100 (namespaces['html'], 'p'), 

101 (namespaces['html'], 'pre'), 

102 (namespaces['html'], 'progress'), 

103 (namespaces['html'], 'q'), 

104 (namespaces['html'], 's'), 

105 (namespaces['html'], 'samp'), 

106 (namespaces['html'], 'section'), 

107 (namespaces['html'], 'select'), 

108 (namespaces['html'], 'small'), 

109 (namespaces['html'], 'sound'), 

110 (namespaces['html'], 'source'), 

111 (namespaces['html'], 'spacer'), 

112 (namespaces['html'], 'span'), 

113 (namespaces['html'], 'strike'), 

114 (namespaces['html'], 'strong'), 

115 (namespaces['html'], 'sub'), 

116 (namespaces['html'], 'sup'), 

117 (namespaces['html'], 'table'), 

118 (namespaces['html'], 'tbody'), 

119 (namespaces['html'], 'td'), 

120 (namespaces['html'], 'textarea'), 

121 (namespaces['html'], 'time'), 

122 (namespaces['html'], 'tfoot'), 

123 (namespaces['html'], 'th'), 

124 (namespaces['html'], 'thead'), 

125 (namespaces['html'], 'tr'), 

126 (namespaces['html'], 'tt'), 

127 (namespaces['html'], 'u'), 

128 (namespaces['html'], 'ul'), 

129 (namespaces['html'], 'var'), 

130 (namespaces['html'], 'video'), 

131 (namespaces['html'], 'wbr'), 

132 (namespaces['mathml'], 'maction'), 

133 (namespaces['mathml'], 'math'), 

134 (namespaces['mathml'], 'merror'), 

135 (namespaces['mathml'], 'mfrac'), 

136 (namespaces['mathml'], 'mi'), 

137 (namespaces['mathml'], 'mmultiscripts'), 

138 (namespaces['mathml'], 'mn'), 

139 (namespaces['mathml'], 'mo'), 

140 (namespaces['mathml'], 'mover'), 

141 (namespaces['mathml'], 'mpadded'), 

142 (namespaces['mathml'], 'mphantom'), 

143 (namespaces['mathml'], 'mprescripts'), 

144 (namespaces['mathml'], 'mroot'), 

145 (namespaces['mathml'], 'mrow'), 

146 (namespaces['mathml'], 'mspace'), 

147 (namespaces['mathml'], 'msqrt'), 

148 (namespaces['mathml'], 'mstyle'), 

149 (namespaces['mathml'], 'msub'), 

150 (namespaces['mathml'], 'msubsup'), 

151 (namespaces['mathml'], 'msup'), 

152 (namespaces['mathml'], 'mtable'), 

153 (namespaces['mathml'], 'mtd'), 

154 (namespaces['mathml'], 'mtext'), 

155 (namespaces['mathml'], 'mtr'), 

156 (namespaces['mathml'], 'munder'), 

157 (namespaces['mathml'], 'munderover'), 

158 (namespaces['mathml'], 'none'), 

159 (namespaces['svg'], 'a'), 

160 (namespaces['svg'], 'animate'), 

161 (namespaces['svg'], 'animateColor'), 

162 (namespaces['svg'], 'animateMotion'), 

163 (namespaces['svg'], 'animateTransform'), 

164 (namespaces['svg'], 'clipPath'), 

165 (namespaces['svg'], 'circle'), 

166 (namespaces['svg'], 'defs'), 

167 (namespaces['svg'], 'desc'), 

168 (namespaces['svg'], 'ellipse'), 

169 (namespaces['svg'], 'font-face'), 

170 (namespaces['svg'], 'font-face-name'), 

171 (namespaces['svg'], 'font-face-src'), 

172 (namespaces['svg'], 'g'), 

173 (namespaces['svg'], 'glyph'), 

174 (namespaces['svg'], 'hkern'), 

175 (namespaces['svg'], 'linearGradient'), 

176 (namespaces['svg'], 'line'), 

177 (namespaces['svg'], 'marker'), 

178 (namespaces['svg'], 'metadata'), 

179 (namespaces['svg'], 'missing-glyph'), 

180 (namespaces['svg'], 'mpath'), 

181 (namespaces['svg'], 'path'), 

182 (namespaces['svg'], 'polygon'), 

183 (namespaces['svg'], 'polyline'), 

184 (namespaces['svg'], 'radialGradient'), 

185 (namespaces['svg'], 'rect'), 

186 (namespaces['svg'], 'set'), 

187 (namespaces['svg'], 'stop'), 

188 (namespaces['svg'], 'svg'), 

189 (namespaces['svg'], 'switch'), 

190 (namespaces['svg'], 'text'), 

191 (namespaces['svg'], 'title'), 

192 (namespaces['svg'], 'tspan'), 

193 (namespaces['svg'], 'use'), 

194)) 

195 

196allowed_attributes = frozenset(( 

197 # HTML attributes 

198 (None, 'abbr'), 

199 (None, 'accept'), 

200 (None, 'accept-charset'), 

201 (None, 'accesskey'), 

202 (None, 'action'), 

203 (None, 'align'), 

204 (None, 'alt'), 

205 (None, 'autocomplete'), 

206 (None, 'autofocus'), 

207 (None, 'axis'), 

208 (None, 'background'), 

209 (None, 'balance'), 

210 (None, 'bgcolor'), 

211 (None, 'bgproperties'), 

212 (None, 'border'), 

213 (None, 'bordercolor'), 

214 (None, 'bordercolordark'), 

215 (None, 'bordercolorlight'), 

216 (None, 'bottompadding'), 

217 (None, 'cellpadding'), 

218 (None, 'cellspacing'), 

219 (None, 'ch'), 

220 (None, 'challenge'), 

221 (None, 'char'), 

222 (None, 'charoff'), 

223 (None, 'choff'), 

224 (None, 'charset'), 

225 (None, 'checked'), 

226 (None, 'cite'), 

227 (None, 'class'), 

228 (None, 'clear'), 

229 (None, 'color'), 

230 (None, 'cols'), 

231 (None, 'colspan'), 

232 (None, 'compact'), 

233 (None, 'contenteditable'), 

234 (None, 'controls'), 

235 (None, 'coords'), 

236 (None, 'data'), 

237 (None, 'datafld'), 

238 (None, 'datapagesize'), 

239 (None, 'datasrc'), 

240 (None, 'datetime'), 

241 (None, 'default'), 

242 (None, 'delay'), 

243 (None, 'dir'), 

244 (None, 'disabled'), 

245 (None, 'draggable'), 

246 (None, 'dynsrc'), 

247 (None, 'enctype'), 

248 (None, 'end'), 

249 (None, 'face'), 

250 (None, 'for'), 

251 (None, 'form'), 

252 (None, 'frame'), 

253 (None, 'galleryimg'), 

254 (None, 'gutter'), 

255 (None, 'headers'), 

256 (None, 'height'), 

257 (None, 'hidefocus'), 

258 (None, 'hidden'), 

259 (None, 'high'), 

260 (None, 'href'), 

261 (None, 'hreflang'), 

262 (None, 'hspace'), 

263 (None, 'icon'), 

264 (None, 'id'), 

265 (None, 'inputmode'), 

266 (None, 'ismap'), 

267 (None, 'keytype'), 

268 (None, 'label'), 

269 (None, 'leftspacing'), 

270 (None, 'lang'), 

271 (None, 'list'), 

272 (None, 'longdesc'), 

273 (None, 'loop'), 

274 (None, 'loopcount'), 

275 (None, 'loopend'), 

276 (None, 'loopstart'), 

277 (None, 'low'), 

278 (None, 'lowsrc'), 

279 (None, 'max'), 

280 (None, 'maxlength'), 

281 (None, 'media'), 

282 (None, 'method'), 

283 (None, 'min'), 

284 (None, 'multiple'), 

285 (None, 'name'), 

286 (None, 'nohref'), 

287 (None, 'noshade'), 

288 (None, 'nowrap'), 

289 (None, 'open'), 

290 (None, 'optimum'), 

291 (None, 'pattern'), 

292 (None, 'ping'), 

293 (None, 'point-size'), 

294 (None, 'poster'), 

295 (None, 'pqg'), 

296 (None, 'preload'), 

297 (None, 'prompt'), 

298 (None, 'radiogroup'), 

299 (None, 'readonly'), 

300 (None, 'rel'), 

301 (None, 'repeat-max'), 

302 (None, 'repeat-min'), 

303 (None, 'replace'), 

304 (None, 'required'), 

305 (None, 'rev'), 

306 (None, 'rightspacing'), 

307 (None, 'rows'), 

308 (None, 'rowspan'), 

309 (None, 'rules'), 

310 (None, 'scope'), 

311 (None, 'selected'), 

312 (None, 'shape'), 

313 (None, 'size'), 

314 (None, 'span'), 

315 (None, 'src'), 

316 (None, 'start'), 

317 (None, 'step'), 

318 (None, 'style'), 

319 (None, 'summary'), 

320 (None, 'suppress'), 

321 (None, 'tabindex'), 

322 (None, 'target'), 

323 (None, 'template'), 

324 (None, 'title'), 

325 (None, 'toppadding'), 

326 (None, 'type'), 

327 (None, 'unselectable'), 

328 (None, 'usemap'), 

329 (None, 'urn'), 

330 (None, 'valign'), 

331 (None, 'value'), 

332 (None, 'variable'), 

333 (None, 'volume'), 

334 (None, 'vspace'), 

335 (None, 'vrml'), 

336 (None, 'width'), 

337 (None, 'wrap'), 

338 (namespaces['xml'], 'lang'), 

339 # MathML attributes 

340 (None, 'actiontype'), 

341 (None, 'align'), 

342 (None, 'columnalign'), 

343 (None, 'columnalign'), 

344 (None, 'columnalign'), 

345 (None, 'columnlines'), 

346 (None, 'columnspacing'), 

347 (None, 'columnspan'), 

348 (None, 'depth'), 

349 (None, 'display'), 

350 (None, 'displaystyle'), 

351 (None, 'equalcolumns'), 

352 (None, 'equalrows'), 

353 (None, 'fence'), 

354 (None, 'fontstyle'), 

355 (None, 'fontweight'), 

356 (None, 'frame'), 

357 (None, 'height'), 

358 (None, 'linethickness'), 

359 (None, 'lspace'), 

360 (None, 'mathbackground'), 

361 (None, 'mathcolor'), 

362 (None, 'mathvariant'), 

363 (None, 'mathvariant'), 

364 (None, 'maxsize'), 

365 (None, 'minsize'), 

366 (None, 'other'), 

367 (None, 'rowalign'), 

368 (None, 'rowalign'), 

369 (None, 'rowalign'), 

370 (None, 'rowlines'), 

371 (None, 'rowspacing'), 

372 (None, 'rowspan'), 

373 (None, 'rspace'), 

374 (None, 'scriptlevel'), 

375 (None, 'selection'), 

376 (None, 'separator'), 

377 (None, 'stretchy'), 

378 (None, 'width'), 

379 (None, 'width'), 

380 (namespaces['xlink'], 'href'), 

381 (namespaces['xlink'], 'show'), 

382 (namespaces['xlink'], 'type'), 

383 # SVG attributes 

384 (None, 'accent-height'), 

385 (None, 'accumulate'), 

386 (None, 'additive'), 

387 (None, 'alphabetic'), 

388 (None, 'arabic-form'), 

389 (None, 'ascent'), 

390 (None, 'attributeName'), 

391 (None, 'attributeType'), 

392 (None, 'baseProfile'), 

393 (None, 'bbox'), 

394 (None, 'begin'), 

395 (None, 'by'), 

396 (None, 'calcMode'), 

397 (None, 'cap-height'), 

398 (None, 'class'), 

399 (None, 'clip-path'), 

400 (None, 'color'), 

401 (None, 'color-rendering'), 

402 (None, 'content'), 

403 (None, 'cx'), 

404 (None, 'cy'), 

405 (None, 'd'), 

406 (None, 'dx'), 

407 (None, 'dy'), 

408 (None, 'descent'), 

409 (None, 'display'), 

410 (None, 'dur'), 

411 (None, 'end'), 

412 (None, 'fill'), 

413 (None, 'fill-opacity'), 

414 (None, 'fill-rule'), 

415 (None, 'font-family'), 

416 (None, 'font-size'), 

417 (None, 'font-stretch'), 

418 (None, 'font-style'), 

419 (None, 'font-variant'), 

420 (None, 'font-weight'), 

421 (None, 'from'), 

422 (None, 'fx'), 

423 (None, 'fy'), 

424 (None, 'g1'), 

425 (None, 'g2'), 

426 (None, 'glyph-name'), 

427 (None, 'gradientUnits'), 

428 (None, 'hanging'), 

429 (None, 'height'), 

430 (None, 'horiz-adv-x'), 

431 (None, 'horiz-origin-x'), 

432 (None, 'id'), 

433 (None, 'ideographic'), 

434 (None, 'k'), 

435 (None, 'keyPoints'), 

436 (None, 'keySplines'), 

437 (None, 'keyTimes'), 

438 (None, 'lang'), 

439 (None, 'marker-end'), 

440 (None, 'marker-mid'), 

441 (None, 'marker-start'), 

442 (None, 'markerHeight'), 

443 (None, 'markerUnits'), 

444 (None, 'markerWidth'), 

445 (None, 'mathematical'), 

446 (None, 'max'), 

447 (None, 'min'), 

448 (None, 'name'), 

449 (None, 'offset'), 

450 (None, 'opacity'), 

451 (None, 'orient'), 

452 (None, 'origin'), 

453 (None, 'overline-position'), 

454 (None, 'overline-thickness'), 

455 (None, 'panose-1'), 

456 (None, 'path'), 

457 (None, 'pathLength'), 

458 (None, 'points'), 

459 (None, 'preserveAspectRatio'), 

460 (None, 'r'), 

461 (None, 'refX'), 

462 (None, 'refY'), 

463 (None, 'repeatCount'), 

464 (None, 'repeatDur'), 

465 (None, 'requiredExtensions'), 

466 (None, 'requiredFeatures'), 

467 (None, 'restart'), 

468 (None, 'rotate'), 

469 (None, 'rx'), 

470 (None, 'ry'), 

471 (None, 'slope'), 

472 (None, 'stemh'), 

473 (None, 'stemv'), 

474 (None, 'stop-color'), 

475 (None, 'stop-opacity'), 

476 (None, 'strikethrough-position'), 

477 (None, 'strikethrough-thickness'), 

478 (None, 'stroke'), 

479 (None, 'stroke-dasharray'), 

480 (None, 'stroke-dashoffset'), 

481 (None, 'stroke-linecap'), 

482 (None, 'stroke-linejoin'), 

483 (None, 'stroke-miterlimit'), 

484 (None, 'stroke-opacity'), 

485 (None, 'stroke-width'), 

486 (None, 'systemLanguage'), 

487 (None, 'target'), 

488 (None, 'text-anchor'), 

489 (None, 'to'), 

490 (None, 'transform'), 

491 (None, 'type'), 

492 (None, 'u1'), 

493 (None, 'u2'), 

494 (None, 'underline-position'), 

495 (None, 'underline-thickness'), 

496 (None, 'unicode'), 

497 (None, 'unicode-range'), 

498 (None, 'units-per-em'), 

499 (None, 'values'), 

500 (None, 'version'), 

501 (None, 'viewBox'), 

502 (None, 'visibility'), 

503 (None, 'width'), 

504 (None, 'widths'), 

505 (None, 'x'), 

506 (None, 'x-height'), 

507 (None, 'x1'), 

508 (None, 'x2'), 

509 (namespaces['xlink'], 'actuate'), 

510 (namespaces['xlink'], 'arcrole'), 

511 (namespaces['xlink'], 'href'), 

512 (namespaces['xlink'], 'role'), 

513 (namespaces['xlink'], 'show'), 

514 (namespaces['xlink'], 'title'), 

515 (namespaces['xlink'], 'type'), 

516 (namespaces['xml'], 'base'), 

517 (namespaces['xml'], 'lang'), 

518 (namespaces['xml'], 'space'), 

519 (None, 'y'), 

520 (None, 'y1'), 

521 (None, 'y2'), 

522 (None, 'zoomAndPan'), 

523)) 

524 

525attr_val_is_uri = frozenset(( 

526 (None, 'href'), 

527 (None, 'src'), 

528 (None, 'cite'), 

529 (None, 'action'), 

530 (None, 'longdesc'), 

531 (None, 'poster'), 

532 (None, 'background'), 

533 (None, 'datasrc'), 

534 (None, 'dynsrc'), 

535 (None, 'lowsrc'), 

536 (None, 'ping'), 

537 (None, 'formaction'), 

538 (namespaces['xlink'], 'href'), 

539 (namespaces['xml'], 'base'), 

540)) 

541 

542svg_attr_val_allows_ref = frozenset(( 

543 (None, 'clip-path'), 

544 (None, 'color-profile'), 

545 (None, 'cursor'), 

546 (None, 'fill'), 

547 (None, 'filter'), 

548 (None, 'marker'), 

549 (None, 'marker-start'), 

550 (None, 'marker-mid'), 

551 (None, 'marker-end'), 

552 (None, 'mask'), 

553 (None, 'stroke'), 

554)) 

555 

556svg_allow_local_href = frozenset(( 

557 (None, 'altGlyph'), 

558 (None, 'animate'), 

559 (None, 'animateColor'), 

560 (None, 'animateMotion'), 

561 (None, 'animateTransform'), 

562 (None, 'cursor'), 

563 (None, 'feImage'), 

564 (None, 'filter'), 

565 (None, 'linearGradient'), 

566 (None, 'pattern'), 

567 (None, 'radialGradient'), 

568 (None, 'textpath'), 

569 (None, 'tref'), 

570 (None, 'set'), 

571 (None, 'use') 

572)) 

573 

574allowed_css_properties = frozenset(( 

575 'azimuth', 

576 'background-color', 

577 'border-bottom-color', 

578 'border-collapse', 

579 'border-color', 

580 'border-left-color', 

581 'border-right-color', 

582 'border-top-color', 

583 'clear', 

584 'color', 

585 'cursor', 

586 'direction', 

587 'display', 

588 'elevation', 

589 'float', 

590 'font', 

591 'font-family', 

592 'font-size', 

593 'font-style', 

594 'font-variant', 

595 'font-weight', 

596 'height', 

597 'letter-spacing', 

598 'line-height', 

599 'overflow', 

600 'pause', 

601 'pause-after', 

602 'pause-before', 

603 'pitch', 

604 'pitch-range', 

605 'richness', 

606 'speak', 

607 'speak-header', 

608 'speak-numeral', 

609 'speak-punctuation', 

610 'speech-rate', 

611 'stress', 

612 'text-align', 

613 'text-decoration', 

614 'text-indent', 

615 'unicode-bidi', 

616 'vertical-align', 

617 'voice-family', 

618 'volume', 

619 'white-space', 

620 'width', 

621)) 

622 

623allowed_css_keywords = frozenset(( 

624 'auto', 

625 'aqua', 

626 'black', 

627 'block', 

628 'blue', 

629 'bold', 

630 'both', 

631 'bottom', 

632 'brown', 

633 'center', 

634 'collapse', 

635 'dashed', 

636 'dotted', 

637 'fuchsia', 

638 'gray', 

639 'green', 

640 '!important', 

641 'italic', 

642 'left', 

643 'lime', 

644 'maroon', 

645 'medium', 

646 'none', 

647 'navy', 

648 'normal', 

649 'nowrap', 

650 'olive', 

651 'pointer', 

652 'purple', 

653 'red', 

654 'right', 

655 'solid', 

656 'silver', 

657 'teal', 

658 'top', 

659 'transparent', 

660 'underline', 

661 'white', 

662 'yellow', 

663)) 

664 

665allowed_svg_properties = frozenset(( 

666 'fill', 

667 'fill-opacity', 

668 'fill-rule', 

669 'stroke', 

670 'stroke-width', 

671 'stroke-linecap', 

672 'stroke-linejoin', 

673 'stroke-opacity', 

674)) 

675 

676allowed_protocols = frozenset(( 

677 'ed2k', 

678 'ftp', 

679 'http', 

680 'https', 

681 'irc', 

682 'mailto', 

683 'news', 

684 'gopher', 

685 'nntp', 

686 'telnet', 

687 'webcal', 

688 'xmpp', 

689 'callto', 

690 'feed', 

691 'urn', 

692 'aim', 

693 'rsync', 

694 'tag', 

695 'ssh', 

696 'sftp', 

697 'rtsp', 

698 'afs', 

699 'data', 

700)) 

701 

702allowed_content_types = frozenset(( 

703 'image/png', 

704 'image/jpeg', 

705 'image/gif', 

706 'image/webp', 

707 'image/bmp', 

708 'text/plain', 

709)) 

710 

711 

712data_content_type = re.compile(r''' 

713 ^ 

714 # Match a content type <application>/<type> 

715 (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+) 

716 # Match any character set and encoding 

717 (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?) 

718 |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?) 

719 # Assume the rest is data 

720 ,.* 

721 $ 

722 ''', 

723 re.VERBOSE) 

724 

725 

726class Filter(base.Filter): 

727 """Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes""" 

728 def __init__(self, 

729 source, 

730 allowed_elements=allowed_elements, 

731 allowed_attributes=allowed_attributes, 

732 allowed_css_properties=allowed_css_properties, 

733 allowed_css_keywords=allowed_css_keywords, 

734 allowed_svg_properties=allowed_svg_properties, 

735 allowed_protocols=allowed_protocols, 

736 allowed_content_types=allowed_content_types, 

737 attr_val_is_uri=attr_val_is_uri, 

738 svg_attr_val_allows_ref=svg_attr_val_allows_ref, 

739 svg_allow_local_href=svg_allow_local_href): 

740 """Creates a Filter 

741 

742 :arg allowed_elements: set of elements to allow--everything else will 

743 be escaped 

744 

745 :arg allowed_attributes: set of attributes to allow in 

746 elements--everything else will be stripped 

747 

748 :arg allowed_css_properties: set of CSS properties to allow--everything 

749 else will be stripped 

750 

751 :arg allowed_css_keywords: set of CSS keywords to allow--everything 

752 else will be stripped 

753 

754 :arg allowed_svg_properties: set of SVG properties to allow--everything 

755 else will be removed 

756 

757 :arg allowed_protocols: set of allowed protocols for URIs 

758 

759 :arg allowed_content_types: set of allowed content types for ``data`` URIs. 

760 

761 :arg attr_val_is_uri: set of attributes that have URI values--values 

762 that have a scheme not listed in ``allowed_protocols`` are removed 

763 

764 :arg svg_attr_val_allows_ref: set of SVG attributes that can have 

765 references 

766 

767 :arg svg_allow_local_href: set of SVG elements that can have local 

768 hrefs--these are removed 

769 

770 """ 

771 super(Filter, self).__init__(source) 

772 

773 warnings.warn(_deprecation_msg, DeprecationWarning) 

774 

775 self.allowed_elements = allowed_elements 

776 self.allowed_attributes = allowed_attributes 

777 self.allowed_css_properties = allowed_css_properties 

778 self.allowed_css_keywords = allowed_css_keywords 

779 self.allowed_svg_properties = allowed_svg_properties 

780 self.allowed_protocols = allowed_protocols 

781 self.allowed_content_types = allowed_content_types 

782 self.attr_val_is_uri = attr_val_is_uri 

783 self.svg_attr_val_allows_ref = svg_attr_val_allows_ref 

784 self.svg_allow_local_href = svg_allow_local_href 

785 

786 def __iter__(self): 

787 for token in base.Filter.__iter__(self): 

788 token = self.sanitize_token(token) 

789 if token: 

790 yield token 

791 

792 # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and 

793 # stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes 

794 # are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and 

795 # ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI 

796 # are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are 

797 # allowed. 

798 # 

799 # sanitize_html('<script> do_nasty_stuff() </script>') 

800 # => &lt;script> do_nasty_stuff() &lt;/script> 

801 # sanitize_html('<a href="javascript: sucker();">Click here for $100</a>') 

802 # => <a>Click here for $100</a> 

803 def sanitize_token(self, token): 

804 

805 # accommodate filters which use token_type differently 

806 token_type = token["type"] 

807 if token_type in ("StartTag", "EndTag", "EmptyTag"): 

808 name = token["name"] 

809 namespace = token["namespace"] 

810 if ((namespace, name) in self.allowed_elements or 

811 (namespace is None and 

812 (namespaces["html"], name) in self.allowed_elements)): 

813 return self.allowed_token(token) 

814 else: 

815 return self.disallowed_token(token) 

816 elif token_type == "Comment": 

817 pass 

818 else: 

819 return token 

820 

821 def allowed_token(self, token): 

822 if "data" in token: 

823 attrs = token["data"] 

824 attr_names = set(attrs.keys()) 

825 

826 # Remove forbidden attributes 

827 for to_remove in (attr_names - self.allowed_attributes): 

828 del token["data"][to_remove] 

829 attr_names.remove(to_remove) 

830 

831 # Remove attributes with disallowed URL values 

832 for attr in (attr_names & self.attr_val_is_uri): 

833 assert attr in attrs 

834 # I don't have a clue where this regexp comes from or why it matches those 

835 # characters, nor why we call unescape. I just know it's always been here. 

836 # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all 

837 # this will do is remove *more* than it otherwise would. 

838 val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '', 

839 unescape(attrs[attr])).lower() 

840 # remove replacement characters from unescaped characters 

841 val_unescaped = val_unescaped.replace("\ufffd", "") 

842 try: 

843 uri = urlparse.urlparse(val_unescaped) 

844 except ValueError: 

845 uri = None 

846 del attrs[attr] 

847 if uri and uri.scheme: 

848 if uri.scheme not in self.allowed_protocols: 

849 del attrs[attr] 

850 if uri.scheme == 'data': 

851 m = data_content_type.match(uri.path) 

852 if not m: 

853 del attrs[attr] 

854 elif m.group('content_type') not in self.allowed_content_types: 

855 del attrs[attr] 

856 

857 for attr in self.svg_attr_val_allows_ref: 

858 if attr in attrs: 

859 attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', 

860 ' ', 

861 unescape(attrs[attr])) 

862 if (token["name"] in self.svg_allow_local_href and 

863 (namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*', 

864 attrs[(namespaces['xlink'], 'href')])): 

865 del attrs[(namespaces['xlink'], 'href')] 

866 if (None, 'style') in attrs: 

867 attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')]) 

868 token["data"] = attrs 

869 return token 

870 

871 def disallowed_token(self, token): 

872 token_type = token["type"] 

873 if token_type == "EndTag": 

874 token["data"] = "</%s>" % token["name"] 

875 elif token["data"]: 

876 assert token_type in ("StartTag", "EmptyTag") 

877 attrs = [] 

878 for (ns, name), v in token["data"].items(): 

879 attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v))) 

880 token["data"] = "<%s%s>" % (token["name"], ''.join(attrs)) 

881 else: 

882 token["data"] = "<%s>" % token["name"] 

883 if token.get("selfClosing"): 

884 token["data"] = token["data"][:-1] + "/>" 

885 

886 token["type"] = "Characters" 

887 

888 del token["name"] 

889 return token 

890 

891 def sanitize_css(self, style): 

892 # disallow urls 

893 style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) 

894 

895 # gauntlet 

896 if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): 

897 return '' 

898 if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): 

899 return '' 

900 

901 clean = [] 

902 for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style): 

903 if not value: 

904 continue 

905 if prop.lower() in self.allowed_css_properties: 

906 clean.append(prop + ': ' + value + ';') 

907 elif prop.split('-')[0].lower() in ['background', 'border', 'margin', 

908 'padding']: 

909 for keyword in value.split(): 

910 if keyword not in self.allowed_css_keywords and \ 

911 not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa 

912 break 

913 else: 

914 clean.append(prop + ': ' + value + ';') 

915 elif prop.lower() in self.allowed_svg_properties: 

916 clean.append(prop + ': ' + value + ';') 

917 

918 return ' '.join(clean)