Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfinterp.py: 95%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

694 statements  

1import logging 

2import re 

3from io import BytesIO 

4from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast 

5 

6from pdfminer import settings 

7from pdfminer.casting import safe_cmyk, safe_float, safe_int, safe_matrix, safe_rgb 

8from pdfminer.cmapdb import CMap, CMapBase, CMapDB 

9from pdfminer.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace 

10from pdfminer.pdfdevice import PDFDevice, PDFTextSeq 

11from pdfminer.pdfexceptions import PDFException, PDFValueError 

12from pdfminer.pdffont import ( 

13 PDFCIDFont, 

14 PDFFont, 

15 PDFFontError, 

16 PDFTrueTypeFont, 

17 PDFType1Font, 

18 PDFType3Font, 

19) 

20from pdfminer.pdfpage import PDFPage 

21from pdfminer.pdftypes import ( 

22 LITERALS_ASCII85_DECODE, 

23 PDFObjRef, 

24 PDFStream, 

25 dict_value, 

26 list_value, 

27 resolve1, 

28 stream_value, 

29) 

30from pdfminer.psexceptions import PSEOF, PSTypeError 

31from pdfminer.psparser import ( 

32 KWD, 

33 LIT, 

34 PSKeyword, 

35 PSLiteral, 

36 PSStackParser, 

37 PSStackType, 

38 keyword_name, 

39 literal_name, 

40) 

41from pdfminer.utils import ( 

42 MATRIX_IDENTITY, 

43 Matrix, 

44 PathSegment, 

45 Point, 

46 Rect, 

47 choplist, 

48 mult_matrix, 

49) 

50 

51log = logging.getLogger(__name__) 

52 

53 

54class PDFResourceError(PDFException): 

55 pass 

56 

57 

58class PDFInterpreterError(PDFException): 

59 pass 

60 

61 

62LITERAL_PDF = LIT("PDF") 

63LITERAL_TEXT = LIT("Text") 

64LITERAL_FONT = LIT("Font") 

65LITERAL_FORM = LIT("Form") 

66LITERAL_IMAGE = LIT("Image") 

67 

68 

69class PDFTextState: 

70 matrix: Matrix 

71 linematrix: Point 

72 

73 def __init__(self) -> None: 

74 self.font: Optional[PDFFont] = None 

75 self.fontsize: float = 0 

76 self.charspace: float = 0 

77 self.wordspace: float = 0 

78 self.scaling: float = 100 

79 self.leading: float = 0 

80 self.render: int = 0 

81 self.rise: float = 0 

82 self.reset() 

83 # self.matrix is set 

84 # self.linematrix is set 

85 

86 def __repr__(self) -> str: 

87 return ( 

88 "<PDFTextState: font=%r, fontsize=%r, charspace=%r, " 

89 "wordspace=%r, scaling=%r, leading=%r, render=%r, rise=%r, " 

90 "matrix=%r, linematrix=%r>" 

91 % ( 

92 self.font, 

93 self.fontsize, 

94 self.charspace, 

95 self.wordspace, 

96 self.scaling, 

97 self.leading, 

98 self.render, 

99 self.rise, 

100 self.matrix, 

101 self.linematrix, 

102 ) 

103 ) 

104 

105 def copy(self) -> "PDFTextState": 

106 obj = PDFTextState() 

107 obj.font = self.font 

108 obj.fontsize = self.fontsize 

109 obj.charspace = self.charspace 

110 obj.wordspace = self.wordspace 

111 obj.scaling = self.scaling 

112 obj.leading = self.leading 

113 obj.render = self.render 

114 obj.rise = self.rise 

115 obj.matrix = self.matrix 

116 obj.linematrix = self.linematrix 

117 return obj 

118 

119 def reset(self) -> None: 

120 self.matrix = MATRIX_IDENTITY 

121 self.linematrix = (0, 0) 

122 

123 

124Color = Union[ 

125 float, # Greyscale 

126 Tuple[float, float, float], # R, G, B 

127 Tuple[float, float, float, float], # C, M, Y, K 

128] 

129 

130 

131class PDFGraphicState: 

132 def __init__(self) -> None: 

133 self.linewidth: float = 0 

134 self.linecap: Optional[object] = None 

135 self.linejoin: Optional[object] = None 

136 self.miterlimit: Optional[object] = None 

137 self.dash: Optional[Tuple[object, object]] = None 

138 self.intent: Optional[object] = None 

139 self.flatness: Optional[object] = None 

140 

141 # stroking color 

142 self.scolor: Color = 0 

143 self.scs: PDFColorSpace = PREDEFINED_COLORSPACE["DeviceGray"] 

144 

145 # non stroking color 

146 self.ncolor: Color = 0 

147 self.ncs: PDFColorSpace = PREDEFINED_COLORSPACE["DeviceGray"] 

148 

149 def copy(self) -> "PDFGraphicState": 

150 obj = PDFGraphicState() 

151 obj.linewidth = self.linewidth 

152 obj.linecap = self.linecap 

153 obj.linejoin = self.linejoin 

154 obj.miterlimit = self.miterlimit 

155 obj.dash = self.dash 

156 obj.intent = self.intent 

157 obj.flatness = self.flatness 

158 obj.scolor = self.scolor 

159 obj.ncolor = self.ncolor 

160 return obj 

161 

162 def __repr__(self) -> str: 

163 return ( 

164 "<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, " 

165 " miterlimit=%r, dash=%r, intent=%r, flatness=%r, " 

166 " stroking color=%r, non stroking color=%r>" 

167 % ( 

168 self.linewidth, 

169 self.linecap, 

170 self.linejoin, 

171 self.miterlimit, 

172 self.dash, 

173 self.intent, 

174 self.flatness, 

175 self.scolor, 

176 self.ncolor, 

177 ) 

178 ) 

179 

180 

181class PDFResourceManager: 

182 """Repository of shared resources. 

183 

184 ResourceManager facilitates reuse of shared resources 

185 such as fonts and images so that large objects are not 

186 allocated multiple times. 

187 """ 

188 

189 def __init__(self, caching: bool = True) -> None: 

190 self.caching = caching 

191 self._cached_fonts: Dict[object, PDFFont] = {} 

192 

193 def get_procset(self, procs: Sequence[object]) -> None: 

194 for proc in procs: 

195 if proc is LITERAL_PDF or proc is LITERAL_TEXT: 

196 pass 

197 else: 

198 pass 

199 

200 def get_cmap(self, cmapname: str, strict: bool = False) -> CMapBase: 

201 try: 

202 return CMapDB.get_cmap(cmapname) 

203 except CMapDB.CMapNotFound: 

204 if strict: 

205 raise 

206 return CMap() 

207 

208 def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont: 

209 if objid and objid in self._cached_fonts: 

210 font = self._cached_fonts[objid] 

211 else: 

212 log.debug("get_font: create: objid=%r, spec=%r", objid, spec) 

213 if settings.STRICT: 

214 if spec["Type"] is not LITERAL_FONT: 

215 raise PDFFontError("Type is not /Font") 

216 # Create a Font object. 

217 if "Subtype" in spec: 

218 subtype = literal_name(spec["Subtype"]) 

219 else: 

220 if settings.STRICT: 

221 raise PDFFontError("Font Subtype is not specified.") 

222 subtype = "Type1" 

223 if subtype in ("Type1", "MMType1"): 

224 # Type1 Font 

225 font = PDFType1Font(self, spec) 

226 elif subtype == "TrueType": 

227 # TrueType Font 

228 font = PDFTrueTypeFont(self, spec) 

229 elif subtype == "Type3": 

230 # Type3 Font 

231 font = PDFType3Font(self, spec) 

232 elif subtype in ("CIDFontType0", "CIDFontType2"): 

233 # CID Font 

234 font = PDFCIDFont(self, spec) 

235 elif subtype == "Type0": 

236 # Type0 Font 

237 dfonts = list_value(spec["DescendantFonts"]) 

238 assert dfonts 

239 subspec = dict_value(dfonts[0]).copy() 

240 for k in ("Encoding", "ToUnicode"): 

241 if k in spec: 

242 subspec[k] = resolve1(spec[k]) 

243 font = self.get_font(None, subspec) 

244 else: 

245 if settings.STRICT: 

246 raise PDFFontError("Invalid Font spec: %r" % spec) 

247 font = PDFType1Font(self, spec) # this is so wrong! 

248 if objid and self.caching: 

249 self._cached_fonts[objid] = font 

250 return font 

251 

252 

253class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]): 

254 def __init__(self, streams: Sequence[object]) -> None: 

255 self.streams = streams 

256 self.istream = 0 

257 # PSStackParser.__init__(fp=None) is safe only because we've overloaded 

258 # all the methods that would attempt to access self.fp without first 

259 # calling self.fillfp(). 

260 PSStackParser.__init__(self, None) # type: ignore[arg-type] 

261 

262 def fillfp(self) -> None: 

263 if not self.fp: 

264 if self.istream < len(self.streams): 

265 strm = stream_value(self.streams[self.istream]) 

266 self.istream += 1 

267 else: 

268 raise PSEOF("Unexpected EOF, file truncated?") 

269 self.fp = BytesIO(strm.get_data()) 

270 

271 def seek(self, pos: int) -> None: 

272 self.fillfp() 

273 PSStackParser.seek(self, pos) 

274 

275 def fillbuf(self) -> None: 

276 if self.charpos < len(self.buf): 

277 return 

278 while 1: 

279 self.fillfp() 

280 self.bufpos = self.fp.tell() 

281 self.buf = self.fp.read(self.BUFSIZ) 

282 if self.buf: 

283 break 

284 self.fp = None # type: ignore[assignment] 

285 self.charpos = 0 

286 

287 def get_inline_data(self, pos: int, target: bytes = b"EI") -> Tuple[int, bytes]: 

288 self.seek(pos) 

289 i = 0 

290 data = b"" 

291 while i <= len(target): 

292 self.fillbuf() 

293 if i: 

294 ci = self.buf[self.charpos] 

295 c = bytes((ci,)) 

296 data += c 

297 self.charpos += 1 

298 if ( 

299 len(target) <= i 

300 and c.isspace() 

301 or i < len(target) 

302 and c == (bytes((target[i],))) 

303 ): 

304 i += 1 

305 else: 

306 i = 0 

307 else: 

308 try: 

309 j = self.buf.index(target[0], self.charpos) 

310 data += self.buf[self.charpos : j + 1] 

311 self.charpos = j + 1 

312 i = 1 

313 except ValueError: 

314 data += self.buf[self.charpos :] 

315 self.charpos = len(self.buf) 

316 data = data[: -(len(target) + 1)] # strip the last part 

317 data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data) 

318 return (pos, data) 

319 

320 def flush(self) -> None: 

321 self.add_results(*self.popall()) 

322 

323 KEYWORD_BI = KWD(b"BI") 

324 KEYWORD_ID = KWD(b"ID") 

325 KEYWORD_EI = KWD(b"EI") 

326 

327 def do_keyword(self, pos: int, token: PSKeyword) -> None: 

328 if token is self.KEYWORD_BI: 

329 # inline image within a content stream 

330 self.start_type(pos, "inline") 

331 elif token is self.KEYWORD_ID: 

332 try: 

333 (_, objs) = self.end_type("inline") 

334 if len(objs) % 2 != 0: 

335 error_msg = f"Invalid dictionary construct: {objs!r}" 

336 raise PSTypeError(error_msg) 

337 d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)} 

338 eos = b"EI" 

339 filter = d.get("F", None) 

340 if filter is not None: 

341 if isinstance(filter, PSLiteral): 

342 filter = [filter] 

343 if filter[0] in LITERALS_ASCII85_DECODE: 

344 eos = b"~>" 

345 (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos) 

346 if eos != b"EI": # it may be necessary for decoding 

347 data += eos 

348 obj = PDFStream(d, data) 

349 self.push((pos, obj)) 

350 if eos == b"EI": # otherwise it is still in the stream 

351 self.push((pos, self.KEYWORD_EI)) 

352 except PSTypeError: 

353 if settings.STRICT: 

354 raise 

355 else: 

356 self.push((pos, token)) 

357 

358 

359PDFStackT = PSStackType[PDFStream] 

360"""Types that may appear on the PDF argument stack.""" 

361 

362 

363class PDFPageInterpreter: 

364 """Processor for the content of a PDF page 

365 

366 Reference: PDF Reference, Appendix A, Operator Summary 

367 """ 

368 

369 def __init__(self, rsrcmgr: PDFResourceManager, device: PDFDevice) -> None: 

370 self.rsrcmgr = rsrcmgr 

371 self.device = device 

372 

373 def dup(self) -> "PDFPageInterpreter": 

374 return self.__class__(self.rsrcmgr, self.device) 

375 

376 def init_resources(self, resources: Dict[object, object]) -> None: 

377 """Prepare the fonts and XObjects listed in the Resource attribute.""" 

378 self.resources = resources 

379 self.fontmap: Dict[object, PDFFont] = {} 

380 self.xobjmap = {} 

381 self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy() 

382 if not resources: 

383 return 

384 

385 def get_colorspace(spec: object) -> Optional[PDFColorSpace]: 

386 if isinstance(spec, list): 

387 name = literal_name(spec[0]) 

388 else: 

389 name = literal_name(spec) 

390 if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2: 

391 return PDFColorSpace(name, stream_value(spec[1])["N"]) 

392 elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2: 

393 return PDFColorSpace(name, len(list_value(spec[1]))) 

394 else: 

395 return PREDEFINED_COLORSPACE.get(name) 

396 

397 for k, v in dict_value(resources).items(): 

398 log.debug("Resource: %r: %r", k, v) 

399 if k == "Font": 

400 for fontid, spec in dict_value(v).items(): 

401 objid = None 

402 if isinstance(spec, PDFObjRef): 

403 objid = spec.objid 

404 spec = dict_value(spec) 

405 self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) 

406 elif k == "ColorSpace": 

407 for csid, spec in dict_value(v).items(): 

408 colorspace = get_colorspace(resolve1(spec)) 

409 if colorspace is not None: 

410 self.csmap[csid] = colorspace 

411 elif k == "ProcSet": 

412 self.rsrcmgr.get_procset(list_value(v)) 

413 elif k == "XObject": 

414 for xobjid, xobjstrm in dict_value(v).items(): 

415 self.xobjmap[xobjid] = xobjstrm 

416 

417 def init_state(self, ctm: Matrix) -> None: 

418 """Initialize the text and graphic states for rendering a page.""" 

419 # gstack: stack for graphical states. 

420 self.gstack: List[Tuple[Matrix, PDFTextState, PDFGraphicState]] = [] 

421 self.ctm = ctm 

422 self.device.set_ctm(self.ctm) 

423 self.textstate = PDFTextState() 

424 self.graphicstate = PDFGraphicState() 

425 self.curpath: List[PathSegment] = [] 

426 # argstack: stack for command arguments. 

427 self.argstack: List[PDFStackT] = [] 

428 

429 def push(self, obj: PDFStackT) -> None: 

430 self.argstack.append(obj) 

431 

432 def pop(self, n: int) -> List[PDFStackT]: 

433 if n == 0: 

434 return [] 

435 x = self.argstack[-n:] 

436 self.argstack = self.argstack[:-n] 

437 return x 

438 

439 def get_current_state(self) -> Tuple[Matrix, PDFTextState, PDFGraphicState]: 

440 return (self.ctm, self.textstate.copy(), self.graphicstate.copy()) 

441 

442 def set_current_state( 

443 self, 

444 state: Tuple[Matrix, PDFTextState, PDFGraphicState], 

445 ) -> None: 

446 (self.ctm, self.textstate, self.graphicstate) = state 

447 self.device.set_ctm(self.ctm) 

448 

449 def do_q(self) -> None: 

450 """Save graphics state""" 

451 self.gstack.append(self.get_current_state()) 

452 

453 def do_Q(self) -> None: 

454 """Restore graphics state""" 

455 if self.gstack: 

456 self.set_current_state(self.gstack.pop()) 

457 

458 def do_cm( 

459 self, 

460 a1: PDFStackT, 

461 b1: PDFStackT, 

462 c1: PDFStackT, 

463 d1: PDFStackT, 

464 e1: PDFStackT, 

465 f1: PDFStackT, 

466 ) -> None: 

467 """Concatenate matrix to current transformation matrix""" 

468 matrix = safe_matrix(a1, b1, c1, d1, e1, f1) 

469 

470 if matrix is None: 

471 log.warning( 

472 f"Cannot concatenate matrix to current transformation matrix because not all values in {(a1, b1, c1, d1, e1, f1)!r} can be parsed as floats" 

473 ) 

474 else: 

475 self.ctm = mult_matrix(matrix, self.ctm) 

476 self.device.set_ctm(self.ctm) 

477 

478 def do_w(self, linewidth: PDFStackT) -> None: 

479 """Set line width""" 

480 linewidth_f = safe_float(linewidth) 

481 if linewidth_f is None: 

482 log.warning( 

483 f"Cannot set line width because {linewidth!r} is an invalid float value" 

484 ) 

485 else: 

486 self.graphicstate.linewidth = linewidth_f 

487 

488 def do_J(self, linecap: PDFStackT) -> None: 

489 """Set line cap style""" 

490 self.graphicstate.linecap = linecap 

491 

492 def do_j(self, linejoin: PDFStackT) -> None: 

493 """Set line join style""" 

494 self.graphicstate.linejoin = linejoin 

495 

496 def do_M(self, miterlimit: PDFStackT) -> None: 

497 """Set miter limit""" 

498 self.graphicstate.miterlimit = miterlimit 

499 

500 def do_d(self, dash: PDFStackT, phase: PDFStackT) -> None: 

501 """Set line dash pattern""" 

502 self.graphicstate.dash = (dash, phase) 

503 

504 def do_ri(self, intent: PDFStackT) -> None: 

505 """Set color rendering intent""" 

506 self.graphicstate.intent = intent 

507 

508 def do_i(self, flatness: PDFStackT) -> None: 

509 """Set flatness tolerance""" 

510 self.graphicstate.flatness = flatness 

511 

512 def do_gs(self, name: PDFStackT) -> None: 

513 """Set parameters from graphics state parameter dictionary""" 

514 # TODO 

515 

516 def do_m(self, x: PDFStackT, y: PDFStackT) -> None: 

517 """Begin new subpath""" 

518 x_f = safe_float(x) 

519 y_f = safe_float(y) 

520 

521 if x_f is None or y_f is None: 

522 point = ("m", x, y) 

523 log.warning( 

524 f"Cannot start new subpath because not all values in {point!r} can be parsed as floats" 

525 ) 

526 else: 

527 point = ("m", x_f, y_f) 

528 self.curpath.append(point) 

529 

530 def do_l(self, x: PDFStackT, y: PDFStackT) -> None: 

531 """Append straight line segment to path""" 

532 x_f = safe_float(x) 

533 y_f = safe_float(y) 

534 if x_f is None or y_f is None: 

535 point = ("l", x, y) 

536 log.warning( 

537 f"Cannot append straight line segment to path because not all values in {point!r} can be parsed as floats" 

538 ) 

539 else: 

540 point = ("l", x_f, y_f) 

541 self.curpath.append(point) 

542 

543 def do_c( 

544 self, 

545 x1: PDFStackT, 

546 y1: PDFStackT, 

547 x2: PDFStackT, 

548 y2: PDFStackT, 

549 x3: PDFStackT, 

550 y3: PDFStackT, 

551 ) -> None: 

552 """Append curved segment to path (three control points)""" 

553 x1_f = safe_float(x1) 

554 y1_f = safe_float(y1) 

555 x2_f = safe_float(x2) 

556 y2_f = safe_float(y2) 

557 x3_f = safe_float(x3) 

558 y3_f = safe_float(y3) 

559 if ( 

560 x1_f is None 

561 or y1_f is None 

562 or x2_f is None 

563 or y2_f is None 

564 or x3_f is None 

565 or y3_f is None 

566 ): 

567 point = ("c", x1, y1, x2, y2, x3, y3) 

568 log.warning( 

569 f"Cannot append curved segment to path because not all values in {point!r} can be parsed as floats" 

570 ) 

571 else: 

572 point = ("c", x1_f, y1_f, x2_f, y2_f, x3_f, y3_f) 

573 self.curpath.append(point) 

574 

575 def do_v(self, x2: PDFStackT, y2: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None: 

576 """Append curved segment to path (initial point replicated)""" 

577 x2_f = safe_float(x2) 

578 y2_f = safe_float(y2) 

579 x3_f = safe_float(x3) 

580 y3_f = safe_float(y3) 

581 if x2_f is None or y2_f is None or x3_f is None or y3_f is None: 

582 point = ("v", x2, y2, x3, y3) 

583 log.warning( 

584 f"Cannot append curved segment to path because not all values in {point!r} can be parsed as floats" 

585 ) 

586 else: 

587 point = ("v", x2_f, y2_f, x3_f, y3_f) 

588 self.curpath.append(point) 

589 

590 def do_y(self, x1: PDFStackT, y1: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None: 

591 """Append curved segment to path (final point replicated)""" 

592 x1_f = safe_float(x1) 

593 y1_f = safe_float(y1) 

594 x3_f = safe_float(x3) 

595 y3_f = safe_float(y3) 

596 if x1_f is None or y1_f is None or x3_f is None or y3_f is None: 

597 point = ("y", x1, y1, x3, y3) 

598 log.warning( 

599 f"Cannot append curved segment to path because not all values in {point!r} can be parsed as floats" 

600 ) 

601 else: 

602 point = ("y", x1_f, y1_f, x3_f, y3_f) 

603 self.curpath.append(point) 

604 

605 def do_h(self) -> None: 

606 """Close subpath""" 

607 self.curpath.append(("h",)) 

608 

609 def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None: 

610 """Append rectangle to path""" 

611 x_f = safe_float(x) 

612 y_f = safe_float(y) 

613 w_f = safe_float(w) 

614 h_f = safe_float(h) 

615 

616 if x_f is None or y_f is None or w_f is None or h_f is None: 

617 values = (x, y, w, h) 

618 log.warning( 

619 f"Cannot append rectangle to path because not all values in {values!r} can be parsed as floats" 

620 ) 

621 else: 

622 self.curpath.append(("m", x_f, y_f)) 

623 self.curpath.append(("l", x_f + w_f, y_f)) 

624 self.curpath.append(("l", x_f + w_f, y_f + h_f)) 

625 self.curpath.append(("l", x_f, y_f + h_f)) 

626 self.curpath.append(("h",)) 

627 

628 def do_S(self) -> None: 

629 """Stroke path""" 

630 self.device.paint_path(self.graphicstate, True, False, False, self.curpath) 

631 self.curpath = [] 

632 

633 def do_s(self) -> None: 

634 """Close and stroke path""" 

635 self.do_h() 

636 self.do_S() 

637 

638 def do_f(self) -> None: 

639 """Fill path using nonzero winding number rule""" 

640 self.device.paint_path(self.graphicstate, False, True, False, self.curpath) 

641 self.curpath = [] 

642 

643 def do_F(self) -> None: 

644 """Fill path using nonzero winding number rule (obsolete)""" 

645 

646 def do_f_a(self) -> None: 

647 """Fill path using even-odd rule""" 

648 self.device.paint_path(self.graphicstate, False, True, True, self.curpath) 

649 self.curpath = [] 

650 

651 def do_B(self) -> None: 

652 """Fill and stroke path using nonzero winding number rule""" 

653 self.device.paint_path(self.graphicstate, True, True, False, self.curpath) 

654 self.curpath = [] 

655 

656 def do_B_a(self) -> None: 

657 """Fill and stroke path using even-odd rule""" 

658 self.device.paint_path(self.graphicstate, True, True, True, self.curpath) 

659 self.curpath = [] 

660 

661 def do_b(self) -> None: 

662 """Close, fill, and stroke path using nonzero winding number rule""" 

663 self.do_h() 

664 self.do_B() 

665 

666 def do_b_a(self) -> None: 

667 """Close, fill, and stroke path using even-odd rule""" 

668 self.do_h() 

669 self.do_B_a() 

670 

671 def do_n(self) -> None: 

672 """End path without filling or stroking""" 

673 self.curpath = [] 

674 

675 def do_W(self) -> None: 

676 """Set clipping path using nonzero winding number rule""" 

677 

678 def do_W_a(self) -> None: 

679 """Set clipping path using even-odd rule""" 

680 

681 def do_CS(self, name: PDFStackT) -> None: 

682 """Set color space for stroking operations 

683 

684 Introduced in PDF 1.1 

685 """ 

686 try: 

687 self.graphicstate.scs = self.csmap[literal_name(name)] 

688 except KeyError: 

689 if settings.STRICT: 

690 raise PDFInterpreterError("Undefined ColorSpace: %r" % name) 

691 

692 def do_cs(self, name: PDFStackT) -> None: 

693 """Set color space for nonstroking operations""" 

694 try: 

695 self.graphicstate.ncs = self.csmap[literal_name(name)] 

696 except KeyError: 

697 if settings.STRICT: 

698 raise PDFInterpreterError("Undefined ColorSpace: %r" % name) 

699 

700 def do_G(self, gray: PDFStackT) -> None: 

701 """Set gray level for stroking operations""" 

702 gray_f = safe_float(gray) 

703 

704 if gray_f is None: 

705 log.warning( 

706 f"Cannot set gray level because {gray!r} is an invalid float value" 

707 ) 

708 else: 

709 self.graphicstate.scolor = gray_f 

710 self.graphicstate.scs = self.csmap["DeviceGray"] 

711 

712 def do_g(self, gray: PDFStackT) -> None: 

713 """Set gray level for nonstroking operations""" 

714 gray_f = safe_float(gray) 

715 

716 if gray_f is None: 

717 log.warning( 

718 f"Cannot set gray level because {gray!r} is an invalid float value" 

719 ) 

720 else: 

721 self.graphicstate.ncolor = gray_f 

722 self.graphicstate.ncs = self.csmap["DeviceGray"] 

723 

724 def do_RG(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None: 

725 """Set RGB color for stroking operations""" 

726 rgb = safe_rgb(r, g, b) 

727 

728 if rgb is None: 

729 log.warning( 

730 f"Cannot set RGB stroke color because not all values in {(r, g, b)!r} can be parsed as floats" 

731 ) 

732 else: 

733 self.graphicstate.scolor = rgb 

734 self.graphicstate.scs = self.csmap["DeviceRGB"] 

735 

736 def do_rg(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None: 

737 """Set RGB color for nonstroking operations""" 

738 rgb = safe_rgb(r, g, b) 

739 

740 if rgb is None: 

741 log.warning( 

742 f"Cannot set RGB non-stroke color because not all values in {(r, g, b)!r} can be parsed as floats" 

743 ) 

744 else: 

745 self.graphicstate.ncolor = rgb 

746 self.graphicstate.ncs = self.csmap["DeviceRGB"] 

747 

748 def do_K(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None: 

749 """Set CMYK color for stroking operations""" 

750 cmyk = safe_cmyk(c, m, y, k) 

751 

752 if cmyk is None: 

753 log.warning( 

754 f"Cannot set CMYK stroke color because not all values in {(c, m, y, k)!r} can be parsed as floats" 

755 ) 

756 else: 

757 self.graphicstate.scolor = cmyk 

758 self.graphicstate.scs = self.csmap["DeviceCMYK"] 

759 

760 def do_k(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None: 

761 """Set CMYK color for nonstroking operations""" 

762 cmyk = safe_cmyk(c, m, y, k) 

763 

764 if cmyk is None: 

765 log.warning( 

766 f"Cannot set CMYK non-stroke color because not all values in {(c, m, y, k)!r} can be parsed as floats" 

767 ) 

768 else: 

769 self.graphicstate.ncolor = cmyk 

770 self.graphicstate.ncs = self.csmap["DeviceCMYK"] 

771 

772 def do_SCN(self) -> None: 

773 """Set color for stroking operations.""" 

774 n = self.graphicstate.scs.ncomponents 

775 

776 components = self.pop(n) 

777 if len(components) != n: 

778 log.warning( 

779 f"Cannot set stroke color because expected {n} components but got {components:!r}" 

780 ) 

781 

782 elif len(components) == 1: 

783 gray = components[0] 

784 gray_f = safe_float(gray) 

785 if gray_f is None: 

786 log.warning( 

787 f"Cannot set gray stroke color because {gray!r} is an invalid float value" 

788 ) 

789 else: 

790 self.graphicstate.scolor = gray_f 

791 

792 elif len(components) == 3: 

793 rgb = safe_rgb(*components) 

794 

795 if rgb is None: 

796 log.warning( 

797 f"Cannot set RGB stroke color because components {components!r} cannot be parsed as RGB" 

798 ) 

799 else: 

800 self.graphicstate.scolor = rgb 

801 

802 elif len(components) == 4: 

803 cmyk = safe_cmyk(*components) 

804 

805 if cmyk is None: 

806 log.warning( 

807 f"Cannot set CMYK stroke color because components {components!r} cannot be parsed as CMYK" 

808 ) 

809 else: 

810 self.graphicstate.scolor = cmyk 

811 

812 else: 

813 log.warning( 

814 f"Cannot set stroke color because {len(components)} components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported" 

815 ) 

816 

817 def do_scn(self) -> None: 

818 """Set color for nonstroking operations""" 

819 n = self.graphicstate.ncs.ncomponents 

820 

821 components = self.pop(n) 

822 if len(components) != n: 

823 log.warning( 

824 f"Cannot set non-stroke color because expected {n} components but got {components:!r}" 

825 ) 

826 

827 elif len(components) == 1: 

828 gray = components[0] 

829 gray_f = safe_float(gray) 

830 if gray_f is None: 

831 log.warning( 

832 f"Cannot set gray non-stroke color because {gray!r} is an invalid float value" 

833 ) 

834 else: 

835 self.graphicstate.ncolor = gray_f 

836 

837 elif len(components) == 3: 

838 rgb = safe_rgb(*components) 

839 

840 if rgb is None: 

841 log.warning( 

842 f"Cannot set RGB non-stroke color because components {components!r} cannot be parsed as RGB" 

843 ) 

844 else: 

845 self.graphicstate.ncolor = rgb 

846 

847 elif len(components) == 4: 

848 cmyk = safe_cmyk(*components) 

849 

850 if cmyk is None: 

851 log.warning( 

852 f"Cannot set CMYK non-stroke color because components {components!r} cannot be parsed as CMYK" 

853 ) 

854 else: 

855 self.graphicstate.ncolor = cmyk 

856 

857 else: 

858 log.warning( 

859 f"Cannot set non-stroke color because {len(components)} components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported" 

860 ) 

861 

862 def do_SC(self) -> None: 

863 """Set color for stroking operations""" 

864 self.do_SCN() 

865 

866 def do_sc(self) -> None: 

867 """Set color for nonstroking operations""" 

868 self.do_scn() 

869 

870 def do_sh(self, name: object) -> None: 

871 """Paint area defined by shading pattern""" 

872 

873 def do_BT(self) -> None: 

874 """Begin text object 

875 

876 Initializing the text matrix, Tm, and the text line matrix, Tlm, to 

877 the identity matrix. Text objects cannot be nested; a second BT cannot 

878 appear before an ET. 

879 """ 

880 self.textstate.reset() 

881 

882 def do_ET(self) -> None: 

883 """End a text object""" 

884 

885 def do_BX(self) -> None: 

886 """Begin compatibility section""" 

887 

888 def do_EX(self) -> None: 

889 """End compatibility section""" 

890 

891 def do_MP(self, tag: PDFStackT) -> None: 

892 """Define marked-content point""" 

893 if isinstance(tag, PSLiteral): 

894 self.device.do_tag(tag) 

895 else: 

896 log.warning( 

897 f"Cannot define marked-content point because {tag!r} is not a PSLiteral" 

898 ) 

899 

900 def do_DP(self, tag: PDFStackT, props: PDFStackT) -> None: 

901 """Define marked-content point with property list""" 

902 if isinstance(tag, PSLiteral): 

903 self.device.do_tag(tag, props) 

904 else: 

905 log.warning( 

906 f"Cannot define marked-content point with property list because {tag!r} is not a PSLiteral" 

907 ) 

908 

909 def do_BMC(self, tag: PDFStackT) -> None: 

910 """Begin marked-content sequence""" 

911 if isinstance(tag, PSLiteral): 

912 self.device.begin_tag(tag) 

913 else: 

914 log.warning( 

915 f"Cannot begin marked-content sequence because {tag!r} is not a PSLiteral" 

916 ) 

917 

918 def do_BDC(self, tag: PDFStackT, props: PDFStackT) -> None: 

919 """Begin marked-content sequence with property list""" 

920 if isinstance(tag, PSLiteral): 

921 self.device.begin_tag(tag, props) 

922 else: 

923 log.warning( 

924 f"Cannot begin marked-content sequence with property list because {tag!r} is not a PSLiteral" 

925 ) 

926 

927 def do_EMC(self) -> None: 

928 """End marked-content sequence""" 

929 self.device.end_tag() 

930 

931 def do_Tc(self, space: PDFStackT) -> None: 

932 """Set character spacing. 

933 

934 Character spacing is used by the Tj, TJ, and ' operators. 

935 

936 :param space: a number expressed in unscaled text space units. 

937 """ 

938 charspace = safe_float(space) 

939 if charspace is None: 

940 log.warning( 

941 f"Could not set character spacing because {space!r} is an invalid float value" 

942 ) 

943 else: 

944 self.textstate.charspace = charspace 

945 

946 def do_Tw(self, space: PDFStackT) -> None: 

947 """Set the word spacing. 

948 

949 Word spacing is used by the Tj, TJ, and ' operators. 

950 

951 :param space: a number expressed in unscaled text space units 

952 """ 

953 wordspace = safe_float(space) 

954 if wordspace is None: 

955 log.warning( 

956 f"Could not set word spacing becuase {space!r} is an invalid float value" 

957 ) 

958 else: 

959 self.textstate.wordspace = wordspace 

960 

961 def do_Tz(self, scale: PDFStackT) -> None: 

962 """Set the horizontal scaling. 

963 

964 :param scale: is a number specifying the percentage of the normal width 

965 """ 

966 scale_f = safe_float(scale) 

967 

968 if scale_f is None: 

969 log.warning( 

970 f"Could not set horizontal scaling because {scale!r} is an invalid float value" 

971 ) 

972 else: 

973 self.textstate.scaling = scale_f 

974 

975 def do_TL(self, leading: PDFStackT) -> None: 

976 """Set the text leading. 

977 

978 Text leading is used only by the T*, ', and " operators. 

979 

980 :param leading: a number expressed in unscaled text space units 

981 """ 

982 leading_f = safe_float(leading) 

983 if leading_f is None: 

984 log.warning( 

985 f"Could not set text leading because {leading!r} is an invalid float value" 

986 ) 

987 else: 

988 self.textstate.leading = -leading_f 

989 

990 def do_Tf(self, fontid: PDFStackT, fontsize: PDFStackT) -> None: 

991 """Set the text font 

992 

993 :param fontid: the name of a font resource in the Font subdictionary 

994 of the current resource dictionary 

995 :param fontsize: size is a number representing a scale factor. 

996 """ 

997 try: 

998 self.textstate.font = self.fontmap[literal_name(fontid)] 

999 except KeyError: 

1000 if settings.STRICT: 

1001 raise PDFInterpreterError("Undefined Font id: %r" % fontid) 

1002 self.textstate.font = self.rsrcmgr.get_font(None, {}) 

1003 

1004 fontsize_f = safe_float(fontsize) 

1005 if fontsize_f is None: 

1006 log.warning( 

1007 f"Could not set text font because {fontsize!r} is an invalid float value" 

1008 ) 

1009 else: 

1010 self.textstate.fontsize = fontsize_f 

1011 

1012 def do_Tr(self, render: PDFStackT) -> None: 

1013 """Set the text rendering mode""" 

1014 render_i = safe_int(render) 

1015 

1016 if render_i is None: 

1017 log.warning( 

1018 f"Could not set text rendering mode because {render!r} is an invalid int value" 

1019 ) 

1020 else: 

1021 self.textstate.render = render_i 

1022 

1023 def do_Ts(self, rise: PDFStackT) -> None: 

1024 """Set the text rise 

1025 

1026 :param rise: a number expressed in unscaled text space units 

1027 """ 

1028 rise_f = safe_float(rise) 

1029 

1030 if rise_f is None: 

1031 log.warning( 

1032 f"Could not set text rise because {rise!r} is an invalid float value" 

1033 ) 

1034 else: 

1035 self.textstate.rise = rise_f 

1036 

1037 def do_Td(self, tx: PDFStackT, ty: PDFStackT) -> None: 

1038 """Move to the start of the next line 

1039 

1040 Offset from the start of the current line by (tx , ty). 

1041 """ 

1042 tx_ = safe_float(tx) 

1043 ty_ = safe_float(ty) 

1044 if tx_ is not None and ty_ is not None: 

1045 (a, b, c, d, e, f) = self.textstate.matrix 

1046 e_new = tx_ * a + ty_ * c + e 

1047 f_new = tx_ * b + ty_ * d + f 

1048 self.textstate.matrix = (a, b, c, d, e_new, f_new) 

1049 

1050 elif settings.STRICT: 

1051 raise PDFValueError(f"Invalid offset ({tx!r}, {ty!r}) for Td") 

1052 

1053 self.textstate.linematrix = (0, 0) 

1054 

1055 def do_TD(self, tx: PDFStackT, ty: PDFStackT) -> None: 

1056 """Move to the start of the next line. 

1057 

1058 offset from the start of the current line by (tx , ty). As a side effect, this 

1059 operator sets the leading parameter in the text state. 

1060 """ 

1061 tx_ = safe_float(tx) 

1062 ty_ = safe_float(ty) 

1063 

1064 if tx_ is not None and ty_ is not None: 

1065 (a, b, c, d, e, f) = self.textstate.matrix 

1066 e_new = tx_ * a + ty_ * c + e 

1067 f_new = tx_ * b + ty_ * d + f 

1068 self.textstate.matrix = (a, b, c, d, e_new, f_new) 

1069 

1070 elif settings.STRICT: 

1071 raise PDFValueError("Invalid offset ({tx}, {ty}) for TD") 

1072 

1073 if ty_ is not None: 

1074 self.textstate.leading = ty_ 

1075 

1076 self.textstate.linematrix = (0, 0) 

1077 

1078 def do_Tm( 

1079 self, 

1080 a: PDFStackT, 

1081 b: PDFStackT, 

1082 c: PDFStackT, 

1083 d: PDFStackT, 

1084 e: PDFStackT, 

1085 f: PDFStackT, 

1086 ) -> None: 

1087 """Set text matrix and text line matrix""" 

1088 values = (a, b, c, d, e, f) 

1089 matrix = safe_matrix(*values) 

1090 

1091 if matrix is None: 

1092 log.warning( 

1093 f"Could not set text matrix because not all values in {values!r} can be parsed as floats" 

1094 ) 

1095 else: 

1096 self.textstate.matrix = matrix 

1097 self.textstate.linematrix = (0, 0) 

1098 

1099 def do_T_a(self) -> None: 

1100 """Move to start of next text line""" 

1101 (a, b, c, d, e, f) = self.textstate.matrix 

1102 self.textstate.matrix = ( 

1103 a, 

1104 b, 

1105 c, 

1106 d, 

1107 self.textstate.leading * c + e, 

1108 self.textstate.leading * d + f, 

1109 ) 

1110 self.textstate.linematrix = (0, 0) 

1111 

1112 def do_TJ(self, seq: PDFStackT) -> None: 

1113 """Show text, allowing individual glyph positioning""" 

1114 if self.textstate.font is None: 

1115 if settings.STRICT: 

1116 raise PDFInterpreterError("No font specified!") 

1117 return 

1118 self.device.render_string( 

1119 self.textstate, 

1120 cast(PDFTextSeq, seq), 

1121 self.graphicstate.ncs, 

1122 self.graphicstate.copy(), 

1123 ) 

1124 

1125 def do_Tj(self, s: PDFStackT) -> None: 

1126 """Show text""" 

1127 self.do_TJ([s]) 

1128 

1129 def do__q(self, s: PDFStackT) -> None: 

1130 """Move to next line and show text 

1131 

1132 The ' (single quote) operator. 

1133 """ 

1134 self.do_T_a() 

1135 self.do_TJ([s]) 

1136 

1137 def do__w(self, aw: PDFStackT, ac: PDFStackT, s: PDFStackT) -> None: 

1138 """Set word and character spacing, move to next line, and show text 

1139 

1140 The " (double quote) operator. 

1141 """ 

1142 self.do_Tw(aw) 

1143 self.do_Tc(ac) 

1144 self.do_TJ([s]) 

1145 

1146 def do_BI(self) -> None: 

1147 """Begin inline image object""" 

1148 

1149 def do_ID(self) -> None: 

1150 """Begin inline image data""" 

1151 

1152 def do_EI(self, obj: PDFStackT) -> None: 

1153 """End inline image object""" 

1154 if isinstance(obj, PDFStream) and "W" in obj and "H" in obj: 

1155 iobjid = str(id(obj)) 

1156 self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY) 

1157 self.device.render_image(iobjid, obj) 

1158 self.device.end_figure(iobjid) 

1159 

1160 def do_Do(self, xobjid_arg: PDFStackT) -> None: 

1161 """Invoke named XObject""" 

1162 xobjid = literal_name(xobjid_arg) 

1163 try: 

1164 xobj = stream_value(self.xobjmap[xobjid]) 

1165 except KeyError: 

1166 if settings.STRICT: 

1167 raise PDFInterpreterError("Undefined xobject id: %r" % xobjid) 

1168 return 

1169 log.debug("Processing xobj: %r", xobj) 

1170 subtype = xobj.get("Subtype") 

1171 if subtype is LITERAL_FORM and "BBox" in xobj: 

1172 interpreter = self.dup() 

1173 bbox = cast(Rect, list_value(xobj["BBox"])) 

1174 matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY))) 

1175 # According to PDF reference 1.7 section 4.9.1, XObjects in 

1176 # earlier PDFs (prior to v1.2) use the page's Resources entry 

1177 # instead of having their own Resources entry. 

1178 xobjres = xobj.get("Resources") 

1179 if xobjres: 

1180 resources = dict_value(xobjres) 

1181 else: 

1182 resources = self.resources.copy() 

1183 self.device.begin_figure(xobjid, bbox, matrix) 

1184 interpreter.render_contents( 

1185 resources, 

1186 [xobj], 

1187 ctm=mult_matrix(matrix, self.ctm), 

1188 ) 

1189 self.device.end_figure(xobjid) 

1190 elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj: 

1191 self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY) 

1192 self.device.render_image(xobjid, xobj) 

1193 self.device.end_figure(xobjid) 

1194 else: 

1195 # unsupported xobject type. 

1196 pass 

1197 

1198 def process_page(self, page: PDFPage) -> None: 

1199 log.debug("Processing page: %r", page) 

1200 (x0, y0, x1, y1) = page.mediabox 

1201 if page.rotate == 90: 

1202 ctm = (0, -1, 1, 0, -y0, x1) 

1203 elif page.rotate == 180: 

1204 ctm = (-1, 0, 0, -1, x1, y1) 

1205 elif page.rotate == 270: 

1206 ctm = (0, 1, -1, 0, y1, -x0) 

1207 else: 

1208 ctm = (1, 0, 0, 1, -x0, -y0) 

1209 self.device.begin_page(page, ctm) 

1210 self.render_contents(page.resources, page.contents, ctm=ctm) 

1211 self.device.end_page(page) 

1212 

1213 def render_contents( 

1214 self, 

1215 resources: Dict[object, object], 

1216 streams: Sequence[object], 

1217 ctm: Matrix = MATRIX_IDENTITY, 

1218 ) -> None: 

1219 """Render the content streams. 

1220 

1221 This method may be called recursively. 

1222 """ 

1223 log.debug( 

1224 "render_contents: resources=%r, streams=%r, ctm=%r", 

1225 resources, 

1226 streams, 

1227 ctm, 

1228 ) 

1229 self.init_resources(resources) 

1230 self.init_state(ctm) 

1231 self.execute(list_value(streams)) 

1232 

1233 def execute(self, streams: Sequence[object]) -> None: 

1234 try: 

1235 parser = PDFContentParser(streams) 

1236 except PSEOF: 

1237 # empty page 

1238 return 

1239 while True: 

1240 try: 

1241 (_, obj) = parser.nextobject() 

1242 except PSEOF: 

1243 break 

1244 if isinstance(obj, PSKeyword): 

1245 name = keyword_name(obj) 

1246 method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace( 

1247 "'", 

1248 "_q", 

1249 ) 

1250 if hasattr(self, method): 

1251 func = getattr(self, method) 

1252 nargs = func.__code__.co_argcount - 1 

1253 if nargs: 

1254 args = self.pop(nargs) 

1255 log.debug("exec: %s %r", name, args) 

1256 if len(args) == nargs: 

1257 func(*args) 

1258 else: 

1259 log.debug("exec: %s", name) 

1260 func() 

1261 elif settings.STRICT: 

1262 error_msg = "Unknown operator: %r" % name 

1263 raise PDFInterpreterError(error_msg) 

1264 else: 

1265 self.push(obj)