Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/matplotlib/dviread.py: 27%

1"""

2A module for reading dvi files output by TeX. Several limitations make

3this not (currently) useful as a general-purpose dvi preprocessor, but

4it is currently used by the pdf backend for processing usetex text.

6Interface::

8 with Dvi(filename, 72) as dvi:

9 # iterate over pages:

10 for page in dvi:

11 w, h, d = page.width, page.height, page.descent

12 for x, y, font, glyph, width in page.text:

13 fontname = font.texname

14 pointsize = font.size

15 ...

16 for x, y, height, width in page.boxes:

17 ...

18"""

20from collections import namedtuple

21import enum

22from functools import lru_cache, partial, wraps

23import logging

24import os

25from pathlib import Path

26import re

27import struct

28import subprocess

29import sys

31import numpy as np

33from matplotlib import _api, cbook

35_log = logging.getLogger(__name__)

37# Many dvi related files are looked for by external processes, require

38# additional parsing, and are used many times per rendering, which is why they

39# are cached using lru_cache().

41# Dvi is a bytecode format documented in

42# https://ctan.org/pkg/dvitype

43# https://texdoc.org/serve/dvitype.pdf/0

44#

45# The file consists of a preamble, some number of pages, a postamble,

46# and a finale. Different opcodes are allowed in different contexts,

47# so the Dvi object has a parser state:

48#

49# pre: expecting the preamble

50# outer: between pages (followed by a page or the postamble,

51# also e.g. font definitions are allowed)

52# page: processing a page

53# post_post: state after the postamble (our current implementation

54# just stops reading)

55# finale: the finale (unimplemented in our current implementation)

57_dvistate = enum.Enum('DviState', 'pre outer inpage post_post finale')

59# The marks on a page consist of text and boxes. A page also has dimensions.

60Page = namedtuple('Page', 'text boxes height width descent')

61Box = namedtuple('Box', 'x y height width')

64# Also a namedtuple, for backcompat.

65class Text(namedtuple('Text', 'x y font glyph width')):

66 """

67 A glyph in the dvi file.

69 The *x* and *y* attributes directly position the glyph. The *font*,

70 *glyph*, and *width* attributes are kept public for back-compatibility,

71 but users wanting to draw the glyph themselves are encouraged to instead

72 load the font specified by `font_path` at `font_size`, warp it with the

73 effects specified by `font_effects`, and load the glyph specified by

74 `glyph_name_or_index`.

75 """

77 def _get_pdftexmap_entry(self):

78 return PsfontsMap(find_tex_file("pdftex.map"))[self.font.texname]

80 @property

81 def font_path(self):

82 """The `~pathlib.Path` to the font for this glyph."""

83 psfont = self._get_pdftexmap_entry()

84 if psfont.filename is None:

85 raise ValueError("No usable font file found for {} ({}); "

86 "the font may lack a Type-1 version"

87 .format(psfont.psname.decode("ascii"),

88 psfont.texname.decode("ascii")))

89 return Path(psfont.filename)

91 @property

92 def font_size(self):

93 """The font size."""

94 return self.font.size

96 @property

97 def font_effects(self):

98 """

99 The "font effects" dict for this glyph.

100

101 This dict contains the values for this glyph of SlantFont and

102 ExtendFont (if any), read off :file:`pdftex.map`.

103 """

104 return self._get_pdftexmap_entry().effects

105

106 @property

107 def glyph_name_or_index(self):

108 """

109 Either the glyph name or the native charmap glyph index.

110

111 If :file:`pdftex.map` specifies an encoding for this glyph's font, that

112 is a mapping of glyph indices to Adobe glyph names; use it to convert

113 dvi indices to glyph names. Callers can then convert glyph names to

114 glyph indices (with FT_Get_Name_Index/get_name_index), and load the

115 glyph using FT_Load_Glyph/load_glyph.

116

117 If :file:`pdftex.map` specifies no encoding, the indices directly map

118 to the font's "native" charmap; glyphs should directly load using

119 FT_Load_Char/load_char after selecting the native charmap.

120 """

121 entry = self._get_pdftexmap_entry()

122 return (_parse_enc(entry.encoding)[self.glyph]

123 if entry.encoding is not None else self.glyph)

124

125

126# Opcode argument parsing

127#

128# Each of the following functions takes a Dvi object and delta, which is the

129# difference between the opcode and the minimum opcode with the same meaning.

130# Dvi opcodes often encode the number of argument bytes in this delta.

131_arg_mapping = dict(

132 # raw: Return delta as is.

133 raw=lambda dvi, delta: delta,

134 # u1: Read 1 byte as an unsigned number.

135 u1=lambda dvi, delta: dvi._arg(1, signed=False),

136 # u4: Read 4 bytes as an unsigned number.

137 u4=lambda dvi, delta: dvi._arg(4, signed=False),

138 # s4: Read 4 bytes as a signed number.

139 s4=lambda dvi, delta: dvi._arg(4, signed=True),

140 # slen: Read delta bytes as a signed number, or None if delta is None.

141 slen=lambda dvi, delta: dvi._arg(delta, signed=True) if delta else None,

142 # slen1: Read (delta + 1) bytes as a signed number.

143 slen1=lambda dvi, delta: dvi._arg(delta + 1, signed=True),

144 # ulen1: Read (delta + 1) bytes as an unsigned number.

145 ulen1=lambda dvi, delta: dvi._arg(delta + 1, signed=False),

146 # olen1: Read (delta + 1) bytes as an unsigned number if less than 4 bytes,

147 # as a signed number if 4 bytes.

148 olen1=lambda dvi, delta: dvi._arg(delta + 1, signed=(delta == 3)),

149)

150

151

152def _dispatch(table, min, max=None, state=None, args=('raw',)):

153 """

154 Decorator for dispatch by opcode. Sets the values in *table*

155 from *min* to *max* to this method, adds a check that the Dvi state

156 matches *state* if not None, reads arguments from the file according

157 to *args*.

158

159 Parameters

160 ----------

161 table : dict[int, callable]

162 The dispatch table to be filled in.

163

164 min, max : int

165 Range of opcodes that calls the registered function; *max* defaults to

166 *min*.

167

168 state : _dvistate, optional

169 State of the Dvi object in which these opcodes are allowed.

170

171 args : list[str], default: ['raw']

172 Sequence of argument specifications:

173

174 - 'raw': opcode minus minimum

175 - 'u1': read one unsigned byte

176 - 'u4': read four bytes, treat as an unsigned number

177 - 's4': read four bytes, treat as a signed number

178 - 'slen': read (opcode - minimum) bytes, treat as signed

179 - 'slen1': read (opcode - minimum + 1) bytes, treat as signed

180 - 'ulen1': read (opcode - minimum + 1) bytes, treat as unsigned

181 - 'olen1': read (opcode - minimum + 1) bytes, treat as unsigned

182 if under four bytes, signed if four bytes

183 """

184 def decorate(method):

185 get_args = [_arg_mapping[x] for x in args]

186

187 @wraps(method)

188 def wrapper(self, byte):

189 if state is not None and self.state != state:

190 raise ValueError("state precondition failed")

191 return method(self, *[f(self, byte-min) for f in get_args])

192 if max is None:

193 table[min] = wrapper

194 else:

195 for i in range(min, max+1):

196 assert table[i] is None

197 table[i] = wrapper

198 return wrapper

199 return decorate

200

201

202class Dvi:

203 """

204 A reader for a dvi ("device-independent") file, as produced by TeX.

205

206 The current implementation can only iterate through pages in order,

207 and does not even attempt to verify the postamble.

208

209 This class can be used as a context manager to close the underlying

210 file upon exit. Pages can be read via iteration. Here is an overly

211 simple way to extract text without trying to detect whitespace::

212

213 >>> with matplotlib.dviread.Dvi('input.dvi', 72) as dvi:

214 ... for page in dvi:

215 ... print(''.join(chr(t.glyph) for t in page.text))

216 """

217 # dispatch table

218 _dtable = [None] * 256

219 _dispatch = partial(_dispatch, _dtable)

220

221 def __init__(self, filename, dpi):

222 """

223 Read the data from the file named *filename* and convert

224 TeX's internal units to units of *dpi* per inch.

225 *dpi* only sets the units and does not limit the resolution.

226 Use None to return TeX's internal units.

227 """

228 _log.debug('Dvi: %s', filename)

229 self.file = open(filename, 'rb')

230 self.dpi = dpi

231 self.fonts = {}

232 self.state = _dvistate.pre

233

234 def __enter__(self):

235 """Context manager enter method, does nothing."""

236 return self

237

238 def __exit__(self, etype, evalue, etrace):

239 """

240 Context manager exit method, closes the underlying file if it is open.

241 """

242 self.close()

243

244 def __iter__(self):

245 """

246 Iterate through the pages of the file.

247

248 Yields

249 ------

250 Page

251 Details of all the text and box objects on the page.

252 The Page tuple contains lists of Text and Box tuples and

253 the page dimensions, and the Text and Box tuples contain

254 coordinates transformed into a standard Cartesian

255 coordinate system at the dpi value given when initializing.

256 The coordinates are floating point numbers, but otherwise

257 precision is not lost and coordinate values are not clipped to

258 integers.

259 """

260 while self._read():

261 yield self._output()

262

263 def close(self):

264 """Close the underlying file if it is open."""

265 if not self.file.closed:

266 self.file.close()

267

268 def _output(self):

269 """

270 Output the text and boxes belonging to the most recent page.

271 page = dvi._output()

272 """

273 minx, miny, maxx, maxy = np.inf, np.inf, -np.inf, -np.inf

274 maxy_pure = -np.inf

275 for elt in self.text + self.boxes:

276 if isinstance(elt, Box):

277 x, y, h, w = elt

278 e = 0 # zero depth

279 else: # glyph

280 x, y, font, g, w = elt

281 h, e = font._height_depth_of(g)

282 minx = min(minx, x)

283 miny = min(miny, y - h)

284 maxx = max(maxx, x + w)

285 maxy = max(maxy, y + e)

286 maxy_pure = max(maxy_pure, y)

287 if self._baseline_v is not None:

288 maxy_pure = self._baseline_v # This should normally be the case.

289 self._baseline_v = None

290

291 if not self.text and not self.boxes: # Avoid infs/nans from inf+/-inf.

292 return Page(text=[], boxes=[], width=0, height=0, descent=0)

293

294 if self.dpi is None:

295 # special case for ease of debugging: output raw dvi coordinates

296 return Page(text=self.text, boxes=self.boxes,

297 width=maxx-minx, height=maxy_pure-miny,

298 descent=maxy-maxy_pure)

299

300 # convert from TeX's "scaled points" to dpi units

301 d = self.dpi / (72.27 * 2**16)

302 descent = (maxy - maxy_pure) * d

303

304 text = [Text((x-minx)*d, (maxy-y)*d - descent, f, g, w*d)

305 for (x, y, f, g, w) in self.text]

306 boxes = [Box((x-minx)*d, (maxy-y)*d - descent, h*d, w*d)

307 for (x, y, h, w) in self.boxes]

308

309 return Page(text=text, boxes=boxes, width=(maxx-minx)*d,

310 height=(maxy_pure-miny)*d, descent=descent)

311

312 def _read(self):

313 """

314 Read one page from the file. Return True if successful,

315 False if there were no more pages.

316 """

317 # Pages appear to start with the sequence

318 # bop (begin of page)

319 # xxx comment

320 # <push, ..., pop> # if using chemformula

321 # down

322 # push

323 # down

324 # <push, push, xxx, right, xxx, pop, pop> # if using xcolor

325 # down

326 # push

327 # down (possibly multiple)

328 # push <= here, v is the baseline position.

329 # etc.

330 # (dviasm is useful to explore this structure.)

331 # Thus, we use the vertical position at the first time the stack depth

332 # reaches 3, while at least three "downs" have been executed (excluding

333 # those popped out (corresponding to the chemformula preamble)), as the

334 # baseline (the "down" count is necessary to handle xcolor).

335 down_stack = [0]

336 self._baseline_v = None

337 while True:

338 byte = self.file.read(1)[0]

339 self._dtable[byte](self, byte)

340 name = self._dtable[byte].__name__

341 if name == "_push":

342 down_stack.append(down_stack[-1])

343 elif name == "_pop":

344 down_stack.pop()

345 elif name == "_down":

346 down_stack[-1] += 1

347 if (self._baseline_v is None

348 and len(getattr(self, "stack", [])) == 3

349 and down_stack[-1] >= 4):

350 self._baseline_v = self.v

351 if byte == 140: # end of page

352 return True

353 if self.state is _dvistate.post_post: # end of file

354 self.close()

355 return False

356

357 def _arg(self, nbytes, signed=False):

358 """

359 Read and return a big-endian integer *nbytes* long.

360 Signedness is determined by the *signed* keyword.

361 """

362 return int.from_bytes(self.file.read(nbytes), "big", signed=signed)

363

364 @_dispatch(min=0, max=127, state=_dvistate.inpage)

365 def _set_char_immediate(self, char):

366 self._put_char_real(char)

367 self.h += self.fonts[self.f]._width_of(char)

368

369 @_dispatch(min=128, max=131, state=_dvistate.inpage, args=('olen1',))

370 def _set_char(self, char):

371 self._put_char_real(char)

372 self.h += self.fonts[self.f]._width_of(char)

373

374 @_dispatch(132, state=_dvistate.inpage, args=('s4', 's4'))

375 def _set_rule(self, a, b):

376 self._put_rule_real(a, b)

377 self.h += b

378

379 @_dispatch(min=133, max=136, state=_dvistate.inpage, args=('olen1',))

380 def _put_char(self, char):

381 self._put_char_real(char)

382

383 def _put_char_real(self, char):

384 font = self.fonts[self.f]

385 if font._vf is None:

386 self.text.append(Text(self.h, self.v, font, char,

387 font._width_of(char)))

388 else:

389 scale = font._scale

390 for x, y, f, g, w in font._vf[char].text:

391 newf = DviFont(scale=_mul2012(scale, f._scale),

392 tfm=f._tfm, texname=f.texname, vf=f._vf)

393 self.text.append(Text(self.h + _mul2012(x, scale),

394 self.v + _mul2012(y, scale),

395 newf, g, newf._width_of(g)))

396 self.boxes.extend([Box(self.h + _mul2012(x, scale),

397 self.v + _mul2012(y, scale),

398 _mul2012(a, scale), _mul2012(b, scale))

399 for x, y, a, b in font._vf[char].boxes])

400

401 @_dispatch(137, state=_dvistate.inpage, args=('s4', 's4'))

402 def _put_rule(self, a, b):

403 self._put_rule_real(a, b)

404

405 def _put_rule_real(self, a, b):

406 if a > 0 and b > 0:

407 self.boxes.append(Box(self.h, self.v, a, b))

408

409 @_dispatch(138)

410 def _nop(self, _):

411 pass

412

413 @_dispatch(139, state=_dvistate.outer, args=('s4',)*11)

414 def _bop(self, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, p):

415 self.state = _dvistate.inpage

416 self.h, self.v, self.w, self.x, self.y, self.z = 0, 0, 0, 0, 0, 0

417 self.stack = []

418 self.text = [] # list of Text objects

419 self.boxes = [] # list of Box objects

420

421 @_dispatch(140, state=_dvistate.inpage)

422 def _eop(self, _):

423 self.state = _dvistate.outer

424 del self.h, self.v, self.w, self.x, self.y, self.z, self.stack

425

426 @_dispatch(141, state=_dvistate.inpage)

427 def _push(self, _):

428 self.stack.append((self.h, self.v, self.w, self.x, self.y, self.z))

429

430 @_dispatch(142, state=_dvistate.inpage)

431 def _pop(self, _):

432 self.h, self.v, self.w, self.x, self.y, self.z = self.stack.pop()

433

434 @_dispatch(min=143, max=146, state=_dvistate.inpage, args=('slen1',))

435 def _right(self, b):

436 self.h += b

437

438 @_dispatch(min=147, max=151, state=_dvistate.inpage, args=('slen',))

439 def _right_w(self, new_w):

440 if new_w is not None:

441 self.w = new_w

442 self.h += self.w

443

444 @_dispatch(min=152, max=156, state=_dvistate.inpage, args=('slen',))

445 def _right_x(self, new_x):

446 if new_x is not None:

447 self.x = new_x

448 self.h += self.x

449

450 @_dispatch(min=157, max=160, state=_dvistate.inpage, args=('slen1',))

451 def _down(self, a):

452 self.v += a

453

454 @_dispatch(min=161, max=165, state=_dvistate.inpage, args=('slen',))

455 def _down_y(self, new_y):

456 if new_y is not None:

457 self.y = new_y

458 self.v += self.y

459

460 @_dispatch(min=166, max=170, state=_dvistate.inpage, args=('slen',))

461 def _down_z(self, new_z):

462 if new_z is not None:

463 self.z = new_z

464 self.v += self.z

465

466 @_dispatch(min=171, max=234, state=_dvistate.inpage)

467 def _fnt_num_immediate(self, k):

468 self.f = k

469

470 @_dispatch(min=235, max=238, state=_dvistate.inpage, args=('olen1',))

471 def _fnt_num(self, new_f):

472 self.f = new_f

473

474 @_dispatch(min=239, max=242, args=('ulen1',))

475 def _xxx(self, datalen):

476 special = self.file.read(datalen)

477 _log.debug(

478 'Dvi._xxx: encountered special: %s',

479 ''.join([chr(ch) if 32 <= ch < 127 else '<%02x>' % ch

480 for ch in special]))

481

482 @_dispatch(min=243, max=246, args=('olen1', 'u4', 'u4', 'u4', 'u1', 'u1'))

483 def _fnt_def(self, k, c, s, d, a, l):

484 self._fnt_def_real(k, c, s, d, a, l)

485

486 def _fnt_def_real(self, k, c, s, d, a, l):

487 n = self.file.read(a + l)

488 fontname = n[-l:].decode('ascii')

489 tfm = _tfmfile(fontname)

490 if c != 0 and tfm.checksum != 0 and c != tfm.checksum:

491 raise ValueError('tfm checksum mismatch: %s' % n)

492 try:

493 vf = _vffile(fontname)

494 except FileNotFoundError:

495 vf = None

496 self.fonts[k] = DviFont(scale=s, tfm=tfm, texname=n, vf=vf)

497

498 @_dispatch(247, state=_dvistate.pre, args=('u1', 'u4', 'u4', 'u4', 'u1'))

499 def _pre(self, i, num, den, mag, k):

500 self.file.read(k) # comment in the dvi file

501 if i != 2:

502 raise ValueError("Unknown dvi format %d" % i)

503 if num != 25400000 or den != 7227 * 2**16:

504 raise ValueError("Nonstandard units in dvi file")

505 # meaning: TeX always uses those exact values, so it

506 # should be enough for us to support those

507 # (There are 72.27 pt to an inch so 7227 pt =

508 # 7227 * 2**16 sp to 100 in. The numerator is multiplied

509 # by 10^5 to get units of 10**-7 meters.)

510 if mag != 1000:

511 raise ValueError("Nonstandard magnification in dvi file")

512 # meaning: LaTeX seems to frown on setting \mag, so

513 # I think we can assume this is constant

514 self.state = _dvistate.outer

515

516 @_dispatch(248, state=_dvistate.outer)

517 def _post(self, _):

518 self.state = _dvistate.post_post

519 # TODO: actually read the postamble and finale?

520 # currently post_post just triggers closing the file

521

522 @_dispatch(249)

523 def _post_post(self, _):

524 raise NotImplementedError

525

526 @_dispatch(min=250, max=255)

527 def _malformed(self, offset):

528 raise ValueError(f"unknown command: byte {250 + offset}")

529

530

531class DviFont:

532 """

533 Encapsulation of a font that a DVI file can refer to.

534

535 This class holds a font's texname and size, supports comparison,

536 and knows the widths of glyphs in the same units as the AFM file.

537 There are also internal attributes (for use by dviread.py) that

538 are *not* used for comparison.

539

540 The size is in Adobe points (converted from TeX points).

541

542 Parameters

543 ----------

544 scale : float

545 Factor by which the font is scaled from its natural size.

546 tfm : Tfm

547 TeX font metrics for this font

548 texname : bytes

549 Name of the font as used internally by TeX and friends, as an ASCII

550 bytestring. This is usually very different from any external font

551 names; `PsfontsMap` can be used to find the external name of the font.

552 vf : Vf

553 A TeX "virtual font" file, or None if this font is not virtual.

554

555 Attributes

556 ----------

557 texname : bytes

558 size : float

559 Size of the font in Adobe points, converted from the slightly

560 smaller TeX points.

561 widths : list

562 Widths of glyphs in glyph-space units, typically 1/1000ths of

563 the point size.

564

565 """

566 __slots__ = ('texname', 'size', 'widths', '_scale', '_vf', '_tfm')

567

568 def __init__(self, scale, tfm, texname, vf):

569 _api.check_isinstance(bytes, texname=texname)

570 self._scale = scale

571 self._tfm = tfm

572 self.texname = texname

573 self._vf = vf

574 self.size = scale * (72.0 / (72.27 * 2**16))

575 try:

576 nchars = max(tfm.width) + 1

577 except ValueError:

578 nchars = 0

579 self.widths = [(1000*tfm.width.get(char, 0)) >> 20

580 for char in range(nchars)]

581

582 def __eq__(self, other):

583 return (type(self) is type(other)

584 and self.texname == other.texname and self.size == other.size)

585

586 def __ne__(self, other):

587 return not self.__eq__(other)

588

589 def __repr__(self):

590 return f"<{type(self).__name__}: {self.texname}>"

591

592 def _width_of(self, char):

593 """Width of char in dvi units."""

594 width = self._tfm.width.get(char, None)

595 if width is not None:

596 return _mul2012(width, self._scale)

597 _log.debug('No width for char %d in font %s.', char, self.texname)

598 return 0

599

600 def _height_depth_of(self, char):

601 """Height and depth of char in dvi units."""

602 result = []

603 for metric, name in ((self._tfm.height, "height"),

604 (self._tfm.depth, "depth")):

605 value = metric.get(char, None)

606 if value is None:

607 _log.debug('No %s for char %d in font %s',

608 name, char, self.texname)

609 result.append(0)

610 else:

611 result.append(_mul2012(value, self._scale))

612 # cmsyXX (symbols font) glyph 0 ("minus") has a nonzero descent

613 # so that TeX aligns equations properly

614 # (https://tex.stackexchange.com/q/526103/)

615 # but we actually care about the rasterization depth to align

616 # the dvipng-generated images.

617 if re.match(br'^cmsy\d+$', self.texname) and char == 0:

618 result[-1] = 0

619 return result

620

621

622class Vf(Dvi):

623 r"""

624 A virtual font (\*.vf file) containing subroutines for dvi files.

625

626 Parameters

627 ----------

628 filename : str or path-like

629

630 Notes

631 -----

632 The virtual font format is a derivative of dvi:

633 http://mirrors.ctan.org/info/knuth/virtual-fonts

634 This class reuses some of the machinery of `Dvi`

635 but replaces the `_read` loop and dispatch mechanism.

636

637 Examples

638 --------

639 ::

640

641 vf = Vf(filename)

642 glyph = vf[code]

643 glyph.text, glyph.boxes, glyph.width

644 """

645

646 def __init__(self, filename):

647 super().__init__(filename, 0)

648 try:

649 self._first_font = None

650 self._chars = {}

651 self._read()

652 finally:

653 self.close()

654

655 def __getitem__(self, code):

656 return self._chars[code]

657

658 def _read(self):

659 """

660 Read one page from the file. Return True if successful,

661 False if there were no more pages.

662 """

663 packet_char, packet_ends = None, None

664 packet_len, packet_width = None, None

665 while True:

666 byte = self.file.read(1)[0]

667 # If we are in a packet, execute the dvi instructions

668 if self.state is _dvistate.inpage:

669 byte_at = self.file.tell()-1

670 if byte_at == packet_ends:

671 self._finalize_packet(packet_char, packet_width)

672 packet_len, packet_char, packet_width = None, None, None

673 # fall through to out-of-packet code

674 elif byte_at > packet_ends:

675 raise ValueError("Packet length mismatch in vf file")

676 else:

677 if byte in (139, 140) or byte >= 243:

678 raise ValueError(

679 "Inappropriate opcode %d in vf file" % byte)

680 Dvi._dtable[byte](self, byte)

681 continue

682

683 # We are outside a packet

684 if byte < 242: # a short packet (length given by byte)

685 packet_len = byte

686 packet_char, packet_width = self._arg(1), self._arg(3)

687 packet_ends = self._init_packet(byte)

688 self.state = _dvistate.inpage

689 elif byte == 242: # a long packet

690 packet_len, packet_char, packet_width = \

691 [self._arg(x) for x in (4, 4, 4)]

692 self._init_packet(packet_len)

693 elif 243 <= byte <= 246:

694 k = self._arg(byte - 242, byte == 246)

695 c, s, d, a, l = [self._arg(x) for x in (4, 4, 4, 1, 1)]

696 self._fnt_def_real(k, c, s, d, a, l)

697 if self._first_font is None:

698 self._first_font = k

699 elif byte == 247: # preamble

700 i, k = self._arg(1), self._arg(1)

701 x = self.file.read(k)

702 cs, ds = self._arg(4), self._arg(4)

703 self._pre(i, x, cs, ds)

704 elif byte == 248: # postamble (just some number of 248s)

705 break

706 else:

707 raise ValueError("Unknown vf opcode %d" % byte)

708

709 def _init_packet(self, pl):

710 if self.state != _dvistate.outer:

711 raise ValueError("Misplaced packet in vf file")

712 self.h, self.v, self.w, self.x, self.y, self.z = 0, 0, 0, 0, 0, 0

713 self.stack, self.text, self.boxes = [], [], []

714 self.f = self._first_font

715 return self.file.tell() + pl

716

717 def _finalize_packet(self, packet_char, packet_width):

718 self._chars[packet_char] = Page(

719 text=self.text, boxes=self.boxes, width=packet_width,

720 height=None, descent=None)

721 self.state = _dvistate.outer

722

723 def _pre(self, i, x, cs, ds):

724 if self.state is not _dvistate.pre:

725 raise ValueError("pre command in middle of vf file")

726 if i != 202:

727 raise ValueError("Unknown vf format %d" % i)

728 if len(x):

729 _log.debug('vf file comment: %s', x)

730 self.state = _dvistate.outer

731 # cs = checksum, ds = design size

732

733

734def _mul2012(num1, num2):

735 """Multiply two numbers in 20.12 fixed point format."""

736 # Separated into a function because >> has surprising precedence

737 return (num1*num2) >> 20

738

739

740class Tfm:

741 """

742 A TeX Font Metric file.

743

744 This implementation covers only the bare minimum needed by the Dvi class.

745

746 Parameters

747 ----------

748 filename : str or path-like

749

750 Attributes

751 ----------

752 checksum : int

753 Used for verifying against the dvi file.

754 design_size : int

755 Design size of the font (unknown units)

756 width, height, depth : dict

757 Dimensions of each character, need to be scaled by the factor

758 specified in the dvi file. These are dicts because indexing may

759 not start from 0.

760 """

761 __slots__ = ('checksum', 'design_size', 'width', 'height', 'depth')

762

763 def __init__(self, filename):

764 _log.debug('opening tfm file %s', filename)

765 with open(filename, 'rb') as file:

766 header1 = file.read(24)

767 lh, bc, ec, nw, nh, nd = struct.unpack('!6H', header1[2:14])

768 _log.debug('lh=%d, bc=%d, ec=%d, nw=%d, nh=%d, nd=%d',

769 lh, bc, ec, nw, nh, nd)

770 header2 = file.read(4*lh)

771 self.checksum, self.design_size = struct.unpack('!2I', header2[:8])

772 # there is also encoding information etc.

773 char_info = file.read(4*(ec-bc+1))

774 widths = struct.unpack(f'!{nw}i', file.read(4*nw))

775 heights = struct.unpack(f'!{nh}i', file.read(4*nh))

776 depths = struct.unpack(f'!{nd}i', file.read(4*nd))

777 self.width, self.height, self.depth = {}, {}, {}

778 for idx, char in enumerate(range(bc, ec+1)):

779 byte0 = char_info[4*idx]

780 byte1 = char_info[4*idx+1]

781 self.width[char] = widths[byte0]

782 self.height[char] = heights[byte1 >> 4]

783 self.depth[char] = depths[byte1 & 0xf]

784

785

786PsFont = namedtuple('PsFont', 'texname psname effects encoding filename')

787

788

789class PsfontsMap:

790 """

791 A psfonts.map formatted file, mapping TeX fonts to PS fonts.

792

793 Parameters

794 ----------

795 filename : str or path-like

796

797 Notes

798 -----

799 For historical reasons, TeX knows many Type-1 fonts by different

800 names than the outside world. (For one thing, the names have to

801 fit in eight characters.) Also, TeX's native fonts are not Type-1

802 but Metafont, which is nontrivial to convert to PostScript except

803 as a bitmap. While high-quality conversions to Type-1 format exist

804 and are shipped with modern TeX distributions, we need to know

805 which Type-1 fonts are the counterparts of which native fonts. For

806 these reasons a mapping is needed from internal font names to font

807 file names.

808

809 A texmf tree typically includes mapping files called e.g.

810 :file:`psfonts.map`, :file:`pdftex.map`, or :file:`dvipdfm.map`.

811 The file :file:`psfonts.map` is used by :program:`dvips`,

812 :file:`pdftex.map` by :program:`pdfTeX`, and :file:`dvipdfm.map`

813 by :program:`dvipdfm`. :file:`psfonts.map` might avoid embedding

814 the 35 PostScript fonts (i.e., have no filename for them, as in

815 the Times-Bold example above), while the pdf-related files perhaps

816 only avoid the "Base 14" pdf fonts. But the user may have

817 configured these files differently.

818

819 Examples

820 --------

821 >>> map = PsfontsMap(find_tex_file('pdftex.map'))

822 >>> entry = map[b'ptmbo8r']

823 >>> entry.texname

824 b'ptmbo8r'

825 >>> entry.psname

826 b'Times-Bold'

827 >>> entry.encoding

828 '/usr/local/texlive/2008/texmf-dist/fonts/enc/dvips/base/8r.enc'

829 >>> entry.effects

830 {'slant': 0.16700000000000001}

831 >>> entry.filename

832 """

833 __slots__ = ('_filename', '_unparsed', '_parsed')

834

835 # Create a filename -> PsfontsMap cache, so that calling

836 # `PsfontsMap(filename)` with the same filename a second time immediately

837 # returns the same object.

838 @lru_cache

839 def __new__(cls, filename):

840 self = object.__new__(cls)

841 self._filename = os.fsdecode(filename)

842 # Some TeX distributions have enormous pdftex.map files which would

843 # take hundreds of milliseconds to parse, but it is easy enough to just

844 # store the unparsed lines (keyed by the first word, which is the

845 # texname) and parse them on-demand.

846 with open(filename, 'rb') as file:

847 self._unparsed = {}

848 for line in file:

849 tfmname = line.split(b' ', 1)[0]

850 self._unparsed.setdefault(tfmname, []).append(line)

851 self._parsed = {}

852 return self

853

854 def __getitem__(self, texname):

855 assert isinstance(texname, bytes)

856 if texname in self._unparsed:

857 for line in self._unparsed.pop(texname):

858 if self._parse_and_cache_line(line):

859 break

860 try:

861 return self._parsed[texname]

862 except KeyError:

863 raise LookupError(

864 f"An associated PostScript font (required by Matplotlib) "

865 f"could not be found for TeX font {texname.decode('ascii')!r} "

866 f"in {self._filename!r}; this problem can often be solved by "

867 f"installing a suitable PostScript font package in your TeX "

868 f"package manager") from None

869

870 def _parse_and_cache_line(self, line):

871 """

872 Parse a line in the font mapping file.

873

874 The format is (partially) documented at

875 http://mirrors.ctan.org/systems/doc/pdftex/manual/pdftex-a.pdf

876 https://tug.org/texinfohtml/dvips.html#psfonts_002emap

877 Each line can have the following fields:

878

879 - tfmname (first, only required field),

880 - psname (defaults to tfmname, must come immediately after tfmname if

881 present),

882 - fontflags (integer, must come immediately after psname if present,

883 ignored by us),

884 - special (SlantFont and ExtendFont, only field that is double-quoted),

885 - fontfile, encodingfile (optional, prefixed by <, <<, or <[; << always

886 precedes a font, <[ always precedes an encoding, < can precede either

887 but then an encoding file must have extension .enc; < and << also

888 request different font subsetting behaviors but we ignore that; < can

889 be separated from the filename by whitespace).

890

891 special, fontfile, and encodingfile can appear in any order.

892 """

893 # If the map file specifies multiple encodings for a font, we

894 # follow pdfTeX in choosing the last one specified. Such

895 # entries are probably mistakes but they have occurred.

896 # https://tex.stackexchange.com/q/10826/

897

898 if not line or line.startswith((b" ", b"%", b"*", b";", b"#")):

899 return

900 tfmname = basename = special = encodingfile = fontfile = None

901 is_subsetted = is_t1 = is_truetype = False

902 matches = re.finditer(br'"([^"]*)(?:"|$)|(\S+)', line)

903 for match in matches:

904 quoted, unquoted = match.groups()

905 if unquoted:

906 if unquoted.startswith(b"<<"): # font

907 fontfile = unquoted[2:]

908 elif unquoted.startswith(b"<["): # encoding

909 encodingfile = unquoted[2:]

910 elif unquoted.startswith(b"<"): # font or encoding

911 word = (

912 # <foo => foo

913 unquoted[1:]

914 # < by itself => read the next word

915 or next(filter(None, next(matches).groups())))

916 if word.endswith(b".enc"):

917 encodingfile = word

918 else:

919 fontfile = word

920 is_subsetted = True

921 elif tfmname is None:

922 tfmname = unquoted

923 elif basename is None:

924 basename = unquoted

925 elif quoted:

926 special = quoted

927 effects = {}

928 if special:

929 words = reversed(special.split())

930 for word in words:

931 if word == b"SlantFont":

932 effects["slant"] = float(next(words))

933 elif word == b"ExtendFont":

934 effects["extend"] = float(next(words))

935

936 # Verify some properties of the line that would cause it to be ignored

937 # otherwise.

938 if fontfile is not None:

939 if fontfile.endswith((b".ttf", b".ttc")):

940 is_truetype = True

941 elif not fontfile.endswith(b".otf"):

942 is_t1 = True

943 elif basename is not None:

944 is_t1 = True

945 if is_truetype and is_subsetted and encodingfile is None:

946 return

947 if not is_t1 and ("slant" in effects or "extend" in effects):

948 return

949 if abs(effects.get("slant", 0)) > 1:

950 return

951 if abs(effects.get("extend", 0)) > 2:

952 return

953

954 if basename is None:

955 basename = tfmname

956 if encodingfile is not None:

957 encodingfile = find_tex_file(encodingfile)

958 if fontfile is not None:

959 fontfile = find_tex_file(fontfile)

960 self._parsed[tfmname] = PsFont(

961 texname=tfmname, psname=basename, effects=effects,

962 encoding=encodingfile, filename=fontfile)

963 return True

964

965

966def _parse_enc(path):

967 r"""

968 Parse a \*.enc file referenced from a psfonts.map style file.

969

970 The format supported by this function is a tiny subset of PostScript.

971

972 Parameters

973 ----------

974 path : `os.PathLike`

975

976 Returns

977 -------

978 list

979 The nth entry of the list is the PostScript glyph name of the nth

980 glyph.

981 """

982 no_comments = re.sub("%.*", "", Path(path).read_text(encoding="ascii"))

983 array = re.search(r"(?s)\[(.*)\]", no_comments).group(1)

984 lines = [line for line in array.split() if line]

985 if all(line.startswith("/") for line in lines):

986 return [line[1:] for line in lines]

987 else:

988 raise ValueError(f"Failed to parse {path} as Postscript encoding")

989

990

991class _LuatexKpsewhich:

992 @lru_cache # A singleton.

993 def __new__(cls):

994 self = object.__new__(cls)

995 self._proc = self._new_proc()

996 return self

997

998 def _new_proc(self):

999 return subprocess.Popen(

1000 ["luatex", "--luaonly",

1001 str(cbook._get_data_path("kpsewhich.lua"))],

1002 stdin=subprocess.PIPE, stdout=subprocess.PIPE)

1003

1004 def search(self, filename):

1005 if self._proc.poll() is not None: # Dead, restart it.

1006 self._proc = self._new_proc()

1007 self._proc.stdin.write(os.fsencode(filename) + b"\n")

1008 self._proc.stdin.flush()

1009 out = self._proc.stdout.readline().rstrip()

1010 return None if out == b"nil" else os.fsdecode(out)

1011

1012

1013@lru_cache

1014def find_tex_file(filename):

1015 """

1016 Find a file in the texmf tree using kpathsea_.

1017

1018 The kpathsea library, provided by most existing TeX distributions, both

1019 on Unix-like systems and on Windows (MikTeX), is invoked via a long-lived

1020 luatex process if luatex is installed, or via kpsewhich otherwise.

1021

1022 .. _kpathsea: https://www.tug.org/kpathsea/

1023

1024 Parameters

1025 ----------

1026 filename : str or path-like

1027

1028 Raises

1029 ------

1030 FileNotFoundError

1031 If the file is not found.

1032 """

1033

1034 # we expect these to always be ascii encoded, but use utf-8

1035 # out of caution

1036 if isinstance(filename, bytes):

1037 filename = filename.decode('utf-8', errors='replace')

1038

1039 try:

1040 lk = _LuatexKpsewhich()

1041 except FileNotFoundError:

1042 lk = None # Fallback to directly calling kpsewhich, as below.

1043

1044 if lk:

1045 path = lk.search(filename)

1046 else:

1047 if sys.platform == 'win32':

1048 # On Windows only, kpathsea can use utf-8 for cmd args and output.

1049 # The `command_line_encoding` environment variable is set to force

1050 # it to always use utf-8 encoding. See Matplotlib issue #11848.

1051 kwargs = {'env': {**os.environ, 'command_line_encoding': 'utf-8'},

1052 'encoding': 'utf-8'}

1053 else: # On POSIX, run through the equivalent of os.fsdecode().

1054 kwargs = {'encoding': sys.getfilesystemencoding(),

1055 'errors': 'surrogateescape'}

1056

1057 try:

1058 path = (cbook._check_and_log_subprocess(['kpsewhich', filename],

1059 _log, **kwargs)

1060 .rstrip('\n'))

1061 except (FileNotFoundError, RuntimeError):

1062 path = None

1063

1064 if path:

1065 return path

1066 else:

1067 raise FileNotFoundError(

1068 f"Matplotlib's TeX implementation searched for a file named "

1069 f"{filename!r} in your texmf tree, but could not find it")

1070

1071

1072@lru_cache

1073def _fontfile(cls, suffix, texname):

1074 return cls(find_tex_file(texname + suffix))

1075

1076

1077_tfmfile = partial(_fontfile, Tfm, ".tfm")

1078_vffile = partial(_fontfile, Vf, ".vf")

1079

1080

1081if __name__ == '__main__':

1082 from argparse import ArgumentParser

1083 import itertools

1084

1085 parser = ArgumentParser()

1086 parser.add_argument("filename")

1087 parser.add_argument("dpi", nargs="?", type=float, default=None)

1088 args = parser.parse_args()

1089 with Dvi(args.filename, args.dpi) as dvi:

1090 fontmap = PsfontsMap(find_tex_file('pdftex.map'))

1091 for page in dvi:

1092 print(f"=== new page === "

1093 f"(w: {page.width}, h: {page.height}, d: {page.descent})")

1094 for font, group in itertools.groupby(

1095 page.text, lambda text: text.font):

1096 print(f"font: {font.texname.decode('latin-1')!r}\t"

1097 f"scale: {font._scale / 2 ** 20}")

1098 print("x", "y", "glyph", "chr", "w", "(glyphs)", sep="\t")

1099 for text in group:

1100 print(text.x, text.y, text.glyph,

1101 chr(text.glyph) if chr(text.glyph).isprintable()

1102 else ".",

1103 text.width, sep="\t")

1104 if page.boxes:

1105 print("x", "y", "h", "w", "", "(boxes)", sep="\t")

1106 for box in page.boxes:

1107 print(box.x, box.y, box.height, box.width, sep="\t")