1"""
2A module for reading dvi files output by TeX. Several limitations make
3this not (currently) useful as a general-purpose dvi preprocessor, but
4it is currently used by the pdf backend for processing usetex text.
5
6Interface::
7
8 with Dvi(filename, 72) as dvi:
9 # iterate over pages:
10 for page in dvi:
11 w, h, d = page.width, page.height, page.descent
12 for x, y, font, glyph, width in page.text:
13 fontname = font.texname
14 pointsize = font.size
15 ...
16 for x, y, height, width in page.boxes:
17 ...
18"""
19
20from collections import namedtuple
21import enum
22from functools import lru_cache, partial, wraps
23import logging
24import os
25from pathlib import Path
26import re
27import struct
28import subprocess
29import sys
30
31import numpy as np
32
33from matplotlib import _api, cbook
34
35_log = logging.getLogger(__name__)
36
37# Many dvi related files are looked for by external processes, require
38# additional parsing, and are used many times per rendering, which is why they
39# are cached using lru_cache().
40
41# Dvi is a bytecode format documented in
42# https://ctan.org/pkg/dvitype
43# https://texdoc.org/serve/dvitype.pdf/0
44#
45# The file consists of a preamble, some number of pages, a postamble,
46# and a finale. Different opcodes are allowed in different contexts,
47# so the Dvi object has a parser state:
48#
49# pre: expecting the preamble
50# outer: between pages (followed by a page or the postamble,
51# also e.g. font definitions are allowed)
52# page: processing a page
53# post_post: state after the postamble (our current implementation
54# just stops reading)
55# finale: the finale (unimplemented in our current implementation)
56
57_dvistate = enum.Enum('DviState', 'pre outer inpage post_post finale')
58
59# The marks on a page consist of text and boxes. A page also has dimensions.
60Page = namedtuple('Page', 'text boxes height width descent')
61Box = namedtuple('Box', 'x y height width')
62
63
64# Also a namedtuple, for backcompat.
65class Text(namedtuple('Text', 'x y font glyph width')):
66 """
67 A glyph in the dvi file.
68
69 The *x* and *y* attributes directly position the glyph. The *font*,
70 *glyph*, and *width* attributes are kept public for back-compatibility,
71 but users wanting to draw the glyph themselves are encouraged to instead
72 load the font specified by `font_path` at `font_size`, warp it with the
73 effects specified by `font_effects`, and load the glyph specified by
74 `glyph_name_or_index`.
75 """
76
77 def _get_pdftexmap_entry(self):
78 return PsfontsMap(find_tex_file("pdftex.map"))[self.font.texname]
79
80 @property
81 def font_path(self):
82 """The `~pathlib.Path` to the font for this glyph."""
83 psfont = self._get_pdftexmap_entry()
84 if psfont.filename is None:
85 raise ValueError("No usable font file found for {} ({}); "
86 "the font may lack a Type-1 version"
87 .format(psfont.psname.decode("ascii"),
88 psfont.texname.decode("ascii")))
89 return Path(psfont.filename)
90
91 @property
92 def font_size(self):
93 """The font size."""
94 return self.font.size
95
96 @property
97 def font_effects(self):
98 """
99 The "font effects" dict for this glyph.
100
101 This dict contains the values for this glyph of SlantFont and
102 ExtendFont (if any), read off :file:`pdftex.map`.
103 """
104 return self._get_pdftexmap_entry().effects
105
106 @property
107 def glyph_name_or_index(self):
108 """
109 Either the glyph name or the native charmap glyph index.
110
111 If :file:`pdftex.map` specifies an encoding for this glyph's font, that
112 is a mapping of glyph indices to Adobe glyph names; use it to convert
113 dvi indices to glyph names. Callers can then convert glyph names to
114 glyph indices (with FT_Get_Name_Index/get_name_index), and load the
115 glyph using FT_Load_Glyph/load_glyph.
116
117 If :file:`pdftex.map` specifies no encoding, the indices directly map
118 to the font's "native" charmap; glyphs should directly load using
119 FT_Load_Char/load_char after selecting the native charmap.
120 """
121 entry = self._get_pdftexmap_entry()
122 return (_parse_enc(entry.encoding)[self.glyph]
123 if entry.encoding is not None else self.glyph)
124
125
126# Opcode argument parsing
127#
128# Each of the following functions takes a Dvi object and delta, which is the
129# difference between the opcode and the minimum opcode with the same meaning.
130# Dvi opcodes often encode the number of argument bytes in this delta.
131_arg_mapping = dict(
132 # raw: Return delta as is.
133 raw=lambda dvi, delta: delta,
134 # u1: Read 1 byte as an unsigned number.
135 u1=lambda dvi, delta: dvi._arg(1, signed=False),
136 # u4: Read 4 bytes as an unsigned number.
137 u4=lambda dvi, delta: dvi._arg(4, signed=False),
138 # s4: Read 4 bytes as a signed number.
139 s4=lambda dvi, delta: dvi._arg(4, signed=True),
140 # slen: Read delta bytes as a signed number, or None if delta is None.
141 slen=lambda dvi, delta: dvi._arg(delta, signed=True) if delta else None,
142 # slen1: Read (delta + 1) bytes as a signed number.
143 slen1=lambda dvi, delta: dvi._arg(delta + 1, signed=True),
144 # ulen1: Read (delta + 1) bytes as an unsigned number.
145 ulen1=lambda dvi, delta: dvi._arg(delta + 1, signed=False),
146 # olen1: Read (delta + 1) bytes as an unsigned number if less than 4 bytes,
147 # as a signed number if 4 bytes.
148 olen1=lambda dvi, delta: dvi._arg(delta + 1, signed=(delta == 3)),
149)
150
151
152def _dispatch(table, min, max=None, state=None, args=('raw',)):
153 """
154 Decorator for dispatch by opcode. Sets the values in *table*
155 from *min* to *max* to this method, adds a check that the Dvi state
156 matches *state* if not None, reads arguments from the file according
157 to *args*.
158
159 Parameters
160 ----------
161 table : dict[int, callable]
162 The dispatch table to be filled in.
163
164 min, max : int
165 Range of opcodes that calls the registered function; *max* defaults to
166 *min*.
167
168 state : _dvistate, optional
169 State of the Dvi object in which these opcodes are allowed.
170
171 args : list[str], default: ['raw']
172 Sequence of argument specifications:
173
174 - 'raw': opcode minus minimum
175 - 'u1': read one unsigned byte
176 - 'u4': read four bytes, treat as an unsigned number
177 - 's4': read four bytes, treat as a signed number
178 - 'slen': read (opcode - minimum) bytes, treat as signed
179 - 'slen1': read (opcode - minimum + 1) bytes, treat as signed
180 - 'ulen1': read (opcode - minimum + 1) bytes, treat as unsigned
181 - 'olen1': read (opcode - minimum + 1) bytes, treat as unsigned
182 if under four bytes, signed if four bytes
183 """
184 def decorate(method):
185 get_args = [_arg_mapping[x] for x in args]
186
187 @wraps(method)
188 def wrapper(self, byte):
189 if state is not None and self.state != state:
190 raise ValueError("state precondition failed")
191 return method(self, *[f(self, byte-min) for f in get_args])
192 if max is None:
193 table[min] = wrapper
194 else:
195 for i in range(min, max+1):
196 assert table[i] is None
197 table[i] = wrapper
198 return wrapper
199 return decorate
200
201
202class Dvi:
203 """
204 A reader for a dvi ("device-independent") file, as produced by TeX.
205
206 The current implementation can only iterate through pages in order,
207 and does not even attempt to verify the postamble.
208
209 This class can be used as a context manager to close the underlying
210 file upon exit. Pages can be read via iteration. Here is an overly
211 simple way to extract text without trying to detect whitespace::
212
213 >>> with matplotlib.dviread.Dvi('input.dvi', 72) as dvi:
214 ... for page in dvi:
215 ... print(''.join(chr(t.glyph) for t in page.text))
216 """
217 # dispatch table
218 _dtable = [None] * 256
219 _dispatch = partial(_dispatch, _dtable)
220
221 def __init__(self, filename, dpi):
222 """
223 Read the data from the file named *filename* and convert
224 TeX's internal units to units of *dpi* per inch.
225 *dpi* only sets the units and does not limit the resolution.
226 Use None to return TeX's internal units.
227 """
228 _log.debug('Dvi: %s', filename)
229 self.file = open(filename, 'rb')
230 self.dpi = dpi
231 self.fonts = {}
232 self.state = _dvistate.pre
233
234 def __enter__(self):
235 """Context manager enter method, does nothing."""
236 return self
237
238 def __exit__(self, etype, evalue, etrace):
239 """
240 Context manager exit method, closes the underlying file if it is open.
241 """
242 self.close()
243
244 def __iter__(self):
245 """
246 Iterate through the pages of the file.
247
248 Yields
249 ------
250 Page
251 Details of all the text and box objects on the page.
252 The Page tuple contains lists of Text and Box tuples and
253 the page dimensions, and the Text and Box tuples contain
254 coordinates transformed into a standard Cartesian
255 coordinate system at the dpi value given when initializing.
256 The coordinates are floating point numbers, but otherwise
257 precision is not lost and coordinate values are not clipped to
258 integers.
259 """
260 while self._read():
261 yield self._output()
262
263 def close(self):
264 """Close the underlying file if it is open."""
265 if not self.file.closed:
266 self.file.close()
267
268 def _output(self):
269 """
270 Output the text and boxes belonging to the most recent page.
271 page = dvi._output()
272 """
273 minx, miny, maxx, maxy = np.inf, np.inf, -np.inf, -np.inf
274 maxy_pure = -np.inf
275 for elt in self.text + self.boxes:
276 if isinstance(elt, Box):
277 x, y, h, w = elt
278 e = 0 # zero depth
279 else: # glyph
280 x, y, font, g, w = elt
281 h, e = font._height_depth_of(g)
282 minx = min(minx, x)
283 miny = min(miny, y - h)
284 maxx = max(maxx, x + w)
285 maxy = max(maxy, y + e)
286 maxy_pure = max(maxy_pure, y)
287 if self._baseline_v is not None:
288 maxy_pure = self._baseline_v # This should normally be the case.
289 self._baseline_v = None
290
291 if not self.text and not self.boxes: # Avoid infs/nans from inf+/-inf.
292 return Page(text=[], boxes=[], width=0, height=0, descent=0)
293
294 if self.dpi is None:
295 # special case for ease of debugging: output raw dvi coordinates
296 return Page(text=self.text, boxes=self.boxes,
297 width=maxx-minx, height=maxy_pure-miny,
298 descent=maxy-maxy_pure)
299
300 # convert from TeX's "scaled points" to dpi units
301 d = self.dpi / (72.27 * 2**16)
302 descent = (maxy - maxy_pure) * d
303
304 text = [Text((x-minx)*d, (maxy-y)*d - descent, f, g, w*d)
305 for (x, y, f, g, w) in self.text]
306 boxes = [Box((x-minx)*d, (maxy-y)*d - descent, h*d, w*d)
307 for (x, y, h, w) in self.boxes]
308
309 return Page(text=text, boxes=boxes, width=(maxx-minx)*d,
310 height=(maxy_pure-miny)*d, descent=descent)
311
312 def _read(self):
313 """
314 Read one page from the file. Return True if successful,
315 False if there were no more pages.
316 """
317 # Pages appear to start with the sequence
318 # bop (begin of page)
319 # xxx comment
320 # <push, ..., pop> # if using chemformula
321 # down
322 # push
323 # down
324 # <push, push, xxx, right, xxx, pop, pop> # if using xcolor
325 # down
326 # push
327 # down (possibly multiple)
328 # push <= here, v is the baseline position.
329 # etc.
330 # (dviasm is useful to explore this structure.)
331 # Thus, we use the vertical position at the first time the stack depth
332 # reaches 3, while at least three "downs" have been executed (excluding
333 # those popped out (corresponding to the chemformula preamble)), as the
334 # baseline (the "down" count is necessary to handle xcolor).
335 down_stack = [0]
336 self._baseline_v = None
337 while True:
338 byte = self.file.read(1)[0]
339 self._dtable[byte](self, byte)
340 name = self._dtable[byte].__name__
341 if name == "_push":
342 down_stack.append(down_stack[-1])
343 elif name == "_pop":
344 down_stack.pop()
345 elif name == "_down":
346 down_stack[-1] += 1
347 if (self._baseline_v is None
348 and len(getattr(self, "stack", [])) == 3
349 and down_stack[-1] >= 4):
350 self._baseline_v = self.v
351 if byte == 140: # end of page
352 return True
353 if self.state is _dvistate.post_post: # end of file
354 self.close()
355 return False
356
357 def _arg(self, nbytes, signed=False):
358 """
359 Read and return a big-endian integer *nbytes* long.
360 Signedness is determined by the *signed* keyword.
361 """
362 return int.from_bytes(self.file.read(nbytes), "big", signed=signed)
363
364 @_dispatch(min=0, max=127, state=_dvistate.inpage)
365 def _set_char_immediate(self, char):
366 self._put_char_real(char)
367 self.h += self.fonts[self.f]._width_of(char)
368
369 @_dispatch(min=128, max=131, state=_dvistate.inpage, args=('olen1',))
370 def _set_char(self, char):
371 self._put_char_real(char)
372 self.h += self.fonts[self.f]._width_of(char)
373
374 @_dispatch(132, state=_dvistate.inpage, args=('s4', 's4'))
375 def _set_rule(self, a, b):
376 self._put_rule_real(a, b)
377 self.h += b
378
379 @_dispatch(min=133, max=136, state=_dvistate.inpage, args=('olen1',))
380 def _put_char(self, char):
381 self._put_char_real(char)
382
383 def _put_char_real(self, char):
384 font = self.fonts[self.f]
385 if font._vf is None:
386 self.text.append(Text(self.h, self.v, font, char,
387 font._width_of(char)))
388 else:
389 scale = font._scale
390 for x, y, f, g, w in font._vf[char].text:
391 newf = DviFont(scale=_mul2012(scale, f._scale),
392 tfm=f._tfm, texname=f.texname, vf=f._vf)
393 self.text.append(Text(self.h + _mul2012(x, scale),
394 self.v + _mul2012(y, scale),
395 newf, g, newf._width_of(g)))
396 self.boxes.extend([Box(self.h + _mul2012(x, scale),
397 self.v + _mul2012(y, scale),
398 _mul2012(a, scale), _mul2012(b, scale))
399 for x, y, a, b in font._vf[char].boxes])
400
401 @_dispatch(137, state=_dvistate.inpage, args=('s4', 's4'))
402 def _put_rule(self, a, b):
403 self._put_rule_real(a, b)
404
405 def _put_rule_real(self, a, b):
406 if a > 0 and b > 0:
407 self.boxes.append(Box(self.h, self.v, a, b))
408
409 @_dispatch(138)
410 def _nop(self, _):
411 pass
412
413 @_dispatch(139, state=_dvistate.outer, args=('s4',)*11)
414 def _bop(self, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, p):
415 self.state = _dvistate.inpage
416 self.h, self.v, self.w, self.x, self.y, self.z = 0, 0, 0, 0, 0, 0
417 self.stack = []
418 self.text = [] # list of Text objects
419 self.boxes = [] # list of Box objects
420
421 @_dispatch(140, state=_dvistate.inpage)
422 def _eop(self, _):
423 self.state = _dvistate.outer
424 del self.h, self.v, self.w, self.x, self.y, self.z, self.stack
425
426 @_dispatch(141, state=_dvistate.inpage)
427 def _push(self, _):
428 self.stack.append((self.h, self.v, self.w, self.x, self.y, self.z))
429
430 @_dispatch(142, state=_dvistate.inpage)
431 def _pop(self, _):
432 self.h, self.v, self.w, self.x, self.y, self.z = self.stack.pop()
433
434 @_dispatch(min=143, max=146, state=_dvistate.inpage, args=('slen1',))
435 def _right(self, b):
436 self.h += b
437
438 @_dispatch(min=147, max=151, state=_dvistate.inpage, args=('slen',))
439 def _right_w(self, new_w):
440 if new_w is not None:
441 self.w = new_w
442 self.h += self.w
443
444 @_dispatch(min=152, max=156, state=_dvistate.inpage, args=('slen',))
445 def _right_x(self, new_x):
446 if new_x is not None:
447 self.x = new_x
448 self.h += self.x
449
450 @_dispatch(min=157, max=160, state=_dvistate.inpage, args=('slen1',))
451 def _down(self, a):
452 self.v += a
453
454 @_dispatch(min=161, max=165, state=_dvistate.inpage, args=('slen',))
455 def _down_y(self, new_y):
456 if new_y is not None:
457 self.y = new_y
458 self.v += self.y
459
460 @_dispatch(min=166, max=170, state=_dvistate.inpage, args=('slen',))
461 def _down_z(self, new_z):
462 if new_z is not None:
463 self.z = new_z
464 self.v += self.z
465
466 @_dispatch(min=171, max=234, state=_dvistate.inpage)
467 def _fnt_num_immediate(self, k):
468 self.f = k
469
470 @_dispatch(min=235, max=238, state=_dvistate.inpage, args=('olen1',))
471 def _fnt_num(self, new_f):
472 self.f = new_f
473
474 @_dispatch(min=239, max=242, args=('ulen1',))
475 def _xxx(self, datalen):
476 special = self.file.read(datalen)
477 _log.debug(
478 'Dvi._xxx: encountered special: %s',
479 ''.join([chr(ch) if 32 <= ch < 127 else '<%02x>' % ch
480 for ch in special]))
481
482 @_dispatch(min=243, max=246, args=('olen1', 'u4', 'u4', 'u4', 'u1', 'u1'))
483 def _fnt_def(self, k, c, s, d, a, l):
484 self._fnt_def_real(k, c, s, d, a, l)
485
486 def _fnt_def_real(self, k, c, s, d, a, l):
487 n = self.file.read(a + l)
488 fontname = n[-l:].decode('ascii')
489 tfm = _tfmfile(fontname)
490 if c != 0 and tfm.checksum != 0 and c != tfm.checksum:
491 raise ValueError('tfm checksum mismatch: %s' % n)
492 try:
493 vf = _vffile(fontname)
494 except FileNotFoundError:
495 vf = None
496 self.fonts[k] = DviFont(scale=s, tfm=tfm, texname=n, vf=vf)
497
498 @_dispatch(247, state=_dvistate.pre, args=('u1', 'u4', 'u4', 'u4', 'u1'))
499 def _pre(self, i, num, den, mag, k):
500 self.file.read(k) # comment in the dvi file
501 if i != 2:
502 raise ValueError("Unknown dvi format %d" % i)
503 if num != 25400000 or den != 7227 * 2**16:
504 raise ValueError("Nonstandard units in dvi file")
505 # meaning: TeX always uses those exact values, so it
506 # should be enough for us to support those
507 # (There are 72.27 pt to an inch so 7227 pt =
508 # 7227 * 2**16 sp to 100 in. The numerator is multiplied
509 # by 10^5 to get units of 10**-7 meters.)
510 if mag != 1000:
511 raise ValueError("Nonstandard magnification in dvi file")
512 # meaning: LaTeX seems to frown on setting \mag, so
513 # I think we can assume this is constant
514 self.state = _dvistate.outer
515
516 @_dispatch(248, state=_dvistate.outer)
517 def _post(self, _):
518 self.state = _dvistate.post_post
519 # TODO: actually read the postamble and finale?
520 # currently post_post just triggers closing the file
521
522 @_dispatch(249)
523 def _post_post(self, _):
524 raise NotImplementedError
525
526 @_dispatch(min=250, max=255)
527 def _malformed(self, offset):
528 raise ValueError(f"unknown command: byte {250 + offset}")
529
530
531class DviFont:
532 """
533 Encapsulation of a font that a DVI file can refer to.
534
535 This class holds a font's texname and size, supports comparison,
536 and knows the widths of glyphs in the same units as the AFM file.
537 There are also internal attributes (for use by dviread.py) that
538 are *not* used for comparison.
539
540 The size is in Adobe points (converted from TeX points).
541
542 Parameters
543 ----------
544 scale : float
545 Factor by which the font is scaled from its natural size.
546 tfm : Tfm
547 TeX font metrics for this font
548 texname : bytes
549 Name of the font as used internally by TeX and friends, as an ASCII
550 bytestring. This is usually very different from any external font
551 names; `PsfontsMap` can be used to find the external name of the font.
552 vf : Vf
553 A TeX "virtual font" file, or None if this font is not virtual.
554
555 Attributes
556 ----------
557 texname : bytes
558 size : float
559 Size of the font in Adobe points, converted from the slightly
560 smaller TeX points.
561 widths : list
562 Widths of glyphs in glyph-space units, typically 1/1000ths of
563 the point size.
564
565 """
566 __slots__ = ('texname', 'size', 'widths', '_scale', '_vf', '_tfm')
567
568 def __init__(self, scale, tfm, texname, vf):
569 _api.check_isinstance(bytes, texname=texname)
570 self._scale = scale
571 self._tfm = tfm
572 self.texname = texname
573 self._vf = vf
574 self.size = scale * (72.0 / (72.27 * 2**16))
575 try:
576 nchars = max(tfm.width) + 1
577 except ValueError:
578 nchars = 0
579 self.widths = [(1000*tfm.width.get(char, 0)) >> 20
580 for char in range(nchars)]
581
582 def __eq__(self, other):
583 return (type(self) is type(other)
584 and self.texname == other.texname and self.size == other.size)
585
586 def __ne__(self, other):
587 return not self.__eq__(other)
588
589 def __repr__(self):
590 return f"<{type(self).__name__}: {self.texname}>"
591
592 def _width_of(self, char):
593 """Width of char in dvi units."""
594 width = self._tfm.width.get(char, None)
595 if width is not None:
596 return _mul2012(width, self._scale)
597 _log.debug('No width for char %d in font %s.', char, self.texname)
598 return 0
599
600 def _height_depth_of(self, char):
601 """Height and depth of char in dvi units."""
602 result = []
603 for metric, name in ((self._tfm.height, "height"),
604 (self._tfm.depth, "depth")):
605 value = metric.get(char, None)
606 if value is None:
607 _log.debug('No %s for char %d in font %s',
608 name, char, self.texname)
609 result.append(0)
610 else:
611 result.append(_mul2012(value, self._scale))
612 # cmsyXX (symbols font) glyph 0 ("minus") has a nonzero descent
613 # so that TeX aligns equations properly
614 # (https://tex.stackexchange.com/q/526103/)
615 # but we actually care about the rasterization depth to align
616 # the dvipng-generated images.
617 if re.match(br'^cmsy\d+$', self.texname) and char == 0:
618 result[-1] = 0
619 return result
620
621
622class Vf(Dvi):
623 r"""
624 A virtual font (\*.vf file) containing subroutines for dvi files.
625
626 Parameters
627 ----------
628 filename : str or path-like
629
630 Notes
631 -----
632 The virtual font format is a derivative of dvi:
633 http://mirrors.ctan.org/info/knuth/virtual-fonts
634 This class reuses some of the machinery of `Dvi`
635 but replaces the `_read` loop and dispatch mechanism.
636
637 Examples
638 --------
639 ::
640
641 vf = Vf(filename)
642 glyph = vf[code]
643 glyph.text, glyph.boxes, glyph.width
644 """
645
646 def __init__(self, filename):
647 super().__init__(filename, 0)
648 try:
649 self._first_font = None
650 self._chars = {}
651 self._read()
652 finally:
653 self.close()
654
655 def __getitem__(self, code):
656 return self._chars[code]
657
658 def _read(self):
659 """
660 Read one page from the file. Return True if successful,
661 False if there were no more pages.
662 """
663 packet_char, packet_ends = None, None
664 packet_len, packet_width = None, None
665 while True:
666 byte = self.file.read(1)[0]
667 # If we are in a packet, execute the dvi instructions
668 if self.state is _dvistate.inpage:
669 byte_at = self.file.tell()-1
670 if byte_at == packet_ends:
671 self._finalize_packet(packet_char, packet_width)
672 packet_len, packet_char, packet_width = None, None, None
673 # fall through to out-of-packet code
674 elif byte_at > packet_ends:
675 raise ValueError("Packet length mismatch in vf file")
676 else:
677 if byte in (139, 140) or byte >= 243:
678 raise ValueError(
679 "Inappropriate opcode %d in vf file" % byte)
680 Dvi._dtable[byte](self, byte)
681 continue
682
683 # We are outside a packet
684 if byte < 242: # a short packet (length given by byte)
685 packet_len = byte
686 packet_char, packet_width = self._arg(1), self._arg(3)
687 packet_ends = self._init_packet(byte)
688 self.state = _dvistate.inpage
689 elif byte == 242: # a long packet
690 packet_len, packet_char, packet_width = \
691 [self._arg(x) for x in (4, 4, 4)]
692 self._init_packet(packet_len)
693 elif 243 <= byte <= 246:
694 k = self._arg(byte - 242, byte == 246)
695 c, s, d, a, l = [self._arg(x) for x in (4, 4, 4, 1, 1)]
696 self._fnt_def_real(k, c, s, d, a, l)
697 if self._first_font is None:
698 self._first_font = k
699 elif byte == 247: # preamble
700 i, k = self._arg(1), self._arg(1)
701 x = self.file.read(k)
702 cs, ds = self._arg(4), self._arg(4)
703 self._pre(i, x, cs, ds)
704 elif byte == 248: # postamble (just some number of 248s)
705 break
706 else:
707 raise ValueError("Unknown vf opcode %d" % byte)
708
709 def _init_packet(self, pl):
710 if self.state != _dvistate.outer:
711 raise ValueError("Misplaced packet in vf file")
712 self.h, self.v, self.w, self.x, self.y, self.z = 0, 0, 0, 0, 0, 0
713 self.stack, self.text, self.boxes = [], [], []
714 self.f = self._first_font
715 return self.file.tell() + pl
716
717 def _finalize_packet(self, packet_char, packet_width):
718 self._chars[packet_char] = Page(
719 text=self.text, boxes=self.boxes, width=packet_width,
720 height=None, descent=None)
721 self.state = _dvistate.outer
722
723 def _pre(self, i, x, cs, ds):
724 if self.state is not _dvistate.pre:
725 raise ValueError("pre command in middle of vf file")
726 if i != 202:
727 raise ValueError("Unknown vf format %d" % i)
728 if len(x):
729 _log.debug('vf file comment: %s', x)
730 self.state = _dvistate.outer
731 # cs = checksum, ds = design size
732
733
734def _mul2012(num1, num2):
735 """Multiply two numbers in 20.12 fixed point format."""
736 # Separated into a function because >> has surprising precedence
737 return (num1*num2) >> 20
738
739
740class Tfm:
741 """
742 A TeX Font Metric file.
743
744 This implementation covers only the bare minimum needed by the Dvi class.
745
746 Parameters
747 ----------
748 filename : str or path-like
749
750 Attributes
751 ----------
752 checksum : int
753 Used for verifying against the dvi file.
754 design_size : int
755 Design size of the font (unknown units)
756 width, height, depth : dict
757 Dimensions of each character, need to be scaled by the factor
758 specified in the dvi file. These are dicts because indexing may
759 not start from 0.
760 """
761 __slots__ = ('checksum', 'design_size', 'width', 'height', 'depth')
762
763 def __init__(self, filename):
764 _log.debug('opening tfm file %s', filename)
765 with open(filename, 'rb') as file:
766 header1 = file.read(24)
767 lh, bc, ec, nw, nh, nd = struct.unpack('!6H', header1[2:14])
768 _log.debug('lh=%d, bc=%d, ec=%d, nw=%d, nh=%d, nd=%d',
769 lh, bc, ec, nw, nh, nd)
770 header2 = file.read(4*lh)
771 self.checksum, self.design_size = struct.unpack('!2I', header2[:8])
772 # there is also encoding information etc.
773 char_info = file.read(4*(ec-bc+1))
774 widths = struct.unpack(f'!{nw}i', file.read(4*nw))
775 heights = struct.unpack(f'!{nh}i', file.read(4*nh))
776 depths = struct.unpack(f'!{nd}i', file.read(4*nd))
777 self.width, self.height, self.depth = {}, {}, {}
778 for idx, char in enumerate(range(bc, ec+1)):
779 byte0 = char_info[4*idx]
780 byte1 = char_info[4*idx+1]
781 self.width[char] = widths[byte0]
782 self.height[char] = heights[byte1 >> 4]
783 self.depth[char] = depths[byte1 & 0xf]
784
785
786PsFont = namedtuple('PsFont', 'texname psname effects encoding filename')
787
788
789class PsfontsMap:
790 """
791 A psfonts.map formatted file, mapping TeX fonts to PS fonts.
792
793 Parameters
794 ----------
795 filename : str or path-like
796
797 Notes
798 -----
799 For historical reasons, TeX knows many Type-1 fonts by different
800 names than the outside world. (For one thing, the names have to
801 fit in eight characters.) Also, TeX's native fonts are not Type-1
802 but Metafont, which is nontrivial to convert to PostScript except
803 as a bitmap. While high-quality conversions to Type-1 format exist
804 and are shipped with modern TeX distributions, we need to know
805 which Type-1 fonts are the counterparts of which native fonts. For
806 these reasons a mapping is needed from internal font names to font
807 file names.
808
809 A texmf tree typically includes mapping files called e.g.
810 :file:`psfonts.map`, :file:`pdftex.map`, or :file:`dvipdfm.map`.
811 The file :file:`psfonts.map` is used by :program:`dvips`,
812 :file:`pdftex.map` by :program:`pdfTeX`, and :file:`dvipdfm.map`
813 by :program:`dvipdfm`. :file:`psfonts.map` might avoid embedding
814 the 35 PostScript fonts (i.e., have no filename for them, as in
815 the Times-Bold example above), while the pdf-related files perhaps
816 only avoid the "Base 14" pdf fonts. But the user may have
817 configured these files differently.
818
819 Examples
820 --------
821 >>> map = PsfontsMap(find_tex_file('pdftex.map'))
822 >>> entry = map[b'ptmbo8r']
823 >>> entry.texname
824 b'ptmbo8r'
825 >>> entry.psname
826 b'Times-Bold'
827 >>> entry.encoding
828 '/usr/local/texlive/2008/texmf-dist/fonts/enc/dvips/base/8r.enc'
829 >>> entry.effects
830 {'slant': 0.16700000000000001}
831 >>> entry.filename
832 """
833 __slots__ = ('_filename', '_unparsed', '_parsed')
834
835 # Create a filename -> PsfontsMap cache, so that calling
836 # `PsfontsMap(filename)` with the same filename a second time immediately
837 # returns the same object.
838 @lru_cache
839 def __new__(cls, filename):
840 self = object.__new__(cls)
841 self._filename = os.fsdecode(filename)
842 # Some TeX distributions have enormous pdftex.map files which would
843 # take hundreds of milliseconds to parse, but it is easy enough to just
844 # store the unparsed lines (keyed by the first word, which is the
845 # texname) and parse them on-demand.
846 with open(filename, 'rb') as file:
847 self._unparsed = {}
848 for line in file:
849 tfmname = line.split(b' ', 1)[0]
850 self._unparsed.setdefault(tfmname, []).append(line)
851 self._parsed = {}
852 return self
853
854 def __getitem__(self, texname):
855 assert isinstance(texname, bytes)
856 if texname in self._unparsed:
857 for line in self._unparsed.pop(texname):
858 if self._parse_and_cache_line(line):
859 break
860 try:
861 return self._parsed[texname]
862 except KeyError:
863 raise LookupError(
864 f"An associated PostScript font (required by Matplotlib) "
865 f"could not be found for TeX font {texname.decode('ascii')!r} "
866 f"in {self._filename!r}; this problem can often be solved by "
867 f"installing a suitable PostScript font package in your TeX "
868 f"package manager") from None
869
870 def _parse_and_cache_line(self, line):
871 """
872 Parse a line in the font mapping file.
873
874 The format is (partially) documented at
875 http://mirrors.ctan.org/systems/doc/pdftex/manual/pdftex-a.pdf
876 https://tug.org/texinfohtml/dvips.html#psfonts_002emap
877 Each line can have the following fields:
878
879 - tfmname (first, only required field),
880 - psname (defaults to tfmname, must come immediately after tfmname if
881 present),
882 - fontflags (integer, must come immediately after psname if present,
883 ignored by us),
884 - special (SlantFont and ExtendFont, only field that is double-quoted),
885 - fontfile, encodingfile (optional, prefixed by <, <<, or <[; << always
886 precedes a font, <[ always precedes an encoding, < can precede either
887 but then an encoding file must have extension .enc; < and << also
888 request different font subsetting behaviors but we ignore that; < can
889 be separated from the filename by whitespace).
890
891 special, fontfile, and encodingfile can appear in any order.
892 """
893 # If the map file specifies multiple encodings for a font, we
894 # follow pdfTeX in choosing the last one specified. Such
895 # entries are probably mistakes but they have occurred.
896 # https://tex.stackexchange.com/q/10826/
897
898 if not line or line.startswith((b" ", b"%", b"*", b";", b"#")):
899 return
900 tfmname = basename = special = encodingfile = fontfile = None
901 is_subsetted = is_t1 = is_truetype = False
902 matches = re.finditer(br'"([^"]*)(?:"|$)|(\S+)', line)
903 for match in matches:
904 quoted, unquoted = match.groups()
905 if unquoted:
906 if unquoted.startswith(b"<<"): # font
907 fontfile = unquoted[2:]
908 elif unquoted.startswith(b"<["): # encoding
909 encodingfile = unquoted[2:]
910 elif unquoted.startswith(b"<"): # font or encoding
911 word = (
912 # <foo => foo
913 unquoted[1:]
914 # < by itself => read the next word
915 or next(filter(None, next(matches).groups())))
916 if word.endswith(b".enc"):
917 encodingfile = word
918 else:
919 fontfile = word
920 is_subsetted = True
921 elif tfmname is None:
922 tfmname = unquoted
923 elif basename is None:
924 basename = unquoted
925 elif quoted:
926 special = quoted
927 effects = {}
928 if special:
929 words = reversed(special.split())
930 for word in words:
931 if word == b"SlantFont":
932 effects["slant"] = float(next(words))
933 elif word == b"ExtendFont":
934 effects["extend"] = float(next(words))
935
936 # Verify some properties of the line that would cause it to be ignored
937 # otherwise.
938 if fontfile is not None:
939 if fontfile.endswith((b".ttf", b".ttc")):
940 is_truetype = True
941 elif not fontfile.endswith(b".otf"):
942 is_t1 = True
943 elif basename is not None:
944 is_t1 = True
945 if is_truetype and is_subsetted and encodingfile is None:
946 return
947 if not is_t1 and ("slant" in effects or "extend" in effects):
948 return
949 if abs(effects.get("slant", 0)) > 1:
950 return
951 if abs(effects.get("extend", 0)) > 2:
952 return
953
954 if basename is None:
955 basename = tfmname
956 if encodingfile is not None:
957 encodingfile = find_tex_file(encodingfile)
958 if fontfile is not None:
959 fontfile = find_tex_file(fontfile)
960 self._parsed[tfmname] = PsFont(
961 texname=tfmname, psname=basename, effects=effects,
962 encoding=encodingfile, filename=fontfile)
963 return True
964
965
966def _parse_enc(path):
967 r"""
968 Parse a \*.enc file referenced from a psfonts.map style file.
969
970 The format supported by this function is a tiny subset of PostScript.
971
972 Parameters
973 ----------
974 path : `os.PathLike`
975
976 Returns
977 -------
978 list
979 The nth entry of the list is the PostScript glyph name of the nth
980 glyph.
981 """
982 no_comments = re.sub("%.*", "", Path(path).read_text(encoding="ascii"))
983 array = re.search(r"(?s)\[(.*)\]", no_comments).group(1)
984 lines = [line for line in array.split() if line]
985 if all(line.startswith("/") for line in lines):
986 return [line[1:] for line in lines]
987 else:
988 raise ValueError(f"Failed to parse {path} as Postscript encoding")
989
990
991class _LuatexKpsewhich:
992 @lru_cache # A singleton.
993 def __new__(cls):
994 self = object.__new__(cls)
995 self._proc = self._new_proc()
996 return self
997
998 def _new_proc(self):
999 return subprocess.Popen(
1000 ["luatex", "--luaonly",
1001 str(cbook._get_data_path("kpsewhich.lua"))],
1002 stdin=subprocess.PIPE, stdout=subprocess.PIPE)
1003
1004 def search(self, filename):
1005 if self._proc.poll() is not None: # Dead, restart it.
1006 self._proc = self._new_proc()
1007 self._proc.stdin.write(os.fsencode(filename) + b"\n")
1008 self._proc.stdin.flush()
1009 out = self._proc.stdout.readline().rstrip()
1010 return None if out == b"nil" else os.fsdecode(out)
1011
1012
1013@lru_cache
1014def find_tex_file(filename):
1015 """
1016 Find a file in the texmf tree using kpathsea_.
1017
1018 The kpathsea library, provided by most existing TeX distributions, both
1019 on Unix-like systems and on Windows (MikTeX), is invoked via a long-lived
1020 luatex process if luatex is installed, or via kpsewhich otherwise.
1021
1022 .. _kpathsea: https://www.tug.org/kpathsea/
1023
1024 Parameters
1025 ----------
1026 filename : str or path-like
1027
1028 Raises
1029 ------
1030 FileNotFoundError
1031 If the file is not found.
1032 """
1033
1034 # we expect these to always be ascii encoded, but use utf-8
1035 # out of caution
1036 if isinstance(filename, bytes):
1037 filename = filename.decode('utf-8', errors='replace')
1038
1039 try:
1040 lk = _LuatexKpsewhich()
1041 except FileNotFoundError:
1042 lk = None # Fallback to directly calling kpsewhich, as below.
1043
1044 if lk:
1045 path = lk.search(filename)
1046 else:
1047 if sys.platform == 'win32':
1048 # On Windows only, kpathsea can use utf-8 for cmd args and output.
1049 # The `command_line_encoding` environment variable is set to force
1050 # it to always use utf-8 encoding. See Matplotlib issue #11848.
1051 kwargs = {'env': {**os.environ, 'command_line_encoding': 'utf-8'},
1052 'encoding': 'utf-8'}
1053 else: # On POSIX, run through the equivalent of os.fsdecode().
1054 kwargs = {'encoding': sys.getfilesystemencoding(),
1055 'errors': 'surrogateescape'}
1056
1057 try:
1058 path = (cbook._check_and_log_subprocess(['kpsewhich', filename],
1059 _log, **kwargs)
1060 .rstrip('\n'))
1061 except (FileNotFoundError, RuntimeError):
1062 path = None
1063
1064 if path:
1065 return path
1066 else:
1067 raise FileNotFoundError(
1068 f"Matplotlib's TeX implementation searched for a file named "
1069 f"{filename!r} in your texmf tree, but could not find it")
1070
1071
1072@lru_cache
1073def _fontfile(cls, suffix, texname):
1074 return cls(find_tex_file(texname + suffix))
1075
1076
1077_tfmfile = partial(_fontfile, Tfm, ".tfm")
1078_vffile = partial(_fontfile, Vf, ".vf")
1079
1080
1081if __name__ == '__main__':
1082 from argparse import ArgumentParser
1083 import itertools
1084
1085 parser = ArgumentParser()
1086 parser.add_argument("filename")
1087 parser.add_argument("dpi", nargs="?", type=float, default=None)
1088 args = parser.parse_args()
1089 with Dvi(args.filename, args.dpi) as dvi:
1090 fontmap = PsfontsMap(find_tex_file('pdftex.map'))
1091 for page in dvi:
1092 print(f"=== new page === "
1093 f"(w: {page.width}, h: {page.height}, d: {page.descent})")
1094 for font, group in itertools.groupby(
1095 page.text, lambda text: text.font):
1096 print(f"font: {font.texname.decode('latin-1')!r}\t"
1097 f"scale: {font._scale / 2 ** 20}")
1098 print("x", "y", "glyph", "chr", "w", "(glyphs)", sep="\t")
1099 for text in group:
1100 print(text.x, text.y, text.glyph,
1101 chr(text.glyph) if chr(text.glyph).isprintable()
1102 else ".",
1103 text.width, sep="\t")
1104 if page.boxes:
1105 print("x", "y", "h", "w", "", "(boxes)", sep="\t")
1106 for box in page.boxes:
1107 print(box.x, box.y, box.height, box.width, sep="\t")