Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_text_extraction/_layout_mode/_font.py: 73%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

26 statements  

1"""Font constants and classes for "layout" mode text operations""" 

2 

3from dataclasses import dataclass, field 

4from typing import Any, Union 

5 

6from ..._codecs import adobe_glyphs 

7from ..._font import FontDescriptor 

8from ...generic import DictionaryObject 

9 

10 

11@dataclass 

12class Font: 

13 """ 

14 A font object formatted for use during "layout" mode text extraction 

15 

16 Attributes: 

17 subtype (str): font subtype 

18 space_width (int | float): width of a space character 

19 encoding (str | Dict[int, str]): font encoding 

20 char_map (dict): character map 

21 font_dictionary (dict): font dictionary 

22 font_descriptor: font metrics, including a mapping of characters to widths 

23 width_map (Dict[str, int]): mapping of characters to widths 

24 interpretable (bool): Default True. If False, the font glyphs cannot 

25 be translated to characters, e.g. Type3 fonts that do not define 

26 a '/ToUnicode' mapping. 

27 

28 """ 

29 

30 subtype: str 

31 space_width: Union[int, float] 

32 encoding: Union[str, dict[int, str]] 

33 char_map: dict[Any, Any] 

34 font_dictionary: DictionaryObject 

35 font_descriptor: FontDescriptor = field(default_factory=FontDescriptor, init=False) 

36 interpretable: bool = True 

37 

38 def __post_init__(self) -> None: 

39 # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be 

40 # reliably converted into character codes unless all named chars 

41 # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the 

42 # PDF 1.7 standard. 

43 if self.subtype == "/Type3" and "/ToUnicode" not in self.font_dictionary: 

44 self.interpretable = all( 

45 cname in adobe_glyphs 

46 for cname in self.font_dictionary.get("/CharProcs") or [] 

47 ) 

48 

49 if not self.interpretable: # save some overhead if font is not interpretable 

50 return 

51 

52 self.font_descriptor = FontDescriptor.from_font_resource(self.font_dictionary, self.encoding, self.char_map) 

53 

54 def word_width(self, word: str) -> float: 

55 """Sum of character widths specified in PDF font for the supplied word""" 

56 return sum( 

57 [self.font_descriptor.character_widths.get(char, self.space_width * 2) for char in word], 0.0 

58 ) 

59 

60 @staticmethod 

61 def to_dict(font_instance: "Font") -> dict[str, Any]: 

62 """Dataclass to dict for json.dumps serialization.""" 

63 return { 

64 k: getattr(font_instance, k) for k in font_instance.__dataclass_fields__ 

65 }