Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pdfplumber/convert.py: 71%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

56 statements  

1import base64 

2from typing import Any, Callable, Dict, List, Optional, Tuple 

3 

4from pdfminer.psparser import PSLiteral 

5 

6from .utils import decode_text 

7 

8ENCODINGS_TO_TRY = [ 

9 "utf-8", 

10 "latin-1", 

11 "utf-16", 

12 "utf-16le", 

13] 

14 

15CSV_COLS_REQUIRED = [ 

16 "object_type", 

17] 

18 

19CSV_COLS_TO_PREPEND = [ 

20 "page_number", 

21 "x0", 

22 "x1", 

23 "y0", 

24 "y1", 

25 "doctop", 

26 "top", 

27 "bottom", 

28 "width", 

29 "height", 

30] 

31 

32 

33def get_attr_filter( 

34 include_attrs: Optional[List[str]] = None, exclude_attrs: Optional[List[str]] = None 

35) -> Callable[[str], bool]: 

36 if include_attrs is not None and exclude_attrs is not None: 

37 raise ValueError( 

38 "Cannot specify `include_attrs` and `exclude_attrs` at the same time." 

39 ) 

40 

41 elif include_attrs is not None: 

42 incl = set(CSV_COLS_REQUIRED + include_attrs) 

43 return lambda attr: attr in incl 

44 

45 elif exclude_attrs is not None: 

46 nonexcludable = set(exclude_attrs).intersection(set(CSV_COLS_REQUIRED)) 

47 if len(nonexcludable): 

48 raise ValueError( 

49 f"Cannot exclude these required properties: {list(nonexcludable)}" 

50 ) 

51 excl = set(exclude_attrs) 

52 return lambda attr: attr not in excl 

53 

54 else: 

55 return lambda attr: True 

56 

57 

58def to_b64(data: bytes) -> str: 

59 return base64.b64encode(data).decode("ascii") 

60 

61 

62class Serializer: 

63 def __init__( 

64 self, 

65 precision: Optional[int] = None, 

66 include_attrs: Optional[List[str]] = None, 

67 exclude_attrs: Optional[List[str]] = None, 

68 ): 

69 

70 self.precision = precision 

71 self.attr_filter = get_attr_filter( 

72 include_attrs=include_attrs, exclude_attrs=exclude_attrs 

73 ) 

74 

75 def serialize(self, obj: Any) -> Any: 

76 if obj is None: 

77 return None 

78 

79 t = type(obj) 

80 

81 # Basic types don't need to be converted 

82 if t in (int, str): 

83 return obj 

84 

85 # Use one of the custom converters, if possible 

86 fn = getattr(self, f"do_{t.__name__}", None) 

87 if fn is not None: 

88 return fn(obj) 

89 

90 # Otherwise, just use the string-representation 

91 else: 

92 return str(obj) 

93 

94 def do_float(self, x: float) -> float: 

95 return x if self.precision is None else round(x, self.precision) 

96 

97 def do_bool(self, x: bool) -> int: 

98 return int(x) 

99 

100 def do_list(self, obj: List[Any]) -> List[Any]: 

101 return list(self.serialize(x) for x in obj) 

102 

103 def do_tuple(self, obj: Tuple[Any, ...]) -> Tuple[Any, ...]: 

104 return tuple(self.serialize(x) for x in obj) 

105 

106 def do_dict(self, obj: Dict[str, Any]) -> Dict[str, Any]: 

107 if "object_type" in obj.keys(): 

108 return {k: self.serialize(v) for k, v in obj.items() if self.attr_filter(k)} 

109 else: 

110 return {k: self.serialize(v) for k, v in obj.items()} 

111 

112 def do_PDFStream(self, obj: Any) -> Dict[str, Optional[str]]: 

113 return {"rawdata": to_b64(obj.rawdata) if obj.rawdata else None} 

114 

115 def do_PSLiteral(self, obj: PSLiteral) -> str: 

116 return decode_text(obj.name) 

117 

118 def do_bytes(self, obj: bytes) -> Optional[str]: 

119 for e in ENCODINGS_TO_TRY: 

120 try: 

121 return obj.decode(e) 

122 except UnicodeDecodeError: # pragma: no cover 

123 return None 

124 # If none of the decodings work, raise whatever error 

125 # decoding with utf-8 causes 

126 obj.decode(ENCODINGS_TO_TRY[0]) # pragma: no cover 

127 return None # pragma: no cover