Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pdfplumber/convert.py: 71%

1import base64

2from typing import Any, Callable, Dict, List, Optional, Tuple

4from pdfminer.psparser import PSLiteral

6from .utils import decode_text

8ENCODINGS_TO_TRY = [

9 "utf-8",

10 "latin-1",

11 "utf-16",

12 "utf-16le",

13]

15CSV_COLS_REQUIRED = [

16 "object_type",

17]

19CSV_COLS_TO_PREPEND = [

20 "page_number",

21 "x0",

22 "x1",

23 "y0",

24 "y1",

25 "doctop",

26 "top",

27 "bottom",

28 "width",

29 "height",

30]

33def get_attr_filter(

34 include_attrs: Optional[List[str]] = None, exclude_attrs: Optional[List[str]] = None

35) -> Callable[[str], bool]:

36 if include_attrs is not None and exclude_attrs is not None:

37 raise ValueError(

38 "Cannot specify `include_attrs` and `exclude_attrs` at the same time."

39 )

41 elif include_attrs is not None:

42 incl = set(CSV_COLS_REQUIRED + include_attrs)

43 return lambda attr: attr in incl

45 elif exclude_attrs is not None:

46 nonexcludable = set(exclude_attrs).intersection(set(CSV_COLS_REQUIRED))

47 if len(nonexcludable):

48 raise ValueError(

49 f"Cannot exclude these required properties: {list(nonexcludable)}"

50 )

51 excl = set(exclude_attrs)

52 return lambda attr: attr not in excl

54 else:

55 return lambda attr: True

58def to_b64(data: bytes) -> str:

59 return base64.b64encode(data).decode("ascii")

62class Serializer:

63 def __init__(

64 self,

65 precision: Optional[int] = None,

66 include_attrs: Optional[List[str]] = None,

67 exclude_attrs: Optional[List[str]] = None,

68 ):

70 self.precision = precision

71 self.attr_filter = get_attr_filter(

72 include_attrs=include_attrs, exclude_attrs=exclude_attrs

73 )

75 def serialize(self, obj: Any) -> Any:

76 if obj is None:

77 return None

79 t = type(obj)

81 # Basic types don't need to be converted

82 if t in (int, str):

83 return obj

85 # Use one of the custom converters, if possible

86 fn = getattr(self, f"do_{t.__name__}", None)

87 if fn is not None:

88 return fn(obj)

90 # Otherwise, just use the string-representation

91 else:

92 return str(obj)

94 def do_float(self, x: float) -> float:

95 return x if self.precision is None else round(x, self.precision)

97 def do_bool(self, x: bool) -> int:

98 return int(x)

100 def do_list(self, obj: List[Any]) -> List[Any]:

101 return list(self.serialize(x) for x in obj)

102

103 def do_tuple(self, obj: Tuple[Any, ...]) -> Tuple[Any, ...]:

104 return tuple(self.serialize(x) for x in obj)

105

106 def do_dict(self, obj: Dict[str, Any]) -> Dict[str, Any]:

107 if "object_type" in obj.keys():

108 return {k: self.serialize(v) for k, v in obj.items() if self.attr_filter(k)}

109 else:

110 return {k: self.serialize(v) for k, v in obj.items()}

111

112 def do_PDFStream(self, obj: Any) -> Dict[str, Optional[str]]:

113 return {"rawdata": to_b64(obj.rawdata) if obj.rawdata else None}

114

115 def do_PSLiteral(self, obj: PSLiteral) -> str:

116 return decode_text(obj.name)

117

118 def do_bytes(self, obj: bytes) -> Optional[str]:

119 for e in ENCODINGS_TO_TRY:

120 try:

121 return obj.decode(e)

122 except UnicodeDecodeError: # pragma: no cover

123 return None

124 # If none of the decodings work, raise whatever error

125 # decoding with utf-8 causes

126 obj.decode(ENCODINGS_TO_TRY[0]) # pragma: no cover

127 return None # pragma: no cover