Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pdfplumber/convert.py: 71%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import base64
2from typing import Any, Callable, Dict, List, Optional, Tuple
4from pdfminer.psparser import PSLiteral
6from .utils import decode_text
8ENCODINGS_TO_TRY = [
9 "utf-8",
10 "latin-1",
11 "utf-16",
12 "utf-16le",
13]
15CSV_COLS_REQUIRED = [
16 "object_type",
17]
19CSV_COLS_TO_PREPEND = [
20 "page_number",
21 "x0",
22 "x1",
23 "y0",
24 "y1",
25 "doctop",
26 "top",
27 "bottom",
28 "width",
29 "height",
30]
33def get_attr_filter(
34 include_attrs: Optional[List[str]] = None, exclude_attrs: Optional[List[str]] = None
35) -> Callable[[str], bool]:
36 if include_attrs is not None and exclude_attrs is not None:
37 raise ValueError(
38 "Cannot specify `include_attrs` and `exclude_attrs` at the same time."
39 )
41 elif include_attrs is not None:
42 incl = set(CSV_COLS_REQUIRED + include_attrs)
43 return lambda attr: attr in incl
45 elif exclude_attrs is not None:
46 nonexcludable = set(exclude_attrs).intersection(set(CSV_COLS_REQUIRED))
47 if len(nonexcludable):
48 raise ValueError(
49 f"Cannot exclude these required properties: {list(nonexcludable)}"
50 )
51 excl = set(exclude_attrs)
52 return lambda attr: attr not in excl
54 else:
55 return lambda attr: True
58def to_b64(data: bytes) -> str:
59 return base64.b64encode(data).decode("ascii")
62class Serializer:
63 def __init__(
64 self,
65 precision: Optional[int] = None,
66 include_attrs: Optional[List[str]] = None,
67 exclude_attrs: Optional[List[str]] = None,
68 ):
70 self.precision = precision
71 self.attr_filter = get_attr_filter(
72 include_attrs=include_attrs, exclude_attrs=exclude_attrs
73 )
75 def serialize(self, obj: Any) -> Any:
76 if obj is None:
77 return None
79 t = type(obj)
81 # Basic types don't need to be converted
82 if t in (int, str):
83 return obj
85 # Use one of the custom converters, if possible
86 fn = getattr(self, f"do_{t.__name__}", None)
87 if fn is not None:
88 return fn(obj)
90 # Otherwise, just use the string-representation
91 else:
92 return str(obj)
94 def do_float(self, x: float) -> float:
95 return x if self.precision is None else round(x, self.precision)
97 def do_bool(self, x: bool) -> int:
98 return int(x)
100 def do_list(self, obj: List[Any]) -> List[Any]:
101 return list(self.serialize(x) for x in obj)
103 def do_tuple(self, obj: Tuple[Any, ...]) -> Tuple[Any, ...]:
104 return tuple(self.serialize(x) for x in obj)
106 def do_dict(self, obj: Dict[str, Any]) -> Dict[str, Any]:
107 if "object_type" in obj.keys():
108 return {k: self.serialize(v) for k, v in obj.items() if self.attr_filter(k)}
109 else:
110 return {k: self.serialize(v) for k, v in obj.items()}
112 def do_PDFStream(self, obj: Any) -> Dict[str, Optional[str]]:
113 return {"rawdata": to_b64(obj.rawdata) if obj.rawdata else None}
115 def do_PSLiteral(self, obj: PSLiteral) -> str:
116 return decode_text(obj.name)
118 def do_bytes(self, obj: bytes) -> Optional[str]:
119 for e in ENCODINGS_TO_TRY:
120 try:
121 return obj.decode(e)
122 except UnicodeDecodeError: # pragma: no cover
123 return None
124 # If none of the decodings work, raise whatever error
125 # decoding with utf-8 causes
126 obj.decode(ENCODINGS_TO_TRY[0]) # pragma: no cover
127 return None # pragma: no cover