1from typing import Any, List, Optional, Union
2
3from pdfminer.pdftypes import PDFObjRef
4from pdfminer.psparser import PSLiteral
5from pdfminer.utils import PDFDocEncoding
6
7from .exceptions import MalformedPDFException
8
9
10def decode_text(s: Union[bytes, str]) -> str:
11 """
12 Decodes a PDFDocEncoding string to Unicode.
13 Adds py3 compatibility to pdfminer's version.
14 """
15 if isinstance(s, bytes) and s.startswith(b"\xfe\xff"):
16 return str(s[2:], "utf-16be", "ignore")
17 try:
18 ords = (ord(c) if isinstance(c, str) else c for c in s)
19 return "".join(PDFDocEncoding[o] for o in ords)
20 except IndexError:
21 return str(s)
22
23
24def resolve_and_decode(obj: Any) -> Any:
25 """Recursively resolve the metadata values."""
26 if hasattr(obj, "resolve"):
27 obj = obj.resolve()
28 if isinstance(obj, list):
29 return list(map(resolve_and_decode, obj))
30 elif isinstance(obj, PSLiteral):
31 return decode_text(obj.name)
32 elif isinstance(obj, (str, bytes)):
33 return decode_text(obj)
34 elif isinstance(obj, dict):
35 for k, v in obj.items():
36 obj[k] = resolve_and_decode(v)
37 return obj
38
39 return obj
40
41
42def decode_psl_list(_list: List[Union[PSLiteral, str]]) -> List[str]:
43 return [
44 decode_text(value.name) if isinstance(value, PSLiteral) else value
45 for value in _list
46 ]
47
48
49def resolve(x: Any) -> Any:
50 if isinstance(x, PDFObjRef):
51 return x.resolve()
52 else:
53 return x
54
55
56def get_dict_type(d: Any) -> Optional[str]:
57 if not isinstance(d, dict):
58 return None
59 t = d.get("Type")
60 if isinstance(t, PSLiteral):
61 return decode_text(t.name)
62 else:
63 return t
64
65
66def resolve_all(x: Any) -> Any:
67 """
68 Recursively resolves the given object and all the internals.
69 """
70 if isinstance(x, PDFObjRef):
71 resolved = x.resolve()
72
73 # Avoid infinite recursion
74 if get_dict_type(resolved) == "Page":
75 return x
76
77 try:
78 return resolve_all(resolved)
79 except RecursionError as e:
80 raise MalformedPDFException(e)
81 elif isinstance(x, (list, tuple)):
82 return type(x)(resolve_all(v) for v in x)
83 elif isinstance(x, dict):
84 exceptions = ["Parent"] if get_dict_type(x) == "Annot" else []
85 return {k: v if k in exceptions else resolve_all(v) for k, v in x.items()}
86 else:
87 return x