Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pdfplumber/utils/pdfinternals.py: 56%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

55 statements  

1from typing import Any, List, Optional, Union 

2 

3from pdfminer.pdftypes import PDFObjRef 

4from pdfminer.psparser import PSLiteral 

5from pdfminer.utils import PDFDocEncoding 

6 

7from .exceptions import MalformedPDFException 

8 

9 

10def decode_text(s: Union[bytes, str]) -> str: 

11 """ 

12 Decodes a PDFDocEncoding string to Unicode. 

13 Adds py3 compatibility to pdfminer's version. 

14 """ 

15 if isinstance(s, bytes) and s.startswith(b"\xfe\xff"): 

16 return str(s[2:], "utf-16be", "ignore") 

17 try: 

18 ords = (ord(c) if isinstance(c, str) else c for c in s) 

19 return "".join(PDFDocEncoding[o] for o in ords) 

20 except IndexError: 

21 return str(s) 

22 

23 

24def resolve_and_decode(obj: Any) -> Any: 

25 """Recursively resolve the metadata values.""" 

26 if hasattr(obj, "resolve"): 

27 obj = obj.resolve() 

28 if isinstance(obj, list): 

29 return list(map(resolve_and_decode, obj)) 

30 elif isinstance(obj, PSLiteral): 

31 return decode_text(obj.name) 

32 elif isinstance(obj, (str, bytes)): 

33 return decode_text(obj) 

34 elif isinstance(obj, dict): 

35 for k, v in obj.items(): 

36 obj[k] = resolve_and_decode(v) 

37 return obj 

38 

39 return obj 

40 

41 

42def decode_psl_list(_list: List[Union[PSLiteral, str]]) -> List[str]: 

43 return [ 

44 decode_text(value.name) if isinstance(value, PSLiteral) else value 

45 for value in _list 

46 ] 

47 

48 

49def resolve(x: Any) -> Any: 

50 if isinstance(x, PDFObjRef): 

51 return x.resolve() 

52 else: 

53 return x 

54 

55 

56def get_dict_type(d: Any) -> Optional[str]: 

57 if not isinstance(d, dict): 

58 return None 

59 t = d.get("Type") 

60 if isinstance(t, PSLiteral): 

61 return decode_text(t.name) 

62 else: 

63 return t 

64 

65 

66def resolve_all(x: Any) -> Any: 

67 """ 

68 Recursively resolves the given object and all the internals. 

69 """ 

70 if isinstance(x, PDFObjRef): 

71 resolved = x.resolve() 

72 

73 # Avoid infinite recursion 

74 if get_dict_type(resolved) == "Page": 

75 return x 

76 

77 try: 

78 return resolve_all(resolved) 

79 except RecursionError as e: 

80 raise MalformedPDFException(e) 

81 elif isinstance(x, (list, tuple)): 

82 return type(x)(resolve_all(v) for v in x) 

83 elif isinstance(x, dict): 

84 exceptions = ["Parent"] if get_dict_type(x) == "Annot" else [] 

85 return {k: v if k in exceptions else resolve_all(v) for k, v in x.items()} 

86 else: 

87 return x