1"""
2Page labels are shown by PDF viewers as "the page number".
3
4A page has a numeric index, starting at 0. Additionally, the page
5has a label. In the most simple case:
6
7 label = index + 1
8
9However, the title page and the table of contents might have Roman numerals as
10page labels. This makes things more complicated.
11
12Example 1
13---------
14
15>>> reader.root_object["/PageLabels"]["/Nums"]
16[0, IndirectObject(18, 0, 139929798197504),
17 8, IndirectObject(19, 0, 139929798197504)]
18>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][1])
19{'/S': '/r'}
20>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][3])
21{'/S': '/D'}
22
23Example 2
24---------
25The following is a document with pages labeled
26i, ii, iii, iv, 1, 2, 3, A-8, A-9, ...
27
281 0 obj
29 << /Type /Catalog
30 /PageLabels << /Nums [
31 0 << /S /r >>
32 4 << /S /D >>
33 7 << /S /D
34 /P ( A- )
35 /St 8
36 >>
37 % A number tree containing
38 % three page label dictionaries
39 ]
40 >>
41 ...
42 >>
43endobj
44
45
46§12.4.2 PDF Specification 1.7 and 2.0
47=====================================
48
49Entries in a page label dictionary
50----------------------------------
51The /S key:
52D Decimal Arabic numerals
53R Uppercase Roman numerals
54r Lowercase Roman numerals
55A Uppercase letters (A to Z for the first 26 pages,
56 AA to ZZ for the next 26, and so on)
57a Lowercase letters (a to z for the first 26 pages,
58 aa to zz for the next 26, and so on)
59"""
60
61from typing import Iterator, List, Optional, Tuple, cast
62
63from ._protocols import PdfCommonDocProtocol
64from ._utils import logger_warning
65from .generic import (
66 ArrayObject,
67 DictionaryObject,
68 NullObject,
69 NumberObject,
70 is_null_or_none,
71)
72
73
74def number2uppercase_roman_numeral(num: int) -> str:
75 roman = [
76 (1000, "M"),
77 (900, "CM"),
78 (500, "D"),
79 (400, "CD"),
80 (100, "C"),
81 (90, "XC"),
82 (50, "L"),
83 (40, "XL"),
84 (10, "X"),
85 (9, "IX"),
86 (5, "V"),
87 (4, "IV"),
88 (1, "I"),
89 ]
90
91 def roman_num(num: int) -> Iterator[str]:
92 for decimal, roman_repr in roman:
93 x, _ = divmod(num, decimal)
94 yield roman_repr * x
95 num -= decimal * x
96 if num <= 0:
97 break
98
99 return "".join(list(roman_num(num)))
100
101
102def number2lowercase_roman_numeral(number: int) -> str:
103 return number2uppercase_roman_numeral(number).lower()
104
105
106def number2uppercase_letter(number: int) -> str:
107 if number <= 0:
108 raise ValueError("Expecting a positive number")
109 alphabet = [chr(i) for i in range(ord("A"), ord("Z") + 1)]
110 rep = ""
111 while number > 0:
112 remainder = number % 26
113 if remainder == 0:
114 remainder = 26
115 rep = alphabet[remainder - 1] + rep
116 # update
117 number -= remainder
118 number = number // 26
119 return rep
120
121
122def number2lowercase_letter(number: int) -> str:
123 return number2uppercase_letter(number).lower()
124
125
126def get_label_from_nums(dictionary_object: DictionaryObject, index: int) -> str:
127 # [Nums] shall be an array of the form
128 # [ key_1 value_1 key_2 value_2 ... key_n value_n ]
129 # where each key_i is an integer and the corresponding
130 # value_i shall be the object associated with that key.
131 # The keys shall be sorted in numerical order,
132 # analogously to the arrangement of keys in a name tree
133 # as described in 7.9.6, "Name Trees."
134 nums = cast(ArrayObject, dictionary_object["/Nums"])
135 i = 0
136 value = None
137 start_index = 0
138 while i < len(nums):
139 start_index = nums[i]
140 value = nums[i + 1].get_object()
141 if i + 2 == len(nums):
142 break
143 if nums[i + 2] > index:
144 break
145 i += 2
146 m = {
147 None: lambda _: "",
148 "/D": lambda n: str(n),
149 "/R": number2uppercase_roman_numeral,
150 "/r": number2lowercase_roman_numeral,
151 "/A": number2uppercase_letter,
152 "/a": number2lowercase_letter,
153 }
154 # if /Nums array is not following the specification or if /Nums is empty
155 if not isinstance(value, dict):
156 return str(index + 1) # Fallback
157 start = value.get("/St", 1)
158 prefix = value.get("/P", "")
159 return prefix + m[value.get("/S")](index - start_index + start)
160
161
162def index2label(reader: PdfCommonDocProtocol, index: int) -> str:
163 """
164 See 7.9.7 "Number Trees".
165
166 Args:
167 reader: The PdfReader
168 index: The index of the page
169
170 Returns:
171 The label of the page, e.g. "iv" or "4".
172
173 """
174 root = cast(DictionaryObject, reader.root_object)
175 if "/PageLabels" not in root:
176 return str(index + 1) # Fallback
177 number_tree = cast(DictionaryObject, root["/PageLabels"].get_object())
178 if "/Nums" in number_tree:
179 return get_label_from_nums(number_tree, index)
180 if "/Kids" in number_tree and not isinstance(number_tree["/Kids"], NullObject):
181 # number_tree = {'/Kids': [IndirectObject(7333, 0, 140132998195856), ...]}
182 # Limit maximum depth.
183 level = 0
184 while level < 100:
185 kids = cast(List[DictionaryObject], number_tree["/Kids"])
186 for kid in kids:
187 # kid = {'/Limits': [0, 63], '/Nums': [0, {'/P': 'C1'}, ...]}
188 limits = cast(List[int], kid["/Limits"])
189 if limits[0] <= index <= limits[1]:
190 if not is_null_or_none(kid.get("/Kids", None)):
191 # Recursive definition.
192 level += 1
193 if level == 100: # pragma: no cover
194 raise NotImplementedError(
195 "Too deep nesting is not supported."
196 )
197 number_tree = kid
198 # Exit the inner `for` loop and continue at the next level with the
199 # next iteration of the `while` loop.
200 break
201 return get_label_from_nums(kid, index)
202 else:
203 # When there are no kids, make sure to exit the `while` loop directly
204 # and continue with the fallback.
205 break
206
207 logger_warning(f"Could not reliably determine page label for {index}.", __name__)
208 return str(index + 1) # Fallback if neither /Nums nor /Kids is in the number_tree
209
210
211def nums_insert(
212 key: NumberObject,
213 value: DictionaryObject,
214 nums: ArrayObject,
215) -> None:
216 """
217 Insert a key, value pair in a Nums array.
218
219 See 7.9.7 "Number Trees".
220
221 Args:
222 key: number key of the entry
223 value: value of the entry
224 nums: Nums array to modify
225
226 """
227 if len(nums) % 2 != 0:
228 raise ValueError("A nums like array must have an even number of elements")
229
230 i = len(nums)
231 while i != 0 and key <= nums[i - 2]:
232 i = i - 2
233
234 if i < len(nums) and key == nums[i]:
235 nums[i + 1] = value
236 else:
237 nums.insert(i, key)
238 nums.insert(i + 1, value)
239
240
241def nums_clear_range(
242 key: NumberObject,
243 page_index_to: int,
244 nums: ArrayObject,
245) -> None:
246 """
247 Remove all entries in a number tree in a range after an entry.
248
249 See 7.9.7 "Number Trees".
250
251 Args:
252 key: number key of the entry before the range
253 page_index_to: The page index of the upper limit of the range
254 nums: Nums array to modify
255
256 """
257 if len(nums) % 2 != 0:
258 raise ValueError("A nums like array must have an even number of elements")
259 if page_index_to < key:
260 raise ValueError("page_index_to must be greater or equal than key")
261
262 i = nums.index(key) + 2
263 while i < len(nums) and nums[i] <= page_index_to:
264 nums.pop(i)
265 nums.pop(i)
266
267
268def nums_next(
269 key: NumberObject,
270 nums: ArrayObject,
271) -> Tuple[Optional[NumberObject], Optional[DictionaryObject]]:
272 """
273 Return the (key, value) pair of the entry after the given one.
274
275 See 7.9.7 "Number Trees".
276
277 Args:
278 key: number key of the entry
279 nums: Nums array
280
281 """
282 if len(nums) % 2 != 0:
283 raise ValueError("A nums like array must have an even number of elements")
284
285 i = nums.index(key) + 2
286 if i < len(nums):
287 return (nums[i], nums[i + 1])
288 return (None, None)