1"""
2Page labels are shown by PDF viewers as "the page number".
3
4A page has a numeric index, starting at 0. Additionally, the page
5has a label. In the most simple case:
6
7 label = index + 1
8
9However, the title page and the table of contents might have Roman numerals as
10page labels. This makes things more complicated.
11
12Example 1
13---------
14
15>>> reader.root_object["/PageLabels"]["/Nums"]
16[0, IndirectObject(18, 0, 139929798197504),
17 8, IndirectObject(19, 0, 139929798197504)]
18>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][1])
19{'/S': '/r'}
20>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][3])
21{'/S': '/D'}
22
23Example 2
24---------
25The following is a document with pages labeled
26i, ii, iii, iv, 1, 2, 3, A-8, A-9, ...
27
281 0 obj
29 << /Type /Catalog
30 /PageLabels << /Nums [
31 0 << /S /r >>
32 4 << /S /D >>
33 7 << /S /D
34 /P ( A- )
35 /St 8
36 >>
37 % A number tree containing
38 % three page label dictionaries
39 ]
40 >>
41 ...
42 >>
43endobj
44
45
46§12.4.2 PDF Specification 1.7 and 2.0
47=====================================
48
49Entries in a page label dictionary
50----------------------------------
51The /S key:
52D Decimal Arabic numerals
53R Uppercase Roman numerals
54r Lowercase Roman numerals
55A Uppercase letters (A to Z for the first 26 pages,
56 AA to ZZ for the next 26, and so on)
57a Lowercase letters (a to z for the first 26 pages,
58 aa to zz for the next 26, and so on)
59"""
60
61from collections.abc import Iterator
62from typing import Optional, cast
63
64from ._protocols import PdfCommonDocProtocol
65from ._utils import logger_warning
66from .generic import (
67 ArrayObject,
68 DictionaryObject,
69 NullObject,
70 NumberObject,
71 is_null_or_none,
72)
73
74
75def number2uppercase_roman_numeral(num: int) -> str:
76 roman = [
77 (1000, "M"),
78 (900, "CM"),
79 (500, "D"),
80 (400, "CD"),
81 (100, "C"),
82 (90, "XC"),
83 (50, "L"),
84 (40, "XL"),
85 (10, "X"),
86 (9, "IX"),
87 (5, "V"),
88 (4, "IV"),
89 (1, "I"),
90 ]
91
92 def roman_num(num: int) -> Iterator[str]:
93 for decimal, roman_repr in roman:
94 x, _ = divmod(num, decimal)
95 yield roman_repr * x
96 num -= decimal * x
97 if num <= 0:
98 break
99
100 return "".join(list(roman_num(num)))
101
102
103def number2lowercase_roman_numeral(number: int) -> str:
104 return number2uppercase_roman_numeral(number).lower()
105
106
107def number2uppercase_letter(number: int) -> str:
108 if number <= 0:
109 raise ValueError("Expecting a positive number")
110 alphabet = [chr(i) for i in range(ord("A"), ord("Z") + 1)]
111 rep = ""
112 while number > 0:
113 remainder = number % 26
114 if remainder == 0:
115 remainder = 26
116 rep = alphabet[remainder - 1] + rep
117 # update
118 number -= remainder
119 number = number // 26
120 return rep
121
122
123def number2lowercase_letter(number: int) -> str:
124 return number2uppercase_letter(number).lower()
125
126
127def get_label_from_nums(dictionary_object: DictionaryObject, index: int) -> str:
128 # [Nums] shall be an array of the form
129 # [ key_1 value_1 key_2 value_2 ... key_n value_n ]
130 # where each key_i is an integer and the corresponding
131 # value_i shall be the object associated with that key.
132 # The keys shall be sorted in numerical order,
133 # analogously to the arrangement of keys in a name tree
134 # as described in 7.9.6, "Name Trees."
135 nums = cast(ArrayObject, dictionary_object["/Nums"])
136 i = 0
137 value = None
138 start_index = 0
139 while i < len(nums):
140 start_index = nums[i]
141 value = nums[i + 1].get_object()
142 if i + 2 == len(nums):
143 break
144 if nums[i + 2] > index:
145 break
146 i += 2
147 m = {
148 None: lambda _: "",
149 "/D": lambda n: str(n),
150 "/R": number2uppercase_roman_numeral,
151 "/r": number2lowercase_roman_numeral,
152 "/A": number2uppercase_letter,
153 "/a": number2lowercase_letter,
154 }
155 # if /Nums array is not following the specification or if /Nums is empty
156 if not isinstance(value, dict):
157 return str(index + 1) # Fallback
158 start = value.get("/St", 1)
159 prefix = value.get("/P", "")
160 return prefix + m[value.get("/S")](index - start_index + start)
161
162
163def index2label(reader: PdfCommonDocProtocol, index: int) -> str:
164 """
165 See 7.9.7 "Number Trees".
166
167 Args:
168 reader: The PdfReader
169 index: The index of the page
170
171 Returns:
172 The label of the page, e.g. "iv" or "4".
173
174 """
175 root = cast(DictionaryObject, reader.root_object)
176 if "/PageLabels" not in root:
177 return str(index + 1) # Fallback
178 number_tree = cast(DictionaryObject, root["/PageLabels"].get_object())
179 if "/Nums" in number_tree:
180 return get_label_from_nums(number_tree, index)
181 if "/Kids" in number_tree and not isinstance(number_tree["/Kids"], NullObject):
182 # number_tree = {'/Kids': [IndirectObject(7333, 0, 140132998195856), ...]}
183 # Limit maximum depth.
184 level = 0
185 while level < 100:
186 kids = cast(list[DictionaryObject], number_tree["/Kids"])
187 for kid in kids:
188 # kid = {'/Limits': [0, 63], '/Nums': [0, {'/P': 'C1'}, ...]}
189 limits = cast(list[int], kid["/Limits"])
190 if limits[0] <= index <= limits[1]:
191 if not is_null_or_none(kid.get("/Kids", None)):
192 # Recursive definition.
193 level += 1
194 if level == 100: # pragma: no cover
195 raise NotImplementedError(
196 "Too deep nesting is not supported."
197 )
198 number_tree = kid
199 # Exit the inner `for` loop and continue at the next level with the
200 # next iteration of the `while` loop.
201 break
202 return get_label_from_nums(kid, index)
203 else:
204 # When there are no kids, make sure to exit the `while` loop directly
205 # and continue with the fallback.
206 break
207
208 logger_warning(f"Could not reliably determine page label for {index}.", __name__)
209 return str(index + 1) # Fallback if neither /Nums nor /Kids is in the number_tree
210
211
212def nums_insert(
213 key: NumberObject,
214 value: DictionaryObject,
215 nums: ArrayObject,
216) -> None:
217 """
218 Insert a key, value pair in a Nums array.
219
220 See 7.9.7 "Number Trees".
221
222 Args:
223 key: number key of the entry
224 value: value of the entry
225 nums: Nums array to modify
226
227 """
228 if len(nums) % 2 != 0:
229 raise ValueError("A nums like array must have an even number of elements")
230
231 i = len(nums)
232 while i != 0 and key <= nums[i - 2]:
233 i = i - 2
234
235 if i < len(nums) and key == nums[i]:
236 nums[i + 1] = value
237 else:
238 nums.insert(i, key)
239 nums.insert(i + 1, value)
240
241
242def nums_clear_range(
243 key: NumberObject,
244 page_index_to: int,
245 nums: ArrayObject,
246) -> None:
247 """
248 Remove all entries in a number tree in a range after an entry.
249
250 See 7.9.7 "Number Trees".
251
252 Args:
253 key: number key of the entry before the range
254 page_index_to: The page index of the upper limit of the range
255 nums: Nums array to modify
256
257 """
258 if len(nums) % 2 != 0:
259 raise ValueError("A nums like array must have an even number of elements")
260 if page_index_to < key:
261 raise ValueError("page_index_to must be greater or equal than key")
262
263 i = nums.index(key) + 2
264 while i < len(nums) and nums[i] <= page_index_to:
265 nums.pop(i)
266 nums.pop(i)
267
268
269def nums_next(
270 key: NumberObject,
271 nums: ArrayObject,
272) -> tuple[Optional[NumberObject], Optional[DictionaryObject]]:
273 """
274 Return the (key, value) pair of the entry after the given one.
275
276 See 7.9.7 "Number Trees".
277
278 Args:
279 key: number key of the entry
280 nums: Nums array
281
282 """
283 if len(nums) % 2 != 0:
284 raise ValueError("A nums like array must have an even number of elements")
285
286 i = nums.index(key) + 2
287 if i < len(nums):
288 return (nums[i], nums[i + 1])
289 return (None, None)