/src/mupdf/source/pdf/pdf-label.c
Line | Count | Source |
1 | | // Copyright (C) 2004-2025 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "mupdf/fitz.h" |
24 | | #include "mupdf/pdf.h" |
25 | | |
26 | | #include <stdarg.h> |
27 | | #include <stdlib.h> |
28 | | #include <string.h> |
29 | | |
30 | | typedef struct pdf_object_labels pdf_object_labels; |
31 | | typedef struct pdf_object_label_node pdf_object_label_node; |
32 | | |
33 | | struct pdf_object_label_node |
34 | | { |
35 | | int num; |
36 | | char *path; |
37 | | pdf_object_label_node *next; |
38 | | }; |
39 | | |
40 | | struct pdf_object_labels |
41 | | { |
42 | | fz_pool *pool; |
43 | | int object_count; |
44 | | int root, info, encrypt; |
45 | | unsigned short *pages; |
46 | | char *seen; |
47 | | pdf_object_label_node **nodes; |
48 | | }; |
49 | | |
50 | | static void |
51 | | add_object_label(fz_context *ctx, pdf_object_labels *g, char *path, int a, int b) |
52 | 0 | { |
53 | 0 | pdf_object_label_node *node, **root; |
54 | |
|
55 | 0 | if (a < 0 || a >= g->object_count) |
56 | 0 | return; |
57 | | |
58 | 0 | node = fz_pool_alloc(ctx, g->pool, sizeof(pdf_object_label_node)); |
59 | 0 | node->path = fz_pool_strdup(ctx, g->pool, path); |
60 | 0 | node->num = b; |
61 | |
|
62 | 0 | root = &g->nodes[a]; |
63 | 0 | node->next = *root; |
64 | 0 | *root = node; |
65 | 0 | } |
66 | | |
67 | | static void |
68 | | scan_object_label_rec(fz_context *ctx, pdf_object_labels *g, char *root_path, pdf_obj *obj, int top) |
69 | 0 | { |
70 | 0 | char path[100]; |
71 | 0 | int i, n; |
72 | 0 | if (pdf_is_indirect(ctx, obj)) |
73 | 0 | ; |
74 | 0 | else if (pdf_is_dict(ctx, obj)) |
75 | 0 | { |
76 | 0 | n = pdf_dict_len(ctx, obj); |
77 | 0 | for (i = 0; i < n; ++i) |
78 | 0 | { |
79 | 0 | pdf_obj *key = pdf_dict_get_key(ctx, obj, i); |
80 | 0 | pdf_obj *val = pdf_dict_get_val(ctx, obj, i); |
81 | 0 | if (val && key != PDF_NAME(Parent) && key != PDF_NAME(P) && key != PDF_NAME(Prev) && key != PDF_NAME(Last)) |
82 | 0 | { |
83 | 0 | if (pdf_is_indirect(ctx, val)) |
84 | 0 | { |
85 | 0 | fz_snprintf(path, sizeof path, "%s/%s", root_path, pdf_to_name(ctx, key)); |
86 | 0 | add_object_label(ctx, g, path, pdf_to_num(ctx, val), top); |
87 | 0 | } |
88 | 0 | else if (pdf_is_dict(ctx, val) || pdf_is_array(ctx, val)) |
89 | 0 | { |
90 | 0 | fz_snprintf(path, sizeof path, "%s/%s", root_path, pdf_to_name(ctx, key)); |
91 | 0 | scan_object_label_rec(ctx, g, path, val, top); |
92 | 0 | } |
93 | 0 | } |
94 | 0 | } |
95 | 0 | } |
96 | 0 | else if (pdf_is_array(ctx, obj)) |
97 | 0 | { |
98 | 0 | n = pdf_array_len(ctx, obj); |
99 | 0 | for (i = 0; i < n; ++i) |
100 | 0 | { |
101 | 0 | pdf_obj *val = pdf_array_get(ctx, obj, i); |
102 | 0 | if (val) |
103 | 0 | { |
104 | 0 | if (pdf_is_indirect(ctx, val)) |
105 | 0 | { |
106 | 0 | fz_snprintf(path, sizeof path, "%s/%d", root_path, i+1); |
107 | 0 | add_object_label(ctx, g, path, pdf_to_num(ctx, val), top); |
108 | 0 | } |
109 | 0 | else if (pdf_is_dict(ctx, val) || pdf_is_array(ctx, val)) |
110 | 0 | { |
111 | 0 | fz_snprintf(path, sizeof path, "%s/%d", root_path, i+1); |
112 | 0 | scan_object_label_rec(ctx, g, path, val, top); |
113 | 0 | } |
114 | 0 | } |
115 | 0 | } |
116 | 0 | } |
117 | 0 | } |
118 | | |
119 | | static void |
120 | | scan_object_label(fz_context *ctx, pdf_document *doc, pdf_object_labels *g, int num) |
121 | 0 | { |
122 | 0 | pdf_obj *obj = pdf_load_object(ctx, doc, num); |
123 | 0 | fz_try(ctx) |
124 | 0 | scan_object_label_rec(ctx, g, "", obj, num); |
125 | 0 | fz_always(ctx) |
126 | 0 | pdf_drop_obj(ctx, obj); |
127 | 0 | fz_catch(ctx) |
128 | 0 | fz_rethrow(ctx); |
129 | 0 | } |
130 | | |
131 | | pdf_object_labels * |
132 | | pdf_load_object_labels(fz_context *ctx, pdf_document *doc) |
133 | 0 | { |
134 | 0 | pdf_object_labels *g = NULL; |
135 | 0 | fz_pool *pool; |
136 | 0 | int i, n, page_count; |
137 | |
|
138 | 0 | n = pdf_count_objects(ctx, doc); |
139 | |
|
140 | 0 | pool = fz_new_pool(ctx); |
141 | 0 | fz_try(ctx) |
142 | 0 | { |
143 | 0 | g = fz_pool_alloc(ctx, pool, sizeof(pdf_object_labels)); |
144 | 0 | g->pool = pool; |
145 | 0 | g->object_count = n; |
146 | 0 | g->root = pdf_to_num(ctx, pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root))); |
147 | 0 | g->info = pdf_to_num(ctx, pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info))); |
148 | 0 | g->encrypt = pdf_to_num(ctx, pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt))); |
149 | 0 | g->seen = fz_pool_alloc(ctx, pool, n); |
150 | 0 | g->nodes = fz_pool_alloc(ctx, pool, g->object_count * sizeof(pdf_object_label_node*)); |
151 | 0 | g->pages = fz_pool_alloc(ctx, pool, g->object_count * sizeof(unsigned short)); |
152 | |
|
153 | 0 | page_count = pdf_count_pages(ctx, doc); |
154 | 0 | for (i = 0; i < page_count; ++i) |
155 | 0 | g->pages[pdf_to_num(ctx, pdf_lookup_page_obj(ctx, doc, i))] = i+1; |
156 | |
|
157 | 0 | for (i = 1; i < n; ++i) |
158 | 0 | scan_object_label(ctx, doc, g, i); |
159 | 0 | } |
160 | 0 | fz_catch(ctx) |
161 | 0 | { |
162 | 0 | fz_drop_pool(ctx, pool); |
163 | 0 | } |
164 | 0 | return g; |
165 | 0 | } |
166 | | |
167 | | void |
168 | | pdf_drop_object_labels(fz_context *ctx, pdf_object_labels *g) |
169 | 0 | { |
170 | 0 | if (g) |
171 | 0 | fz_drop_pool(ctx, g->pool); |
172 | 0 | } |
173 | | |
174 | | static char * |
175 | | prepend(char *path_buffer, char *path, const char *fmt, ...) |
176 | 0 | { |
177 | 0 | char buf[256]; |
178 | 0 | size_t z; |
179 | 0 | va_list args; |
180 | |
|
181 | 0 | va_start(args, fmt); |
182 | 0 | z = fz_vsnprintf(buf, sizeof(buf), fmt, args); |
183 | 0 | va_end(args); |
184 | | |
185 | | /* We always want to leave ourselves at least 3 chars for |
186 | | * a future "..." */ |
187 | 0 | if (path_buffer + z + 3 <= path) |
188 | 0 | { |
189 | 0 | path -= z; |
190 | 0 | memcpy(path, buf, z); |
191 | 0 | return path; |
192 | 0 | } |
193 | | |
194 | | /* Just put ... in now. */ |
195 | 0 | path -= 3; |
196 | 0 | path[0] = '.'; |
197 | 0 | path[1] = '.'; |
198 | 0 | path[2] = '.'; |
199 | |
|
200 | 0 | return path; |
201 | 0 | } |
202 | | |
203 | | static void |
204 | | find_paths(fz_context *ctx, pdf_object_labels *g, int here, char *path_buffer, char *leaf_path, pdf_label_object_fn *callback, void *arg) |
205 | 0 | { |
206 | 0 | pdf_object_label_node *node; |
207 | 0 | int next; |
208 | 0 | if (here == g->root) |
209 | 0 | { |
210 | 0 | prepend(path_buffer, leaf_path, "trailer/Root"); |
211 | 0 | callback(ctx, arg, prepend(path_buffer, leaf_path, "trailer/Root")); |
212 | 0 | return; |
213 | 0 | } |
214 | 0 | if (here == g->info) |
215 | 0 | { |
216 | 0 | callback(ctx, arg, prepend(path_buffer, leaf_path, "trailer/Info")); |
217 | 0 | return; |
218 | 0 | } |
219 | 0 | if (here == g->encrypt) |
220 | 0 | { |
221 | 0 | callback(ctx, arg, prepend(path_buffer, leaf_path, "trailer/Encrypt")); |
222 | 0 | return; |
223 | 0 | } |
224 | 0 | if (g->pages[here]) |
225 | 0 | { |
226 | 0 | callback(ctx, arg, prepend(path_buffer, leaf_path, "pages/%d", g->pages[here])); |
227 | 0 | } |
228 | 0 | for (node = g->nodes[here]; node; node = node->next) |
229 | 0 | { |
230 | 0 | next = node->num; |
231 | 0 | if (next < 1 || next >= g->object_count) |
232 | 0 | continue; |
233 | 0 | if (g->seen[next]) |
234 | 0 | continue; |
235 | 0 | if (g->pages[next]) |
236 | 0 | { |
237 | 0 | callback(ctx, arg, prepend(path_buffer, leaf_path, "pages/%d%s", g->pages[next], node->path)); |
238 | 0 | } |
239 | 0 | else |
240 | 0 | { |
241 | 0 | char *p = prepend(path_buffer, leaf_path, "%s", node->path); |
242 | 0 | g->seen[next] = 1; |
243 | | // if we've run out of room in the path buffer, send this and stop. |
244 | 0 | if (p[0] == '.' && p[1] == '.' && p[2] == '.') |
245 | 0 | callback(ctx, arg, p); |
246 | 0 | else |
247 | 0 | find_paths(ctx, g, next, path_buffer, p, callback, arg); |
248 | 0 | g->seen[next] = 0; |
249 | 0 | } |
250 | 0 | } |
251 | 0 | } |
252 | | |
253 | | void |
254 | | pdf_label_object(fz_context *ctx, pdf_object_labels *g, int num, pdf_label_object_fn *callback, void *arg) |
255 | 0 | { |
256 | 0 | int i; |
257 | 0 | char path[4096]; |
258 | |
|
259 | 0 | if (num < 1 || num >= g->object_count) |
260 | 0 | return; |
261 | 0 | for (i = 1; i < g->object_count; ++i) |
262 | 0 | g->seen[i] = 0; |
263 | 0 | path[sizeof(path)-1] = 0; |
264 | 0 | find_paths(ctx, g, num, path, &path[sizeof(path)-1], callback, arg); |
265 | 0 | } |