/src/qpdf/libqpdf/QPDF_linearization.cc
Line | Count | Source |
1 | | // See doc/linearization. |
2 | | |
3 | | #include <qpdf/QPDF_private.hh> |
4 | | |
5 | | #include <qpdf/BitStream.hh> |
6 | | #include <qpdf/BitWriter.hh> |
7 | | #include <qpdf/InputSource_private.hh> |
8 | | #include <qpdf/Pipeline_private.hh> |
9 | | #include <qpdf/Pl_Buffer.hh> |
10 | | #include <qpdf/Pl_Flate.hh> |
11 | | #include <qpdf/Pl_String.hh> |
12 | | #include <qpdf/QPDFExc.hh> |
13 | | #include <qpdf/QPDFObjectHandle_private.hh> |
14 | | #include <qpdf/QPDFWriter_private.hh> |
15 | | #include <qpdf/QTC.hh> |
16 | | #include <qpdf/QUtil.hh> |
17 | | #include <qpdf/Util.hh> |
18 | | |
19 | | #include <algorithm> |
20 | | #include <cmath> |
21 | | #include <cstring> |
22 | | #include <utility> |
23 | | |
24 | | using namespace qpdf; |
25 | | using namespace std::literals; |
26 | | |
27 | | using Lin = QPDF::Doc::Linearization; |
28 | | |
29 | | template <class T, class int_type> |
30 | | static void |
31 | | load_vector_int( |
32 | | BitStream& bit_stream, int nitems, std::vector<T>& vec, int bits_wanted, int_type T::* field) |
33 | 0 | { |
34 | 0 | bool append = vec.empty(); |
35 | | // nitems times, read bits_wanted from the given bit stream, storing results in the ith vector |
36 | | // entry. |
37 | |
|
38 | 0 | for (size_t i = 0; i < QIntC::to_size(nitems); ++i) { |
39 | 0 | if (append) { |
40 | 0 | vec.push_back(T()); |
41 | 0 | } |
42 | 0 | vec.at(i).*field = bit_stream.getBitsInt(QIntC::to_size(bits_wanted)); |
43 | 0 | } |
44 | 0 | util::assertion( |
45 | 0 | std::cmp_equal(vec.size(), nitems), "vector has wrong size in load_vector_int" // |
46 | 0 | ); |
47 | | // The PDF spec says that each hint table starts at a byte boundary. Each "row" actually must |
48 | | // start on a byte boundary. |
49 | 0 | bit_stream.skipToNextByte(); |
50 | 0 | } Unexecuted instantiation: QPDF_linearization.cc:void load_vector_int<QPDF::Doc::Linearization::HPageOffsetEntry, int>(BitStream&, int, std::__1::vector<QPDF::Doc::Linearization::HPageOffsetEntry, std::__1::allocator<QPDF::Doc::Linearization::HPageOffsetEntry> >&, int, int QPDF::Doc::Linearization::HPageOffsetEntry::*) Unexecuted instantiation: QPDF_linearization.cc:void load_vector_int<QPDF::Doc::Linearization::HPageOffsetEntry, long long>(BitStream&, int, std::__1::vector<QPDF::Doc::Linearization::HPageOffsetEntry, std::__1::allocator<QPDF::Doc::Linearization::HPageOffsetEntry> >&, int, long long QPDF::Doc::Linearization::HPageOffsetEntry::*) Unexecuted instantiation: QPDF_linearization.cc:void load_vector_int<QPDF::Doc::Linearization::HSharedObjectEntry, int>(BitStream&, int, std::__1::vector<QPDF::Doc::Linearization::HSharedObjectEntry, std::__1::allocator<QPDF::Doc::Linearization::HSharedObjectEntry> >&, int, int QPDF::Doc::Linearization::HSharedObjectEntry::*) |
51 | | |
52 | | template <class T> |
53 | | static void |
54 | | load_vector_vector( |
55 | | BitStream& bit_stream, |
56 | | int nitems1, |
57 | | std::vector<T>& vec1, |
58 | | int T::* nitems2, |
59 | | int bits_wanted, |
60 | | std::vector<int> T::* vec2) |
61 | 0 | { |
62 | | // nitems1 times, read nitems2 (from the ith element of vec1) items into the vec2 vector field |
63 | | // of the ith item of vec1. |
64 | 0 | for (size_t i1 = 0; i1 < QIntC::to_size(nitems1); ++i1) { |
65 | 0 | for (int i2 = 0; i2 < vec1.at(i1).*nitems2; ++i2) { |
66 | 0 | (vec1.at(i1).*vec2).push_back(bit_stream.getBitsInt(QIntC::to_size(bits_wanted))); |
67 | 0 | } |
68 | 0 | } |
69 | 0 | bit_stream.skipToNextByte(); |
70 | 0 | } |
71 | | |
72 | | Lin::ObjUser::ObjUser(user_e type) : |
73 | 0 | ou_type(type) |
74 | 0 | { |
75 | 0 | qpdf_expect(type == ou_root); |
76 | 0 | } |
77 | | |
78 | | Lin::ObjUser::ObjUser(user_e type, size_t pageno) : |
79 | 0 | ou_type(type), |
80 | 0 | pageno(pageno) |
81 | 0 | { |
82 | 0 | qpdf_expect(type == ou_page || type == ou_thumb); |
83 | 0 | } |
84 | | |
85 | | Lin::ObjUser::ObjUser(user_e type, std::string const& key) : |
86 | 0 | ou_type(type), |
87 | 0 | key(key) |
88 | 0 | { |
89 | 0 | qpdf_expect(type == ou_trailer_key || type == ou_root_key); |
90 | 0 | } |
91 | | |
92 | | bool |
93 | | Lin::ObjUser::operator<(ObjUser const& rhs) const |
94 | 0 | { |
95 | 0 | if (ou_type < rhs.ou_type) { |
96 | 0 | return true; |
97 | 0 | } |
98 | 0 | if (ou_type == rhs.ou_type) { |
99 | 0 | if (pageno < rhs.pageno) { |
100 | 0 | return true; |
101 | 0 | } |
102 | 0 | if (pageno == rhs.pageno) { |
103 | 0 | return key < rhs.key; |
104 | 0 | } |
105 | 0 | } |
106 | 0 | return false; |
107 | 0 | } |
108 | | |
109 | | Lin::UpdateObjectMapsFrame::UpdateObjectMapsFrame( |
110 | | ObjUser const& ou, QPDFObjectHandle oh, bool top) : |
111 | 0 | ou(ou), |
112 | 0 | oh(oh), |
113 | 0 | top(top) |
114 | 0 | { |
115 | 0 | } |
116 | | |
117 | | void |
118 | | QPDF::optimize( |
119 | | std::map<int, int> const& object_stream_data, |
120 | | bool allow_changes, |
121 | | std::function<int(QPDFObjectHandle&)> skip_stream_parameters) |
122 | 0 | { |
123 | 0 | m->lin.optimize_internal(object_stream_data, allow_changes, skip_stream_parameters); |
124 | 0 | } |
125 | | |
126 | | void |
127 | | Lin::optimize( |
128 | | QPDFWriter::ObjTable const& obj, std::function<int(QPDFObjectHandle&)> skip_stream_parameters) |
129 | 0 | { |
130 | 0 | optimize_internal(obj, true, skip_stream_parameters); |
131 | 0 | } |
132 | | |
133 | | template <typename T> |
134 | | void |
135 | | Lin::optimize_internal( |
136 | | T const& object_stream_data, |
137 | | bool allow_changes, |
138 | | std::function<int(QPDFObjectHandle&)> skip_stream_parameters) |
139 | 0 | { |
140 | 0 | if (!obj_user_to_objects_.empty()) { |
141 | | // already optimized |
142 | 0 | return; |
143 | 0 | } |
144 | | |
145 | | // The PDF specification indicates that /Outlines is supposed to be an indirect reference. Force |
146 | | // it to be so if it exists and is direct. (This has been seen in the wild.) |
147 | 0 | QPDFObjectHandle root = qpdf.getRoot(); |
148 | 0 | if (root.getKey("/Outlines").isDictionary()) { |
149 | 0 | QPDFObjectHandle outlines = root.getKey("/Outlines"); |
150 | 0 | if (!outlines.isIndirect()) { |
151 | 0 | root.replaceKey("/Outlines", qpdf.makeIndirectObject(outlines)); |
152 | 0 | } |
153 | 0 | } |
154 | | |
155 | | // Traverse pages tree pushing all inherited resources down to the page level. This also |
156 | | // initializes m->all_pages. |
157 | 0 | m->pages.pushInheritedAttributesToPage(allow_changes, false); |
158 | | // Traverse pages |
159 | |
|
160 | 0 | size_t n = 0; |
161 | 0 | for (auto const& page: m->pages) { |
162 | 0 | updateObjectMaps(ObjUser(ObjUser::ou_page, n), page, skip_stream_parameters); |
163 | 0 | ++n; |
164 | 0 | } |
165 | | |
166 | | // Traverse document-level items |
167 | 0 | for (auto const& [key, value]: m->trailer.as_dictionary()) { |
168 | 0 | if (key == "/Root") { |
169 | | // handled separately |
170 | 0 | } else { |
171 | 0 | if (!value.null()) { |
172 | 0 | updateObjectMaps( |
173 | 0 | ObjUser(ObjUser::ou_trailer_key, key), value, skip_stream_parameters); |
174 | 0 | } |
175 | 0 | } |
176 | 0 | } |
177 | |
|
178 | 0 | for (auto const& [key, value]: root.as_dictionary()) { |
179 | | // Technically, /I keys from /Thread dictionaries are supposed to be handled separately, but |
180 | | // we are going to disregard that specification for now. There is loads of evidence that |
181 | | // pdlin and Acrobat both disregard things like this from time to time, so this is almost |
182 | | // certain not to cause any problems. |
183 | 0 | if (!value.null()) { |
184 | 0 | updateObjectMaps(ObjUser(ObjUser::ou_root_key, key), value, skip_stream_parameters); |
185 | 0 | } |
186 | 0 | } |
187 | |
|
188 | 0 | ObjUser root_ou = ObjUser(ObjUser::ou_root); |
189 | 0 | auto root_og = root.id_gen(); |
190 | 0 | obj_user_to_objects_[root_ou].insert(root_og); |
191 | 0 | object_to_obj_users_[root_og].insert(root_ou); |
192 | |
|
193 | 0 | filterCompressedObjects(object_stream_data); |
194 | 0 | } Unexecuted instantiation: void QPDF::Doc::Linearization::optimize_internal<std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > >(std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > const&, bool, std::__1::function<int (QPDFObjectHandle&)>) Unexecuted instantiation: void QPDF::Doc::Linearization::optimize_internal<QPDFWriter::ObjTable>(QPDFWriter::ObjTable const&, bool, std::__1::function<int (QPDFObjectHandle&)>) |
195 | | |
196 | | void |
197 | | Lin::updateObjectMaps( |
198 | | ObjUser const& first_ou, |
199 | | QPDFObjectHandle first_oh, |
200 | | std::function<int(QPDFObjectHandle&)> skip_stream_parameters) |
201 | 0 | { |
202 | 0 | QPDFObjGen::set visited; |
203 | 0 | std::vector<UpdateObjectMapsFrame> pending; |
204 | 0 | pending.emplace_back(first_ou, first_oh, true); |
205 | | // Traverse the object tree from this point taking care to avoid crossing page boundaries. |
206 | 0 | std::unique_ptr<ObjUser> thumb_ou; |
207 | 0 | while (!pending.empty()) { |
208 | 0 | auto cur = pending.back(); |
209 | 0 | pending.pop_back(); |
210 | |
|
211 | 0 | bool is_page_node = false; |
212 | |
|
213 | 0 | if (cur.oh.isDictionaryOfType("/Page")) { |
214 | 0 | is_page_node = true; |
215 | 0 | if (!cur.top) { |
216 | 0 | continue; |
217 | 0 | } |
218 | 0 | } |
219 | | |
220 | 0 | if (cur.oh.indirect()) { |
221 | 0 | QPDFObjGen og(cur.oh.getObjGen()); |
222 | 0 | if (!visited.add(og)) { |
223 | 0 | QTC::TC("qpdf", "QPDF opt loop detected"); |
224 | 0 | continue; |
225 | 0 | } |
226 | 0 | obj_user_to_objects_[cur.ou].insert(og); |
227 | 0 | object_to_obj_users_[og].insert(cur.ou); |
228 | 0 | } |
229 | | |
230 | 0 | if (cur.oh.isArray()) { |
231 | 0 | for (auto const& item: cur.oh.as_array()) { |
232 | 0 | pending.emplace_back(cur.ou, item, false); |
233 | 0 | } |
234 | 0 | } else if (cur.oh.isDictionary() || cur.oh.isStream()) { |
235 | 0 | QPDFObjectHandle dict = cur.oh; |
236 | 0 | bool is_stream = cur.oh.isStream(); |
237 | 0 | int ssp = 0; |
238 | 0 | if (is_stream) { |
239 | 0 | dict = cur.oh.getDict(); |
240 | 0 | if (skip_stream_parameters) { |
241 | 0 | ssp = skip_stream_parameters(cur.oh); |
242 | 0 | } |
243 | 0 | } |
244 | |
|
245 | 0 | for (auto& [key, value]: dict.as_dictionary()) { |
246 | 0 | if (value.null()) { |
247 | 0 | continue; |
248 | 0 | } |
249 | | |
250 | 0 | if (is_page_node && (key == "/Thumb")) { |
251 | | // Traverse page thumbnail dictionaries as a special case. There can only ever |
252 | | // be one /Thumb key on a page, and we see at most one page node per call. |
253 | 0 | thumb_ou = std::make_unique<ObjUser>(ObjUser::ou_thumb, cur.ou.pageno); |
254 | 0 | pending.emplace_back(*thumb_ou, dict.getKey(key), false); |
255 | 0 | } else if (is_page_node && (key == "/Parent")) { |
256 | | // Don't traverse back up the page tree |
257 | 0 | } else if ( |
258 | 0 | ((ssp >= 1) && (key == "/Length")) || |
259 | 0 | ((ssp >= 2) && ((key == "/Filter") || (key == "/DecodeParms")))) { |
260 | | // Don't traverse into stream parameters that we are not going to write. |
261 | 0 | } else { |
262 | 0 | pending.emplace_back(cur.ou, value, false); |
263 | 0 | } |
264 | 0 | } |
265 | 0 | } |
266 | 0 | } |
267 | 0 | } |
268 | | |
269 | | void |
270 | | Lin::filterCompressedObjects(std::map<int, int> const& object_stream_data) |
271 | 0 | { |
272 | 0 | if (object_stream_data.empty()) { |
273 | 0 | return; |
274 | 0 | } |
275 | | |
276 | | // Transform object_to_obj_users and obj_user_to_objects so that they refer only to uncompressed |
277 | | // objects. If something is a user of a compressed object, then it is really a user of the |
278 | | // object stream that contains it. |
279 | | |
280 | 0 | std::map<ObjUser, std::set<QPDFObjGen>> t_obj_user_to_objects; |
281 | 0 | std::map<QPDFObjGen, std::set<ObjUser>> t_object_to_obj_users; |
282 | |
|
283 | 0 | for (auto const& [ou, ogs]: obj_user_to_objects_) { |
284 | 0 | for (auto const& og: ogs) { |
285 | 0 | auto i2 = object_stream_data.find(og.getObj()); |
286 | 0 | if (i2 == object_stream_data.end()) { |
287 | 0 | t_obj_user_to_objects[ou].insert(og); |
288 | 0 | } else { |
289 | 0 | t_obj_user_to_objects[ou].insert({i2->second, 0}); |
290 | 0 | } |
291 | 0 | } |
292 | 0 | } |
293 | |
|
294 | 0 | for (auto const& [og, ous]: object_to_obj_users_) { |
295 | 0 | for (auto const& ou: ous) { |
296 | 0 | auto i2 = object_stream_data.find(og.getObj()); |
297 | 0 | if (i2 == object_stream_data.end()) { |
298 | 0 | t_object_to_obj_users[og].insert(ou); |
299 | 0 | } else { |
300 | 0 | t_object_to_obj_users[{i2->second, 0}].insert(ou); |
301 | 0 | } |
302 | 0 | } |
303 | 0 | } |
304 | |
|
305 | 0 | obj_user_to_objects_ = std::move(t_obj_user_to_objects); |
306 | 0 | object_to_obj_users_ = std::move(t_object_to_obj_users); |
307 | 0 | } |
308 | | |
309 | | void |
310 | | Lin::filterCompressedObjects(QPDFWriter::ObjTable const& obj) |
311 | 0 | { |
312 | 0 | if (obj.getStreamsEmpty()) { |
313 | 0 | return; |
314 | 0 | } |
315 | | |
316 | | // Transform object_to_obj_users and obj_user_to_objects so that they refer only to uncompressed |
317 | | // objects. If something is a user of a compressed object, then it is really a user of the |
318 | | // object stream that contains it. |
319 | | |
320 | 0 | std::map<ObjUser, std::set<QPDFObjGen>> t_obj_user_to_objects; |
321 | 0 | std::map<QPDFObjGen, std::set<ObjUser>> t_object_to_obj_users; |
322 | |
|
323 | 0 | for (auto const& [ou, ogs]: obj_user_to_objects_) { |
324 | 0 | for (auto const& og: ogs) { |
325 | 0 | if (obj.contains(og)) { |
326 | 0 | if (auto const& i2 = obj[og].object_stream; i2 <= 0) { |
327 | 0 | t_obj_user_to_objects[ou].insert(og); |
328 | 0 | } else { |
329 | 0 | t_obj_user_to_objects[ou].insert(QPDFObjGen(i2, 0)); |
330 | 0 | } |
331 | 0 | } |
332 | 0 | } |
333 | 0 | } |
334 | |
|
335 | 0 | for (auto const& [og, ous]: object_to_obj_users_) { |
336 | 0 | if (obj.contains(og)) { |
337 | | // Loop over obj_users. |
338 | 0 | for (auto const& ou: ous) { |
339 | 0 | if (auto i2 = obj[og].object_stream; i2 <= 0) { |
340 | 0 | t_object_to_obj_users[og].insert(ou); |
341 | 0 | } else { |
342 | 0 | t_object_to_obj_users[{i2, 0}].insert(ou); |
343 | 0 | } |
344 | 0 | } |
345 | 0 | } |
346 | 0 | } |
347 | |
|
348 | 0 | obj_user_to_objects_ = std::move(t_obj_user_to_objects); |
349 | 0 | object_to_obj_users_ = std::move(t_object_to_obj_users); |
350 | 0 | } |
351 | | |
352 | | void |
353 | | Lin::linearizationWarning(std::string_view msg) |
354 | 0 | { |
355 | 0 | linearization_warnings_ = true; |
356 | 0 | warn(qpdf_e_linearization, "", 0, std::string(msg)); |
357 | 0 | } |
358 | | |
359 | | bool |
360 | | QPDF::checkLinearization() |
361 | 0 | { |
362 | 0 | return m->lin.check(); |
363 | 0 | } |
364 | | |
365 | | bool |
366 | | Lin::check() |
367 | 0 | { |
368 | 0 | try { |
369 | 0 | readLinearizationData(); |
370 | 0 | checkLinearizationInternal(); |
371 | 0 | return !linearization_warnings_; |
372 | 0 | } catch (std::runtime_error& e) { |
373 | 0 | linearizationWarning( |
374 | 0 | "error encountered while checking linearization data: " + std::string(e.what())); |
375 | 0 | return false; |
376 | 0 | } |
377 | 0 | } |
378 | | |
379 | | bool |
380 | | QPDF::isLinearized() |
381 | 0 | { |
382 | 0 | return m->lin.linearized(); |
383 | 0 | } |
384 | | |
385 | | bool |
386 | | Lin::linearized() |
387 | 0 | { |
388 | | // If the first object in the file is a dictionary with a suitable /Linearized key and has an /L |
389 | | // key that accurately indicates the file size, initialize m->lindict and return true. |
390 | | |
391 | | // A linearized PDF spec's first object will be contained within the first 1024 bytes of the |
392 | | // file and will be a dictionary with a valid /Linearized key. This routine looks for that and |
393 | | // does no additional validation. |
394 | | |
395 | | // The PDF spec says the linearization dictionary must be completely contained within the first |
396 | | // 1024 bytes of the file. Add a byte for a null terminator. |
397 | 0 | auto buffer = m->file->read(1024, 0); |
398 | 0 | size_t pos = 0; |
399 | 0 | while (true) { |
400 | | // Find a digit or end of buffer |
401 | 0 | pos = buffer.find_first_of("0123456789"sv, pos); |
402 | 0 | if (pos == std::string::npos) { |
403 | 0 | return false; |
404 | 0 | } |
405 | | // Seek to the digit. Then skip over digits for a potential |
406 | | // next iteration. |
407 | 0 | m->file->seek(toO(pos), SEEK_SET); |
408 | |
|
409 | 0 | auto t1 = m->objects.readToken(*m->file, 20); |
410 | 0 | if (!(t1.isInteger() && m->objects.readToken(*m->file, 6).isInteger() && |
411 | 0 | m->objects.readToken(*m->file, 4).isWord("obj"))) { |
412 | 0 | pos = buffer.find_first_not_of("0123456789"sv, pos); |
413 | 0 | if (pos == std::string::npos) { |
414 | 0 | return false; |
415 | 0 | } |
416 | 0 | continue; |
417 | 0 | } |
418 | | |
419 | 0 | Dictionary candidate = qpdf.getObject(toI(QUtil::string_to_ll(t1.getValue().data())), 0); |
420 | 0 | auto linkey = candidate["/Linearized"]; |
421 | 0 | if (!(linkey.isNumber() && toI(floor(linkey.getNumericValue())) == 1)) { |
422 | 0 | return false; |
423 | 0 | } |
424 | | |
425 | 0 | m->file->seek(0, SEEK_END); |
426 | 0 | Integer L = candidate["/L"]; |
427 | 0 | if (L != m->file->tell()) { |
428 | 0 | return false; |
429 | 0 | } |
430 | 0 | linp_.file_size = L; |
431 | 0 | lindict_ = candidate; |
432 | 0 | return true; |
433 | 0 | } |
434 | 0 | } |
435 | | |
436 | | void |
437 | | Lin::readLinearizationData() |
438 | 0 | { |
439 | 0 | util::assertion( |
440 | 0 | linearized(), "called readLinearizationData for file that is not linearized" // |
441 | 0 | ); |
442 | | |
443 | | // This function throws an exception (which is trapped by checkLinearization()) for any errors |
444 | | // that prevent loading. |
445 | | |
446 | | // /L is read and stored in linp by isLinearized() |
447 | 0 | Array H = lindict_["/H"]; // hint table offset/length for primary and overflow hint tables |
448 | 0 | auto H_size = H.size(); |
449 | 0 | Integer H_0 = H[0]; // hint table offset |
450 | 0 | Integer H_1 = H[1]; // hint table length |
451 | 0 | Integer H_2 = H[2]; // hint table offset for overflow hint table |
452 | 0 | Integer H_3 = H[3]; // hint table length for overflow hint table |
453 | 0 | Integer O = lindict_["/O"]; |
454 | 0 | Integer E = lindict_["/E"]; |
455 | 0 | Integer N = lindict_["/N"]; |
456 | 0 | Integer T = lindict_["/T"]; |
457 | 0 | auto P_oh = lindict_["/P"]; |
458 | 0 | Integer P = P_oh; // first page number |
459 | 0 | QTC::TC("qpdf", "QPDF P absent in lindict", P ? 0 : 1); |
460 | |
|
461 | 0 | no_ci_stop_if( |
462 | 0 | !(H && O && E && N && T && (P || P_oh.null())), |
463 | 0 | "some keys in linearization dictionary are of the wrong type", |
464 | 0 | "linearization dictionary" // |
465 | 0 | ); |
466 | |
|
467 | 0 | no_ci_stop_if( |
468 | 0 | !(H_size == 2 || H_size == 4), |
469 | 0 | "H has the wrong number of items", |
470 | 0 | "linearization dictionary" // |
471 | 0 | ); |
472 | |
|
473 | 0 | no_ci_stop_if( |
474 | 0 | !(H_0 && H_1 && (H_size == 2 || (H_2 && H_3))), |
475 | 0 | "some H items are of the wrong type", |
476 | 0 | "linearization dictionary" // |
477 | 0 | ); |
478 | | |
479 | | // Store linearization parameter data |
480 | | |
481 | | // Various places in the code use linp.npages, which is initialized from N, to pre-allocate |
482 | | // memory, so make sure it's accurate and bail right now if it's not. |
483 | 0 | no_ci_stop_if( |
484 | 0 | N != pages.size(), |
485 | 0 | "/N does not match number of pages", |
486 | 0 | "linearization dictionary" // |
487 | 0 | ); |
488 | | |
489 | | // file_size initialized by isLinearized() |
490 | 0 | linp_.first_page_object = O.value<int>(); |
491 | 0 | linp_.first_page_end = E; |
492 | 0 | linp_.npages = N.value<size_t>(); |
493 | 0 | linp_.xref_zero_offset = T; |
494 | 0 | linp_.first_page = P ? P.value<int>() : 0; |
495 | 0 | linp_.H_offset = H_0; |
496 | 0 | linp_.H_length = H_1; |
497 | | |
498 | | // Read hint streams |
499 | |
|
500 | 0 | Pl_Buffer pb("hint buffer"); |
501 | 0 | auto H0 = readHintStream(pb, H_0, H_1.value<size_t>()); |
502 | 0 | if (H_2) { |
503 | 0 | (void)readHintStream(pb, H_2, H_3.value<size_t>()); |
504 | 0 | } |
505 | | |
506 | | // PDF 1.4 hint tables that we ignore: |
507 | | |
508 | | // /T thumbnail |
509 | | // /A thread information |
510 | | // /E named destination |
511 | | // /V interactive form |
512 | | // /I information dictionary |
513 | | // /C logical structure |
514 | | // /L page label |
515 | | |
516 | | // Individual hint table offsets |
517 | 0 | Integer HS = H0["/S"]; // shared object |
518 | 0 | Integer HO = H0["/O"]; // outline |
519 | |
|
520 | 0 | auto hbp = pb.getBufferSharedPointer(); |
521 | 0 | Buffer* hb = hbp.get(); |
522 | 0 | unsigned char const* h_buf = hb->getBuffer(); |
523 | 0 | size_t h_size = hb->getSize(); |
524 | |
|
525 | 0 | readHPageOffset(BitStream(h_buf, h_size)); |
526 | |
|
527 | 0 | size_t HSi = HS.value<size_t>(); |
528 | 0 | if (HSi < 0 || HSi >= h_size) { |
529 | 0 | throw damagedPDF("linearization hint table", "/S (shared object) offset is out of bounds"); |
530 | 0 | } |
531 | 0 | readHSharedObject(BitStream(h_buf + HSi, h_size - HSi)); |
532 | |
|
533 | 0 | if (HO) { |
534 | 0 | no_ci_stop_if( |
535 | 0 | HO < 0 || HO >= h_size, |
536 | 0 | "/O (outline) offset is out of bounds", |
537 | 0 | "linearization dictionary" // |
538 | 0 | ); |
539 | 0 | size_t HOi = HO.value<size_t>(); |
540 | 0 | readHGeneric(BitStream(h_buf + HO, h_size - HOi), outline_hints_); |
541 | 0 | } |
542 | 0 | } |
543 | | |
544 | | Dictionary |
545 | | Lin::readHintStream(Pipeline& pl, qpdf_offset_t offset, size_t length) |
546 | 0 | { |
547 | 0 | auto H = m->objects.readObjectAtOffset(offset, "linearization hint stream", false); |
548 | 0 | ObjCache& oc = m->obj_cache[H]; |
549 | 0 | qpdf_offset_t min_end_offset = oc.end_before_space; |
550 | 0 | qpdf_offset_t max_end_offset = oc.end_after_space; |
551 | 0 | no_ci_stop_if( |
552 | 0 | !H.isStream(), "hint table is not a stream", "linearization dictionary" // |
553 | 0 | ); |
554 | |
|
555 | 0 | Dictionary Hdict = H.getDict(); |
556 | | |
557 | | // Some versions of Acrobat make /Length indirect and place it immediately after the stream, |
558 | | // increasing length to cover it, even though the specification says all objects in the |
559 | | // linearization parameter dictionary must be direct. We have to get the file position of the |
560 | | // end of length in this case. |
561 | 0 | if (Hdict["/Length"].indirect()) { |
562 | 0 | ObjCache& oc2 = m->obj_cache[Hdict["/Length"]]; |
563 | 0 | min_end_offset = oc2.end_before_space; |
564 | 0 | max_end_offset = oc2.end_after_space; |
565 | 0 | } else { |
566 | 0 | QTC::TC("qpdf", "QPDF hint table length direct"); |
567 | 0 | } |
568 | 0 | qpdf_offset_t computed_end = offset + toO(length); |
569 | 0 | no_ci_stop_if( |
570 | 0 | computed_end < min_end_offset || computed_end > max_end_offset, |
571 | 0 | "hint table length mismatch (expected = " + std::to_string(computed_end) + "; actual = " + |
572 | 0 | std::to_string(min_end_offset) + ".." + std::to_string(max_end_offset) + ")", |
573 | 0 | "linearization dictionary" // |
574 | 0 | ); |
575 | 0 | H.pipeStreamData(&pl, 0, qpdf_dl_specialized); |
576 | 0 | return Hdict; |
577 | 0 | } |
578 | | |
579 | | void |
580 | | Lin::readHPageOffset(BitStream h) |
581 | 0 | { |
582 | | // All comments referring to the PDF spec refer to the spec for version 1.4. |
583 | |
|
584 | 0 | HPageOffset& t = page_offset_hints_; |
585 | |
|
586 | 0 | t.min_nobjects = h.getBitsInt(32); // 1 |
587 | 0 | t.first_page_offset = h.getBitsInt(32); // 2 |
588 | 0 | t.nbits_delta_nobjects = h.getBitsInt(16); // 3 |
589 | 0 | t.min_page_length = h.getBitsInt(32); // 4 |
590 | 0 | t.nbits_delta_page_length = h.getBitsInt(16); // 5 |
591 | 0 | t.min_content_offset = h.getBitsInt(32); // 6 |
592 | 0 | t.nbits_delta_content_offset = h.getBitsInt(16); // 7 |
593 | 0 | t.min_content_length = h.getBitsInt(32); // 8 |
594 | 0 | t.nbits_delta_content_length = h.getBitsInt(16); // 9 |
595 | 0 | t.nbits_nshared_objects = h.getBitsInt(16); // 10 |
596 | 0 | t.nbits_shared_identifier = h.getBitsInt(16); // 11 |
597 | 0 | t.nbits_shared_numerator = h.getBitsInt(16); // 12 |
598 | 0 | t.shared_denominator = h.getBitsInt(16); // 13 |
599 | |
|
600 | 0 | std::vector<HPageOffsetEntry>& entries = t.entries; |
601 | 0 | entries.clear(); |
602 | 0 | int nitems = toI(linp_.npages); |
603 | 0 | load_vector_int(h, nitems, entries, t.nbits_delta_nobjects, &HPageOffsetEntry::delta_nobjects); |
604 | 0 | load_vector_int( |
605 | 0 | h, nitems, entries, t.nbits_delta_page_length, &HPageOffsetEntry::delta_page_length); |
606 | 0 | load_vector_int( |
607 | 0 | h, nitems, entries, t.nbits_nshared_objects, &HPageOffsetEntry::nshared_objects); |
608 | 0 | load_vector_vector( |
609 | 0 | h, |
610 | 0 | nitems, |
611 | 0 | entries, |
612 | 0 | &HPageOffsetEntry::nshared_objects, |
613 | 0 | t.nbits_shared_identifier, |
614 | 0 | &HPageOffsetEntry::shared_identifiers); |
615 | 0 | load_vector_vector( |
616 | 0 | h, |
617 | 0 | nitems, |
618 | 0 | entries, |
619 | 0 | &HPageOffsetEntry::nshared_objects, |
620 | 0 | t.nbits_shared_numerator, |
621 | 0 | &HPageOffsetEntry::shared_numerators); |
622 | 0 | load_vector_int( |
623 | 0 | h, nitems, entries, t.nbits_delta_content_offset, &HPageOffsetEntry::delta_content_offset); |
624 | 0 | load_vector_int( |
625 | 0 | h, nitems, entries, t.nbits_delta_content_length, &HPageOffsetEntry::delta_content_length); |
626 | 0 | } |
627 | | |
628 | | void |
629 | | Lin::readHSharedObject(BitStream h) |
630 | 0 | { |
631 | 0 | HSharedObject& t = shared_object_hints_; |
632 | |
|
633 | 0 | t.first_shared_obj = h.getBitsInt(32); // 1 |
634 | 0 | t.first_shared_offset = h.getBitsInt(32); // 2 |
635 | 0 | t.nshared_first_page = h.getBitsInt(32); // 3 |
636 | 0 | t.nshared_total = h.getBitsInt(32); // 4 |
637 | 0 | t.nbits_nobjects = h.getBitsInt(16); // 5 |
638 | 0 | t.min_group_length = h.getBitsInt(32); // 6 |
639 | 0 | t.nbits_delta_group_length = h.getBitsInt(16); // 7 |
640 | |
|
641 | 0 | QTC::TC( |
642 | 0 | "qpdf", |
643 | 0 | "QPDF lin nshared_total > nshared_first_page", |
644 | 0 | (t.nshared_total > t.nshared_first_page) ? 1 : 0); |
645 | |
|
646 | 0 | std::vector<HSharedObjectEntry>& entries = t.entries; |
647 | 0 | entries.clear(); |
648 | 0 | int nitems = t.nshared_total; |
649 | 0 | load_vector_int( |
650 | 0 | h, nitems, entries, t.nbits_delta_group_length, &HSharedObjectEntry::delta_group_length); |
651 | 0 | load_vector_int(h, nitems, entries, 1, &HSharedObjectEntry::signature_present); |
652 | 0 | for (size_t i = 0; i < toS(nitems); ++i) { |
653 | 0 | if (entries.at(i).signature_present) { |
654 | | // Skip 128-bit MD5 hash. These are not supported by acrobat, so they should probably |
655 | | // never be there. We have no test case for this. |
656 | 0 | for (int j = 0; j < 4; ++j) { |
657 | 0 | (void)h.getBits(32); |
658 | 0 | } |
659 | 0 | } |
660 | 0 | } |
661 | 0 | load_vector_int(h, nitems, entries, t.nbits_nobjects, &HSharedObjectEntry::nobjects_minus_one); |
662 | 0 | } |
663 | | |
664 | | void |
665 | | Lin::readHGeneric(BitStream h, HGeneric& t) |
666 | 0 | { |
667 | 0 | t.first_object = h.getBitsInt(32); // 1 |
668 | 0 | t.first_object_offset = h.getBitsInt(32); // 2 |
669 | 0 | t.nobjects = h.getBitsInt(32); // 3 |
670 | 0 | t.group_length = h.getBitsInt(32); // 4 |
671 | 0 | } |
672 | | |
673 | | void |
674 | | Lin::checkLinearizationInternal() |
675 | 0 | { |
676 | | // All comments referring to the PDF spec refer to the spec for version 1.4. |
677 | | |
678 | | // Check all values in linearization parameter dictionary |
679 | |
|
680 | 0 | LinParameters& p = linp_; |
681 | | |
682 | | // L: file size in bytes -- checked by isLinearized |
683 | | |
684 | | // O: object number of first page |
685 | 0 | auto const& all_pages = pages.all(); |
686 | 0 | if (p.first_page_object != all_pages.at(0).getObjectID()) { |
687 | 0 | linearizationWarning("first page object (/O) mismatch"); |
688 | 0 | } |
689 | | |
690 | | // N: number of pages |
691 | 0 | size_t npages = all_pages.size(); |
692 | 0 | if (std::cmp_not_equal(p.npages, npages)) { |
693 | | // Not tested in the test suite |
694 | 0 | linearizationWarning("page count (/N) mismatch"); |
695 | 0 | } |
696 | |
|
697 | 0 | int i = 0; |
698 | 0 | for (auto const& page: all_pages) { |
699 | 0 | if (m->xref_table[page].getType() == 2) { |
700 | 0 | linearizationWarning( |
701 | 0 | "page dictionary for page " + std::to_string(i) + " is compressed"); |
702 | 0 | } |
703 | 0 | ++i; |
704 | 0 | } |
705 | | |
706 | | // T: offset of whitespace character preceding xref entry for object 0 |
707 | 0 | m->file->seek(p.xref_zero_offset, SEEK_SET); |
708 | 0 | while (true) { |
709 | 0 | char ch; |
710 | 0 | m->file->read(&ch, 1); |
711 | 0 | if (!(ch == ' ' || ch == '\r' || ch == '\n')) { |
712 | 0 | m->file->seek(-1, SEEK_CUR); |
713 | 0 | break; |
714 | 0 | } |
715 | 0 | } |
716 | 0 | if (m->file->tell() != objects.first_xref_item_offset()) { |
717 | 0 | linearizationWarning( |
718 | 0 | "space before first xref item (/T) mismatch (computed = " + |
719 | 0 | std::to_string(objects.first_xref_item_offset()) + |
720 | 0 | "; file = " + std::to_string(m->file->tell())); |
721 | 0 | } |
722 | | |
723 | | // P: first page number -- Implementation note 124 says Acrobat ignores this value, so we will |
724 | | // too. |
725 | | |
726 | | // Check numbering of compressed objects in each xref section. For linearized files, all |
727 | | // compressed objects are supposed to be at the end of the containing xref section if any object |
728 | | // streams are in use. |
729 | |
|
730 | 0 | if (objects.uncompressed_after_compressed()) { |
731 | 0 | linearizationWarning( |
732 | 0 | "linearized file contains an uncompressed object after a compressed " |
733 | 0 | "one in a cross-reference stream"); |
734 | 0 | } |
735 | | |
736 | | // Further checking requires optimization and order calculation. Don't allow optimization to |
737 | | // make changes. If it has to, then the file is not properly linearized. We use the xref table |
738 | | // to figure out which objects are compressed and which are uncompressed. |
739 | 0 | { // local scope |
740 | 0 | std::map<int, int> object_stream_data; |
741 | 0 | for (auto const& [og, entry]: m->xref_table) { |
742 | 0 | if (entry.getType() == 2) { |
743 | 0 | object_stream_data[og.getObj()] = entry.getObjStreamNumber(); |
744 | 0 | } |
745 | 0 | } |
746 | 0 | optimize_internal(object_stream_data, false, nullptr); |
747 | 0 | calculateLinearizationData(object_stream_data); |
748 | 0 | } |
749 | | |
750 | | // E: offset of end of first page -- Implementation note 123 says Acrobat includes on extra |
751 | | // object here by mistake. pdlin fails to place thumbnail images in section 9, so when |
752 | | // thumbnails are present, it also gets the wrong value for /E. It also doesn't count outlines |
753 | | // here when it should even though it places them in part 6. This code fails to put thread |
754 | | // information dictionaries in part 9, so it actually gets the wrong value for E when threads |
755 | | // are present. In that case, it would probably agree with pdlin. As of this writing, the test |
756 | | // suite doesn't contain any files with threads. |
757 | |
|
758 | 0 | no_ci_stop_if( |
759 | 0 | part6_.empty(), "linearization part 6 unexpectedly empty" // |
760 | 0 | ); |
761 | 0 | qpdf_offset_t min_E = -1; |
762 | 0 | qpdf_offset_t max_E = -1; |
763 | 0 | for (auto const& oh: part6_) { |
764 | 0 | QPDFObjGen og(oh.getObjGen()); |
765 | | // All objects have to have been dereferenced to be classified. |
766 | 0 | util::assertion(m->obj_cache.contains(og), "linearization part6 object not in cache"); |
767 | 0 | ObjCache const& oc = m->obj_cache[og]; |
768 | 0 | min_E = std::max(min_E, oc.end_before_space); |
769 | 0 | max_E = std::max(max_E, oc.end_after_space); |
770 | 0 | } |
771 | 0 | if (p.first_page_end < min_E || p.first_page_end > max_E) { |
772 | 0 | linearizationWarning( |
773 | 0 | "end of first page section (/E) mismatch: /E = " + std::to_string(p.first_page_end) + |
774 | 0 | "; computed = " + std::to_string(min_E) + ".." + std::to_string(max_E)); |
775 | 0 | } |
776 | | |
777 | | // Check hint tables |
778 | |
|
779 | 0 | std::map<int, int> shared_idx_to_obj; |
780 | 0 | checkHSharedObject(all_pages, shared_idx_to_obj); |
781 | 0 | checkHPageOffset(all_pages, shared_idx_to_obj); |
782 | 0 | checkHOutlines(); |
783 | 0 | } |
784 | | |
785 | | qpdf_offset_t |
786 | | Lin::maxEnd(ObjUser const& ou) |
787 | 0 | { |
788 | 0 | no_ci_stop_if( |
789 | 0 | !obj_user_to_objects_.contains(ou), |
790 | 0 | "no entry in object user table for requested object user" // |
791 | 0 | ); |
792 | |
|
793 | 0 | qpdf_offset_t end = 0; |
794 | 0 | for (auto const& og: obj_user_to_objects_[ou]) { |
795 | 0 | no_ci_stop_if( |
796 | 0 | !m->obj_cache.contains(og), "unknown object referenced in object user table" // |
797 | 0 | ); |
798 | 0 | end = std::max(end, m->obj_cache[og].end_after_space); |
799 | 0 | } |
800 | 0 | return end; |
801 | 0 | } |
802 | | |
803 | | qpdf_offset_t |
804 | | Lin::getLinearizationOffset(QPDFObjGen og) |
805 | 0 | { |
806 | 0 | QPDFXRefEntry const& entry = m->xref_table[og]; |
807 | 0 | auto typ = entry.getType(); |
808 | 0 | if (typ == 1) { |
809 | 0 | return entry.getOffset(); |
810 | 0 | } |
811 | 0 | no_ci_stop_if( |
812 | 0 | typ != 2, "getLinearizationOffset called for xref entry not of type 1 or 2" // |
813 | 0 | ); |
814 | | // For compressed objects, return the offset of the object stream that contains them. |
815 | 0 | return getLinearizationOffset({entry.getObjStreamNumber(), 0}); |
816 | 0 | } |
817 | | |
818 | | QPDFObjectHandle |
819 | | Lin::getUncompressedObject(QPDFObjectHandle& obj, std::map<int, int> const& object_stream_data) |
820 | 0 | { |
821 | 0 | if (obj.null() || !object_stream_data.contains(obj.getObjectID())) { |
822 | 0 | return obj; |
823 | 0 | } |
824 | 0 | return qpdf.getObject((*(object_stream_data.find(obj.getObjectID()))).second, 0); |
825 | 0 | } |
826 | | |
827 | | QPDFObjectHandle |
828 | | Lin::getUncompressedObject(QPDFObjectHandle& oh, QPDFWriter::ObjTable const& obj) |
829 | 0 | { |
830 | 0 | if (obj.contains(oh)) { |
831 | 0 | if (auto id = obj[oh].object_stream; id > 0) { |
832 | 0 | return oh.null() ? oh : qpdf.getObject(id, 0); |
833 | 0 | } |
834 | 0 | } |
835 | 0 | return oh; |
836 | 0 | } |
837 | | |
838 | | int |
839 | | Lin::lengthNextN(int first_object, int n) |
840 | 0 | { |
841 | 0 | int length = 0; |
842 | 0 | for (int i = 0; i < n; ++i) { |
843 | 0 | QPDFObjGen og(first_object + i, 0); |
844 | 0 | if (m->xref_table.contains(og)) { |
845 | 0 | no_ci_stop_if( |
846 | 0 | !m->obj_cache.contains(og), |
847 | 0 | "found unknown object while calculating length for linearization data" // |
848 | 0 | ); |
849 | |
|
850 | 0 | length += toI(m->obj_cache[og].end_after_space - getLinearizationOffset(og)); |
851 | 0 | } else { |
852 | 0 | linearizationWarning( |
853 | 0 | "no xref table entry for " + std::to_string(first_object + i) + " 0"); |
854 | 0 | } |
855 | 0 | } |
856 | 0 | return length; |
857 | 0 | } |
858 | | |
859 | | void |
860 | | Lin::checkHPageOffset( |
861 | | std::vector<QPDFObjectHandle> const& pages, std::map<int, int>& shared_idx_to_obj) |
862 | 0 | { |
863 | | // Implementation note 126 says Acrobat always sets delta_content_offset and |
864 | | // delta_content_length in the page offset header dictionary to 0. It also states that |
865 | | // min_content_offset in the per-page information is always 0, which is an incorrect value. |
866 | | |
867 | | // Implementation note 127 explains that Acrobat always sets item 8 (min_content_length) to |
868 | | // zero, item 9 (nbits_delta_content_length) to the value of item 5 (nbits_delta_page_length), |
869 | | // and item 7 of each per-page hint table (delta_content_length) to item 2 (delta_page_length) |
870 | | // of that entry. Acrobat ignores these values when reading files. |
871 | | |
872 | | // Empirically, it also seems that Acrobat sometimes puts items under a page's /Resources |
873 | | // dictionary in with shared objects even when they are private. |
874 | |
|
875 | 0 | size_t npages = pages.size(); |
876 | 0 | qpdf_offset_t table_offset = adjusted_offset(page_offset_hints_.first_page_offset); |
877 | 0 | QPDFObjGen first_page_og(pages.at(0).getObjGen()); |
878 | 0 | if (!m->xref_table.contains(first_page_og)) { |
879 | 0 | stopOnError("supposed first page object is not known"); |
880 | 0 | } |
881 | 0 | qpdf_offset_t offset = getLinearizationOffset(first_page_og); |
882 | 0 | if (table_offset != offset) { |
883 | 0 | linearizationWarning("first page object offset mismatch"); |
884 | 0 | } |
885 | |
|
886 | 0 | for (size_t pageno = 0; pageno < npages; ++pageno) { |
887 | 0 | QPDFObjGen page_og(pages.at(pageno).getObjGen()); |
888 | 0 | int first_object = page_og.getObj(); |
889 | 0 | if (!m->xref_table.contains(page_og)) { |
890 | 0 | stopOnError("unknown object in page offset hint table"); |
891 | 0 | } |
892 | 0 | offset = getLinearizationOffset(page_og); |
893 | |
|
894 | 0 | HPageOffsetEntry& he = page_offset_hints_.entries.at(pageno); |
895 | 0 | CHPageOffsetEntry& ce = c_page_offset_data_.entries.at(pageno); |
896 | 0 | int h_nobjects = he.delta_nobjects + page_offset_hints_.min_nobjects; |
897 | 0 | if (h_nobjects != ce.nobjects) { |
898 | | // This happens with pdlin when there are thumbnails. |
899 | 0 | linearizationWarning( |
900 | 0 | "object count mismatch for page " + std::to_string(pageno) + ": hint table = " + |
901 | 0 | std::to_string(h_nobjects) + "; computed = " + std::to_string(ce.nobjects)); |
902 | 0 | } |
903 | | |
904 | | // Use value for number of objects in hint table rather than computed value if there is a |
905 | | // discrepancy. |
906 | 0 | int length = lengthNextN(first_object, h_nobjects); |
907 | 0 | int h_length = toI(he.delta_page_length + page_offset_hints_.min_page_length); |
908 | 0 | if (length != h_length) { |
909 | | // This condition almost certainly indicates a bad hint table or a bug in this code. |
910 | 0 | linearizationWarning( |
911 | 0 | "page length mismatch for page " + std::to_string(pageno) + ": hint table = " + |
912 | 0 | std::to_string(h_length) + "; computed length = " + std::to_string(length) + |
913 | 0 | " (offset = " + std::to_string(offset) + ")"); |
914 | 0 | } |
915 | |
|
916 | 0 | offset += h_length; |
917 | | |
918 | | // Translate shared object indexes to object numbers. |
919 | 0 | std::set<int> hint_shared; |
920 | 0 | std::set<int> computed_shared; |
921 | |
|
922 | 0 | if (pageno == 0 && he.nshared_objects > 0) { |
923 | | // pdlin and Acrobat both do this even though the spec states clearly and unambiguously |
924 | | // that they should not. |
925 | 0 | linearizationWarning("page 0 has shared identifier entries"); |
926 | 0 | } |
927 | |
|
928 | 0 | for (size_t i = 0; i < toS(he.nshared_objects); ++i) { |
929 | 0 | int idx = he.shared_identifiers.at(i); |
930 | 0 | no_ci_stop_if( |
931 | 0 | !shared_idx_to_obj.contains(idx), |
932 | 0 | "unable to get object for item in shared objects hint table"); |
933 | |
|
934 | 0 | hint_shared.insert(shared_idx_to_obj[idx]); |
935 | 0 | } |
936 | |
|
937 | 0 | for (size_t i = 0; i < toS(ce.nshared_objects); ++i) { |
938 | 0 | int idx = ce.shared_identifiers.at(i); |
939 | 0 | no_ci_stop_if( |
940 | 0 | idx >= c_shared_object_data_.nshared_total, |
941 | 0 | "index out of bounds for shared object hint table" // |
942 | 0 | ); |
943 | |
|
944 | 0 | int obj = c_shared_object_data_.entries.at(toS(idx)).object; |
945 | 0 | computed_shared.insert(obj); |
946 | 0 | } |
947 | |
|
948 | 0 | for (int iter: hint_shared) { |
949 | 0 | if (!computed_shared.contains(iter)) { |
950 | | // pdlin puts thumbnails here even though it shouldn't |
951 | 0 | linearizationWarning( |
952 | 0 | "page " + std::to_string(pageno) + ": shared object " + std::to_string(iter) + |
953 | 0 | ": in hint table but not computed list"); |
954 | 0 | } |
955 | 0 | } |
956 | |
|
957 | 0 | for (int iter: computed_shared) { |
958 | 0 | if (!hint_shared.contains(iter)) { |
959 | | // Acrobat does not put some things including at least built-in fonts and procsets |
960 | | // here, at least in some cases. |
961 | 0 | linearizationWarning( |
962 | 0 | ("page " + std::to_string(pageno) + ": shared object " + std::to_string(iter) + |
963 | 0 | ": in computed list but not hint table")); |
964 | 0 | } |
965 | 0 | } |
966 | 0 | } |
967 | 0 | } |
968 | | |
969 | | void |
970 | | Lin::checkHSharedObject(std::vector<QPDFObjectHandle> const& pages, std::map<int, int>& idx_to_obj) |
971 | 0 | { |
972 | | // Implementation note 125 says shared object groups always contain only one object. |
973 | | // Implementation note 128 says that Acrobat always nbits_nobjects to zero. Implementation note |
974 | | // 130 says that Acrobat does not support more than one shared object per group. These are all |
975 | | // consistent. |
976 | | |
977 | | // Implementation note 129 states that MD5 signatures are not implemented in Acrobat, so |
978 | | // signature_present must always be zero. |
979 | | |
980 | | // Implementation note 131 states that first_shared_obj and first_shared_offset have meaningless |
981 | | // values for single-page files. |
982 | | |
983 | | // Empirically, Acrobat and pdlin generate incorrect values for these whenever there are no |
984 | | // shared objects not referenced by the first page (i.e., nshared_total == nshared_first_page). |
985 | |
|
986 | 0 | HSharedObject& so = shared_object_hints_; |
987 | 0 | if (so.nshared_total < so.nshared_first_page) { |
988 | 0 | linearizationWarning("shared object hint table: ntotal < nfirst_page"); |
989 | 0 | } else { |
990 | | // The first nshared_first_page objects are consecutive objects starting with the first page |
991 | | // object. The rest are consecutive starting from the first_shared_obj object. |
992 | 0 | int cur_object = pages.at(0).getObjectID(); |
993 | 0 | for (int i = 0; i < so.nshared_total; ++i) { |
994 | 0 | if (i == so.nshared_first_page) { |
995 | 0 | QTC::TC("qpdf", "QPDF lin check shared past first page"); |
996 | 0 | if (part8_.empty()) { |
997 | 0 | linearizationWarning("part 8 is empty but nshared_total > nshared_first_page"); |
998 | 0 | } else { |
999 | 0 | int obj = part8_.at(0).getObjectID(); |
1000 | 0 | if (obj != so.first_shared_obj) { |
1001 | 0 | linearizationWarning( |
1002 | 0 | "first shared object number mismatch: hint table = " + |
1003 | 0 | std::to_string(so.first_shared_obj) + |
1004 | 0 | "; computed = " + std::to_string(obj)); |
1005 | 0 | } |
1006 | 0 | } |
1007 | |
|
1008 | 0 | cur_object = so.first_shared_obj; |
1009 | |
|
1010 | 0 | QPDFObjGen og(cur_object, 0); |
1011 | 0 | if (!m->xref_table.contains(og)) { |
1012 | 0 | stopOnError("unknown object in shared object hint table"); |
1013 | 0 | } |
1014 | 0 | qpdf_offset_t offset = getLinearizationOffset(og); |
1015 | 0 | qpdf_offset_t h_offset = adjusted_offset(so.first_shared_offset); |
1016 | 0 | if (offset != h_offset) { |
1017 | 0 | linearizationWarning( |
1018 | 0 | "first shared object offset mismatch: hint table = " + |
1019 | 0 | std::to_string(h_offset) + "; computed = " + std::to_string(offset)); |
1020 | 0 | } |
1021 | 0 | } |
1022 | |
|
1023 | 0 | idx_to_obj[i] = cur_object; |
1024 | 0 | HSharedObjectEntry& se = so.entries.at(toS(i)); |
1025 | 0 | int nobjects = se.nobjects_minus_one + 1; |
1026 | 0 | int length = lengthNextN(cur_object, nobjects); |
1027 | 0 | int h_length = so.min_group_length + se.delta_group_length; |
1028 | 0 | if (length != h_length) { |
1029 | 0 | linearizationWarning( |
1030 | 0 | "shared object " + std::to_string(i) + " length mismatch: hint table = " + |
1031 | 0 | std::to_string(h_length) + "; computed = " + std::to_string(length)); |
1032 | 0 | } |
1033 | 0 | cur_object += nobjects; |
1034 | 0 | } |
1035 | 0 | } |
1036 | 0 | } |
1037 | | |
1038 | | void |
1039 | | Lin::checkHOutlines() |
1040 | 0 | { |
1041 | | // Empirically, Acrobat generates the correct value for the object number but incorrectly stores |
1042 | | // the next object number's offset as the offset, at least when outlines appear in part 6. It |
1043 | | // also generates an incorrect value for length (specifically, the length that would cover the |
1044 | | // correct number of objects from the wrong starting place). pdlin appears to generate correct |
1045 | | // values in those cases. |
1046 | |
|
1047 | 0 | if (c_outline_data_.nobjects == outline_hints_.nobjects) { |
1048 | 0 | if (c_outline_data_.nobjects == 0) { |
1049 | 0 | return; |
1050 | 0 | } |
1051 | | |
1052 | 0 | if (c_outline_data_.first_object == outline_hints_.first_object) { |
1053 | | // Check length and offset. Acrobat gets these wrong. |
1054 | 0 | QPDFObjectHandle outlines = qpdf.getRoot().getKey("/Outlines"); |
1055 | 0 | if (!outlines.isIndirect()) { |
1056 | | // This case is not exercised in test suite since not permitted by the spec, but if |
1057 | | // this does occur, the code below would fail. |
1058 | 0 | linearizationWarning("/Outlines key of root dictionary is not indirect"); |
1059 | 0 | return; |
1060 | 0 | } |
1061 | 0 | QPDFObjGen og(outlines.getObjGen()); |
1062 | 0 | no_ci_stop_if( |
1063 | 0 | !m->xref_table.contains(og), "unknown object in outlines hint table" // |
1064 | 0 | ); |
1065 | 0 | qpdf_offset_t offset = getLinearizationOffset(og); |
1066 | 0 | ObjUser ou(ObjUser::ou_root_key, "/Outlines"); |
1067 | 0 | int length = toI(maxEnd(ou) - offset); |
1068 | 0 | qpdf_offset_t table_offset = adjusted_offset(outline_hints_.first_object_offset); |
1069 | 0 | if (offset != table_offset) { |
1070 | 0 | linearizationWarning( |
1071 | 0 | "incorrect offset in outlines table: hint table = " + |
1072 | 0 | std::to_string(table_offset) + "; computed = " + std::to_string(offset)); |
1073 | 0 | } |
1074 | 0 | int table_length = outline_hints_.group_length; |
1075 | 0 | if (length != table_length) { |
1076 | 0 | linearizationWarning( |
1077 | 0 | "incorrect length in outlines table: hint table = " + |
1078 | 0 | std::to_string(table_length) + "; computed = " + std::to_string(length)); |
1079 | 0 | } |
1080 | 0 | } else { |
1081 | 0 | linearizationWarning("incorrect first object number in outline hints table."); |
1082 | 0 | } |
1083 | 0 | } else { |
1084 | 0 | linearizationWarning("incorrect object count in outline hint table"); |
1085 | 0 | } |
1086 | 0 | } |
1087 | | |
1088 | | void |
1089 | | QPDF::showLinearizationData() |
1090 | 0 | { |
1091 | 0 | m->lin.show_data(); |
1092 | 0 | } |
1093 | | |
1094 | | void |
1095 | | Lin::show_data() |
1096 | 0 | { |
1097 | 0 | try { |
1098 | 0 | readLinearizationData(); |
1099 | 0 | checkLinearizationInternal(); |
1100 | 0 | dumpLinearizationDataInternal(); |
1101 | 0 | } catch (QPDFExc& e) { |
1102 | 0 | linearizationWarning(e.what()); |
1103 | 0 | } |
1104 | 0 | } |
1105 | | |
1106 | | void |
1107 | | Lin::dumpLinearizationDataInternal() |
1108 | 0 | { |
1109 | 0 | auto& info = *cf.log()->getInfo(); |
1110 | |
|
1111 | 0 | info << m->file->getName() << ": linearization data:\n\n"; |
1112 | |
|
1113 | 0 | info << "file_size: " << linp_.file_size << "\n" |
1114 | 0 | << "first_page_object: " << linp_.first_page_object << "\n" |
1115 | 0 | << "first_page_end: " << linp_.first_page_end << "\n" |
1116 | 0 | << "npages: " << linp_.npages << "\n" |
1117 | 0 | << "xref_zero_offset: " << linp_.xref_zero_offset << "\n" |
1118 | 0 | << "first_page: " << linp_.first_page << "\n" |
1119 | 0 | << "H_offset: " << linp_.H_offset << "\n" |
1120 | 0 | << "H_length: " << linp_.H_length << "\n" |
1121 | 0 | << "\n"; |
1122 | |
|
1123 | 0 | info << "Page Offsets Hint Table\n\n"; |
1124 | 0 | dumpHPageOffset(); |
1125 | 0 | info << "\nShared Objects Hint Table\n\n"; |
1126 | 0 | dumpHSharedObject(); |
1127 | |
|
1128 | 0 | if (outline_hints_.nobjects > 0) { |
1129 | 0 | info << "\nOutlines Hint Table\n\n"; |
1130 | 0 | dumpHGeneric(outline_hints_); |
1131 | 0 | } |
1132 | 0 | } |
1133 | | |
1134 | | qpdf_offset_t |
1135 | | Lin::adjusted_offset(qpdf_offset_t offset) |
1136 | 0 | { |
1137 | | // All offsets >= H_offset have to be increased by H_length since all hint table location values |
1138 | | // disregard the hint table itself. |
1139 | 0 | if (offset >= linp_.H_offset) { |
1140 | 0 | return offset + linp_.H_length; |
1141 | 0 | } |
1142 | 0 | return offset; |
1143 | 0 | } |
1144 | | |
1145 | | void |
1146 | | Lin::dumpHPageOffset() |
1147 | 0 | { |
1148 | 0 | auto& info = *cf.log()->getInfo(); |
1149 | 0 | HPageOffset& t = page_offset_hints_; |
1150 | 0 | info << "min_nobjects: " << t.min_nobjects << "\n" |
1151 | 0 | << "first_page_offset: " << adjusted_offset(t.first_page_offset) << "\n" |
1152 | 0 | << "nbits_delta_nobjects: " << t.nbits_delta_nobjects << "\n" |
1153 | 0 | << "min_page_length: " << t.min_page_length << "\n" |
1154 | 0 | << "nbits_delta_page_length: " << t.nbits_delta_page_length << "\n" |
1155 | 0 | << "min_content_offset: " << t.min_content_offset << "\n" |
1156 | 0 | << "nbits_delta_content_offset: " << t.nbits_delta_content_offset << "\n" |
1157 | 0 | << "min_content_length: " << t.min_content_length << "\n" |
1158 | 0 | << "nbits_delta_content_length: " << t.nbits_delta_content_length << "\n" |
1159 | 0 | << "nbits_nshared_objects: " << t.nbits_nshared_objects << "\n" |
1160 | 0 | << "nbits_shared_identifier: " << t.nbits_shared_identifier << "\n" |
1161 | 0 | << "nbits_shared_numerator: " << t.nbits_shared_numerator << "\n" |
1162 | 0 | << "shared_denominator: " << t.shared_denominator << "\n"; |
1163 | |
|
1164 | 0 | for (size_t i1 = 0; i1 < linp_.npages; ++i1) { |
1165 | 0 | HPageOffsetEntry& pe = t.entries.at(i1); |
1166 | 0 | info << "Page " << i1 << ":\n" |
1167 | 0 | << " nobjects: " << pe.delta_nobjects + t.min_nobjects << "\n" |
1168 | 0 | << " length: " << pe.delta_page_length + t.min_page_length |
1169 | 0 | << "\n" |
1170 | | // content offset is relative to page, not file |
1171 | 0 | << " content_offset: " << pe.delta_content_offset + t.min_content_offset << "\n" |
1172 | 0 | << " content_length: " << pe.delta_content_length + t.min_content_length << "\n" |
1173 | 0 | << " nshared_objects: " << pe.nshared_objects << "\n"; |
1174 | 0 | for (size_t i2 = 0; i2 < toS(pe.nshared_objects); ++i2) { |
1175 | 0 | info << " identifier " << i2 << ": " << pe.shared_identifiers.at(i2) << "\n"; |
1176 | 0 | info << " numerator " << i2 << ": " << pe.shared_numerators.at(i2) << "\n"; |
1177 | 0 | } |
1178 | 0 | } |
1179 | 0 | } |
1180 | | |
1181 | | void |
1182 | | Lin::dumpHSharedObject() |
1183 | 0 | { |
1184 | 0 | auto& info = *cf.log()->getInfo(); |
1185 | 0 | HSharedObject& t = shared_object_hints_; |
1186 | 0 | info << "first_shared_obj: " << t.first_shared_obj << "\n" |
1187 | 0 | << "first_shared_offset: " << adjusted_offset(t.first_shared_offset) << "\n" |
1188 | 0 | << "nshared_first_page: " << t.nshared_first_page << "\n" |
1189 | 0 | << "nshared_total: " << t.nshared_total << "\n" |
1190 | 0 | << "nbits_nobjects: " << t.nbits_nobjects << "\n" |
1191 | 0 | << "min_group_length: " << t.min_group_length << "\n" |
1192 | 0 | << "nbits_delta_group_length: " << t.nbits_delta_group_length << "\n"; |
1193 | |
|
1194 | 0 | for (size_t i = 0; i < toS(t.nshared_total); ++i) { |
1195 | 0 | HSharedObjectEntry& se = t.entries.at(i); |
1196 | 0 | info << "Shared Object " << i << ":\n" |
1197 | 0 | << " group length: " << se.delta_group_length + t.min_group_length << "\n"; |
1198 | | // PDF spec says signature present nobjects_minus_one are always 0, so print them only if |
1199 | | // they have a non-zero value. |
1200 | 0 | if (se.signature_present) { |
1201 | 0 | info << " signature present\n"; |
1202 | 0 | } |
1203 | 0 | if (se.nobjects_minus_one != 0) { |
1204 | 0 | info << " nobjects: " << se.nobjects_minus_one + 1 << "\n"; |
1205 | 0 | } |
1206 | 0 | } |
1207 | 0 | } |
1208 | | |
1209 | | void |
1210 | | Lin::dumpHGeneric(HGeneric& t) |
1211 | 0 | { |
1212 | 0 | *cf.log()->getInfo() << "first_object: " << t.first_object << "\n" |
1213 | 0 | << "first_object_offset: " << adjusted_offset(t.first_object_offset) |
1214 | 0 | << "\n" |
1215 | 0 | << "nobjects: " << t.nobjects << "\n" |
1216 | 0 | << "group_length: " << t.group_length << "\n"; |
1217 | 0 | } |
1218 | | |
1219 | | template <typename T> |
1220 | | void |
1221 | | Lin::calculateLinearizationData(T const& object_stream_data) |
1222 | 0 | { |
1223 | | // This function calculates the ordering of objects, divides them into the appropriate parts, |
1224 | | // and computes some values for the linearization parameter dictionary and hint tables. The |
1225 | | // file must be optimized (via calling optimize()) prior to calling this function. Note that |
1226 | | // actual offsets and lengths are not computed here, but anything related to object ordering is. |
1227 | |
|
1228 | 0 | util::assertion( |
1229 | 0 | !object_to_obj_users_.empty(), |
1230 | 0 | "INTERNAL ERROR: QPDF::calculateLinearizationData called before optimize()" // |
1231 | 0 | ); |
1232 | | // Note that we can't call optimize here because we don't know whether it should be called |
1233 | | // with or without allow changes. |
1234 | | |
1235 | | // Separate objects into the categories sufficient for us to determine which part of the |
1236 | | // linearized file should contain the object. This categorization is useful for other purposes |
1237 | | // as well. Part numbers refer to version 1.4 of the PDF spec. |
1238 | | |
1239 | | // Parts 1, 3, 5, 10, and 11 don't contain any objects from the original file (except the |
1240 | | // trailer dictionary in part 11). |
1241 | | |
1242 | | // Part 4 is the document catalog (root) and the following root keys: /ViewerPreferences, |
1243 | | // /PageMode, /Threads, /OpenAction, /AcroForm, /Encrypt. Note that Thread information |
1244 | | // dictionaries are supposed to appear in part 9, but we are disregarding that recommendation |
1245 | | // for now. |
1246 | | |
1247 | | // Part 6 is the first page section. It includes all remaining objects referenced by the first |
1248 | | // page including shared objects but not including thumbnails. Additionally, if /PageMode is |
1249 | | // /Outlines, then information from /Outlines also appears here. |
1250 | | |
1251 | | // Part 7 contains remaining objects private to pages other than the first page. |
1252 | | |
1253 | | // Part 8 contains all remaining shared objects except those that are shared only within |
1254 | | // thumbnails. |
1255 | | |
1256 | | // Part 9 contains all remaining objects. |
1257 | | |
1258 | | // We sort objects into the following categories: |
1259 | | |
1260 | | // * open_document: part 4 |
1261 | | |
1262 | | // * first_page_private: part 6 |
1263 | | |
1264 | | // * first_page_shared: part 6 |
1265 | | |
1266 | | // * other_page_private: part 7 |
1267 | | |
1268 | | // * other_page_shared: part 8 |
1269 | | |
1270 | | // * thumbnail_private: part 9 |
1271 | | |
1272 | | // * thumbnail_shared: part 9 |
1273 | | |
1274 | | // * other: part 9 |
1275 | | |
1276 | | // * outlines: part 6 or 9 |
1277 | |
|
1278 | 0 | part4_.clear(); |
1279 | 0 | part6_.clear(); |
1280 | 0 | part7_.clear(); |
1281 | 0 | part8_.clear(); |
1282 | 0 | part9_.clear(); |
1283 | 0 | c_linp_ = LinParameters(); |
1284 | 0 | c_page_offset_data_ = CHPageOffset(); |
1285 | 0 | c_shared_object_data_ = CHSharedObject(); |
1286 | 0 | c_outline_data_ = HGeneric(); |
1287 | |
|
1288 | 0 | QPDFObjectHandle root = qpdf.getRoot(); |
1289 | 0 | bool outlines_in_first_page = false; |
1290 | 0 | QPDFObjectHandle pagemode = root.getKey("/PageMode"); |
1291 | 0 | QTC::TC("qpdf", "QPDF categorize pagemode present", pagemode.isName() ? 1 : 0); |
1292 | 0 | if (pagemode.isName()) { |
1293 | 0 | if (pagemode.getName() == "/UseOutlines") { |
1294 | 0 | if (root.hasKey("/Outlines")) { |
1295 | 0 | outlines_in_first_page = true; |
1296 | 0 | } else { |
1297 | 0 | QTC::TC("qpdf", "QPDF UseOutlines but no Outlines"); |
1298 | 0 | } |
1299 | 0 | } |
1300 | 0 | QTC::TC("qpdf", "QPDF categorize pagemode outlines", outlines_in_first_page ? 1 : 0); |
1301 | 0 | } |
1302 | |
|
1303 | 0 | std::set<std::string> open_document_keys; |
1304 | 0 | open_document_keys.insert("/ViewerPreferences"); |
1305 | 0 | open_document_keys.insert("/PageMode"); |
1306 | 0 | open_document_keys.insert("/Threads"); |
1307 | 0 | open_document_keys.insert("/OpenAction"); |
1308 | 0 | open_document_keys.insert("/AcroForm"); |
1309 | |
|
1310 | 0 | std::set<QPDFObjGen> lc_open_document; |
1311 | 0 | std::set<QPDFObjGen> lc_first_page_private; |
1312 | 0 | std::set<QPDFObjGen> lc_first_page_shared; |
1313 | 0 | std::set<QPDFObjGen> lc_other_page_private; |
1314 | 0 | std::set<QPDFObjGen> lc_other_page_shared; |
1315 | 0 | std::set<QPDFObjGen> lc_thumbnail_private; |
1316 | 0 | std::set<QPDFObjGen> lc_thumbnail_shared; |
1317 | 0 | std::set<QPDFObjGen> lc_other; |
1318 | 0 | std::set<QPDFObjGen> lc_outlines; |
1319 | 0 | std::set<QPDFObjGen> lc_root; |
1320 | |
|
1321 | 0 | for (auto& [og, ous]: object_to_obj_users_) { |
1322 | 0 | bool in_open_document = false; |
1323 | 0 | bool in_first_page = false; |
1324 | 0 | int other_pages = 0; |
1325 | 0 | int thumbs = 0; |
1326 | 0 | int others = 0; |
1327 | 0 | bool in_outlines = false; |
1328 | 0 | bool is_root = false; |
1329 | |
|
1330 | 0 | for (auto const& ou: ous) { |
1331 | 0 | switch (ou.ou_type) { |
1332 | 0 | case ObjUser::ou_trailer_key: |
1333 | 0 | if (ou.key == "/Encrypt") { |
1334 | 0 | in_open_document = true; |
1335 | 0 | } else { |
1336 | 0 | ++others; |
1337 | 0 | } |
1338 | 0 | break; |
1339 | | |
1340 | 0 | case ObjUser::ou_thumb: |
1341 | 0 | ++thumbs; |
1342 | 0 | break; |
1343 | | |
1344 | 0 | case ObjUser::ou_root_key: |
1345 | 0 | if (open_document_keys.contains(ou.key)) { |
1346 | 0 | in_open_document = true; |
1347 | 0 | } else if (ou.key == "/Outlines") { |
1348 | 0 | in_outlines = true; |
1349 | 0 | } else { |
1350 | 0 | ++others; |
1351 | 0 | } |
1352 | 0 | break; |
1353 | | |
1354 | 0 | case ObjUser::ou_page: |
1355 | 0 | if (ou.pageno == 0) { |
1356 | 0 | in_first_page = true; |
1357 | 0 | } else { |
1358 | 0 | ++other_pages; |
1359 | 0 | } |
1360 | 0 | break; |
1361 | | |
1362 | 0 | case ObjUser::ou_root: |
1363 | 0 | is_root = true; |
1364 | 0 | break; |
1365 | 0 | } |
1366 | 0 | } |
1367 | | |
1368 | 0 | if (is_root) { |
1369 | 0 | lc_root.insert(og); |
1370 | 0 | } else if (in_outlines) { |
1371 | 0 | lc_outlines.insert(og); |
1372 | 0 | } else if (in_open_document) { |
1373 | 0 | lc_open_document.insert(og); |
1374 | 0 | } else if ((in_first_page) && (others == 0) && (other_pages == 0) && (thumbs == 0)) { |
1375 | 0 | lc_first_page_private.insert(og); |
1376 | 0 | } else if (in_first_page) { |
1377 | 0 | lc_first_page_shared.insert(og); |
1378 | 0 | } else if ((other_pages == 1) && (others == 0) && (thumbs == 0)) { |
1379 | 0 | lc_other_page_private.insert(og); |
1380 | 0 | } else if (other_pages > 1) { |
1381 | 0 | lc_other_page_shared.insert(og); |
1382 | 0 | } else if ((thumbs == 1) && (others == 0)) { |
1383 | 0 | lc_thumbnail_private.insert(og); |
1384 | 0 | } else if (thumbs > 1) { |
1385 | 0 | lc_thumbnail_shared.insert(og); |
1386 | 0 | } else { |
1387 | 0 | lc_other.insert(og); |
1388 | 0 | } |
1389 | 0 | } |
1390 | | |
1391 | | // Generate ordering for objects in the output file. Sometimes we just dump right from a set |
1392 | | // into a vector. Rather than optimizing this by going straight into the vector, we'll leave |
1393 | | // these phases separate for now. That way, this section can be concerned only with ordering, |
1394 | | // and the above section can be considered only with categorization. Note that sets of |
1395 | | // QPDFObjGens are sorted by QPDFObjGen. In a linearized file, objects appear in sequence with |
1396 | | // the possible exception of hints tables which we won't see here anyway. That means that |
1397 | | // running calculateLinearizationData() on a linearized file should give results identical to |
1398 | | // the original file ordering. |
1399 | | |
1400 | | // We seem to traverse the page tree a lot in this code, but we can address this for a future |
1401 | | // code optimization if necessary. Premature optimization is the root of all evil. |
1402 | 0 | std::vector<QPDFObjectHandle> uc_pages; |
1403 | 0 | { // local scope |
1404 | | // Map all page objects to the containing object stream. This should be a no-op in a |
1405 | | // properly linearized file. |
1406 | 0 | for (auto oh: pages) { |
1407 | 0 | uc_pages.emplace_back(getUncompressedObject(oh, object_stream_data)); |
1408 | 0 | } |
1409 | 0 | } |
1410 | 0 | size_t npages = pages.size(); |
1411 | | |
1412 | | // We will be initializing some values of the computed hint tables. Specifically, we can |
1413 | | // initialize any items that deal with object numbers or counts but not any items that deal with |
1414 | | // lengths or offsets. The code that writes linearized files will have to fill in these values |
1415 | | // during the first pass. The validation code can compute them relatively easily given the rest |
1416 | | // of the information. |
1417 | | |
1418 | | // npages is the size of the existing pages vector, which has been created by traversing the |
1419 | | // pages tree, and as such is a reasonable size. |
1420 | 0 | c_linp_.npages = npages; |
1421 | 0 | c_page_offset_data_.entries = std::vector<CHPageOffsetEntry>(npages); |
1422 | | |
1423 | | // Part 4: open document objects. We don't care about the order. |
1424 | |
|
1425 | 0 | no_ci_stop_if( |
1426 | 0 | lc_root.size() != 1, "found other than one root while calculating linearization data" // |
1427 | 0 | ); |
1428 | |
|
1429 | 0 | part4_.emplace_back(qpdf.getObject(*(lc_root.begin()))); |
1430 | 0 | for (auto const& og: lc_open_document) { |
1431 | 0 | part4_.emplace_back(qpdf.getObject(og)); |
1432 | 0 | } |
1433 | | |
1434 | | // Part 6: first page objects. Note: implementation note 124 states that Acrobat always treats |
1435 | | // page 0 as the first page for linearization regardless of /OpenAction. pdlin doesn't provide |
1436 | | // any option to set this and also disregards /OpenAction. We will do the same. |
1437 | | |
1438 | | // First, place the actual first page object itself. |
1439 | 0 | no_ci_stop_if( |
1440 | 0 | pages.empty(), "no pages found while calculating linearization data" // |
1441 | 0 | ); |
1442 | 0 | QPDFObjGen first_page_og(uc_pages.at(0).getObjGen()); |
1443 | 0 | no_ci_stop_if( |
1444 | 0 | !lc_first_page_private.erase(first_page_og), "unable to linearize first page" // |
1445 | 0 | ); |
1446 | 0 | c_linp_.first_page_object = uc_pages.at(0).getObjectID(); |
1447 | 0 | part6_.emplace_back(uc_pages.at(0)); |
1448 | | |
1449 | | // The PDF spec "recommends" an order for the rest of the objects, but we are going to disregard |
1450 | | // it except to the extent that it groups private and shared objects contiguously for the sake |
1451 | | // of hint tables. |
1452 | |
|
1453 | 0 | for (auto const& og: lc_first_page_private) { |
1454 | 0 | part6_.emplace_back(qpdf.getObject(og)); |
1455 | 0 | } |
1456 | |
|
1457 | 0 | for (auto const& og: lc_first_page_shared) { |
1458 | 0 | part6_.emplace_back(qpdf.getObject(og)); |
1459 | 0 | } |
1460 | | |
1461 | | // Place the outline dictionary if it goes in the first page section. |
1462 | 0 | if (outlines_in_first_page) { |
1463 | 0 | pushOutlinesToPart(part6_, lc_outlines, object_stream_data); |
1464 | 0 | } |
1465 | | |
1466 | | // Fill in page offset hint table information for the first page. The PDF spec says that |
1467 | | // nshared_objects should be zero for the first page. pdlin does not appear to obey this, but |
1468 | | // it fills in garbage values for all the shared object identifiers on the first page. |
1469 | |
|
1470 | 0 | c_page_offset_data_.entries.at(0).nobjects = toI(part6_.size()); |
1471 | | |
1472 | | // Part 7: other pages' private objects |
1473 | | |
1474 | | // For each page in order: |
1475 | 0 | for (size_t i = 1; i < npages; ++i) { |
1476 | | // Place this page's page object |
1477 | |
|
1478 | 0 | QPDFObjGen page_og(uc_pages.at(i).getObjGen()); |
1479 | 0 | no_ci_stop_if( |
1480 | 0 | !lc_other_page_private.erase(page_og), |
1481 | 0 | "unable to linearize page " + std::to_string(i) // |
1482 | 0 | ); |
1483 | |
|
1484 | 0 | part7_.emplace_back(uc_pages.at(i)); |
1485 | | |
1486 | | // Place all non-shared objects referenced by this page, updating the page object count for |
1487 | | // the hint table. |
1488 | |
|
1489 | 0 | c_page_offset_data_.entries.at(i).nobjects = 1; |
1490 | |
|
1491 | 0 | ObjUser ou(ObjUser::ou_page, i); |
1492 | 0 | no_ci_stop_if( |
1493 | 0 | !obj_user_to_objects_.contains(ou), |
1494 | 0 | "found unreferenced page while calculating linearization data" // |
1495 | 0 | ); |
1496 | |
|
1497 | 0 | for (auto const& og: obj_user_to_objects_[ou]) { |
1498 | 0 | if (lc_other_page_private.erase(og)) { |
1499 | 0 | part7_.emplace_back(qpdf.getObject(og)); |
1500 | 0 | ++c_page_offset_data_.entries.at(i).nobjects; |
1501 | 0 | } |
1502 | 0 | } |
1503 | 0 | } |
1504 | | // That should have covered all part7 objects. |
1505 | 0 | util::assertion( |
1506 | 0 | lc_other_page_private.empty(), |
1507 | 0 | "INTERNAL ERROR: QPDF::calculateLinearizationData: lc_other_page_private is not empty " |
1508 | 0 | "after generation of part7" // |
1509 | 0 | ); |
1510 | | |
1511 | | // Part 8: other pages' shared objects |
1512 | | |
1513 | | // Order is unimportant. |
1514 | 0 | for (auto const& og: lc_other_page_shared) { |
1515 | 0 | part8_.emplace_back(qpdf.getObject(og)); |
1516 | 0 | } |
1517 | | |
1518 | | // Part 9: other objects |
1519 | | |
1520 | | // The PDF specification makes recommendations on ordering here. We follow them only to a |
1521 | | // limited extent. Specifically, we put the pages tree first, then private thumbnail objects in |
1522 | | // page order, then shared thumbnail objects, and then outlines (unless in part 6). After that, |
1523 | | // we throw all remaining objects in arbitrary order. |
1524 | | |
1525 | | // Place the pages tree. |
1526 | 0 | auto& pages_ogs = obj_user_to_objects_[{ObjUser::ou_root_key, "/Pages"}]; |
1527 | 0 | no_ci_stop_if( |
1528 | 0 | pages_ogs.empty(), "found empty pages tree while calculating linearization data" // |
1529 | 0 | ); |
1530 | 0 | for (auto const& og: pages_ogs) { |
1531 | 0 | if (lc_other.erase(og)) { |
1532 | 0 | part9_.emplace_back(qpdf.getObject(og)); |
1533 | 0 | } |
1534 | 0 | } |
1535 | | |
1536 | | // Place private thumbnail images in page order. Slightly more information would be required if |
1537 | | // we were going to bother with thumbnail hint tables. |
1538 | 0 | for (size_t i = 0; i < npages; ++i) { |
1539 | 0 | QPDFObjectHandle thumb = uc_pages.at(i).getKey("/Thumb"); |
1540 | 0 | thumb = getUncompressedObject(thumb, object_stream_data); |
1541 | 0 | QPDFObjGen thumb_og(thumb.getObjGen()); |
1542 | | // Output the thumbnail itself |
1543 | 0 | if (lc_thumbnail_private.erase(thumb_og) && !thumb.null()) { |
1544 | 0 | part9_.emplace_back(thumb); |
1545 | 0 | } else { |
1546 | | // No internal error this time...there's nothing to stop this object from having |
1547 | | // been referred to somewhere else outside of a page's /Thumb, and if it had been, |
1548 | | // there's nothing to prevent it from having been in some set other than |
1549 | | // lc_thumbnail_private. |
1550 | 0 | } |
1551 | 0 | for (auto const& og: obj_user_to_objects_[{ObjUser::ou_thumb, i}]) { |
1552 | 0 | if (lc_thumbnail_private.erase(og)) { |
1553 | 0 | part9_.emplace_back(qpdf.getObject(og)); |
1554 | 0 | } |
1555 | 0 | } |
1556 | 0 | } |
1557 | 0 | util::assertion( |
1558 | 0 | lc_thumbnail_private.empty(), |
1559 | 0 | "INTERNAL ERROR: QPDF::calculateLinearizationData: lc_thumbnail_private not " |
1560 | 0 | "empty after placing thumbnails" // |
1561 | 0 | ); |
1562 | | |
1563 | | // Place shared thumbnail objects |
1564 | 0 | for (auto const& og: lc_thumbnail_shared) { |
1565 | 0 | part9_.emplace_back(qpdf.getObject(og)); |
1566 | 0 | } |
1567 | | |
1568 | | // Place outlines unless in first page |
1569 | 0 | if (!outlines_in_first_page) { |
1570 | 0 | pushOutlinesToPart(part9_, lc_outlines, object_stream_data); |
1571 | 0 | } |
1572 | | |
1573 | | // Place all remaining objects |
1574 | 0 | for (auto const& og: lc_other) { |
1575 | 0 | part9_.emplace_back(qpdf.getObject(og)); |
1576 | 0 | } |
1577 | | |
1578 | | // Make sure we got everything exactly once. |
1579 | |
|
1580 | 0 | size_t num_placed = |
1581 | 0 | part4_.size() + part6_.size() + part7_.size() + part8_.size() + part9_.size(); |
1582 | 0 | size_t num_wanted = object_to_obj_users_.size(); |
1583 | 0 | no_ci_stop_if( |
1584 | | // This can happen with damaged files, e.g. if the root is part of the the pages tree. |
1585 | 0 | num_placed != num_wanted, |
1586 | 0 | "QPDF::calculateLinearizationData: wrong number of objects placed (num_placed = " + |
1587 | 0 | std::to_string(num_placed) + "; number of objects: " + std::to_string(num_wanted) + |
1588 | 0 | "\nIf the file did not generate any other warnings please report this as a bug." // |
1589 | 0 | ); |
1590 | | |
1591 | | // Calculate shared object hint table information including references to shared objects from |
1592 | | // page offset hint data. |
1593 | | |
1594 | | // The shared object hint table consists of all part 6 (whether shared or not) in order followed |
1595 | | // by all part 8 objects in order. Add the objects to shared object data keeping a map of |
1596 | | // object number to index. Then populate the shared object information for the pages. |
1597 | | |
1598 | | // Note that two objects never have the same object number, so we can map from object number |
1599 | | // only without regards to generation. |
1600 | 0 | std::map<int, int> obj_to_index; |
1601 | |
|
1602 | 0 | c_shared_object_data_.nshared_first_page = toI(part6_.size()); |
1603 | 0 | c_shared_object_data_.nshared_total = |
1604 | 0 | c_shared_object_data_.nshared_first_page + toI(part8_.size()); |
1605 | |
|
1606 | 0 | std::vector<CHSharedObjectEntry>& shared = c_shared_object_data_.entries; |
1607 | 0 | for (auto& oh: part6_) { |
1608 | 0 | int obj = oh.getObjectID(); |
1609 | 0 | obj_to_index[obj] = toI(shared.size()); |
1610 | 0 | shared.emplace_back(obj); |
1611 | 0 | } |
1612 | 0 | QTC::TC("qpdf", "QPDF lin part 8 empty", part8_.empty() ? 1 : 0); |
1613 | 0 | if (!part8_.empty()) { |
1614 | 0 | c_shared_object_data_.first_shared_obj = part8_.at(0).getObjectID(); |
1615 | 0 | for (auto& oh: part8_) { |
1616 | 0 | int obj = oh.getObjectID(); |
1617 | 0 | obj_to_index[obj] = toI(shared.size()); |
1618 | 0 | shared.emplace_back(obj); |
1619 | 0 | } |
1620 | 0 | } |
1621 | 0 | no_ci_stop_if( |
1622 | 0 | std::cmp_not_equal( |
1623 | 0 | c_shared_object_data_.nshared_total, c_shared_object_data_.entries.size()), |
1624 | 0 | "shared object hint table has wrong number of entries" // |
1625 | 0 | ); |
1626 | | |
1627 | | // Now compute the list of shared objects for each page after the first page. |
1628 | |
|
1629 | 0 | for (size_t i = 1; i < npages; ++i) { |
1630 | 0 | CHPageOffsetEntry& pe = c_page_offset_data_.entries.at(i); |
1631 | 0 | ObjUser ou(ObjUser::ou_page, i); |
1632 | 0 | no_ci_stop_if( |
1633 | 0 | !obj_user_to_objects_.contains(ou), |
1634 | 0 | "found unreferenced page while calculating linearization data" // |
1635 | 0 | ); |
1636 | |
|
1637 | 0 | for (auto const& og: obj_user_to_objects_[ou]) { |
1638 | 0 | if (object_to_obj_users_[og].size() > 1 && obj_to_index.contains(og.getObj())) { |
1639 | 0 | int idx = obj_to_index[og.getObj()]; |
1640 | 0 | ++pe.nshared_objects; |
1641 | 0 | pe.shared_identifiers.push_back(idx); |
1642 | 0 | } |
1643 | 0 | } |
1644 | 0 | } |
1645 | 0 | } Unexecuted instantiation: void QPDF::Doc::Linearization::calculateLinearizationData<std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > >(std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > const&) Unexecuted instantiation: void QPDF::Doc::Linearization::calculateLinearizationData<QPDFWriter::ObjTable>(QPDFWriter::ObjTable const&) |
1646 | | |
1647 | | template <typename T> |
1648 | | void |
1649 | | Lin::pushOutlinesToPart( |
1650 | | std::vector<QPDFObjectHandle>& part, |
1651 | | std::set<QPDFObjGen>& lc_outlines, |
1652 | | T const& object_stream_data) |
1653 | 0 | { |
1654 | 0 | QPDFObjectHandle root = qpdf.getRoot(); |
1655 | 0 | QPDFObjectHandle outlines = root.getKey("/Outlines"); |
1656 | 0 | if (outlines.null()) { |
1657 | 0 | return; |
1658 | 0 | } |
1659 | 0 | outlines = getUncompressedObject(outlines, object_stream_data); |
1660 | 0 | QPDFObjGen outlines_og(outlines.getObjGen()); |
1661 | 0 | QTC::TC( |
1662 | 0 | "qpdf", |
1663 | 0 | "QPDF lin outlines in part", |
1664 | 0 | &part == &part6_ ? 0 |
1665 | 0 | : (&part == &part9_) ? 1 |
1666 | 0 | : 9999); // can't happen |
1667 | 0 | if (lc_outlines.erase(outlines_og)) { |
1668 | | // Make sure outlines is in lc_outlines in case the file is damaged. in which case it may be |
1669 | | // included in an earlier part. |
1670 | 0 | part.emplace_back(outlines); |
1671 | 0 | c_outline_data_.first_object = outlines_og.getObj(); |
1672 | 0 | c_outline_data_.nobjects = 1; |
1673 | 0 | } |
1674 | 0 | for (auto const& og: lc_outlines) { |
1675 | 0 | if (!c_outline_data_.first_object) { |
1676 | 0 | c_outline_data_.first_object = og.getObj(); |
1677 | 0 | } |
1678 | 0 | part.emplace_back(qpdf.getObject(og)); |
1679 | 0 | ++c_outline_data_.nobjects; |
1680 | 0 | } |
1681 | 0 | } Unexecuted instantiation: void QPDF::Doc::Linearization::pushOutlinesToPart<std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > >(std::__1::vector<QPDFObjectHandle, std::__1::allocator<QPDFObjectHandle> >&, std::__1::set<QPDFObjGen, std::__1::less<QPDFObjGen>, std::__1::allocator<QPDFObjGen> >&, std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > const&) Unexecuted instantiation: void QPDF::Doc::Linearization::pushOutlinesToPart<QPDFWriter::ObjTable>(std::__1::vector<QPDFObjectHandle, std::__1::allocator<QPDFObjectHandle> >&, std::__1::set<QPDFObjGen, std::__1::less<QPDFObjGen>, std::__1::allocator<QPDFObjGen> >&, QPDFWriter::ObjTable const&) |
1682 | | |
1683 | | void |
1684 | | Lin::parts( |
1685 | | QPDFWriter::ObjTable const& obj, |
1686 | | std::vector<QPDFObjectHandle>& part4, |
1687 | | std::vector<QPDFObjectHandle>& part6, |
1688 | | std::vector<QPDFObjectHandle>& part7, |
1689 | | std::vector<QPDFObjectHandle>& part8, |
1690 | | std::vector<QPDFObjectHandle>& part9) |
1691 | 0 | { |
1692 | 0 | calculateLinearizationData(obj); |
1693 | 0 | part4 = part4_; |
1694 | 0 | part6 = part6_; |
1695 | 0 | part7 = part7_; |
1696 | 0 | part8 = part8_; |
1697 | 0 | part9 = part9_; |
1698 | 0 | } |
1699 | | |
1700 | | static inline int |
1701 | | nbits(int val) |
1702 | 0 | { |
1703 | 0 | return (val == 0 ? 0 : (1 + nbits(val >> 1))); |
1704 | 0 | } |
1705 | | |
1706 | | int |
1707 | | Lin::outputLengthNextN( |
1708 | | int in_object, int n, QPDFWriter::NewObjTable const& new_obj, QPDFWriter::ObjTable const& obj) |
1709 | 0 | { |
1710 | | // Figure out the length of a series of n consecutive objects in the output file starting with |
1711 | | // whatever object in_object from the input file mapped to. |
1712 | |
|
1713 | 0 | int first = obj[in_object].renumber; |
1714 | 0 | int last = first + n; |
1715 | 0 | no_ci_stop_if( |
1716 | 0 | first <= 0, "found object that is not renumbered while writing linearization data"); |
1717 | 0 | qpdf_offset_t length = 0; |
1718 | 0 | for (int i = first; i < last; ++i) { |
1719 | 0 | auto l = new_obj[i].length; |
1720 | 0 | no_ci_stop_if( |
1721 | 0 | l == 0, "found item with unknown length while writing linearization data" // |
1722 | 0 | ); |
1723 | 0 | length += l; |
1724 | 0 | } |
1725 | 0 | return toI(length); |
1726 | 0 | } |
1727 | | |
1728 | | void |
1729 | | Lin::calculateHPageOffset(QPDFWriter::NewObjTable const& new_obj, QPDFWriter::ObjTable const& obj) |
1730 | 0 | { |
1731 | | // Page Offset Hint Table |
1732 | | |
1733 | | // We are purposely leaving some values set to their initial zero values. |
1734 | |
|
1735 | 0 | auto const& all_pages = pages.all(); |
1736 | 0 | size_t npages = all_pages.size(); |
1737 | 0 | CHPageOffset& cph = c_page_offset_data_; |
1738 | 0 | std::vector<CHPageOffsetEntry>& cphe = cph.entries; |
1739 | | |
1740 | | // Calculate minimum and maximum values for number of objects per page and page length. |
1741 | |
|
1742 | 0 | int min_nobjects = std::numeric_limits<int>::max(); |
1743 | 0 | int max_nobjects = 0; |
1744 | 0 | int min_length = std::numeric_limits<int>::max(); |
1745 | 0 | int max_length = 0; |
1746 | 0 | int max_shared = 0; |
1747 | |
|
1748 | 0 | HPageOffset& ph = page_offset_hints_; |
1749 | 0 | std::vector<HPageOffsetEntry>& phe = ph.entries; |
1750 | | // npages is the size of the existing pages array. |
1751 | 0 | phe = std::vector<HPageOffsetEntry>(npages); |
1752 | |
|
1753 | 0 | size_t i = 0; |
1754 | 0 | for (auto& phe_i: phe) { |
1755 | | // Calculate values for each page, assigning full values to the delta items. They will be |
1756 | | // adjusted later. |
1757 | | |
1758 | | // Repeat calculations for page 0 so we can assign to phe[i] without duplicating those |
1759 | | // assignments. |
1760 | |
|
1761 | 0 | int nobjects = cphe.at(i).nobjects; |
1762 | 0 | int length = outputLengthNextN(all_pages.at(i).getObjectID(), nobjects, new_obj, obj); |
1763 | 0 | int nshared = cphe.at(i).nshared_objects; |
1764 | |
|
1765 | 0 | min_nobjects = std::min(min_nobjects, nobjects); |
1766 | 0 | max_nobjects = std::max(max_nobjects, nobjects); |
1767 | 0 | min_length = std::min(min_length, length); |
1768 | 0 | max_length = std::max(max_length, length); |
1769 | 0 | max_shared = std::max(max_shared, nshared); |
1770 | |
|
1771 | 0 | phe_i.delta_nobjects = nobjects; |
1772 | 0 | phe_i.delta_page_length = length; |
1773 | 0 | phe_i.nshared_objects = nshared; |
1774 | 0 | ++i; |
1775 | 0 | } |
1776 | |
|
1777 | 0 | ph.min_nobjects = min_nobjects; |
1778 | 0 | ph.first_page_offset = new_obj[obj[all_pages.at(0)].renumber].xref.getOffset(); |
1779 | 0 | ph.nbits_delta_nobjects = nbits(max_nobjects - min_nobjects); |
1780 | 0 | ph.min_page_length = min_length; |
1781 | 0 | ph.nbits_delta_page_length = nbits(max_length - min_length); |
1782 | 0 | ph.nbits_nshared_objects = nbits(max_shared); |
1783 | 0 | ph.nbits_shared_identifier = nbits(c_shared_object_data_.nshared_total); |
1784 | 0 | ph.shared_denominator = 4; // doesn't matter |
1785 | | |
1786 | | // It isn't clear how to compute content offset and content length. Since we are not |
1787 | | // interleaving page objects with the content stream, we'll use the same values for content |
1788 | | // length as page length. We will use 0 as content offset because this is what Adobe does |
1789 | | // (implementation note 127) and pdlin as well. |
1790 | 0 | ph.nbits_delta_content_length = ph.nbits_delta_page_length; |
1791 | 0 | ph.min_content_length = ph.min_page_length; |
1792 | |
|
1793 | 0 | i = 0; |
1794 | 0 | for (auto& phe_i: phe) { |
1795 | | // Adjust delta entries |
1796 | 0 | if (phe_i.delta_nobjects < min_nobjects || phe_i.delta_page_length < min_length) { |
1797 | 0 | stopOnError( |
1798 | 0 | "found too small delta nobjects or delta page length while writing " |
1799 | 0 | "linearization data"); |
1800 | 0 | } |
1801 | 0 | phe_i.delta_nobjects -= min_nobjects; |
1802 | 0 | phe_i.delta_page_length -= min_length; |
1803 | 0 | phe_i.delta_content_length = phe_i.delta_page_length; |
1804 | |
|
1805 | 0 | auto& si = cphe.at(i).shared_identifiers; |
1806 | 0 | phe_i.shared_identifiers.insert(phe_i.shared_identifiers.end(), si.begin(), si.end()); |
1807 | 0 | phe_i.shared_numerators.insert(phe_i.shared_numerators.end(), si.size(), 0); |
1808 | 0 | ++i; |
1809 | 0 | } |
1810 | 0 | } |
1811 | | |
1812 | | void |
1813 | | Lin::calculateHSharedObject(QPDFWriter::NewObjTable const& new_obj, QPDFWriter::ObjTable const& obj) |
1814 | 0 | { |
1815 | 0 | CHSharedObject& cso = c_shared_object_data_; |
1816 | 0 | std::vector<CHSharedObjectEntry>& csoe = cso.entries; |
1817 | 0 | HSharedObject& so = shared_object_hints_; |
1818 | 0 | std::vector<HSharedObjectEntry>& soe = so.entries; |
1819 | 0 | soe.clear(); |
1820 | |
|
1821 | 0 | int min_length = outputLengthNextN(csoe.at(0).object, 1, new_obj, obj); |
1822 | 0 | int max_length = min_length; |
1823 | |
|
1824 | 0 | for (size_t i = 0; i < toS(cso.nshared_total); ++i) { |
1825 | | // Assign absolute numbers to deltas; adjust later |
1826 | 0 | int length = outputLengthNextN(csoe.at(i).object, 1, new_obj, obj); |
1827 | 0 | min_length = std::min(min_length, length); |
1828 | 0 | max_length = std::max(max_length, length); |
1829 | 0 | soe.emplace_back(); |
1830 | 0 | soe.at(i).delta_group_length = length; |
1831 | 0 | } |
1832 | 0 | no_ci_stop_if( |
1833 | 0 | soe.size() != toS(cso.nshared_total), "soe has wrong size after initialization" // |
1834 | 0 | ); |
1835 | |
|
1836 | 0 | so.nshared_total = cso.nshared_total; |
1837 | 0 | so.nshared_first_page = cso.nshared_first_page; |
1838 | 0 | if (so.nshared_total > so.nshared_first_page) { |
1839 | 0 | so.first_shared_obj = obj[cso.first_shared_obj].renumber; |
1840 | 0 | so.min_group_length = min_length; |
1841 | 0 | so.first_shared_offset = new_obj[so.first_shared_obj].xref.getOffset(); |
1842 | 0 | } |
1843 | 0 | so.min_group_length = min_length; |
1844 | 0 | so.nbits_delta_group_length = nbits(max_length - min_length); |
1845 | |
|
1846 | 0 | for (size_t i = 0; i < toS(cso.nshared_total); ++i) { |
1847 | | // Adjust deltas |
1848 | 0 | no_ci_stop_if( |
1849 | 0 | soe.at(i).delta_group_length < min_length, |
1850 | 0 | "found too small group length while writing linearization data" // |
1851 | 0 | ); |
1852 | |
|
1853 | 0 | soe.at(i).delta_group_length -= min_length; |
1854 | 0 | } |
1855 | 0 | } |
1856 | | |
1857 | | void |
1858 | | Lin::calculateHOutline(QPDFWriter::NewObjTable const& new_obj, QPDFWriter::ObjTable const& obj) |
1859 | 0 | { |
1860 | 0 | HGeneric& cho = c_outline_data_; |
1861 | |
|
1862 | 0 | if (cho.nobjects == 0) { |
1863 | 0 | return; |
1864 | 0 | } |
1865 | | |
1866 | 0 | HGeneric& ho = outline_hints_; |
1867 | |
|
1868 | 0 | ho.first_object = obj[cho.first_object].renumber; |
1869 | 0 | ho.first_object_offset = new_obj[ho.first_object].xref.getOffset(); |
1870 | 0 | ho.nobjects = cho.nobjects; |
1871 | 0 | ho.group_length = outputLengthNextN(cho.first_object, ho.nobjects, new_obj, obj); |
1872 | 0 | } |
1873 | | |
1874 | | template <class T, class int_type> |
1875 | | static void |
1876 | | write_vector_int(BitWriter& w, int nitems, std::vector<T>& vec, int bits, int_type T::* field) |
1877 | 0 | { |
1878 | | // nitems times, write bits bits from the given field of the ith vector to the given bit writer. |
1879 | |
|
1880 | 0 | for (size_t i = 0; i < QIntC::to_size(nitems); ++i) { |
1881 | 0 | w.writeBits(QIntC::to_ulonglong(vec.at(i).*field), QIntC::to_size(bits)); |
1882 | 0 | } |
1883 | | // The PDF spec says that each hint table starts at a byte boundary. Each "row" actually must |
1884 | | // start on a byte boundary. |
1885 | 0 | w.flush(); |
1886 | 0 | } Unexecuted instantiation: QPDF_linearization.cc:void write_vector_int<QPDF::Doc::Linearization::HPageOffsetEntry, int>(BitWriter&, int, std::__1::vector<QPDF::Doc::Linearization::HPageOffsetEntry, std::__1::allocator<QPDF::Doc::Linearization::HPageOffsetEntry> >&, int, int QPDF::Doc::Linearization::HPageOffsetEntry::*) Unexecuted instantiation: QPDF_linearization.cc:void write_vector_int<QPDF::Doc::Linearization::HPageOffsetEntry, long long>(BitWriter&, int, std::__1::vector<QPDF::Doc::Linearization::HPageOffsetEntry, std::__1::allocator<QPDF::Doc::Linearization::HPageOffsetEntry> >&, int, long long QPDF::Doc::Linearization::HPageOffsetEntry::*) Unexecuted instantiation: QPDF_linearization.cc:void write_vector_int<QPDF::Doc::Linearization::HSharedObjectEntry, int>(BitWriter&, int, std::__1::vector<QPDF::Doc::Linearization::HSharedObjectEntry, std::__1::allocator<QPDF::Doc::Linearization::HSharedObjectEntry> >&, int, int QPDF::Doc::Linearization::HSharedObjectEntry::*) |
1887 | | |
1888 | | template <class T> |
1889 | | static void |
1890 | | write_vector_vector( |
1891 | | BitWriter& w, |
1892 | | int nitems1, |
1893 | | std::vector<T>& vec1, |
1894 | | int T::* nitems2, |
1895 | | int bits, |
1896 | | std::vector<int> T::* vec2) |
1897 | 0 | { |
1898 | | // nitems1 times, write nitems2 (from the ith element of vec1) items from the vec2 vector field |
1899 | | // of the ith item of vec1. |
1900 | 0 | for (size_t i1 = 0; i1 < QIntC::to_size(nitems1); ++i1) { |
1901 | 0 | for (size_t i2 = 0; i2 < QIntC::to_size(vec1.at(i1).*nitems2); ++i2) { |
1902 | 0 | w.writeBits(QIntC::to_ulonglong((vec1.at(i1).*vec2).at(i2)), QIntC::to_size(bits)); |
1903 | 0 | } |
1904 | 0 | } |
1905 | 0 | w.flush(); |
1906 | 0 | } |
1907 | | |
1908 | | void |
1909 | | Lin::writeHPageOffset(BitWriter& w) |
1910 | 0 | { |
1911 | 0 | HPageOffset& t = page_offset_hints_; |
1912 | |
|
1913 | 0 | w.writeBitsInt(t.min_nobjects, 32); // 1 |
1914 | 0 | w.writeBits(toULL(t.first_page_offset), 32); // 2 |
1915 | 0 | w.writeBitsInt(t.nbits_delta_nobjects, 16); // 3 |
1916 | 0 | w.writeBitsInt(t.min_page_length, 32); // 4 |
1917 | 0 | w.writeBitsInt(t.nbits_delta_page_length, 16); // 5 |
1918 | 0 | w.writeBits(toULL(t.min_content_offset), 32); // 6 |
1919 | 0 | w.writeBitsInt(t.nbits_delta_content_offset, 16); // 7 |
1920 | 0 | w.writeBitsInt(t.min_content_length, 32); // 8 |
1921 | 0 | w.writeBitsInt(t.nbits_delta_content_length, 16); // 9 |
1922 | 0 | w.writeBitsInt(t.nbits_nshared_objects, 16); // 10 |
1923 | 0 | w.writeBitsInt(t.nbits_shared_identifier, 16); // 11 |
1924 | 0 | w.writeBitsInt(t.nbits_shared_numerator, 16); // 12 |
1925 | 0 | w.writeBitsInt(t.shared_denominator, 16); // 13 |
1926 | |
|
1927 | 0 | int nitems = toI(pages.size()); |
1928 | 0 | std::vector<HPageOffsetEntry>& entries = t.entries; |
1929 | |
|
1930 | 0 | write_vector_int(w, nitems, entries, t.nbits_delta_nobjects, &HPageOffsetEntry::delta_nobjects); |
1931 | 0 | write_vector_int( |
1932 | 0 | w, nitems, entries, t.nbits_delta_page_length, &HPageOffsetEntry::delta_page_length); |
1933 | 0 | write_vector_int( |
1934 | 0 | w, nitems, entries, t.nbits_nshared_objects, &HPageOffsetEntry::nshared_objects); |
1935 | 0 | write_vector_vector( |
1936 | 0 | w, |
1937 | 0 | nitems, |
1938 | 0 | entries, |
1939 | 0 | &HPageOffsetEntry::nshared_objects, |
1940 | 0 | t.nbits_shared_identifier, |
1941 | 0 | &HPageOffsetEntry::shared_identifiers); |
1942 | 0 | write_vector_vector( |
1943 | 0 | w, |
1944 | 0 | nitems, |
1945 | 0 | entries, |
1946 | 0 | &HPageOffsetEntry::nshared_objects, |
1947 | 0 | t.nbits_shared_numerator, |
1948 | 0 | &HPageOffsetEntry::shared_numerators); |
1949 | 0 | write_vector_int( |
1950 | 0 | w, nitems, entries, t.nbits_delta_content_offset, &HPageOffsetEntry::delta_content_offset); |
1951 | 0 | write_vector_int( |
1952 | 0 | w, nitems, entries, t.nbits_delta_content_length, &HPageOffsetEntry::delta_content_length); |
1953 | 0 | } |
1954 | | |
1955 | | void |
1956 | | Lin::writeHSharedObject(BitWriter& w) |
1957 | 0 | { |
1958 | 0 | HSharedObject& t = shared_object_hints_; |
1959 | |
|
1960 | 0 | w.writeBitsInt(t.first_shared_obj, 32); // 1 |
1961 | 0 | w.writeBits(toULL(t.first_shared_offset), 32); // 2 |
1962 | 0 | w.writeBitsInt(t.nshared_first_page, 32); // 3 |
1963 | 0 | w.writeBitsInt(t.nshared_total, 32); // 4 |
1964 | 0 | w.writeBitsInt(t.nbits_nobjects, 16); // 5 |
1965 | 0 | w.writeBitsInt(t.min_group_length, 32); // 6 |
1966 | 0 | w.writeBitsInt(t.nbits_delta_group_length, 16); // 7 |
1967 | |
|
1968 | 0 | QTC::TC( |
1969 | 0 | "qpdf", |
1970 | 0 | "QPDF lin write nshared_total > nshared_first_page", |
1971 | 0 | (t.nshared_total > t.nshared_first_page) ? 1 : 0); |
1972 | |
|
1973 | 0 | int nitems = t.nshared_total; |
1974 | 0 | std::vector<HSharedObjectEntry>& entries = t.entries; |
1975 | |
|
1976 | 0 | write_vector_int( |
1977 | 0 | w, nitems, entries, t.nbits_delta_group_length, &HSharedObjectEntry::delta_group_length); |
1978 | 0 | write_vector_int(w, nitems, entries, 1, &HSharedObjectEntry::signature_present); |
1979 | 0 | for (size_t i = 0; i < toS(nitems); ++i) { |
1980 | | // If signature were present, we'd have to write a 128-bit hash. |
1981 | 0 | if (entries.at(i).signature_present != 0) { |
1982 | 0 | stopOnError("found unexpected signature present while writing linearization data"); |
1983 | 0 | } |
1984 | 0 | } |
1985 | 0 | write_vector_int(w, nitems, entries, t.nbits_nobjects, &HSharedObjectEntry::nobjects_minus_one); |
1986 | 0 | } |
1987 | | |
1988 | | void |
1989 | | Lin::writeHGeneric(BitWriter& w, HGeneric& t) |
1990 | 0 | { |
1991 | 0 | w.writeBitsInt(t.first_object, 32); // 1 |
1992 | 0 | w.writeBits(toULL(t.first_object_offset), 32); // 2 |
1993 | 0 | w.writeBitsInt(t.nobjects, 32); // 3 |
1994 | 0 | w.writeBitsInt(t.group_length, 32); // 4 |
1995 | 0 | } |
1996 | | |
1997 | | void |
1998 | | Lin::generateHintStream( |
1999 | | QPDFWriter::NewObjTable const& new_obj, |
2000 | | QPDFWriter::ObjTable const& obj, |
2001 | | std::string& hint_buffer, |
2002 | | int& S, |
2003 | | int& O, |
2004 | | bool compressed) |
2005 | 0 | { |
2006 | | // Populate actual hint table values |
2007 | 0 | calculateHPageOffset(new_obj, obj); |
2008 | 0 | calculateHSharedObject(new_obj, obj); |
2009 | 0 | calculateHOutline(new_obj, obj); |
2010 | | |
2011 | | // Write the hint stream itself into a compressed memory buffer. Write through a counter so we |
2012 | | // can get offsets. |
2013 | 0 | pl::Count c(0, hint_buffer); |
2014 | 0 | BitWriter w(&c); |
2015 | |
|
2016 | 0 | writeHPageOffset(w); |
2017 | 0 | S = toI(c.getCount()); |
2018 | 0 | writeHSharedObject(w); |
2019 | 0 | O = 0; |
2020 | 0 | if (outline_hints_.nobjects > 0) { |
2021 | 0 | O = toI(c.getCount()); |
2022 | 0 | writeHGeneric(w, outline_hints_); |
2023 | 0 | } |
2024 | 0 | if (compressed) { |
2025 | 0 | hint_buffer = pl::pipe<Pl_Flate>(hint_buffer, Pl_Flate::a_deflate); |
2026 | 0 | } |
2027 | 0 | } |