/src/mupdf/source/pdf/pdf-unicode.c
Line | Count | Source |
1 | | // Copyright (C) 2004-2021 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "mupdf/fitz.h" |
24 | | #include "mupdf/pdf.h" |
25 | | |
26 | | #include <string.h> |
27 | | |
28 | | /* Load or synthesize ToUnicode map for fonts */ |
29 | | |
30 | | static void |
31 | | pdf_remap_cmap_range(fz_context *ctx, pdf_cmap *ucs_from_gid, |
32 | | unsigned int cpt, unsigned int gid, unsigned int n, pdf_cmap *ucs_from_cpt) |
33 | 2 | { |
34 | 2 | unsigned int k; |
35 | 2 | int ucsbuf[PDF_MRANGE_CAP]; |
36 | 2 | int ucslen; |
37 | | |
38 | 131k | for (k = 0; k <= n; ++k) |
39 | 131k | { |
40 | 131k | ucslen = pdf_lookup_cmap_full(ucs_from_cpt, cpt + k, ucsbuf); |
41 | 131k | if (ucslen == 1) |
42 | 14.4k | pdf_map_range_to_range(ctx, ucs_from_gid, gid + k, gid + k, ucsbuf[0]); |
43 | 116k | else if (ucslen > 1) |
44 | 3.20k | pdf_map_one_to_many(ctx, ucs_from_gid, gid + k, ucsbuf, ucslen); |
45 | 131k | } |
46 | 2 | } |
47 | | |
48 | | static pdf_cmap * |
49 | | pdf_remap_cmap(fz_context *ctx, pdf_cmap *gid_from_cpt, pdf_cmap *ucs_from_cpt) |
50 | 2 | { |
51 | 2 | pdf_cmap *ucs_from_gid; |
52 | 2 | unsigned int a, b, x; |
53 | 2 | int i; |
54 | | |
55 | 2 | ucs_from_gid = pdf_new_cmap(ctx); |
56 | | |
57 | 4 | fz_try(ctx) |
58 | 4 | { |
59 | 2 | if (gid_from_cpt->usecmap) |
60 | 0 | ucs_from_gid->usecmap = pdf_remap_cmap(ctx, gid_from_cpt->usecmap, ucs_from_cpt); |
61 | | |
62 | 2 | pdf_add_codespace(ctx, ucs_from_gid, 0, 0x7fffffff, 4); |
63 | | |
64 | 4 | for (i = 0; i < gid_from_cpt->rlen; ++i) |
65 | 2 | { |
66 | 2 | a = gid_from_cpt->ranges[i].low; |
67 | 2 | b = gid_from_cpt->ranges[i].high; |
68 | 2 | x = gid_from_cpt->ranges[i].out; |
69 | 2 | pdf_remap_cmap_range(ctx, ucs_from_gid, a, x, b - a, ucs_from_cpt); |
70 | 2 | } |
71 | | |
72 | 2 | for (i = 0; i < gid_from_cpt->xlen; ++i) |
73 | 0 | { |
74 | 0 | a = gid_from_cpt->xranges[i].low; |
75 | 0 | b = gid_from_cpt->xranges[i].high; |
76 | 0 | x = gid_from_cpt->xranges[i].out; |
77 | 0 | pdf_remap_cmap_range(ctx, ucs_from_gid, a, x, b - a, ucs_from_cpt); |
78 | 0 | } |
79 | | |
80 | | /* Font encoding CMaps don't have one-to-many mappings, so we can ignore the mranges. */ |
81 | | |
82 | 2 | pdf_sort_cmap(ctx, ucs_from_gid); |
83 | 2 | } |
84 | 4 | fz_catch(ctx) |
85 | 0 | { |
86 | 0 | pdf_drop_cmap(ctx, ucs_from_gid); |
87 | 0 | fz_rethrow(ctx); |
88 | 0 | } |
89 | | |
90 | 2 | return ucs_from_gid; |
91 | 2 | } |
92 | | |
93 | | void |
94 | | pdf_load_to_unicode(fz_context *ctx, pdf_document *doc, pdf_font_desc *font, |
95 | | const char **strings, char *collection, pdf_obj *cmapstm) |
96 | 4 | { |
97 | 4 | unsigned int cpt; |
98 | | |
99 | 4 | if (pdf_is_stream(ctx, cmapstm)) |
100 | 2 | { |
101 | 2 | pdf_cmap *ucs_from_cpt = pdf_load_embedded_cmap(ctx, doc, cmapstm); |
102 | 4 | fz_try(ctx) |
103 | 4 | font->to_unicode = pdf_remap_cmap(ctx, font->encoding, ucs_from_cpt); |
104 | 4 | fz_always(ctx) |
105 | 2 | pdf_drop_cmap(ctx, ucs_from_cpt); |
106 | 2 | fz_catch(ctx) |
107 | 0 | fz_rethrow(ctx); |
108 | 2 | font->size += pdf_cmap_size(ctx, font->to_unicode); |
109 | 2 | } |
110 | | |
111 | 2 | else if (pdf_is_name(ctx, cmapstm)) |
112 | 0 | { |
113 | 0 | pdf_cmap *ucs_from_cpt = pdf_load_system_cmap(ctx, pdf_to_name(ctx, cmapstm)); |
114 | 0 | fz_try(ctx) |
115 | 0 | font->to_unicode = pdf_remap_cmap(ctx, font->encoding, ucs_from_cpt); |
116 | 0 | fz_always(ctx) |
117 | 0 | pdf_drop_cmap(ctx, ucs_from_cpt); |
118 | 0 | fz_catch(ctx) |
119 | 0 | fz_rethrow(ctx); |
120 | 0 | font->size += pdf_cmap_size(ctx, font->to_unicode); |
121 | 0 | } |
122 | | |
123 | 2 | else if (collection) |
124 | 0 | { |
125 | 0 | if (!strcmp(collection, "Adobe-CNS1")) |
126 | 0 | font->to_unicode = pdf_load_system_cmap(ctx, "Adobe-CNS1-UCS2"); |
127 | 0 | else if (!strcmp(collection, "Adobe-GB1")) |
128 | 0 | font->to_unicode = pdf_load_system_cmap(ctx, "Adobe-GB1-UCS2"); |
129 | 0 | else if (!strcmp(collection, "Adobe-Japan1")) |
130 | 0 | font->to_unicode = pdf_load_system_cmap(ctx, "Adobe-Japan1-UCS2"); |
131 | 0 | else if (!strcmp(collection, "Adobe-Korea1")) |
132 | 0 | font->to_unicode = pdf_load_system_cmap(ctx, "Adobe-Korea1-UCS2"); |
133 | 0 | } |
134 | | |
135 | 4 | if (strings) |
136 | 2 | { |
137 | | /* TODO one-to-many mappings */ |
138 | | |
139 | 2 | font->cid_to_ucs = Memento_label(fz_malloc_array(ctx, 256, unsigned short), "cid_to_ucs"); |
140 | 2 | font->cid_to_ucs_len = 256; |
141 | 2 | font->size += 256 * sizeof *font->cid_to_ucs; |
142 | | |
143 | 514 | for (cpt = 0; cpt < 256; cpt++) |
144 | 512 | { |
145 | 512 | if (strings[cpt]) |
146 | 448 | font->cid_to_ucs[cpt] = fz_unicode_from_glyph_name(strings[cpt]); |
147 | 64 | else |
148 | 64 | font->cid_to_ucs[cpt] = FZ_REPLACEMENT_CHARACTER; |
149 | 512 | } |
150 | 2 | } |
151 | | |
152 | 4 | if (!font->to_unicode && !font->cid_to_ucs) |
153 | 0 | { |
154 | | /* TODO: synthesize a ToUnicode if it's a freetype font with |
155 | | * cmap and/or post tables or if it has glyph names. */ |
156 | 0 | } |
157 | 4 | } |