/work/workdir/UnpackedTarball/hyphen/hyphen.c
Line | Count | Source |
1 | | /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both |
2 | | * licenses follows. |
3 | | */ |
4 | | |
5 | | /* LibHnj - a library for high quality hyphenation and justification |
6 | | * Copyright (C) 1998 Raph Levien, |
7 | | * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), |
8 | | * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su) |
9 | | * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo) |
10 | | * |
11 | | * This library is free software; you can redistribute it and/or |
12 | | * modify it under the terms of the GNU Library General Public |
13 | | * License as published by the Free Software Foundation; either |
14 | | * version 2 of the License, or (at your option) any later version. |
15 | | * |
16 | | * This library is distributed in the hope that it will be useful, |
17 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
18 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
19 | | * Library General Public License for more details. |
20 | | * |
21 | | * You should have received a copy of the GNU Library General Public |
22 | | * License along with this library; if not, write to the |
23 | | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
24 | | * Boston, MA 02111-1307 USA. |
25 | | */ |
26 | | |
27 | | /* |
28 | | * The contents of this file are subject to the Mozilla Public License |
29 | | * Version 1.0 (the "MPL"); you may not use this file except in |
30 | | * compliance with the MPL. You may obtain a copy of the MPL at |
31 | | * http://www.mozilla.org/MPL/ |
32 | | * |
33 | | * Software distributed under the MPL is distributed on an "AS IS" basis, |
34 | | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL |
35 | | * for the specific language governing rights and limitations under the |
36 | | * MPL. |
37 | | * |
38 | | */ |
39 | | #include <stdlib.h> /* for NULL, malloc */ |
40 | | #include <stdio.h> /* for fprintf */ |
41 | | #include <string.h> /* for strdup */ |
42 | | #include <limits.h> /* for INT_MAX */ |
43 | | |
44 | | #ifdef UNX |
45 | | #include <unistd.h> /* for exit */ |
46 | | #endif |
47 | | |
48 | | #ifdef _WIN32 |
49 | | #include <windows.h> |
50 | | #include <wchar.h> |
51 | | #endif |
52 | | |
53 | | #define noVERBOSE |
54 | | |
55 | | /* calculate hyphenmin values with long ligature length (2 or 3 characters |
56 | | * instead of 1 or 2) for comparison with hyphenation without ligatures */ |
57 | | #define noLONG_LIGATURE |
58 | | |
59 | | #ifdef LONG_LIGATURE |
60 | | #define LIG_xx 1 |
61 | | #define LIG_xxx 2 |
62 | | #else |
63 | 0 | #define LIG_xx 0 |
64 | 0 | #define LIG_xxx 1 |
65 | | #endif |
66 | | |
67 | | #include "hnjalloc.h" |
68 | | #include "hyphen.h" |
69 | | |
70 | | static char * |
71 | | hnj_strdup (const char *s) |
72 | 0 | { |
73 | 0 | char *newstr; |
74 | 0 | int l; |
75 | |
|
76 | 0 | l = strlen (s); |
77 | 0 | newstr = (char *) hnj_malloc (l + 1); |
78 | 0 | memcpy (newstr, s, l); |
79 | 0 | newstr[l] = 0; |
80 | 0 | return newstr; |
81 | 0 | } |
82 | | |
83 | | /* remove cross-platform text line end characters */ |
84 | | void hnj_strchomp(char * s) |
85 | 0 | { |
86 | 0 | int k = strlen(s); |
87 | 0 | if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0'; |
88 | 0 | if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0'; |
89 | 0 | } |
90 | | |
91 | | /* a little bit of a hash table implementation. This simply maps strings |
92 | | to state numbers */ |
93 | | |
94 | | typedef struct _HashTab HashTab; |
95 | | typedef struct _HashEntry HashEntry; |
96 | | |
97 | | /* A cheap, but effective, hack. */ |
98 | 0 | #define HASH_SIZE 31627 |
99 | | |
100 | | struct _HashTab { |
101 | | HashEntry *entries[HASH_SIZE]; |
102 | | }; |
103 | | |
104 | | struct _HashEntry { |
105 | | HashEntry *next; |
106 | | char *key; |
107 | | int val; |
108 | | }; |
109 | | |
110 | | /* a char* hash function from ASU - adapted from Gtk+ */ |
111 | | static unsigned int |
112 | | hnj_string_hash (const char *s) |
113 | 0 | { |
114 | 0 | const char *p; |
115 | 0 | unsigned int h=0, g; |
116 | 0 | for(p = s; *p != '\0'; p += 1) { |
117 | 0 | h = ( h << 4 ) + *p; |
118 | 0 | if ( ( g = h & 0xf0000000 ) ) { |
119 | 0 | h = h ^ (g >> 24); |
120 | 0 | h = h ^ g; |
121 | 0 | } |
122 | 0 | } |
123 | 0 | return h /* % M */; |
124 | 0 | } |
125 | | |
126 | | static HashTab * |
127 | | hnj_hash_new (void) |
128 | 0 | { |
129 | 0 | HashTab *hashtab; |
130 | 0 | int i; |
131 | |
|
132 | 0 | hashtab = (HashTab *) hnj_malloc (sizeof(HashTab)); |
133 | 0 | for (i = 0; i < HASH_SIZE; i++) |
134 | 0 | hashtab->entries[i] = NULL; |
135 | |
|
136 | 0 | return hashtab; |
137 | 0 | } |
138 | | |
139 | | static void |
140 | | hnj_hash_free (HashTab *hashtab) |
141 | 0 | { |
142 | 0 | int i; |
143 | 0 | HashEntry *e, *next; |
144 | |
|
145 | 0 | for (i = 0; i < HASH_SIZE; i++) |
146 | 0 | for (e = hashtab->entries[i]; e; e = next) |
147 | 0 | { |
148 | 0 | next = e->next; |
149 | 0 | hnj_free (e->key); |
150 | 0 | hnj_free (e); |
151 | 0 | } |
152 | |
|
153 | 0 | hnj_free (hashtab); |
154 | 0 | } |
155 | | |
156 | | /* assumes that key is not already present! */ |
157 | | static void |
158 | | hnj_hash_insert (HashTab *hashtab, const char *key, int val) |
159 | 0 | { |
160 | 0 | int i; |
161 | 0 | HashEntry *e; |
162 | |
|
163 | 0 | i = hnj_string_hash (key) % HASH_SIZE; |
164 | 0 | e = (HashEntry *) hnj_malloc (sizeof(HashEntry)); |
165 | 0 | e->next = hashtab->entries[i]; |
166 | 0 | e->key = hnj_strdup (key); |
167 | 0 | e->val = val; |
168 | 0 | hashtab->entries[i] = e; |
169 | 0 | } |
170 | | |
171 | | /* return val if found, otherwise -1 */ |
172 | | static int |
173 | | hnj_hash_lookup (HashTab *hashtab, const char *key) |
174 | 0 | { |
175 | 0 | int i; |
176 | 0 | HashEntry *e; |
177 | 0 | i = hnj_string_hash (key) % HASH_SIZE; |
178 | 0 | for (e = hashtab->entries[i]; e; e = e->next) |
179 | 0 | if (!strcmp (key, e->key)) |
180 | 0 | return e->val; |
181 | 0 | return -1; |
182 | 0 | } |
183 | | |
184 | | /* Get the state number, allocating a new state if necessary. */ |
185 | | static int |
186 | | hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string) |
187 | 0 | { |
188 | 0 | int state_num; |
189 | |
|
190 | 0 | state_num = hnj_hash_lookup (hashtab, string); |
191 | |
|
192 | 0 | if (state_num >= 0) |
193 | 0 | return state_num; |
194 | | |
195 | 0 | hnj_hash_insert (hashtab, string, dict->num_states); |
196 | | /* predicate is true if dict->num_states is a power of two */ |
197 | 0 | if (!(dict->num_states & (dict->num_states - 1))) |
198 | 0 | { |
199 | 0 | dict->states = (HyphenState *) hnj_realloc (dict->states, |
200 | 0 | (dict->num_states << 1) * |
201 | 0 | sizeof(HyphenState)); |
202 | 0 | } |
203 | 0 | dict->states[dict->num_states].match = NULL; |
204 | 0 | dict->states[dict->num_states].repl = NULL; |
205 | 0 | dict->states[dict->num_states].fallback_state = -1; |
206 | 0 | dict->states[dict->num_states].num_trans = 0; |
207 | 0 | dict->states[dict->num_states].trans = NULL; |
208 | 0 | return dict->num_states++; |
209 | 0 | } |
210 | | |
211 | | /* add a transition from state1 to state2 through ch - assumes that the |
212 | | transition does not already exist */ |
213 | | static void |
214 | | hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch) |
215 | 0 | { |
216 | 0 | int num_trans; |
217 | |
|
218 | 0 | num_trans = dict->states[state1].num_trans; |
219 | 0 | if (num_trans == 0) |
220 | 0 | { |
221 | 0 | dict->states[state1].trans = (HyphenTrans *) hnj_malloc (sizeof(HyphenTrans)); |
222 | 0 | } |
223 | 0 | else if (!(num_trans & (num_trans - 1))) |
224 | 0 | { |
225 | 0 | dict->states[state1].trans = (HyphenTrans *) hnj_realloc (dict->states[state1].trans, |
226 | 0 | (num_trans << 1) * |
227 | 0 | sizeof(HyphenTrans)); |
228 | 0 | } |
229 | 0 | dict->states[state1].trans[num_trans].ch = ch; |
230 | 0 | dict->states[state1].trans[num_trans].new_state = state2; |
231 | 0 | dict->states[state1].num_trans++; |
232 | 0 | } |
233 | | |
234 | | #ifdef VERBOSE |
235 | | HashTab *global[1]; |
236 | | |
237 | | static char * |
238 | | get_state_str (int state, int level) |
239 | | { |
240 | | int i; |
241 | | HashEntry *e; |
242 | | |
243 | | for (i = 0; i < HASH_SIZE; i++) |
244 | | for (e = global[level]->entries[i]; e; e = e->next) |
245 | | if (e->val == state) |
246 | | return e->key; |
247 | | return NULL; |
248 | | } |
249 | | #endif |
250 | | |
251 | 0 | void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) { |
252 | 0 | int i, j; |
253 | 0 | char word[MAX_CHARS]; |
254 | 0 | char pattern[MAX_CHARS]; |
255 | 0 | char * repl; |
256 | 0 | signed char replindex; |
257 | 0 | signed char replcut; |
258 | 0 | int state_num = 0; |
259 | 0 | int last_state; |
260 | 0 | char ch; |
261 | 0 | int found; |
262 | |
|
263 | 0 | if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) { |
264 | 0 | dict->lhmin = atoi(buf + 13); |
265 | 0 | return; |
266 | 0 | } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) { |
267 | 0 | dict->rhmin = atoi(buf + 14); |
268 | 0 | return; |
269 | 0 | } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) { |
270 | 0 | dict->clhmin = atoi(buf + 21); |
271 | 0 | return; |
272 | 0 | } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) { |
273 | 0 | dict->crhmin = atoi(buf + 22); |
274 | 0 | return; |
275 | 0 | } else if (strncmp(buf, "NOHYPHEN", 8) == 0) { |
276 | 0 | char * space = buf + 8; |
277 | 0 | while (*space != '\0' && (*space == ' ' || *space == '\t')) space++; |
278 | 0 | if (dict->nohyphen) hnj_free(dict->nohyphen); |
279 | 0 | dict->nohyphen = NULL; |
280 | 0 | dict->nohyphenl = 0; |
281 | 0 | if (*space != '\0') dict->nohyphen = hnj_strdup(space); |
282 | 0 | if (dict->nohyphen) { |
283 | 0 | char * nhe = dict->nohyphen + strlen(dict->nohyphen) - 1; |
284 | 0 | *nhe = 0; |
285 | 0 | for (nhe = nhe - 1; nhe > dict->nohyphen; nhe--) { |
286 | 0 | if (*nhe == ',') { |
287 | 0 | dict->nohyphenl++; |
288 | 0 | *nhe = 0; |
289 | 0 | } |
290 | 0 | } |
291 | 0 | } |
292 | 0 | return; |
293 | 0 | } |
294 | 0 | j = 0; |
295 | 0 | pattern[j] = '0'; |
296 | 0 | repl = strchr(buf, '/'); |
297 | 0 | replindex = 0; |
298 | 0 | replcut = 0; |
299 | 0 | if (repl) { |
300 | 0 | char * index = strchr(repl + 1, ','); |
301 | 0 | *repl = '\0'; |
302 | 0 | if (index) { |
303 | 0 | char * index2 = strchr(index + 1, ','); |
304 | 0 | *index = '\0'; |
305 | 0 | if (index2) { |
306 | 0 | *index2 = '\0'; |
307 | 0 | replindex = (signed char) atoi(index + 1) - 1; |
308 | 0 | replcut = (signed char) atoi(index2 + 1); |
309 | 0 | } |
310 | 0 | } else { |
311 | 0 | hnj_strchomp(repl + 1); |
312 | 0 | replindex = 0; |
313 | 0 | replcut = (signed char) strlen(buf); |
314 | 0 | } |
315 | 0 | repl = hnj_strdup(repl + 1); |
316 | 0 | } |
317 | 0 | for (i = 0; (unsigned char)buf[i] > (unsigned char)' ' && j < MAX_CHARS - 2; i++) |
318 | 0 | { |
319 | 0 | if (buf[i] >= '0' && buf[i] <= '9') |
320 | 0 | pattern[j] = buf[i]; |
321 | 0 | else |
322 | 0 | { |
323 | 0 | word[j] = buf[i]; |
324 | 0 | pattern[++j] = '0'; |
325 | 0 | } |
326 | 0 | } |
327 | 0 | word[j] = '\0'; |
328 | 0 | pattern[j + 1] = '\0'; |
329 | |
|
330 | 0 | i = 0; |
331 | 0 | if (!repl) { |
332 | | /* Optimize away leading zeroes */ |
333 | 0 | for (; pattern[i] == '0'; i++); |
334 | 0 | } else { |
335 | 0 | if (*word == '.') i++; |
336 | | /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */ |
337 | 0 | if (dict->utf8) { |
338 | 0 | int pu = -1; /* unicode character position */ |
339 | 0 | int ps = -1; /* unicode start position (original replindex) */ |
340 | 0 | size_t pc = (*word == '.') ? 1: 0; /* 8-bit character position */ |
341 | 0 | for (; pc < (strlen(word) + 1); pc++) { |
342 | | /* beginning of an UTF-8 character (not '10' start bits) */ |
343 | 0 | if ((((unsigned char) word[pc]) >> 6) != 2) pu++; |
344 | 0 | if ((ps < 0) && (replindex == pu)) { |
345 | 0 | ps = replindex; |
346 | 0 | replindex = (signed char) pc; |
347 | 0 | } |
348 | 0 | if ((ps >= 0) && ((pu - ps) == replcut)) { |
349 | 0 | replcut = (signed char) (pc - replindex); |
350 | 0 | break; |
351 | 0 | } |
352 | 0 | } |
353 | 0 | if (*word == '.') replindex--; |
354 | 0 | } |
355 | 0 | } |
356 | |
|
357 | | #ifdef VERBOSE |
358 | | printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl); |
359 | | #endif |
360 | 0 | found = hnj_hash_lookup (hashtab, word); |
361 | 0 | state_num = hnj_get_state (dict, hashtab, word); |
362 | 0 | if (dict->states[state_num].match) hnj_free (dict->states[state_num].match); |
363 | 0 | if (dict->states[state_num].repl) hnj_free (dict->states[state_num].repl); |
364 | 0 | dict->states[state_num].match = hnj_strdup (pattern + i); |
365 | 0 | dict->states[state_num].repl = repl; |
366 | 0 | dict->states[state_num].replindex = replindex; |
367 | 0 | if (!replcut) { |
368 | 0 | dict->states[state_num].replcut = (signed char) strlen(word); |
369 | 0 | } else { |
370 | 0 | dict->states[state_num].replcut = replcut; |
371 | 0 | } |
372 | | |
373 | | /* now, put in the prefix transitions */ |
374 | 0 | for (; found < 0 && j > 0; --j) |
375 | 0 | { |
376 | 0 | last_state = state_num; |
377 | 0 | ch = word[j - 1]; |
378 | 0 | word[j - 1] = '\0'; |
379 | 0 | found = hnj_hash_lookup (hashtab, word); |
380 | 0 | state_num = hnj_get_state (dict, hashtab, word); |
381 | 0 | hnj_add_trans (dict, state_num, last_state, ch); |
382 | 0 | } |
383 | 0 | } |
384 | | |
385 | 0 | FILE * hnj_fopen(const char * path, const char * mode) { |
386 | | #ifdef _WIN32 |
387 | | #define WIN32_LONG_PATH_PREFIX "\\\\?\\" |
388 | | if (strncmp(path, WIN32_LONG_PATH_PREFIX, 4) == 0) { |
389 | | int len = MultiByteToWideChar(CP_UTF8, 0, path, -1, NULL, 0); |
390 | | wchar_t *buff = (wchar_t *) malloc(len * sizeof(wchar_t)); |
391 | | wchar_t *buff2 = (wchar_t *) malloc(len * sizeof(wchar_t)); |
392 | | FILE * f = NULL; |
393 | | MultiByteToWideChar(CP_UTF8, 0, path, -1, buff, len); |
394 | | if (_wfullpath( buff2, buff, len ) != NULL) { |
395 | | f = _wfopen(buff2, (strcmp(mode, "r") == 0) ? L"r" : L"rb"); |
396 | | } |
397 | | free(buff); |
398 | | free(buff2); |
399 | | return f; |
400 | | } |
401 | | #endif |
402 | 0 | return fopen(path, mode); |
403 | 0 | } |
404 | | |
405 | | HyphenDict * |
406 | | hnj_hyphen_load (const char *fn) |
407 | 0 | { |
408 | 0 | HyphenDict *result; |
409 | 0 | FILE *f; |
410 | 0 | f = hnj_fopen (fn, "r"); |
411 | 0 | if (f == NULL) |
412 | 0 | return NULL; |
413 | | |
414 | 0 | result = hnj_hyphen_load_file(f); |
415 | |
|
416 | 0 | fclose(f); |
417 | 0 | return result; |
418 | 0 | } |
419 | | |
420 | | /* Line-reader callback used by hnj_hyphen_load_impl. Returns buf on success |
421 | | * (line copied in, '\n' included if present, NUL-terminated), or NULL on EOF. |
422 | | * Overly long lines are skipped with a warning. */ |
423 | | typedef char *(*hnj_get_line_fn)(char *buf, int n, void *ctx); |
424 | | |
425 | | typedef struct { const char *p; size_t n; } hnj_mem_ctx; |
426 | | |
427 | 0 | static char *hnj_get_line_file(char *buf, int n, void *ctx) { |
428 | 0 | FILE *f = (FILE *)ctx; |
429 | 0 | while (fgets(buf, n, f) != NULL) { |
430 | 0 | if (!feof(f) && strchr(buf, '\n') == NULL) { |
431 | 0 | int c; |
432 | 0 | while ((c = fgetc(f)) != '\n' && c != EOF); |
433 | 0 | if (buf[0] != '%') |
434 | 0 | fprintf(stderr, "Warning: skipping too long pattern (more than %d chars)\n", n); |
435 | 0 | continue; |
436 | 0 | } |
437 | 0 | return buf; |
438 | 0 | } |
439 | 0 | return NULL; |
440 | 0 | } |
441 | | |
442 | 0 | static char *hnj_get_line_mem(char *buf, int n, void *ctx) { |
443 | 0 | hnj_mem_ctx *m = (hnj_mem_ctx *)ctx; |
444 | 0 | while (m->n > 0) { |
445 | 0 | size_t cap = (size_t)(n - 1); |
446 | 0 | size_t i = 0; |
447 | 0 | while (i < m->n && i < cap && m->p[i] != '\n') i++; |
448 | 0 | int has_nl = (i < cap && i < m->n && m->p[i] == '\n'); |
449 | 0 | size_t copy = has_nl ? i + 1 : i; |
450 | 0 | memcpy(buf, m->p, copy); |
451 | 0 | buf[copy] = '\0'; |
452 | 0 | m->p += copy; |
453 | 0 | m->n -= copy; |
454 | 0 | if (!has_nl && m->n > 0) { |
455 | 0 | while (m->n > 0 && *m->p != '\n') { m->p++; m->n--; } |
456 | 0 | if (m->n > 0) { m->p++; m->n--; } |
457 | 0 | if (buf[0] != '%') |
458 | 0 | fprintf(stderr, "Warning: skipping too long pattern (more than %d chars)\n", n); |
459 | 0 | continue; |
460 | 0 | } |
461 | 0 | return buf; |
462 | 0 | } |
463 | 0 | return NULL; |
464 | 0 | } |
465 | | |
466 | | static HyphenDict * |
467 | | hnj_hyphen_load_impl (hnj_get_line_fn get_line, void *ctx) |
468 | 0 | { |
469 | 0 | HyphenDict *dict[2]; |
470 | 0 | HashTab *hashtab; |
471 | 0 | char buf[MAX_CHARS]; |
472 | 0 | int nextlevel = 0; |
473 | 0 | int i, j, k; |
474 | 0 | HashEntry *e; |
475 | 0 | int state_num = 0; |
476 | | /* loading one or two dictionaries (separated by NEXTLEVEL keyword) */ |
477 | 0 | for (k = 0; k < 2; k++) { |
478 | 0 | hashtab = hnj_hash_new (); |
479 | | #ifdef VERBOSE |
480 | | global[k] = hashtab; |
481 | | #endif |
482 | 0 | hnj_hash_insert (hashtab, "", 0); |
483 | 0 | dict[k] = (HyphenDict *) hnj_malloc (sizeof(HyphenDict)); |
484 | 0 | dict[k]->num_states = 1; |
485 | 0 | dict[k]->states = (HyphenState *) hnj_malloc (sizeof(HyphenState)); |
486 | 0 | dict[k]->states[0].match = NULL; |
487 | 0 | dict[k]->states[0].repl = NULL; |
488 | 0 | dict[k]->states[0].fallback_state = -1; |
489 | 0 | dict[k]->states[0].num_trans = 0; |
490 | 0 | dict[k]->states[0].trans = NULL; |
491 | 0 | dict[k]->nextlevel = NULL; |
492 | 0 | dict[k]->lhmin = 0; |
493 | 0 | dict[k]->rhmin = 0; |
494 | 0 | dict[k]->clhmin = 0; |
495 | 0 | dict[k]->crhmin = 0; |
496 | 0 | dict[k]->nohyphen = NULL; |
497 | 0 | dict[k]->nohyphenl = 0; |
498 | | |
499 | | /* read in character set info */ |
500 | 0 | if (k == 0) { |
501 | 0 | for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0; |
502 | 0 | if (get_line(dict[k]->cset, sizeof(dict[k]->cset), ctx) != NULL) { |
503 | 0 | for (i=0;i<MAX_NAME;i++) |
504 | 0 | if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n')) |
505 | 0 | dict[k]->cset[i] = 0; |
506 | 0 | } else { |
507 | 0 | dict[k]->cset[0] = 0; |
508 | 0 | } |
509 | 0 | dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0); |
510 | 0 | } else { |
511 | 0 | strncpy(dict[k]->cset, dict[0]->cset, sizeof(dict[k]->cset)-1); |
512 | 0 | dict[k]->cset[sizeof(dict[k]->cset)-1] = '\0'; |
513 | 0 | dict[k]->utf8 = dict[0]->utf8; |
514 | 0 | } |
515 | |
|
516 | 0 | if (k == 0 || nextlevel) { |
517 | 0 | while (get_line(buf, sizeof(buf), ctx) != NULL) { |
518 | 0 | if (strncmp(buf, "NEXTLEVEL", 9) == 0) { |
519 | 0 | nextlevel = 1; |
520 | 0 | break; |
521 | 0 | } else if (buf[0] != '%') { |
522 | 0 | hnj_hyphen_load_line(buf, dict[k], hashtab); |
523 | 0 | } |
524 | 0 | } |
525 | 0 | } else if (k == 1) { |
526 | | /* default first level: hyphen and ASCII apostrophe */ |
527 | 0 | if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN ',-\n", dict[k], hashtab); |
528 | 0 | else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab); |
529 | 0 | strncpy(buf, "1-1\n", MAX_CHARS-1); /* buf rewritten by hnj_hyphen_load here */ |
530 | 0 | buf[MAX_CHARS-1] = '\0'; |
531 | 0 | hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */ |
532 | 0 | hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */ |
533 | 0 | if (dict[0]->utf8) { |
534 | 0 | hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */ |
535 | 0 | hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */ |
536 | 0 | } |
537 | 0 | } |
538 | | |
539 | | /* Could do unioning of matches here (instead of the preprocessor script). |
540 | | If we did, the pseudocode would look something like this: |
541 | | |
542 | | foreach state in the hash table |
543 | | foreach i = [1..length(state) - 1] |
544 | | state to check is substr (state, i) |
545 | | look it up |
546 | | if found, and if there is a match, union the match in. |
547 | | |
548 | | It's also possible to avoid the quadratic blowup by doing the |
549 | | search in order of increasing state string sizes - then you |
550 | | can break the loop after finding the first match. |
551 | | |
552 | | This step should be optional in any case - if there is a |
553 | | preprocessed rule table, it's always faster to use that. |
554 | | |
555 | | */ |
556 | | |
557 | | /* put in the fallback states */ |
558 | 0 | for (i = 0; i < HASH_SIZE; i++) |
559 | 0 | for (e = hashtab->entries[i]; e; e = e->next) |
560 | 0 | { |
561 | 0 | if (*(e->key)) for (j = 1; 1; j++) |
562 | 0 | { |
563 | 0 | state_num = hnj_hash_lookup (hashtab, e->key + j); |
564 | 0 | if (state_num >= 0) |
565 | 0 | break; |
566 | 0 | } |
567 | | /* KBH: FIXME state 0 fallback_state should always be -1? */ |
568 | 0 | if (e->val) |
569 | 0 | dict[k]->states[e->val].fallback_state = state_num; |
570 | 0 | } |
571 | | #ifdef VERBOSE |
572 | | for (i = 0; i < HASH_SIZE; i++) |
573 | | for (e = hashtab->entries[i]; e; e = e->next) |
574 | | { |
575 | | printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val, |
576 | | dict[k]->states[e->val].fallback_state); |
577 | | for (j = 0; j < dict[k]->states[e->val].num_trans; j++) |
578 | | printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch, |
579 | | dict[k]->states[e->val].trans[j].new_state); |
580 | | } |
581 | | #endif |
582 | |
|
583 | 0 | #ifndef VERBOSE |
584 | 0 | hnj_hash_free (hashtab); |
585 | 0 | #endif |
586 | 0 | state_num = 0; |
587 | 0 | } |
588 | 0 | if (nextlevel) dict[0]->nextlevel = dict[1]; |
589 | 0 | else { |
590 | 0 | dict[1] -> nextlevel = dict[0]; |
591 | 0 | dict[1]->lhmin = dict[0]->lhmin; |
592 | 0 | dict[1]->rhmin = dict[0]->rhmin; |
593 | 0 | dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3); |
594 | 0 | dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3); |
595 | | #ifdef VERBOSE |
596 | | HashTab *r = global[0]; |
597 | | global[0] = global[1]; |
598 | | global[1] = r; |
599 | | #endif |
600 | 0 | return dict[1]; |
601 | 0 | } |
602 | 0 | return dict[0]; |
603 | 0 | } |
604 | | |
605 | | HyphenDict * |
606 | | hnj_hyphen_load_file (FILE *f) |
607 | 0 | { |
608 | 0 | return hnj_hyphen_load_impl(hnj_get_line_file, f); |
609 | 0 | } |
610 | | |
611 | | HyphenDict * |
612 | | hnj_hyphen_load_data (const char *fdata, size_t flen) |
613 | 0 | { |
614 | 0 | hnj_mem_ctx ctx = { fdata, flen }; |
615 | 0 | return hnj_hyphen_load_impl(hnj_get_line_mem, &ctx); |
616 | 0 | } |
617 | | |
618 | | void hnj_hyphen_free (HyphenDict *dict) |
619 | 0 | { |
620 | 0 | int state_num; |
621 | 0 | HyphenState *hstate; |
622 | |
|
623 | 0 | for (state_num = 0; state_num < dict->num_states; state_num++) |
624 | 0 | { |
625 | 0 | hstate = &dict->states[state_num]; |
626 | 0 | if (hstate->match) |
627 | 0 | hnj_free (hstate->match); |
628 | 0 | if (hstate->repl) |
629 | 0 | hnj_free (hstate->repl); |
630 | 0 | if (hstate->trans) |
631 | 0 | hnj_free (hstate->trans); |
632 | 0 | } |
633 | 0 | if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel); |
634 | |
|
635 | 0 | if (dict->nohyphen) hnj_free(dict->nohyphen); |
636 | |
|
637 | 0 | hnj_free (dict->states); |
638 | |
|
639 | 0 | hnj_free (dict); |
640 | 0 | } |
641 | | |
642 | | #define MAX_WORD 256 |
643 | | |
644 | | int hnj_hyphen_hyphenate (HyphenDict *dict, |
645 | | const char *word, int word_size, |
646 | | char *hyphens) |
647 | 0 | { |
648 | 0 | char *prep_word; |
649 | 0 | int i, j, k; |
650 | 0 | int state; |
651 | 0 | char ch; |
652 | 0 | HyphenState *hstate; |
653 | 0 | char *match; |
654 | 0 | int offset; |
655 | |
|
656 | 0 | prep_word = (char*) hnj_malloc (word_size + 3); |
657 | |
|
658 | 0 | j = 0; |
659 | 0 | prep_word[j++] = '.'; |
660 | |
|
661 | 0 | for (i = 0; i < word_size; i++) { |
662 | 0 | if (word[i] <= '9' && word[i] >= '0') { |
663 | 0 | prep_word[j++] = '.'; |
664 | 0 | } else { |
665 | 0 | prep_word[j++] = word[i]; |
666 | 0 | } |
667 | 0 | } |
668 | |
|
669 | 0 | prep_word[j++] = '.'; |
670 | 0 | prep_word[j] = '\0'; |
671 | |
|
672 | 0 | for (i = 0; i < word_size + 5; i++) |
673 | 0 | hyphens[i] = '0'; |
674 | |
|
675 | | #ifdef VERBOSE |
676 | | printf ("prep_word = %s\n", prep_word); |
677 | | #endif |
678 | | |
679 | | /* now, run the finite state machine */ |
680 | 0 | state = 0; |
681 | 0 | for (i = 0; i < j; i++) |
682 | 0 | { |
683 | 0 | ch = prep_word[i]; |
684 | 0 | for (;;) |
685 | 0 | { |
686 | |
|
687 | 0 | if (state == -1) { |
688 | | /* return 1; */ |
689 | | /* KBH: FIXME shouldn't this be as follows? */ |
690 | 0 | state = 0; |
691 | 0 | goto try_next_letter; |
692 | 0 | } |
693 | | |
694 | | #ifdef VERBOSE |
695 | | char *state_str; |
696 | | state_str = get_state_str (state, 0); |
697 | | |
698 | | for (k = 0; k < i - strlen (state_str); k++) |
699 | | putchar (' '); |
700 | | printf ("%s", state_str); |
701 | | #endif |
702 | | |
703 | 0 | hstate = &dict->states[state]; |
704 | 0 | for (k = 0; k < hstate->num_trans; k++) |
705 | 0 | if (hstate->trans[k].ch == ch) |
706 | 0 | { |
707 | 0 | state = hstate->trans[k].new_state; |
708 | 0 | goto found_state; |
709 | 0 | } |
710 | 0 | state = hstate->fallback_state; |
711 | | #ifdef VERBOSE |
712 | | printf (" falling back, fallback_state %d\n", state); |
713 | | #endif |
714 | 0 | } |
715 | 0 | found_state: |
716 | | #ifdef VERBOSE |
717 | | printf ("found state %d\n",state); |
718 | | #endif |
719 | | /* Additional optimization is possible here - especially, |
720 | | elimination of trailing zeroes from the match. Leading zeroes |
721 | | have already been optimized. */ |
722 | 0 | match = dict->states[state].match; |
723 | | /* replacing rules not handled by hyphen_hyphenate() */ |
724 | 0 | if (match && !dict->states[state].repl) |
725 | 0 | { |
726 | 0 | offset = i + 1 - strlen (match); |
727 | | #ifdef VERBOSE |
728 | | for (k = 0; k < offset; k++) |
729 | | putchar (' '); |
730 | | printf ("%s\n", match); |
731 | | #endif |
732 | | /* This is a linear search because I tried a binary search and |
733 | | found it to be just a teeny bit slower. */ |
734 | 0 | for (k = (offset < 0 ? -offset : 0); match[k]; k++) |
735 | 0 | if (hyphens[offset + k] < match[k]) |
736 | 0 | hyphens[offset + k] = match[k]; |
737 | 0 | } |
738 | | |
739 | | /* KBH: we need this to make sure we keep looking in a word */ |
740 | | /* for patterns even if the current character is not known in state 0 */ |
741 | | /* since patterns for hyphenation may occur anywhere in the word */ |
742 | 0 | try_next_letter: ; |
743 | |
|
744 | 0 | } |
745 | | #ifdef VERBOSE |
746 | | for (i = 0; i < j; i++) |
747 | | putchar (hyphens[i]); |
748 | | putchar ('\n'); |
749 | | #endif |
750 | | |
751 | 0 | for (i = 0; i < j - 4; i++) |
752 | | #if 0 |
753 | | if (hyphens[i + 1] & 1) |
754 | | hyphens[i] = '-'; |
755 | | #else |
756 | 0 | hyphens[i] = hyphens[i + 1]; |
757 | 0 | #endif |
758 | 0 | hyphens[0] = '0'; |
759 | 0 | for (; i < word_size; i++) |
760 | 0 | hyphens[i] = '0'; |
761 | 0 | hyphens[word_size] = '\0'; |
762 | |
|
763 | 0 | hnj_free (prep_word); |
764 | |
|
765 | 0 | return 0; |
766 | 0 | } |
767 | | |
768 | | /* Unicode ligature length */ |
769 | 0 | int hnj_ligature(unsigned char c) { |
770 | 0 | switch (c) { |
771 | 0 | case 0x80: /* ff */ |
772 | 0 | case 0x81: /* fi */ |
773 | 0 | case 0x82: return LIG_xx; /* fl */ |
774 | 0 | case 0x83: /* ffi */ |
775 | 0 | case 0x84: return LIG_xxx; /* ffl */ |
776 | 0 | case 0x85: /* long st */ |
777 | 0 | case 0x86: return LIG_xx; /* st */ |
778 | 0 | } |
779 | 0 | return 0; |
780 | 0 | } |
781 | | |
782 | | /* character length of the first n byte of the input word */ |
783 | | int hnj_hyphen_strnlen(const char * word, int n, int utf8) |
784 | 0 | { |
785 | 0 | int i = 0; |
786 | 0 | int j = 0; |
787 | 0 | while (j < n && word[j] != '\0') { |
788 | 0 | i++; |
789 | | /* Unicode ligature support */ |
790 | 0 | if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) { |
791 | 0 | i += hnj_ligature(word[j + 2]); |
792 | 0 | } |
793 | 0 | for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++); |
794 | 0 | } |
795 | 0 | return i; |
796 | 0 | } |
797 | | |
798 | | int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens, |
799 | | char *** rep, int ** pos, int ** cut, int lhmin) |
800 | 0 | { |
801 | 0 | int i = 1, j; |
802 | | |
803 | | /* Unicode ligature support */ |
804 | 0 | if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC)) { |
805 | 0 | i += hnj_ligature(word[2]); |
806 | 0 | } |
807 | | |
808 | | /* ignore numbers */ |
809 | 0 | for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--; |
810 | |
|
811 | 0 | for (j = 0; i < lhmin && j < word_size && word[j] != '\0'; i++) do { |
812 | | /* check length of the non-standard part */ |
813 | 0 | if (*rep && *pos && *cut && (*rep)[j]) { |
814 | 0 | char * rh = strchr((*rep)[j], '='); |
815 | 0 | if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) + |
816 | 0 | hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) { |
817 | 0 | free((*rep)[j]); |
818 | 0 | (*rep)[j] = NULL; |
819 | 0 | hyphens[j] = '0'; |
820 | 0 | } |
821 | 0 | } else { |
822 | 0 | hyphens[j] = '0'; |
823 | 0 | } |
824 | 0 | j++; |
825 | | |
826 | | /* Unicode ligature support */ |
827 | 0 | if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) { |
828 | 0 | i += hnj_ligature(word[j + 2]); |
829 | 0 | } |
830 | 0 | } while (j < word_size && utf8 && (word[j] & 0xc0) == 0x80); |
831 | 0 | return 0; |
832 | 0 | } |
833 | | |
834 | | int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens, |
835 | | char *** rep, int ** pos, int ** cut, int rhmin) |
836 | 0 | { |
837 | 0 | int i = 0; |
838 | 0 | int j; |
839 | | |
840 | | /* ignore numbers */ |
841 | 0 | for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--; |
842 | |
|
843 | 0 | for (j = word_size - 1; i < rhmin && j > 0; j--) { |
844 | | /* check length of the non-standard part */ |
845 | 0 | if (*rep && *pos && *cut && (*rep)[j]) { |
846 | 0 | char * rh = strchr((*rep)[j], '='); |
847 | 0 | int start = j - (*pos)[j] + (*cut)[j] + 1; |
848 | 0 | int word_len = (start >= 0 && start <= word_size) ? |
849 | 0 | hnj_hyphen_strnlen(word + start, word_size - start, utf8) : 0; |
850 | 0 | if (rh && (word_len + |
851 | 0 | hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) { |
852 | 0 | free((*rep)[j]); |
853 | 0 | (*rep)[j] = NULL; |
854 | 0 | hyphens[j] = '0'; |
855 | 0 | } |
856 | 0 | } else { |
857 | 0 | hyphens[j] = '0'; |
858 | 0 | } |
859 | 0 | if (!utf8 || (word[j] & 0xc0) == 0xc0 || (word[j] & 0x80) != 0x80) i++; |
860 | 0 | } |
861 | 0 | return 0; |
862 | 0 | } |
863 | | |
864 | | /* recursive function for compound level hyphenation */ |
865 | | int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size, |
866 | | char * hyphens, char *** rep, int ** pos, int ** cut, |
867 | | int clhmin, int crhmin, int lend, int rend) |
868 | 0 | { |
869 | 0 | char *prep_word; |
870 | 0 | int i, j, k; |
871 | 0 | int state; |
872 | 0 | char ch; |
873 | 0 | HyphenState *hstate; |
874 | 0 | char *match; |
875 | 0 | char *repl; |
876 | 0 | signed char replindex; |
877 | 0 | signed char replcut; |
878 | 0 | int offset; |
879 | 0 | int * matchlen; |
880 | 0 | int * matchindex; |
881 | 0 | char ** matchrepl; |
882 | 0 | int isrepl = 0; |
883 | 0 | int nHyphCount; |
884 | |
|
885 | 0 | size_t prep_word_size = word_size + 3; |
886 | 0 | prep_word = (char*) hnj_malloc (prep_word_size); |
887 | 0 | matchlen = (int*) hnj_malloc ((word_size + 3) * sizeof(int)); |
888 | 0 | memset(matchlen, 0, (word_size + 3) * sizeof(int)); |
889 | 0 | matchindex = (int*) hnj_malloc ((word_size + 3) * sizeof(int)); |
890 | 0 | matchrepl = (char**) hnj_malloc ((word_size + 3) * sizeof(char *)); |
891 | |
|
892 | 0 | j = 0; |
893 | 0 | prep_word[j++] = '.'; |
894 | |
|
895 | 0 | for (i = 0; i < word_size; i++) { |
896 | 0 | if (word[i] <= '9' && word[i] >= '0') { |
897 | 0 | prep_word[j++] = '.'; |
898 | 0 | } else { |
899 | 0 | prep_word[j++] = word[i]; |
900 | 0 | } |
901 | 0 | } |
902 | | |
903 | | |
904 | |
|
905 | 0 | prep_word[j++] = '.'; |
906 | 0 | prep_word[j] = '\0'; |
907 | |
|
908 | 0 | for (i = 0; i < j; i++) |
909 | 0 | hyphens[i] = '0'; |
910 | |
|
911 | | #ifdef VERBOSE |
912 | | printf ("prep_word = %s\n", prep_word); |
913 | | #endif |
914 | | |
915 | | /* now, run the finite state machine */ |
916 | 0 | state = 0; |
917 | 0 | for (i = 0; i < j; i++) |
918 | 0 | { |
919 | 0 | ch = prep_word[i]; |
920 | 0 | for (;;) |
921 | 0 | { |
922 | |
|
923 | 0 | if (state == -1) { |
924 | | /* return 1; */ |
925 | | /* KBH: FIXME shouldn't this be as follows? */ |
926 | 0 | state = 0; |
927 | 0 | goto try_next_letter; |
928 | 0 | } |
929 | | |
930 | | #ifdef VERBOSE |
931 | | char *state_str; |
932 | | state_str = get_state_str (state, 1); |
933 | | |
934 | | for (k = 0; k < i - strlen (state_str); k++) |
935 | | putchar (' '); |
936 | | printf ("%s", state_str); |
937 | | #endif |
938 | | |
939 | 0 | hstate = &dict->states[state]; |
940 | 0 | for (k = 0; k < hstate->num_trans; k++) |
941 | 0 | if (hstate->trans[k].ch == ch) |
942 | 0 | { |
943 | 0 | state = hstate->trans[k].new_state; |
944 | 0 | goto found_state; |
945 | 0 | } |
946 | 0 | state = hstate->fallback_state; |
947 | | #ifdef VERBOSE |
948 | | printf (" falling back, fallback_state %d\n", state); |
949 | | #endif |
950 | 0 | } |
951 | 0 | found_state: |
952 | | #ifdef VERBOSE |
953 | | printf ("found state %d\n",state); |
954 | | #endif |
955 | | /* Additional optimization is possible here - especially, |
956 | | elimination of trailing zeroes from the match. Leading zeroes |
957 | | have already been optimized. */ |
958 | 0 | match = dict->states[state].match; |
959 | 0 | repl = dict->states[state].repl; |
960 | 0 | replindex = dict->states[state].replindex; |
961 | 0 | replcut = dict->states[state].replcut; |
962 | | /* replacing rules not handled by hyphen_hyphenate() */ |
963 | 0 | if (match) |
964 | 0 | { |
965 | 0 | offset = i + 1 - strlen (match); |
966 | | #ifdef VERBOSE |
967 | | for (k = 0; k < offset; k++) |
968 | | putchar (' '); |
969 | | printf ("%s (%s)\n", match, repl); |
970 | | #endif |
971 | 0 | if (repl) { |
972 | 0 | if (!isrepl) for(; isrepl < word_size; isrepl++) { |
973 | 0 | matchrepl[isrepl] = NULL; |
974 | 0 | matchindex[isrepl] = -1; |
975 | 0 | } |
976 | 0 | if (offset + replindex >= 0 && offset + replindex < word_size + 3) |
977 | 0 | matchlen[offset + replindex] = replcut; |
978 | 0 | } |
979 | | /* This is a linear search because I tried a binary search and |
980 | | found it to be just a teeny bit slower. */ |
981 | 0 | for (k = (offset < 0 ? -offset : 0); match[k]; k++) { |
982 | 0 | if ((hyphens[offset + k] < match[k])) { |
983 | 0 | hyphens[offset + k] = match[k]; |
984 | 0 | if (match[k]&1) { |
985 | 0 | matchrepl[offset + k] = repl; |
986 | 0 | if (repl && (k >= replindex) && (k <= replindex + replcut) |
987 | 0 | && offset + replindex >= 0) { |
988 | 0 | matchindex[offset + replindex] = offset + k; |
989 | 0 | } |
990 | 0 | } |
991 | 0 | } |
992 | 0 | } |
993 | |
|
994 | 0 | } |
995 | | |
996 | | /* KBH: we need this to make sure we keep looking in a word */ |
997 | | /* for patterns even if the current character is not known in state 0 */ |
998 | | /* since patterns for hyphenation may occur anywhere in the word */ |
999 | 0 | try_next_letter: ; |
1000 | |
|
1001 | 0 | } |
1002 | | #ifdef VERBOSE |
1003 | | for (i = 0; i < j; i++) |
1004 | | putchar (hyphens[i]); |
1005 | | putchar ('\n'); |
1006 | | #endif |
1007 | | |
1008 | 0 | for (i = 0; i < j - 3; i++) |
1009 | | #if 0 |
1010 | | if (hyphens[i + 1] & 1) |
1011 | | hyphens[i] = '-'; |
1012 | | #else |
1013 | 0 | hyphens[i] = hyphens[i + 1]; |
1014 | 0 | #endif |
1015 | 0 | for (; i < word_size; i++) |
1016 | 0 | hyphens[i] = '0'; |
1017 | 0 | hyphens[word_size] = '\0'; |
1018 | | |
1019 | | /* now create a new char string showing hyphenation positions */ |
1020 | | /* count the hyphens and allocate space for the new hyphenated string */ |
1021 | 0 | nHyphCount = 0; |
1022 | 0 | for (i = 0; i < word_size; i++) |
1023 | 0 | if (hyphens[i]&1) |
1024 | 0 | nHyphCount++; |
1025 | 0 | j = 0; |
1026 | 0 | for (i = 0; i < word_size; i++) { |
1027 | 0 | if (isrepl && matchlen[i] >= 1 && matchindex[i] >= 1 && matchindex[i] <= word_size && matchrepl[matchindex[i]]) { |
1028 | 0 | if (rep && pos && cut) { |
1029 | 0 | if (!*rep) |
1030 | 0 | *rep = (char **) calloc(word_size, sizeof(char *)); |
1031 | 0 | if (!*pos) |
1032 | 0 | *pos = (int *) calloc(word_size, sizeof(int)); |
1033 | 0 | if (!*cut) { |
1034 | 0 | *cut = (int *) calloc(word_size, sizeof(int)); |
1035 | 0 | } |
1036 | 0 | hnj_free((*rep)[matchindex[i] - 1]); |
1037 | 0 | (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]); |
1038 | 0 | (*pos)[matchindex[i] - 1] = matchindex[i] - i; |
1039 | 0 | (*cut)[matchindex[i] - 1] = matchlen[i]; |
1040 | 0 | } |
1041 | 0 | j += strlen(matchrepl[matchindex[i]]); |
1042 | 0 | i += matchlen[i] - 1; |
1043 | 0 | } |
1044 | 0 | } |
1045 | |
|
1046 | 0 | hnj_free (matchrepl); |
1047 | 0 | hnj_free (matchlen); |
1048 | 0 | hnj_free (matchindex); |
1049 | | |
1050 | | /* recursive hyphenation of the first (compound) level segments */ |
1051 | 0 | if (dict->nextlevel) { |
1052 | 0 | char ** rep2; |
1053 | 0 | int * pos2; |
1054 | 0 | int * cut2; |
1055 | 0 | char * hyphens2; |
1056 | 0 | int begin = 0; |
1057 | |
|
1058 | 0 | rep2 = (char**) hnj_malloc (word_size * sizeof(char *)); |
1059 | 0 | pos2 = (int*) hnj_malloc (word_size * sizeof(int)); |
1060 | 0 | cut2 = (int*) hnj_malloc (word_size * sizeof(int)); |
1061 | 0 | hyphens2 = (char*) hnj_malloc (word_size + 3); |
1062 | 0 | for (i = 0; i < word_size; i++) rep2[i] = NULL; |
1063 | 0 | for (i = 0; i < word_size; i++) if |
1064 | 0 | (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) { |
1065 | 0 | if (i - begin > 0) { |
1066 | 0 | int hyph = 0; |
1067 | 0 | prep_word[i + 2] = '\0'; |
1068 | | /* non-standard hyphenation at compound boundary (Schiffahrt) */ |
1069 | 0 | if (rep && *rep && *pos && *cut && (*rep)[i]) { |
1070 | 0 | char * l = strchr((*rep)[i], '='); |
1071 | 0 | size_t offset = 2 + i - (*pos)[i]; |
1072 | 0 | strncpy(prep_word + offset, (*rep)[i], prep_word_size - offset - 1); |
1073 | 0 | prep_word[prep_word_size - 1] = '\0'; |
1074 | 0 | if (l) { |
1075 | 0 | hyph = (l - (*rep)[i]) - (*pos)[i]; |
1076 | 0 | if (2 + i + hyph < prep_word_size) |
1077 | 0 | prep_word[2 + i + hyph] = '\0'; |
1078 | 0 | } |
1079 | 0 | } |
1080 | 0 | int sub_size = i - begin + 1 + hyph; |
1081 | 0 | if (sub_size >= word_size) sub_size = word_size - 1; |
1082 | 0 | if ((size_t)sub_size + begin + 2 > prep_word_size) |
1083 | 0 | sub_size = (int)(prep_word_size - begin - 2); |
1084 | 0 | if (sub_size < 1) sub_size = 1; |
1085 | 0 | hnj_hyphen_hyph_(dict, prep_word + begin + 1, sub_size, |
1086 | 0 | hyphens2, &rep2, &pos2, &cut2, clhmin, |
1087 | 0 | crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend)); |
1088 | 0 | for (j = 0; j < i - begin; j++) { |
1089 | 0 | hyphens[begin + j] = hyphens2[j]; |
1090 | 0 | if (rep2[j] && rep && pos && cut) { |
1091 | 0 | if (!*rep && !*pos && !*cut) { |
1092 | 0 | int k; |
1093 | 0 | *rep = (char **) malloc(sizeof(char *) * word_size); |
1094 | 0 | *pos = (int *) malloc(sizeof(int) * word_size); |
1095 | 0 | *cut = (int *) malloc(sizeof(int) * word_size); |
1096 | 0 | for (k = 0; k < word_size; k++) { |
1097 | 0 | (*rep)[k] = NULL; |
1098 | 0 | (*pos)[k] = 0; |
1099 | 0 | (*cut)[k] = 0; |
1100 | 0 | } |
1101 | 0 | } |
1102 | 0 | hnj_free((*rep)[begin + j]); |
1103 | 0 | (*rep)[begin + j] = rep2[j]; |
1104 | 0 | (*pos)[begin + j] = pos2[j]; |
1105 | 0 | (*cut)[begin + j] = cut2[j]; |
1106 | 0 | rep2[j] = NULL; |
1107 | 0 | } |
1108 | 0 | } |
1109 | 0 | prep_word[i + 2] = word[i + 1]; |
1110 | 0 | if (*rep && *pos && *cut && (*rep)[i]) { |
1111 | 0 | size_t offset = 1; |
1112 | 0 | strncpy(prep_word + offset, word, prep_word_size - offset - 1); |
1113 | 0 | prep_word[prep_word_size - 1] = '\0'; |
1114 | 0 | } |
1115 | 0 | } |
1116 | 0 | begin = i + 1; |
1117 | 0 | for (j = 0; j < word_size; j++) { |
1118 | 0 | hnj_free(rep2[j]); |
1119 | 0 | rep2[j] = NULL; |
1120 | 0 | } |
1121 | 0 | } |
1122 | | |
1123 | | /* non-compound */ |
1124 | 0 | if (begin == 0) { |
1125 | 0 | hnj_hyphen_hyph_(dict->nextlevel, word, word_size, |
1126 | 0 | hyphens, rep, pos, cut, clhmin, crhmin, lend, rend); |
1127 | 0 | if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens, |
1128 | 0 | rep, pos, cut, clhmin); |
1129 | 0 | if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, |
1130 | 0 | rep, pos, cut, crhmin); |
1131 | 0 | } |
1132 | |
|
1133 | 0 | free(rep2); |
1134 | 0 | free(cut2); |
1135 | 0 | free(pos2); |
1136 | 0 | free(hyphens2); |
1137 | 0 | } |
1138 | |
|
1139 | 0 | hnj_free (prep_word); |
1140 | 0 | return 0; |
1141 | 0 | } |
1142 | | |
1143 | | /* UTF-8 normalization of hyphen and non-standard positions */ |
1144 | | int hnj_hyphen_norm(const char *word, int word_size, char * hyphens, |
1145 | | char *** rep, int ** pos, int ** cut) |
1146 | 0 | { |
1147 | 0 | int i, j, k; |
1148 | 0 | if ((((unsigned char) word[0]) >> 6) == 2) { |
1149 | 0 | fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word); |
1150 | 0 | return 1; |
1151 | 0 | } |
1152 | | |
1153 | | /* calculate UTF-8 character positions */ |
1154 | 0 | for (i = 0, j = -1; i < word_size; i++) { |
1155 | | /* beginning of an UTF-8 character (not '10' start bits) */ |
1156 | 0 | if ((((unsigned char) word[i]) >> 6) != 2) j++; |
1157 | 0 | hyphens[j] = hyphens[i]; |
1158 | 0 | if (rep && pos && cut && *rep && *pos && *cut) { |
1159 | 0 | int l = (*pos)[i]; |
1160 | 0 | (*pos)[j] = 0; |
1161 | 0 | for (k = 0; k < l; k++) { |
1162 | 0 | if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++; |
1163 | 0 | } |
1164 | 0 | k = i - l + 1; |
1165 | 0 | l = k + (*cut)[i]; |
1166 | 0 | (*cut)[j] = 0; |
1167 | 0 | for (; k < l && k < word_size; k++) { |
1168 | 0 | if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++; |
1169 | 0 | } |
1170 | 0 | if (j != i) { |
1171 | 0 | hnj_free((*rep)[j]); |
1172 | 0 | (*rep)[j] = (*rep)[i]; |
1173 | 0 | (*rep)[i] = NULL; |
1174 | 0 | (*pos)[i] = 0; |
1175 | 0 | (*cut)[i] = 0; |
1176 | 0 | } |
1177 | 0 | } |
1178 | 0 | } |
1179 | 0 | hyphens[j + 1] = '\0'; |
1180 | | #ifdef VERBOSE |
1181 | | printf ("nums: %s\n", hyphens); |
1182 | | #endif |
1183 | 0 | return 0; |
1184 | 0 | } |
1185 | | |
1186 | | /* get the word with all possible hyphenations (output: hyphword) */ |
1187 | | void hnj_hyphen_hyphword(const char * word, int word_size, const char * hyphens, |
1188 | | char * hyphword, char *** rep, int ** pos, int ** cut) |
1189 | 0 | { |
1190 | |
|
1191 | 0 | if (word_size <= 0 || word_size > INT_MAX / 2) { |
1192 | 0 | hyphword[0] = '\0'; |
1193 | 0 | return; |
1194 | 0 | } |
1195 | | |
1196 | | /* hyphword buffer size must be at least 2 * l */ |
1197 | 0 | int hyphword_size = 2 * word_size - 1; |
1198 | |
|
1199 | 0 | int nonstandard = 0; |
1200 | 0 | if (*rep && *pos && *cut) { |
1201 | 0 | nonstandard = 1; |
1202 | 0 | } |
1203 | |
|
1204 | 0 | int i; |
1205 | 0 | int j = 0; |
1206 | 0 | for (i = 0; i < word_size && j < hyphword_size; i++) { |
1207 | 0 | hyphword[j++] = word[i]; |
1208 | 0 | if (hyphens[i]&1 && j < hyphword_size) { |
1209 | 0 | if (nonstandard && (*rep)[i] && j >= (*pos)[i]) { |
1210 | | /* non-standard */ |
1211 | 0 | j -= (*pos)[i]; |
1212 | 0 | char *s = (*rep)[i]; |
1213 | 0 | while (*s && j < hyphword_size) { |
1214 | 0 | hyphword[j++] = *s++; |
1215 | 0 | } |
1216 | 0 | i += (*cut)[i] - (*pos)[i]; |
1217 | 0 | } else { |
1218 | | /* standard */ |
1219 | 0 | hyphword[j++] = '='; |
1220 | 0 | } |
1221 | 0 | } |
1222 | 0 | } |
1223 | 0 | hyphword[j] = '\0'; |
1224 | 0 | } |
1225 | | |
1226 | | |
1227 | | /* main api function with default hyphenmin parameters */ |
1228 | | int hnj_hyphen_hyphenate2 (HyphenDict *dict, |
1229 | | const char *word, int word_size, char * hyphens, |
1230 | | char *hyphword, char *** rep, int ** pos, int ** cut) |
1231 | 0 | { |
1232 | 0 | hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut, |
1233 | 0 | dict->clhmin, dict->crhmin, 1, 1); |
1234 | 0 | hnj_hyphen_lhmin(dict->utf8, word, word_size, |
1235 | 0 | hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2)); |
1236 | 0 | hnj_hyphen_rhmin(dict->utf8, word, word_size, |
1237 | 0 | hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2)); |
1238 | | |
1239 | | /* nohyphen */ |
1240 | 0 | if (dict->nohyphen) { |
1241 | 0 | char * nh = dict->nohyphen; |
1242 | 0 | int nhi; |
1243 | 0 | for (nhi = 0; nhi <= dict->nohyphenl; nhi++) { |
1244 | 0 | char * nhy = *nh ? (char *) strstr(word, nh) : NULL; |
1245 | 0 | while (nhy) { |
1246 | 0 | hyphens[nhy - word + strlen(nh) - 1] = '0'; |
1247 | 0 | if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = '0'; |
1248 | 0 | nhy = (char *) strstr(nhy + 1, nh); |
1249 | 0 | } |
1250 | 0 | nh = nh + strlen(nh) + 1; |
1251 | 0 | } |
1252 | 0 | } |
1253 | |
|
1254 | 0 | if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut); |
1255 | 0 | if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut); |
1256 | | #ifdef VERBOSE |
1257 | | printf ("nums: %s\n", hyphens); |
1258 | | #endif |
1259 | 0 | return 0; |
1260 | 0 | } |
1261 | | |
1262 | | /* previous main api function with hyphenmin parameters */ |
1263 | | int hnj_hyphen_hyphenate3 (HyphenDict *dict, |
1264 | | const char *word, int word_size, char * hyphens, |
1265 | | char *hyphword, char *** rep, int ** pos, int ** cut, |
1266 | | int lhmin, int rhmin, int clhmin, int crhmin) |
1267 | 0 | { |
1268 | 0 | lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin; |
1269 | 0 | rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin; |
1270 | 0 | clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin; |
1271 | 0 | crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin; |
1272 | 0 | hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut, |
1273 | 0 | clhmin, crhmin, 1, 1); |
1274 | 0 | hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens, |
1275 | 0 | rep, pos, cut, (lhmin > 0 ? lhmin : 2)); |
1276 | 0 | hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, |
1277 | 0 | rep, pos, cut, (rhmin > 0 ? rhmin : 2)); |
1278 | | |
1279 | | /* nohyphen */ |
1280 | 0 | if (dict->nohyphen) { |
1281 | 0 | char * nh = dict->nohyphen; |
1282 | 0 | int nhi; |
1283 | 0 | for (nhi = 0; nhi <= dict->nohyphenl; nhi++) { |
1284 | 0 | char * nhy = *nh ? (char *) strstr(word, nh) : NULL; |
1285 | 0 | while (nhy) { |
1286 | 0 | hyphens[nhy - word + strlen(nh) - 1] = '0'; |
1287 | 0 | if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = '0'; |
1288 | 0 | nhy = (char *) strstr(nhy + 1, nh); |
1289 | 0 | } |
1290 | 0 | nh = nh + strlen(nh) + 1; |
1291 | 0 | } |
1292 | 0 | } |
1293 | |
|
1294 | 0 | if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut); |
1295 | 0 | if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut); |
1296 | 0 | return 0; |
1297 | 0 | } |