Coverage Report

Created: 2026-06-30 11:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/workdir/UnpackedTarball/hyphen/hyphen.c
Line
Count
Source
1
/* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
2
 * licenses follows.
3
 */
4
5
/* LibHnj - a library for high quality hyphenation and justification
6
 * Copyright (C) 1998 Raph Levien,
7
 *           (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
8
 *           (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
9
 *           (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)
10
 *
11
 * This library is free software; you can redistribute it and/or
12
 * modify it under the terms of the GNU Library General Public
13
 * License as published by the Free Software Foundation; either
14
 * version 2 of the License, or (at your option) any later version.
15
 *
16
 * This library is distributed in the hope that it will be useful,
17
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19
 * Library General Public License for more details.
20
 *
21
 * You should have received a copy of the GNU Library General Public
22
 * License along with this library; if not, write to the
23
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24
 * Boston, MA  02111-1307  USA.
25
*/
26
27
/*
28
 * The contents of this file are subject to the Mozilla Public License
29
 * Version 1.0 (the "MPL"); you may not use this file except in
30
 * compliance with the MPL.  You may obtain a copy of the MPL at
31
 * http://www.mozilla.org/MPL/
32
 *
33
 * Software distributed under the MPL is distributed on an "AS IS" basis,
34
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
35
 * for the specific language governing rights and limitations under the
36
 * MPL.
37
 *
38
 */
39
#include <stdlib.h> /* for NULL, malloc */
40
#include <stdio.h>  /* for fprintf */
41
#include <string.h> /* for strdup */
42
#include <limits.h> /* for INT_MAX */
43
44
#ifdef UNX
45
#include <unistd.h> /* for exit */
46
#endif
47
48
#ifdef _WIN32
49
#include <windows.h>
50
#include <wchar.h>
51
#endif
52
53
#define noVERBOSE
54
55
/* calculate hyphenmin values with long ligature length (2 or 3 characters
56
 * instead of 1 or 2) for comparison with hyphenation without ligatures */
57
#define noLONG_LIGATURE
58
59
#ifdef LONG_LIGATURE
60
#define LIG_xx  1
61
#define LIG_xxx 2
62
#else
63
0
#define LIG_xx  0
64
0
#define LIG_xxx 1
65
#endif
66
67
#include "hnjalloc.h"
68
#include "hyphen.h"
69
70
static char *
71
hnj_strdup (const char *s)
72
0
{
73
0
  char *newstr;
74
0
  int l;
75
76
0
  l = strlen (s);
77
0
  newstr = (char *) hnj_malloc (l + 1);
78
0
  memcpy (newstr, s, l);
79
0
  newstr[l] = 0;
80
0
  return newstr;
81
0
}
82
83
/* remove cross-platform text line end characters */
84
void hnj_strchomp(char * s)
85
0
{
86
0
  int k = strlen(s);
87
0
  if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
88
0
  if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
89
0
}
90
91
/* a little bit of a hash table implementation. This simply maps strings
92
   to state numbers */
93
94
typedef struct _HashTab HashTab;
95
typedef struct _HashEntry HashEntry;
96
97
/* A cheap, but effective, hack. */
98
0
#define HASH_SIZE 31627
99
100
struct _HashTab {
101
  HashEntry *entries[HASH_SIZE];
102
};
103
104
struct _HashEntry {
105
  HashEntry *next;
106
  char *key;
107
  int val;
108
};
109
110
/* a char* hash function from ASU - adapted from Gtk+ */
111
static unsigned int
112
hnj_string_hash (const char *s)
113
0
{
114
0
  const char *p;
115
0
  unsigned int h=0, g;
116
0
  for(p = s; *p != '\0'; p += 1) {
117
0
    h = ( h << 4 ) + *p;
118
0
    if ( ( g = h & 0xf0000000 ) ) {
119
0
      h = h ^ (g >> 24);
120
0
      h = h ^ g;
121
0
    }
122
0
  }
123
0
  return h /* % M */;
124
0
}
125
126
static HashTab *
127
hnj_hash_new (void)
128
0
{
129
0
  HashTab *hashtab;
130
0
  int i;
131
132
0
  hashtab = (HashTab *) hnj_malloc (sizeof(HashTab));
133
0
  for (i = 0; i < HASH_SIZE; i++)
134
0
    hashtab->entries[i] = NULL;
135
136
0
  return hashtab;
137
0
}
138
139
static void
140
hnj_hash_free (HashTab *hashtab)
141
0
{
142
0
  int i;
143
0
  HashEntry *e, *next;
144
145
0
  for (i = 0; i < HASH_SIZE; i++)
146
0
    for (e = hashtab->entries[i]; e; e = next)
147
0
      {
148
0
  next = e->next;
149
0
  hnj_free (e->key);
150
0
  hnj_free (e);
151
0
      }
152
153
0
  hnj_free (hashtab);
154
0
}
155
156
/* assumes that key is not already present! */
157
static void
158
hnj_hash_insert (HashTab *hashtab, const char *key, int val)
159
0
{
160
0
  int i;
161
0
  HashEntry *e;
162
163
0
  i = hnj_string_hash (key) % HASH_SIZE;
164
0
  e = (HashEntry *) hnj_malloc (sizeof(HashEntry));
165
0
  e->next = hashtab->entries[i];
166
0
  e->key = hnj_strdup (key);
167
0
  e->val = val;
168
0
  hashtab->entries[i] = e;
169
0
}
170
171
/* return val if found, otherwise -1 */
172
static int
173
hnj_hash_lookup (HashTab *hashtab, const char *key)
174
0
{
175
0
  int i;
176
0
  HashEntry *e;
177
0
  i = hnj_string_hash (key) % HASH_SIZE;
178
0
  for (e = hashtab->entries[i]; e; e = e->next)
179
0
    if (!strcmp (key, e->key))
180
0
      return e->val;
181
0
  return -1;
182
0
}
183
184
/* Get the state number, allocating a new state if necessary. */
185
static int
186
hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
187
0
{
188
0
  int state_num;
189
190
0
  state_num = hnj_hash_lookup (hashtab, string);
191
192
0
  if (state_num >= 0)
193
0
    return state_num;
194
195
0
  hnj_hash_insert (hashtab, string, dict->num_states);
196
  /* predicate is true if dict->num_states is a power of two */
197
0
  if (!(dict->num_states & (dict->num_states - 1)))
198
0
    {
199
0
      dict->states = (HyphenState *) hnj_realloc (dict->states,
200
0
          (dict->num_states << 1) *
201
0
          sizeof(HyphenState));
202
0
    }
203
0
  dict->states[dict->num_states].match = NULL;
204
0
  dict->states[dict->num_states].repl = NULL;
205
0
  dict->states[dict->num_states].fallback_state = -1;
206
0
  dict->states[dict->num_states].num_trans = 0;
207
0
  dict->states[dict->num_states].trans = NULL;
208
0
  return dict->num_states++;
209
0
}
210
211
/* add a transition from state1 to state2 through ch - assumes that the
212
   transition does not already exist */
213
static void
214
hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
215
0
{
216
0
  int num_trans;
217
218
0
  num_trans = dict->states[state1].num_trans;
219
0
  if (num_trans == 0)
220
0
    {
221
0
      dict->states[state1].trans = (HyphenTrans *) hnj_malloc (sizeof(HyphenTrans));
222
0
    }
223
0
  else if (!(num_trans & (num_trans - 1)))
224
0
    {
225
0
      dict->states[state1].trans = (HyphenTrans *) hnj_realloc (dict->states[state1].trans,
226
0
            (num_trans << 1) *
227
0
            sizeof(HyphenTrans));
228
0
    }
229
0
  dict->states[state1].trans[num_trans].ch = ch;
230
0
  dict->states[state1].trans[num_trans].new_state = state2;
231
0
  dict->states[state1].num_trans++;
232
0
}
233
234
#ifdef VERBOSE
235
HashTab *global[1];
236
237
static char *
238
get_state_str (int state, int level)
239
{
240
  int i;
241
  HashEntry *e;
242
243
  for (i = 0; i < HASH_SIZE; i++)
244
    for (e = global[level]->entries[i]; e; e = e->next)
245
      if (e->val == state)
246
  return e->key;
247
  return NULL;
248
}
249
#endif
250
251
0
void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) {
252
0
  int i, j;
253
0
  char word[MAX_CHARS];
254
0
  char pattern[MAX_CHARS];
255
0
  char * repl;
256
0
  signed char replindex;
257
0
  signed char replcut;
258
0
  int state_num = 0;
259
0
  int last_state;
260
0
  char ch;
261
0
  int found;
262
263
0
    if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
264
0
      dict->lhmin = atoi(buf + 13);
265
0
      return;
266
0
    } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
267
0
      dict->rhmin = atoi(buf + 14);
268
0
      return;
269
0
    } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
270
0
      dict->clhmin = atoi(buf + 21);
271
0
      return;
272
0
    } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
273
0
      dict->crhmin = atoi(buf + 22);
274
0
      return;
275
0
    } else if (strncmp(buf, "NOHYPHEN", 8) == 0) {
276
0
      char * space = buf + 8;
277
0
      while (*space != '\0' && (*space == ' ' || *space == '\t')) space++;
278
0
      if (dict->nohyphen) hnj_free(dict->nohyphen);
279
0
      dict->nohyphen = NULL;
280
0
      dict->nohyphenl = 0;
281
0
      if (*space != '\0') dict->nohyphen = hnj_strdup(space);
282
0
      if (dict->nohyphen) {
283
0
          char * nhe = dict->nohyphen + strlen(dict->nohyphen) - 1;
284
0
          *nhe = 0;
285
0
          for (nhe = nhe - 1; nhe > dict->nohyphen; nhe--) {
286
0
                  if (*nhe == ',') {
287
0
                      dict->nohyphenl++;
288
0
                      *nhe = 0;
289
0
                  }
290
0
          }
291
0
      }
292
0
      return;
293
0
    }
294
0
    j = 0;
295
0
    pattern[j] = '0';
296
0
          repl = strchr(buf, '/');
297
0
          replindex = 0;
298
0
          replcut = 0;
299
0
          if (repl) {
300
0
            char * index = strchr(repl + 1, ',');
301
0
            *repl = '\0';
302
0
            if (index) {
303
0
                char * index2 = strchr(index + 1, ',');
304
0
                *index = '\0';
305
0
                if (index2) {
306
0
                    *index2 = '\0';
307
0
                    replindex = (signed char) atoi(index + 1) - 1;
308
0
                    replcut = (signed char) atoi(index2 + 1);
309
0
                }
310
0
            } else {
311
0
                hnj_strchomp(repl + 1);
312
0
                replindex = 0;
313
0
                replcut = (signed char) strlen(buf);
314
0
            }
315
0
            repl = hnj_strdup(repl + 1);
316
0
          }
317
0
    for (i = 0; (unsigned char)buf[i] > (unsigned char)' ' && j < MAX_CHARS - 2; i++)
318
0
      {
319
0
        if (buf[i] >= '0' && buf[i] <= '9')
320
0
    pattern[j] = buf[i];
321
0
        else
322
0
    {
323
0
      word[j] = buf[i];
324
0
      pattern[++j] = '0';
325
0
    }
326
0
      }
327
0
    word[j] = '\0';
328
0
    pattern[j + 1] = '\0';
329
330
0
          i = 0;
331
0
    if (!repl) {
332
      /* Optimize away leading zeroes */
333
0
            for (; pattern[i] == '0'; i++);
334
0
          } else {
335
0
            if (*word == '.') i++;
336
            /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
337
0
            if (dict->utf8) {
338
0
                int pu = -1;        /* unicode character position */
339
0
                int ps = -1;        /* unicode start position (original replindex) */
340
0
                size_t pc = (*word == '.') ? 1: 0; /* 8-bit character position */
341
0
                for (; pc < (strlen(word) + 1); pc++) {
342
                /* beginning of an UTF-8 character (not '10' start bits) */
343
0
                    if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
344
0
                    if ((ps < 0) && (replindex == pu)) {
345
0
                        ps = replindex;
346
0
                        replindex = (signed char) pc;
347
0
                    }
348
0
                    if ((ps >= 0) && ((pu - ps) == replcut)) {
349
0
                        replcut = (signed char) (pc - replindex);
350
0
                        break;
351
0
                    }
352
0
                }
353
0
                if (*word == '.') replindex--;
354
0
            }
355
0
          }
356
357
#ifdef VERBOSE
358
    printf ("word %s pattern %s, j = %d  repl: %s\n", word, pattern + i, j, repl);
359
#endif
360
0
    found = hnj_hash_lookup (hashtab, word);
361
0
    state_num = hnj_get_state (dict, hashtab, word);
362
0
    if (dict->states[state_num].match) hnj_free (dict->states[state_num].match);
363
0
    if (dict->states[state_num].repl) hnj_free (dict->states[state_num].repl);
364
0
    dict->states[state_num].match = hnj_strdup (pattern + i);
365
0
    dict->states[state_num].repl = repl;
366
0
    dict->states[state_num].replindex = replindex;
367
0
          if (!replcut) {
368
0
            dict->states[state_num].replcut = (signed char) strlen(word);
369
0
          } else {
370
0
            dict->states[state_num].replcut = replcut;
371
0
          }
372
373
    /* now, put in the prefix transitions */
374
0
          for (; found < 0 && j > 0; --j)
375
0
      {
376
0
        last_state = state_num;
377
0
        ch = word[j - 1];
378
0
        word[j - 1] = '\0';
379
0
        found = hnj_hash_lookup (hashtab, word);
380
0
        state_num = hnj_get_state (dict, hashtab, word);
381
0
        hnj_add_trans (dict, state_num, last_state, ch);
382
0
      }
383
0
}
384
385
0
FILE * hnj_fopen(const char * path, const char * mode) {
386
#ifdef _WIN32
387
#define WIN32_LONG_PATH_PREFIX "\\\\?\\"
388
    if (strncmp(path, WIN32_LONG_PATH_PREFIX, 4) == 0) {
389
        int len = MultiByteToWideChar(CP_UTF8, 0, path, -1, NULL, 0);
390
        wchar_t *buff = (wchar_t *) malloc(len * sizeof(wchar_t));
391
        wchar_t *buff2 = (wchar_t *) malloc(len * sizeof(wchar_t));
392
        FILE * f = NULL;
393
        MultiByteToWideChar(CP_UTF8, 0, path, -1, buff, len);
394
        if (_wfullpath( buff2, buff, len ) != NULL) {
395
          f = _wfopen(buff2, (strcmp(mode, "r") == 0) ? L"r" : L"rb");
396
        }
397
        free(buff);
398
        free(buff2);
399
        return f;
400
    }
401
#endif
402
0
    return fopen(path, mode);
403
0
}
404
405
HyphenDict *
406
hnj_hyphen_load (const char *fn)
407
0
{
408
0
  HyphenDict *result;
409
0
  FILE *f;
410
0
  f = hnj_fopen (fn, "r");
411
0
  if (f == NULL)
412
0
    return NULL;
413
414
0
  result = hnj_hyphen_load_file(f);
415
416
0
  fclose(f);
417
0
  return result;
418
0
}
419
420
/* Line-reader callback used by hnj_hyphen_load_impl. Returns buf on success
421
 * (line copied in, '\n' included if present, NUL-terminated), or NULL on EOF.
422
 * Overly long lines are skipped with a warning. */
423
typedef char *(*hnj_get_line_fn)(char *buf, int n, void *ctx);
424
425
typedef struct { const char *p; size_t n; } hnj_mem_ctx;
426
427
0
static char *hnj_get_line_file(char *buf, int n, void *ctx) {
428
0
  FILE *f = (FILE *)ctx;
429
0
  while (fgets(buf, n, f) != NULL) {
430
0
    if (!feof(f) && strchr(buf, '\n') == NULL) {
431
0
      int c;
432
0
      while ((c = fgetc(f)) != '\n' && c != EOF);
433
0
      if (buf[0] != '%')
434
0
        fprintf(stderr, "Warning: skipping too long pattern (more than %d chars)\n", n);
435
0
      continue;
436
0
    }
437
0
    return buf;
438
0
  }
439
0
  return NULL;
440
0
}
441
442
0
static char *hnj_get_line_mem(char *buf, int n, void *ctx) {
443
0
  hnj_mem_ctx *m = (hnj_mem_ctx *)ctx;
444
0
  while (m->n > 0) {
445
0
    size_t cap = (size_t)(n - 1);
446
0
    size_t i = 0;
447
0
    while (i < m->n && i < cap && m->p[i] != '\n') i++;
448
0
    int has_nl = (i < cap && i < m->n && m->p[i] == '\n');
449
0
    size_t copy = has_nl ? i + 1 : i;
450
0
    memcpy(buf, m->p, copy);
451
0
    buf[copy] = '\0';
452
0
    m->p += copy;
453
0
    m->n -= copy;
454
0
    if (!has_nl && m->n > 0) {
455
0
      while (m->n > 0 && *m->p != '\n') { m->p++; m->n--; }
456
0
      if (m->n > 0) { m->p++; m->n--; }
457
0
      if (buf[0] != '%')
458
0
        fprintf(stderr, "Warning: skipping too long pattern (more than %d chars)\n", n);
459
0
      continue;
460
0
    }
461
0
    return buf;
462
0
  }
463
0
  return NULL;
464
0
}
465
466
static HyphenDict *
467
hnj_hyphen_load_impl (hnj_get_line_fn get_line, void *ctx)
468
0
{
469
0
  HyphenDict *dict[2];
470
0
  HashTab *hashtab;
471
0
  char buf[MAX_CHARS];
472
0
  int nextlevel = 0;
473
0
  int i, j, k;
474
0
  HashEntry *e;
475
0
  int state_num = 0;
476
/* loading one or two dictionaries (separated by NEXTLEVEL keyword) */
477
0
for (k = 0; k < 2; k++) {
478
0
  hashtab = hnj_hash_new ();
479
#ifdef VERBOSE
480
  global[k] = hashtab;
481
#endif
482
0
  hnj_hash_insert (hashtab, "", 0);
483
0
  dict[k] = (HyphenDict *) hnj_malloc (sizeof(HyphenDict));
484
0
  dict[k]->num_states = 1;
485
0
  dict[k]->states = (HyphenState *) hnj_malloc (sizeof(HyphenState));
486
0
  dict[k]->states[0].match = NULL;
487
0
  dict[k]->states[0].repl = NULL;
488
0
  dict[k]->states[0].fallback_state = -1;
489
0
  dict[k]->states[0].num_trans = 0;
490
0
  dict[k]->states[0].trans = NULL;
491
0
  dict[k]->nextlevel = NULL;
492
0
  dict[k]->lhmin = 0;
493
0
  dict[k]->rhmin = 0;
494
0
  dict[k]->clhmin = 0;
495
0
  dict[k]->crhmin = 0;
496
0
  dict[k]->nohyphen = NULL;
497
0
  dict[k]->nohyphenl = 0;
498
499
  /* read in character set info */
500
0
  if (k == 0) {
501
0
    for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
502
0
    if (get_line(dict[k]->cset, sizeof(dict[k]->cset), ctx) != NULL) {
503
0
      for (i=0;i<MAX_NAME;i++)
504
0
        if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
505
0
          dict[k]->cset[i] = 0;
506
0
    } else {
507
0
      dict[k]->cset[0] = 0;
508
0
    }
509
0
    dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
510
0
  } else {
511
0
    strncpy(dict[k]->cset, dict[0]->cset, sizeof(dict[k]->cset)-1);
512
0
    dict[k]->cset[sizeof(dict[k]->cset)-1] = '\0';
513
0
    dict[k]->utf8 = dict[0]->utf8;
514
0
  }
515
516
0
  if (k == 0 || nextlevel) {
517
0
    while (get_line(buf, sizeof(buf), ctx) != NULL) {
518
0
      if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
519
0
        nextlevel = 1;
520
0
        break;
521
0
      } else if (buf[0] != '%') {
522
0
        hnj_hyphen_load_line(buf, dict[k], hashtab);
523
0
      }
524
0
    }
525
0
  } else if (k == 1) {
526
    /* default first level: hyphen and ASCII apostrophe */
527
0
    if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN ',-\n", dict[k], hashtab);
528
0
    else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab);
529
0
    strncpy(buf, "1-1\n", MAX_CHARS-1); /* buf rewritten by hnj_hyphen_load here */
530
0
    buf[MAX_CHARS-1] = '\0';
531
0
    hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */
532
0
    hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */
533
0
    if (dict[0]->utf8) {
534
0
      hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */
535
0
      hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */
536
0
    }
537
0
  }
538
539
  /* Could do unioning of matches here (instead of the preprocessor script).
540
     If we did, the pseudocode would look something like this:
541
542
     foreach state in the hash table
543
        foreach i = [1..length(state) - 1]
544
           state to check is substr (state, i)
545
           look it up
546
           if found, and if there is a match, union the match in.
547
548
     It's also possible to avoid the quadratic blowup by doing the
549
     search in order of increasing state string sizes - then you
550
     can break the loop after finding the first match.
551
552
     This step should be optional in any case - if there is a
553
     preprocessed rule table, it's always faster to use that.
554
555
*/
556
557
  /* put in the fallback states */
558
0
  for (i = 0; i < HASH_SIZE; i++)
559
0
    for (e = hashtab->entries[i]; e; e = e->next)
560
0
      {
561
0
  if (*(e->key)) for (j = 1; 1; j++)
562
0
    {
563
0
      state_num = hnj_hash_lookup (hashtab, e->key + j);
564
0
      if (state_num >= 0)
565
0
        break;
566
0
    }
567
        /* KBH: FIXME state 0 fallback_state should always be -1? */
568
0
  if (e->val)
569
0
    dict[k]->states[e->val].fallback_state = state_num;
570
0
      }
571
#ifdef VERBOSE
572
  for (i = 0; i < HASH_SIZE; i++)
573
    for (e = hashtab->entries[i]; e; e = e->next)
574
      {
575
  printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
576
    dict[k]->states[e->val].fallback_state);
577
  for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
578
    printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
579
      dict[k]->states[e->val].trans[j].new_state);
580
      }
581
#endif
582
583
0
#ifndef VERBOSE
584
0
  hnj_hash_free (hashtab);
585
0
#endif
586
0
  state_num = 0;
587
0
}
588
0
  if (nextlevel) dict[0]->nextlevel = dict[1];
589
0
  else {
590
0
    dict[1] -> nextlevel = dict[0];
591
0
    dict[1]->lhmin = dict[0]->lhmin;
592
0
    dict[1]->rhmin = dict[0]->rhmin;
593
0
    dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3);
594
0
    dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3);
595
#ifdef VERBOSE
596
    HashTab *r = global[0];
597
    global[0] = global[1];
598
    global[1] = r;
599
#endif
600
0
    return dict[1];
601
0
  }
602
0
  return dict[0];
603
0
}
604
605
HyphenDict *
606
hnj_hyphen_load_file (FILE *f)
607
0
{
608
0
  return hnj_hyphen_load_impl(hnj_get_line_file, f);
609
0
}
610
611
HyphenDict *
612
hnj_hyphen_load_data (const char *fdata, size_t flen)
613
0
{
614
0
  hnj_mem_ctx ctx = { fdata, flen };
615
0
  return hnj_hyphen_load_impl(hnj_get_line_mem, &ctx);
616
0
}
617
618
void hnj_hyphen_free (HyphenDict *dict)
619
0
{
620
0
  int state_num;
621
0
  HyphenState *hstate;
622
623
0
  for (state_num = 0; state_num < dict->num_states; state_num++)
624
0
    {
625
0
      hstate = &dict->states[state_num];
626
0
      if (hstate->match)
627
0
  hnj_free (hstate->match);
628
0
      if (hstate->repl)
629
0
  hnj_free (hstate->repl);
630
0
      if (hstate->trans)
631
0
  hnj_free (hstate->trans);
632
0
    }
633
0
  if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);
634
635
0
  if (dict->nohyphen) hnj_free(dict->nohyphen);
636
637
0
  hnj_free (dict->states);
638
639
0
  hnj_free (dict);
640
0
}
641
642
#define MAX_WORD 256
643
644
int hnj_hyphen_hyphenate (HyphenDict *dict,
645
         const char *word, int word_size,
646
         char *hyphens)
647
0
{
648
0
  char *prep_word;
649
0
  int i, j, k;
650
0
  int state;
651
0
  char ch;
652
0
  HyphenState *hstate;
653
0
  char *match;
654
0
  int offset;
655
656
0
  prep_word = (char*) hnj_malloc (word_size + 3);
657
658
0
  j = 0;
659
0
  prep_word[j++] = '.';
660
661
0
  for (i = 0; i < word_size; i++) {
662
0
    if (word[i] <= '9' && word[i] >= '0') {
663
0
      prep_word[j++] = '.';
664
0
    } else {
665
0
      prep_word[j++] = word[i];
666
0
    }
667
0
  }
668
669
0
  prep_word[j++] = '.';
670
0
  prep_word[j] = '\0';
671
672
0
  for (i = 0; i < word_size + 5; i++)
673
0
    hyphens[i] = '0';
674
675
#ifdef VERBOSE
676
  printf ("prep_word = %s\n", prep_word);
677
#endif
678
679
  /* now, run the finite state machine */
680
0
  state = 0;
681
0
  for (i = 0; i < j; i++)
682
0
    {
683
0
      ch = prep_word[i];
684
0
      for (;;)
685
0
  {
686
687
0
    if (state == -1) {
688
            /* return 1; */
689
      /*  KBH: FIXME shouldn't this be as follows? */
690
0
            state = 0;
691
0
            goto try_next_letter;
692
0
          }
693
694
#ifdef VERBOSE
695
    char *state_str;
696
    state_str = get_state_str (state, 0);
697
698
    for (k = 0; k < i - strlen (state_str); k++)
699
      putchar (' ');
700
    printf ("%s", state_str);
701
#endif
702
703
0
    hstate = &dict->states[state];
704
0
    for (k = 0; k < hstate->num_trans; k++)
705
0
      if (hstate->trans[k].ch == ch)
706
0
        {
707
0
    state = hstate->trans[k].new_state;
708
0
    goto found_state;
709
0
        }
710
0
    state = hstate->fallback_state;
711
#ifdef VERBOSE
712
    printf (" falling back, fallback_state %d\n", state);
713
#endif
714
0
  }
715
0
    found_state:
716
#ifdef VERBOSE
717
      printf ("found state %d\n",state);
718
#endif
719
      /* Additional optimization is possible here - especially,
720
   elimination of trailing zeroes from the match. Leading zeroes
721
   have already been optimized. */
722
0
      match = dict->states[state].match;
723
      /* replacing rules not handled by hyphen_hyphenate() */
724
0
      if (match && !dict->states[state].repl)
725
0
  {
726
0
    offset = i + 1 - strlen (match);
727
#ifdef VERBOSE
728
    for (k = 0; k < offset; k++)
729
      putchar (' ');
730
    printf ("%s\n", match);
731
#endif
732
    /* This is a linear search because I tried a binary search and
733
       found it to be just a teeny bit slower. */
734
0
    for (k = (offset < 0 ? -offset : 0); match[k]; k++)
735
0
      if (hyphens[offset + k] < match[k])
736
0
        hyphens[offset + k] = match[k];
737
0
  }
738
739
      /* KBH: we need this to make sure we keep looking in a word */
740
      /* for patterns even if the current character is not known in state 0 */
741
      /* since patterns for hyphenation may occur anywhere in the word */
742
0
      try_next_letter: ;
743
744
0
    }
745
#ifdef VERBOSE
746
  for (i = 0; i < j; i++)
747
    putchar (hyphens[i]);
748
  putchar ('\n');
749
#endif
750
751
0
  for (i = 0; i < j - 4; i++)
752
#if 0
753
    if (hyphens[i + 1] & 1)
754
      hyphens[i] = '-';
755
#else
756
0
    hyphens[i] = hyphens[i + 1];
757
0
#endif
758
0
  hyphens[0] = '0';
759
0
  for (; i < word_size; i++)
760
0
    hyphens[i] = '0';
761
0
  hyphens[word_size] = '\0';
762
763
0
  hnj_free (prep_word);
764
765
0
  return 0;
766
0
}
767
768
/* Unicode ligature length */
769
0
int hnj_ligature(unsigned char c) {
770
0
    switch (c) {
771
0
        case 0x80:      /* ff */
772
0
        case 0x81:      /* fi */
773
0
        case 0x82: return LIG_xx; /* fl */
774
0
        case 0x83:      /* ffi */
775
0
        case 0x84: return LIG_xxx; /* ffl */
776
0
        case 0x85:      /* long st */
777
0
        case 0x86: return LIG_xx; /* st */
778
0
    }
779
0
    return 0;
780
0
}
781
782
/* character length of the first n byte of the input word */
783
int hnj_hyphen_strnlen(const char * word, int n, int utf8)
784
0
{
785
0
    int i = 0;
786
0
    int j = 0;
787
0
    while (j < n && word[j] != '\0') {
788
0
      i++;
789
      /* Unicode ligature support */
790
0
      if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC))  {
791
0
        i += hnj_ligature(word[j + 2]);
792
0
      }
793
0
      for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
794
0
    }
795
0
    return i;
796
0
}
797
798
int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
799
  char *** rep, int ** pos, int ** cut, int lhmin)
800
0
{
801
0
    int i = 1, j;
802
803
    /* Unicode ligature support */
804
0
    if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC))  {
805
0
      i += hnj_ligature(word[2]);
806
0
    }
807
808
    /* ignore numbers */
809
0
    for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--;
810
811
0
    for (j = 0; i < lhmin && j < word_size && word[j] != '\0'; i++) do {
812
      /* check length of the non-standard part */
813
0
      if (*rep && *pos && *cut && (*rep)[j]) {
814
0
        char * rh = strchr((*rep)[j], '=');
815
0
        if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
816
0
          hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
817
0
            free((*rep)[j]);
818
0
            (*rep)[j] = NULL;
819
0
            hyphens[j] = '0';
820
0
          }
821
0
       } else {
822
0
         hyphens[j] = '0';
823
0
       }
824
0
       j++;
825
826
       /* Unicode ligature support */
827
0
       if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC))  {
828
0
         i += hnj_ligature(word[j + 2]);
829
0
       }
830
0
    } while (j < word_size && utf8 && (word[j] & 0xc0) == 0x80);
831
0
    return 0;
832
0
}
833
834
int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
835
  char *** rep, int ** pos, int ** cut, int rhmin)
836
0
{
837
0
    int i = 0;
838
0
    int j;
839
840
    /* ignore numbers */
841
0
    for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--;
842
843
0
    for (j = word_size - 1; i < rhmin && j > 0; j--) {
844
      /* check length of the non-standard part */
845
0
      if (*rep && *pos && *cut && (*rep)[j]) {
846
0
        char * rh = strchr((*rep)[j], '=');
847
0
        int start = j - (*pos)[j] + (*cut)[j] + 1;
848
0
        int word_len = (start >= 0 && start <= word_size) ?
849
0
            hnj_hyphen_strnlen(word + start, word_size - start, utf8) : 0;
850
0
        if (rh && (word_len +
851
0
          hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
852
0
            free((*rep)[j]);
853
0
            (*rep)[j] = NULL;
854
0
            hyphens[j] = '0';
855
0
          }
856
0
       } else {
857
0
         hyphens[j] = '0';
858
0
       }
859
0
       if (!utf8 || (word[j] & 0xc0) == 0xc0 || (word[j] & 0x80) != 0x80) i++;
860
0
    }
861
0
    return 0;
862
0
}
863
864
/* recursive function for compound level hyphenation */
865
int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
866
    char * hyphens, char *** rep, int ** pos, int ** cut,
867
    int clhmin, int crhmin, int lend, int rend)
868
0
{
869
0
  char *prep_word;
870
0
  int i, j, k;
871
0
  int state;
872
0
  char ch;
873
0
  HyphenState *hstate;
874
0
  char *match;
875
0
  char *repl;
876
0
  signed char replindex;
877
0
  signed char replcut;
878
0
  int offset;
879
0
  int * matchlen;
880
0
  int * matchindex;
881
0
  char ** matchrepl;
882
0
  int isrepl = 0;
883
0
  int nHyphCount;
884
885
0
  size_t prep_word_size = word_size + 3;
886
0
  prep_word = (char*) hnj_malloc (prep_word_size);
887
0
  matchlen = (int*) hnj_malloc ((word_size + 3) * sizeof(int));
888
0
  memset(matchlen, 0, (word_size + 3) * sizeof(int));
889
0
  matchindex = (int*) hnj_malloc ((word_size + 3) * sizeof(int));
890
0
  matchrepl = (char**) hnj_malloc ((word_size + 3) * sizeof(char *));
891
892
0
  j = 0;
893
0
  prep_word[j++] = '.';
894
895
0
  for (i = 0; i < word_size; i++) {
896
0
    if (word[i] <= '9' && word[i] >= '0') {
897
0
      prep_word[j++] = '.';
898
0
    } else {
899
0
      prep_word[j++] = word[i];
900
0
    }
901
0
  }
902
903
904
905
0
  prep_word[j++] = '.';
906
0
  prep_word[j] = '\0';
907
908
0
  for (i = 0; i < j; i++)
909
0
    hyphens[i] = '0';
910
911
#ifdef VERBOSE
912
  printf ("prep_word = %s\n", prep_word);
913
#endif
914
915
  /* now, run the finite state machine */
916
0
  state = 0;
917
0
  for (i = 0; i < j; i++)
918
0
    {
919
0
      ch = prep_word[i];
920
0
      for (;;)
921
0
  {
922
923
0
    if (state == -1) {
924
            /* return 1; */
925
      /*  KBH: FIXME shouldn't this be as follows? */
926
0
            state = 0;
927
0
            goto try_next_letter;
928
0
          }
929
930
#ifdef VERBOSE
931
    char *state_str;
932
    state_str = get_state_str (state, 1);
933
934
    for (k = 0; k < i - strlen (state_str); k++)
935
      putchar (' ');
936
    printf ("%s", state_str);
937
#endif
938
939
0
    hstate = &dict->states[state];
940
0
    for (k = 0; k < hstate->num_trans; k++)
941
0
      if (hstate->trans[k].ch == ch)
942
0
        {
943
0
    state = hstate->trans[k].new_state;
944
0
    goto found_state;
945
0
        }
946
0
    state = hstate->fallback_state;
947
#ifdef VERBOSE
948
    printf (" falling back, fallback_state %d\n", state);
949
#endif
950
0
  }
951
0
    found_state:
952
#ifdef VERBOSE
953
      printf ("found state %d\n",state);
954
#endif
955
      /* Additional optimization is possible here - especially,
956
   elimination of trailing zeroes from the match. Leading zeroes
957
   have already been optimized. */
958
0
      match = dict->states[state].match;
959
0
      repl = dict->states[state].repl;
960
0
      replindex = dict->states[state].replindex;
961
0
      replcut = dict->states[state].replcut;
962
      /* replacing rules not handled by hyphen_hyphenate() */
963
0
      if (match)
964
0
  {
965
0
    offset = i + 1 - strlen (match);
966
#ifdef VERBOSE
967
    for (k = 0; k < offset; k++)
968
      putchar (' ');
969
    printf ("%s (%s)\n", match, repl);
970
#endif
971
0
          if (repl) {
972
0
            if (!isrepl) for(; isrepl < word_size; isrepl++) {
973
0
                matchrepl[isrepl] = NULL;
974
0
                matchindex[isrepl] = -1;
975
0
            }
976
0
            if (offset + replindex >= 0 && offset + replindex < word_size + 3)
977
0
                matchlen[offset + replindex] = replcut;
978
0
          }
979
    /* This is a linear search because I tried a binary search and
980
       found it to be just a teeny bit slower. */
981
0
    for (k = (offset < 0 ? -offset : 0); match[k]; k++) {
982
0
      if ((hyphens[offset + k] < match[k])) {
983
0
        hyphens[offset + k] = match[k];
984
0
              if (match[k]&1) {
985
0
                matchrepl[offset + k] = repl;
986
0
                if (repl && (k >= replindex) && (k <= replindex + replcut)
987
0
                    && offset + replindex >= 0) {
988
0
                    matchindex[offset + replindex] = offset + k;
989
0
                }
990
0
              }
991
0
            }
992
0
          }
993
994
0
  }
995
996
      /* KBH: we need this to make sure we keep looking in a word */
997
      /* for patterns even if the current character is not known in state 0 */
998
      /* since patterns for hyphenation may occur anywhere in the word */
999
0
      try_next_letter: ;
1000
1001
0
    }
1002
#ifdef VERBOSE
1003
  for (i = 0; i < j; i++)
1004
    putchar (hyphens[i]);
1005
  putchar ('\n');
1006
#endif
1007
1008
0
  for (i = 0; i < j - 3; i++)
1009
#if 0
1010
    if (hyphens[i + 1] & 1)
1011
      hyphens[i] = '-';
1012
#else
1013
0
    hyphens[i] = hyphens[i + 1];
1014
0
#endif
1015
0
  for (; i < word_size; i++)
1016
0
    hyphens[i] = '0';
1017
0
  hyphens[word_size] = '\0';
1018
1019
       /* now create a new char string showing hyphenation positions */
1020
       /* count the hyphens and allocate space for the new hyphenated string */
1021
0
       nHyphCount = 0;
1022
0
       for (i = 0; i < word_size; i++)
1023
0
          if (hyphens[i]&1)
1024
0
             nHyphCount++;
1025
0
       j = 0;
1026
0
       for (i = 0; i < word_size; i++) {
1027
0
           if (isrepl && matchlen[i] >= 1 && matchindex[i] >= 1 && matchindex[i] <= word_size && matchrepl[matchindex[i]]) {
1028
0
                if (rep && pos && cut) {
1029
0
                    if (!*rep)
1030
0
                        *rep = (char **) calloc(word_size, sizeof(char *));
1031
0
                    if (!*pos)
1032
0
                        *pos = (int *) calloc(word_size, sizeof(int));
1033
0
                    if (!*cut) {
1034
0
                        *cut = (int *) calloc(word_size, sizeof(int));
1035
0
                    }
1036
0
                    hnj_free((*rep)[matchindex[i] - 1]);
1037
0
                    (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
1038
0
                    (*pos)[matchindex[i] - 1] = matchindex[i] - i;
1039
0
                    (*cut)[matchindex[i] - 1] = matchlen[i];
1040
0
                }
1041
0
                j += strlen(matchrepl[matchindex[i]]);
1042
0
                i += matchlen[i] - 1;
1043
0
          }
1044
0
       }
1045
1046
0
  hnj_free (matchrepl);
1047
0
  hnj_free (matchlen);
1048
0
  hnj_free (matchindex);
1049
1050
  /* recursive hyphenation of the first (compound) level segments */
1051
0
  if (dict->nextlevel) {
1052
0
     char ** rep2;
1053
0
     int * pos2;
1054
0
     int * cut2;
1055
0
     char * hyphens2;
1056
0
     int begin = 0;
1057
1058
0
     rep2 = (char**) hnj_malloc (word_size * sizeof(char *));
1059
0
     pos2 = (int*) hnj_malloc (word_size * sizeof(int));
1060
0
     cut2 = (int*) hnj_malloc (word_size * sizeof(int));
1061
0
     hyphens2 = (char*) hnj_malloc (word_size + 3);
1062
0
     for (i = 0; i < word_size; i++) rep2[i] = NULL;
1063
0
     for (i = 0; i < word_size; i++) if
1064
0
        (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
1065
0
        if (i - begin > 0) {
1066
0
            int hyph = 0;
1067
0
            prep_word[i + 2] = '\0';
1068
            /* non-standard hyphenation at compound boundary (Schiffahrt) */
1069
0
            if (rep && *rep && *pos && *cut && (*rep)[i]) {
1070
0
                char * l = strchr((*rep)[i], '=');
1071
0
                size_t offset = 2 + i - (*pos)[i];
1072
0
                strncpy(prep_word + offset, (*rep)[i], prep_word_size - offset - 1);
1073
0
                prep_word[prep_word_size - 1] = '\0';
1074
0
                if (l) {
1075
0
                    hyph = (l - (*rep)[i]) - (*pos)[i];
1076
0
                    if (2 + i + hyph < prep_word_size)
1077
0
                        prep_word[2 + i + hyph] = '\0';
1078
0
                }
1079
0
            }
1080
0
            int sub_size = i - begin + 1 + hyph;
1081
0
            if (sub_size >= word_size) sub_size = word_size - 1;
1082
0
            if ((size_t)sub_size + begin + 2 > prep_word_size)
1083
0
                sub_size = (int)(prep_word_size - begin - 2);
1084
0
            if (sub_size < 1) sub_size = 1;
1085
0
            hnj_hyphen_hyph_(dict, prep_word + begin + 1, sub_size,
1086
0
                hyphens2, &rep2, &pos2, &cut2, clhmin,
1087
0
                crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
1088
0
            for (j = 0; j < i - begin; j++) {
1089
0
                hyphens[begin + j] = hyphens2[j];
1090
0
                if (rep2[j] && rep && pos && cut) {
1091
0
                    if (!*rep && !*pos && !*cut) {
1092
0
                        int k;
1093
0
                        *rep = (char **) malloc(sizeof(char *) * word_size);
1094
0
                        *pos = (int *) malloc(sizeof(int) * word_size);
1095
0
                        *cut = (int *) malloc(sizeof(int) * word_size);
1096
0
                        for (k = 0; k < word_size; k++) {
1097
0
                            (*rep)[k] = NULL;
1098
0
                            (*pos)[k] = 0;
1099
0
                            (*cut)[k] = 0;
1100
0
                        }
1101
0
                    }
1102
0
                    hnj_free((*rep)[begin + j]);
1103
0
                    (*rep)[begin + j] = rep2[j];
1104
0
                    (*pos)[begin + j] = pos2[j];
1105
0
                    (*cut)[begin + j] = cut2[j];
1106
0
                    rep2[j] = NULL;
1107
0
                }
1108
0
            }
1109
0
            prep_word[i + 2] = word[i + 1];
1110
0
            if (*rep && *pos && *cut && (*rep)[i]) {
1111
0
                size_t offset = 1;
1112
0
                strncpy(prep_word + offset, word, prep_word_size - offset - 1);
1113
0
                prep_word[prep_word_size - 1] = '\0';
1114
0
            }
1115
0
        }
1116
0
        begin = i + 1;
1117
0
        for (j = 0; j < word_size; j++) {
1118
0
            hnj_free(rep2[j]);
1119
0
            rep2[j] = NULL;
1120
0
        }
1121
0
     }
1122
1123
     /* non-compound */
1124
0
     if (begin == 0) {
1125
0
        hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
1126
0
            hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
1127
0
        if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
1128
0
            rep, pos, cut, clhmin);
1129
0
        if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
1130
0
            rep, pos, cut, crhmin);
1131
0
     }
1132
1133
0
     free(rep2);
1134
0
     free(cut2);
1135
0
     free(pos2);
1136
0
     free(hyphens2);
1137
0
  }
1138
1139
0
  hnj_free (prep_word);
1140
0
  return 0;
1141
0
}
1142
1143
/* UTF-8 normalization of hyphen and non-standard positions */
1144
int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
1145
  char *** rep, int ** pos, int ** cut)
1146
0
{
1147
0
  int i, j, k;
1148
0
  if ((((unsigned char) word[0]) >> 6) == 2) {
1149
0
    fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
1150
0
    return 1;
1151
0
  }
1152
1153
  /* calculate UTF-8 character positions */
1154
0
  for (i = 0, j = -1; i < word_size; i++) {
1155
    /* beginning of an UTF-8 character (not '10' start bits) */
1156
0
    if ((((unsigned char) word[i]) >> 6) != 2) j++;
1157
0
    hyphens[j] = hyphens[i];
1158
0
    if (rep && pos && cut && *rep && *pos && *cut) {
1159
0
        int l = (*pos)[i];
1160
0
        (*pos)[j] = 0;
1161
0
        for (k = 0; k < l; k++) {
1162
0
            if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
1163
0
        }
1164
0
        k = i - l + 1;
1165
0
        l = k + (*cut)[i];
1166
0
        (*cut)[j] = 0;
1167
0
        for (; k < l && k < word_size; k++) {
1168
0
            if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
1169
0
        }
1170
0
        if (j != i) {
1171
0
            hnj_free((*rep)[j]);
1172
0
            (*rep)[j] = (*rep)[i];
1173
0
            (*rep)[i] = NULL;
1174
0
            (*pos)[i] = 0;
1175
0
            (*cut)[i] = 0;
1176
0
        }
1177
0
    }
1178
0
  }
1179
0
  hyphens[j + 1] = '\0';
1180
#ifdef VERBOSE
1181
  printf ("nums: %s\n", hyphens);
1182
#endif
1183
0
  return 0;
1184
0
}
1185
1186
/* get the word with all possible hyphenations (output: hyphword) */
1187
void hnj_hyphen_hyphword(const char * word, int word_size, const char * hyphens,
1188
    char * hyphword, char *** rep, int ** pos, int ** cut)
1189
0
{
1190
1191
0
  if (word_size <= 0 || word_size > INT_MAX / 2) {
1192
0
    hyphword[0] = '\0';
1193
0
    return;
1194
0
  }
1195
1196
  /* hyphword buffer size must be at least 2 * l */
1197
0
  int hyphword_size = 2 * word_size - 1;
1198
1199
0
  int nonstandard = 0;
1200
0
  if (*rep && *pos && *cut) {
1201
0
    nonstandard = 1;
1202
0
  }
1203
1204
0
  int i;
1205
0
  int j = 0;
1206
0
  for (i = 0; i < word_size && j < hyphword_size; i++) {
1207
0
    hyphword[j++] = word[i];
1208
0
    if (hyphens[i]&1 && j < hyphword_size) {
1209
0
      if (nonstandard && (*rep)[i] && j >= (*pos)[i]) {
1210
        /* non-standard */
1211
0
        j -= (*pos)[i];
1212
0
        char *s = (*rep)[i];
1213
0
        while (*s && j < hyphword_size) {
1214
0
          hyphword[j++] = *s++;
1215
0
        }
1216
0
        i += (*cut)[i] - (*pos)[i];
1217
0
      } else {
1218
        /* standard */
1219
0
        hyphword[j++] = '=';
1220
0
      }
1221
0
    }
1222
0
  }
1223
0
  hyphword[j] = '\0';
1224
0
}
1225
1226
1227
/* main api function with default hyphenmin parameters */
1228
int hnj_hyphen_hyphenate2 (HyphenDict *dict,
1229
         const char *word, int word_size, char * hyphens,
1230
         char *hyphword, char *** rep, int ** pos, int ** cut)
1231
0
{
1232
0
  hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
1233
0
    dict->clhmin, dict->crhmin, 1, 1);
1234
0
  hnj_hyphen_lhmin(dict->utf8, word, word_size,
1235
0
    hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
1236
0
  hnj_hyphen_rhmin(dict->utf8, word, word_size,
1237
0
    hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
1238
1239
  /* nohyphen */
1240
0
  if (dict->nohyphen) {
1241
0
    char * nh = dict->nohyphen;
1242
0
    int nhi;
1243
0
    for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
1244
0
        char * nhy = *nh ? (char *) strstr(word, nh) : NULL;
1245
0
        while (nhy) {
1246
0
            hyphens[nhy - word + strlen(nh) - 1] = '0';
1247
0
            if (nhy - word  - 1 >= 0) hyphens[nhy - word - 1] = '0';
1248
0
            nhy = (char *) strstr(nhy + 1, nh);
1249
0
        }
1250
0
        nh = nh + strlen(nh) + 1;
1251
0
    }
1252
0
  }
1253
1254
0
  if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
1255
0
  if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
1256
#ifdef VERBOSE
1257
  printf ("nums: %s\n", hyphens);
1258
#endif
1259
0
  return 0;
1260
0
}
1261
1262
/* previous main api function with hyphenmin parameters */
1263
int hnj_hyphen_hyphenate3 (HyphenDict *dict,
1264
  const char *word, int word_size, char * hyphens,
1265
  char *hyphword, char *** rep, int ** pos, int ** cut,
1266
  int lhmin, int rhmin, int clhmin, int crhmin)
1267
0
{
1268
0
  lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin;
1269
0
  rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin;
1270
0
  clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin;
1271
0
  crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin;
1272
0
  hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
1273
0
    clhmin, crhmin, 1, 1);
1274
0
  hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
1275
0
    rep, pos, cut, (lhmin > 0 ? lhmin : 2));
1276
0
  hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
1277
0
    rep, pos, cut, (rhmin > 0 ? rhmin : 2));
1278
1279
  /* nohyphen */
1280
0
  if (dict->nohyphen) {
1281
0
    char * nh = dict->nohyphen;
1282
0
    int nhi;
1283
0
    for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
1284
0
        char * nhy = *nh ? (char *) strstr(word, nh) : NULL;
1285
0
        while (nhy) {
1286
0
            hyphens[nhy - word + strlen(nh) - 1] = '0';
1287
0
            if (nhy - word  - 1 >= 0) hyphens[nhy - word - 1] = '0';
1288
0
            nhy = (char *) strstr(nhy + 1, nh);
1289
0
        }
1290
0
        nh = nh + strlen(nh) + 1;
1291
0
    }
1292
0
  }
1293
1294
0
  if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
1295
0
  if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
1296
0
  return 0;
1297
0
}