Coverage Report

Created: 2026-06-30 11:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/workdir/UnpackedTarball/libexttextcat/src/textcat.c
Line
Count
Source
1
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2
/**
3
 * textcat.c -- routines for categorizing text
4
 *
5
 * Copyright (C) 2003 WiseGuys Internet B.V.
6
 *
7
 * THE BSD LICENSE
8
 *
9
 * Redistribution and use in source and binary forms, with or without
10
 * modification, are permitted provided that the following conditions
11
 * are met:
12
 *
13
 * - Redistributions of source code must retain the above copyright
14
 * notice, this list of conditions and the following disclaimer.
15
 *
16
 * - Redistributions in binary form must reproduce the above copyright
17
 * notice, this list of conditions and the following disclaimer in the
18
 * documentation and/or other materials provided with the
19
 * distribution.
20
 *
21
 * - Neither the name of the WiseGuys Internet B.V. nor the names of
22
 * its contributors may be used to endorse or promote products derived
23
 * from this software without specific prior written permission.
24
 *
25
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36
 *
37
 * DESCRIPTION
38
 *
39
 * These routines use the N-gram fingerprinting technique as described
40
 * in Cavnar and Trenkle, (1994.), N-Gram-Based Text Categorization.
41
 * (cf. http://www.nonlineardynamics.com/trenkle/)
42
 *
43
 * REVISION HISTORY
44
 *
45
 * Mar 27, 2003 frank@wise-guys.nl -- created
46
 *
47
 * IMPROVEMENTS:
48
 * - If two n-grams have the same frequency count, choose the shortest
49
 * - Use a better similarity measure (the article suggests Wilcoxon rank test)
50
 * - The profiles are matched one by one, which results in redundant lookups.
51
 * - Make the thingy reentrant as well as thread-safe. (Reentrancy is abandoned
52
 *   by the use of the output buffer in textcat_t.)
53
 */
54
#ifdef HAVE_CONFIG_H
55
#include "config.h"
56
#endif
57
58
#include <stdlib.h>
59
#include <string.h>
60
61
#include "common_impl.h"
62
#include "fingerprint.h"
63
#include "textcat.h"
64
#include "constants.h"
65
66
67
typedef struct
68
{
69
70
    void **fprint;
71
    unsigned char *fprint_disable;
72
    uint4 size;
73
    uint4 maxsize;
74
    uint4 mindocsize;
75
76
    char output[MAXOUTPUTSIZE];
77
    candidate_t *tmp_candidates;
78
    boole utfaware;
79
} textcat_t;
80
81
82
static int cmpcandidates(const void *a, const void *b)
83
0
{
84
0
    const candidate_t *x = (const candidate_t *)a;
85
0
    const candidate_t *y = (const candidate_t *)b;
86
0
    return (x->score - y->score);
87
0
}
88
89
90
extern void textcat_Done(void *handle)
91
0
{
92
0
    textcat_t *h = (textcat_t *) handle;
93
0
    uint4 i;
94
95
0
    for (i = 0; i < h->size; i++)
96
0
    {
97
0
        fp_Done(h->fprint[i]);
98
0
    }
99
0
    if (h->tmp_candidates != NULL)
100
0
    {
101
0
        textcat_ReleaseClassifyFullOutput(h, h->tmp_candidates);
102
0
    }
103
0
    free(h->fprint);
104
0
    free(h->fprint_disable);
105
0
    free(h);
106
107
0
}
108
109
extern int textcat_SetProperty(void *handle, textcat_Property property,
110
                               sint4 value)
111
0
{
112
0
    textcat_t *h = (textcat_t *) handle;
113
0
    switch (property)
114
0
    {
115
0
    case TCPROP_UTF8AWARE:
116
0
        if ((value == TC_TRUE) || (value == TC_FALSE))
117
0
        {
118
0
            h->utfaware = value;
119
0
            return 0;
120
0
        }
121
0
        return -2;
122
0
        break;
123
0
    case TCPROP_MINIMUM_DOCUMENT_SIZE:
124
0
        if (value > 0)
125
0
        {
126
0
            h->mindocsize = value;
127
0
            return 0;
128
0
        }
129
0
        return -2;
130
0
        break;
131
0
    default:
132
0
        break;
133
0
    }
134
0
    return -1;
135
0
}
136
137
/** Replaces older function */
138
extern void *textcat_Init(const char *conffile)
139
0
{
140
0
    return special_textcat_Init(conffile, DEFAULT_FINGERPRINTS_PATH);
141
0
}
142
143
/**
144
 * Originaly this function had only one parameter (conffile) it has been modified since OOo use
145
 * Basicaly prefix is the directory path where fingerprints are stored
146
 */
147
extern void *special_textcat_Init(const char *conffile, const char *prefix)
148
0
{
149
0
    textcat_t *h;
150
0
    char *finger_print_file_name;
151
0
    size_t finger_print_file_name_size;
152
0
    size_t prefix_size;
153
0
    char line[1024];
154
0
    FILE *fp;
155
156
0
    fp = fopen(conffile, "r");
157
0
    if (!fp)
158
0
    {
159
0
#ifdef VERBOSE
160
0
        fprintf(stderr, "Failed to open config file '%s'\n", conffile);
161
0
#endif
162
0
        return NULL;
163
0
    }
164
165
0
    h = (textcat_t *) malloc(sizeof(textcat_t));
166
0
    h->size = 0;
167
0
    h->maxsize = 16;
168
0
    h->mindocsize = MINDOCSIZE;
169
0
    h->fprint = (void **)malloc(sizeof(void *) * h->maxsize);
170
0
    h->fprint_disable =
171
0
        (unsigned char *)malloc(sizeof(unsigned char) * h->maxsize);
172
    /* added to store the state of languages */
173
0
    h->tmp_candidates = NULL;
174
0
    h->utfaware = TC_TRUE;
175
176
0
    prefix_size = strlen(prefix);
177
0
    finger_print_file_name_size = prefix_size + 1;
178
0
    finger_print_file_name =
179
0
        (char *)malloc(sizeof(char) * (finger_print_file_name_size + 1024));
180
0
    finger_print_file_name[0] = '\0';
181
0
    strcat(finger_print_file_name, prefix);
182
183
0
    while (wg_getline(line, 1024, fp))
184
0
    {
185
0
        char *p;
186
0
        char *segment[4];
187
188
        /*** Skip comments ***/
189
0
        if ((p = strchr(line, '#')))
190
0
        {
191
0
            *p = '\0';
192
0
        }
193
194
0
        if (wg_split(segment, line, line, 4) < 2)
195
0
        {
196
0
            continue;
197
0
        }
198
199
        /*** Ensure enough space ***/
200
0
        if (h->size == h->maxsize)
201
0
        {
202
0
            h->maxsize *= 2;
203
0
            h->fprint =
204
0
                (void **)realloc(h->fprint, sizeof(void *) * h->maxsize);
205
0
            h->fprint_disable =
206
0
                (unsigned char *)realloc(h->fprint_disable,
207
0
                                         sizeof(unsigned char) * h->maxsize);
208
0
        }
209
210
        /*** Load data ***/
211
0
        if ((h->fprint[h->size] = fp_Init(segment[1])) == NULL)
212
0
        {
213
0
            goto BAILOUT;
214
0
        }
215
216
0
        while (prefix_size + strlen(segment[0]) > finger_print_file_name_size)
217
0
        {
218
0
            char *tmp;
219
0
            size_t tmp_size = finger_print_file_name_size * 2;
220
0
            tmp =
221
0
                (char *)realloc(finger_print_file_name,
222
0
                                sizeof(char) * (tmp_size + 1));
223
0
            if (tmp == NULL)
224
0
            {
225
0
                fp_Done(h->fprint[h->size]);
226
0
                goto BAILOUT;
227
0
            }
228
0
            else
229
0
            {
230
0
                finger_print_file_name = tmp;
231
0
                finger_print_file_name_size = tmp_size;
232
0
            }
233
0
        }
234
0
        finger_print_file_name[prefix_size] = '\0';
235
0
        strcat(finger_print_file_name, segment[0]);
236
237
0
        if (fp_Read(h->fprint[h->size], finger_print_file_name, 400) == 0)
238
0
        {
239
0
            fp_Done(h->fprint[h->size]);
240
0
            goto BAILOUT;
241
0
        }
242
0
        h->fprint_disable[h->size] = 0xF0;  /* 0xF0 is the code for enabled
243
                                               languages, 0x0F is for disabled 
244
                                             */
245
0
        h->size++;
246
0
    }
247
248
0
    free(finger_print_file_name);
249
250
0
    fclose(fp);
251
0
    return h;
252
253
0
  BAILOUT:
254
0
    free(finger_print_file_name);
255
0
    fclose(fp);
256
0
    textcat_Done(h);
257
0
    return NULL;
258
0
}
259
260
extern candidate_t *textcat_GetClassifyFullOutput(void *handle)
261
0
{
262
0
    textcat_t *h = (textcat_t *) handle;
263
0
    return (candidate_t *) malloc(sizeof(candidate_t) * h->size);
264
0
}
265
266
extern void textcat_ReleaseClassifyFullOutput(void *handle,
267
                                              candidate_t * candidates)
268
0
{
269
0
    if (candidates != NULL)
270
0
    {
271
0
        free(candidates);
272
0
    }
273
0
}
274
275
extern char *textcat_Classify(void *handle, const char *buffer, size_t size)
276
0
{
277
0
    textcat_t *h = (textcat_t *) handle;
278
0
    char *result = h->output;
279
0
    uint4 i, cnt;
280
281
0
    if (h->tmp_candidates == NULL)
282
0
    {
283
0
        h->tmp_candidates = textcat_GetClassifyFullOutput(h);
284
0
    }
285
286
0
    cnt = textcat_ClassifyFull(h, buffer, size, h->tmp_candidates);
287
288
0
    switch (cnt)
289
0
    {
290
0
    case TEXTCAT_RESULT_UNKNOWN:
291
0
        result = TEXTCAT_RESULT_UNKNOWN_STR;
292
0
        break;
293
0
    case TEXTCAT_RESULT_SHORT:
294
0
        result = TEXTCAT_RESULT_SHORT_STR;
295
0
        break;
296
0
    default:
297
0
        {
298
0
            const char *plimit = result + MAXOUTPUTSIZE;
299
0
            char *p = result;
300
301
0
            *p = '\0';
302
0
            for (i = 0; i < cnt; i++)
303
0
            {
304
0
                p = wg_strgmov(p, "[", plimit);
305
0
                p = wg_strgmov(p, h->tmp_candidates[i].name, plimit);
306
0
                p = wg_strgmov(p, "]", plimit);
307
0
            }
308
0
        }
309
0
    }
310
311
0
    return result;
312
0
}
313
314
315
extern int textcat_ClassifyFull(void *handle, const char *buffer, size_t size,
316
                                candidate_t * candidates)
317
0
{
318
0
    textcat_t *h = (textcat_t *) handle;
319
0
    uint4 i, cnt = 0;
320
0
    int minscore = MAXSCORE;
321
0
    int threshold = minscore;
322
323
0
    void *unknown;
324
325
0
    unknown = fp_Init(NULL);
326
0
    fp_SetProperty(unknown, TCPROP_UTF8AWARE, h->utfaware);
327
0
    fp_SetProperty(unknown, TCPROP_MINIMUM_DOCUMENT_SIZE, h->mindocsize);
328
0
    if (fp_Create(unknown, buffer, size, MAXNGRAMS) == 0)
329
0
    {
330
        /*** Too little information ***/
331
0
        fp_Done(unknown);
332
0
        return TEXTCAT_RESULT_SHORT;
333
0
    }
334
335
    /*** Calculate the score for each category. ***/
336
0
    for (i = 0; i < h->size; i++)
337
0
    {
338
0
        int score;
339
0
        if (h->fprint_disable[i] & 0x0F)
340
0
        {                       /* if this language is disabled */
341
0
            score = MAXSCORE;
342
0
        }
343
0
        else
344
0
        {
345
0
            score = fp_Compare(h->fprint[i], unknown, threshold);
346
            /* printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score); */
347
0
        }
348
0
        candidates[i].score = score;
349
0
        candidates[i].name = fp_Name(h->fprint[i]);
350
0
        if (score < minscore)
351
0
        {
352
0
            minscore = score;
353
0
            threshold = (int)((double)score * THRESHOLDVALUE);
354
0
        }
355
0
    }
356
357
    /*** Find the best performers ***/
358
0
    for (i = 0, cnt = 0; i < h->size; i++)
359
0
    {
360
0
        if (candidates[i].score < threshold)
361
0
        {
362
0
            if (++cnt == MAXCANDIDATES + 1)
363
0
            {
364
0
                break;
365
0
            }
366
367
0
            memcpy(&candidates[cnt - 1], &candidates[i], sizeof(candidate_t));
368
369
0
        }
370
0
    }
371
372
0
    fp_Done(unknown);
373
    /*** The verdict ***/
374
0
    if (cnt == MAXCANDIDATES + 1)
375
0
    {
376
0
        return TEXTCAT_RESULT_UNKNOWN;
377
0
    }
378
0
    else
379
0
    {
380
0
        qsort(candidates, cnt, sizeof(candidate_t), cmpcandidates);
381
0
        return cnt;
382
0
    }
383
0
}
384
385
extern const char *textcat_Version(void)
386
0
{
387
0
    return EXTTEXTCAT_VERSION;
388
0
}
389
390
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */