Coverage Report

Created: 2025-07-07 10:01

/work/workdir/UnpackedTarball/libexttextcat/src/textcat.c
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2
/**
3
 * textcat.c -- routines for categorizing text
4
 *
5
 * Copyright (C) 2003 WiseGuys Internet B.V.
6
 *
7
 * THE BSD LICENSE
8
 *
9
 * Redistribution and use in source and binary forms, with or without
10
 * modification, are permitted provided that the following conditions
11
 * are met:
12
 *
13
 * - Redistributions of source code must retain the above copyright
14
 * notice, this list of conditions and the following disclaimer.
15
 *
16
 * - Redistributions in binary form must reproduce the above copyright
17
 * notice, this list of conditions and the following disclaimer in the
18
 * documentation and/or other materials provided with the
19
 * distribution.
20
 *
21
 * - Neither the name of the WiseGuys Internet B.V. nor the names of
22
 * its contributors may be used to endorse or promote products derived
23
 * from this software without specific prior written permission.
24
 *
25
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36
 *
37
 * DESCRIPTION
38
 *
39
 * These routines use the N-gram fingerprinting technique as described
40
 * in Cavnar and Trenkle, (1994.), N-Gram-Based Text Categorization.
41
 * (cf. http://www.nonlineardynamics.com/trenkle/)
42
 *
43
 * REVISION HISTORY
44
 *
45
 * Mar 27, 2003 frank@wise-guys.nl -- created
46
 *
47
 * IMPROVEMENTS:
48
 * - If two n-grams have the same frequency count, choose the shortest
49
 * - Use a better similarity measure (the article suggests Wilcoxon rank test)
50
 * - The profiles are matched one by one, which results in redundant lookups.
51
 * - Make the thingy reentrant as well as thread-safe. (Reentrancy is abandoned
52
 *   by the use of the output buffer in textcat_t.)
53
 */
54
#ifdef HAVE_CONFIG_H
55
#include "config.h"
56
#endif
57
58
#include <stdlib.h>
59
#include <string.h>
60
61
#include "common_impl.h"
62
#include "fingerprint.h"
63
#include "textcat.h"
64
#include "constants.h"
65
66
67
typedef struct
68
{
69
70
    void **fprint;
71
    unsigned char *fprint_disable;
72
    uint4 size;
73
    uint4 maxsize;
74
    uint4 mindocsize;
75
76
    char output[MAXOUTPUTSIZE];
77
    candidate_t *tmp_candidates;
78
    boole utfaware;
79
} textcat_t;
80
81
82
static int cmpcandidates(const void *a, const void *b)
83
0
{
84
0
    const candidate_t *x = (const candidate_t *)a;
85
0
    const candidate_t *y = (const candidate_t *)b;
86
0
    return (x->score - y->score);
87
0
}
88
89
90
extern void textcat_Done(void *handle)
91
0
{
92
0
    textcat_t *h = (textcat_t *) handle;
93
0
    uint4 i;
94
95
0
    for (i = 0; i < h->size; i++)
96
0
    {
97
0
        fp_Done(h->fprint[i]);
98
0
    }
99
0
    if (h->tmp_candidates != NULL)
100
0
    {
101
0
        textcat_ReleaseClassifyFullOutput(h, h->tmp_candidates);
102
0
    }
103
0
    free(h->fprint);
104
0
    free(h->fprint_disable);
105
0
    free(h);
106
107
0
}
108
109
extern int textcat_SetProperty(void *handle, textcat_Property property,
110
                               sint4 value)
111
0
{
112
0
    textcat_t *h = (textcat_t *) handle;
113
0
    switch (property)
114
0
    {
115
0
    case TCPROP_UTF8AWARE:
116
0
        if ((value == TC_TRUE) || (value == TC_FALSE))
117
0
        {
118
0
            h->utfaware = value;
119
0
            return 0;
120
0
        }
121
0
        return -2;
122
0
        break;
123
0
    case TCPROP_MINIMUM_DOCUMENT_SIZE:
124
0
        if (value > 0)
125
0
        {
126
0
            h->mindocsize = value;
127
0
            return 0;
128
0
        }
129
0
        return -2;
130
0
        break;
131
0
    default:
132
0
        break;
133
0
    }
134
0
    return -1;
135
0
}
136
137
/** Replaces older function */
138
extern void *textcat_Init(const char *conffile)
139
0
{
140
0
    return special_textcat_Init(conffile, DEFAULT_FINGERPRINTS_PATH);
141
0
}
142
143
/**
144
 * Originaly this function had only one parameter (conffile) it has been modified since OOo use
145
 * Basicaly prefix is the directory path where fingerprints are stored
146
 */
147
extern void *special_textcat_Init(const char *conffile, const char *prefix)
148
0
{
149
0
    textcat_t *h;
150
0
    char *finger_print_file_name;
151
0
    size_t finger_print_file_name_size;
152
0
    size_t prefix_size;
153
0
    char line[1024];
154
0
    FILE *fp;
155
156
0
    fp = fopen(conffile, "r");
157
0
    if (!fp)
158
0
    {
159
0
#ifdef VERBOSE
160
0
        fprintf(stderr, "Failed to open config file '%s'\n", conffile);
161
0
#endif
162
0
        return NULL;
163
0
    }
164
165
0
    h = (textcat_t *) malloc(sizeof(textcat_t));
166
0
    h->size = 0;
167
0
    h->maxsize = 16;
168
0
    h->mindocsize = MINDOCSIZE;
169
0
    h->fprint = (void **)malloc(sizeof(void *) * h->maxsize);
170
0
    h->fprint_disable =
171
0
        (unsigned char *)malloc(sizeof(unsigned char) * h->maxsize);
172
    /* added to store the state of languages */
173
0
    h->tmp_candidates = NULL;
174
0
    h->utfaware = TC_TRUE;
175
176
0
    prefix_size = strlen(prefix);
177
0
    finger_print_file_name_size = prefix_size + 1;
178
0
    finger_print_file_name =
179
0
        (char *)malloc(sizeof(char) * (finger_print_file_name_size + 1024));
180
0
    finger_print_file_name[0] = '\0';
181
0
    strcat(finger_print_file_name, prefix);
182
183
0
    while (wg_getline(line, 1024, fp))
184
0
    {
185
0
        char *p;
186
0
        char *segment[4];
187
188
        /*** Skip comments ***/
189
0
        if ((p = strchr(line, '#')))
190
0
        {
191
0
            *p = '\0';
192
0
        }
193
194
0
        if (wg_split(segment, line, line, 4) < 2)
195
0
        {
196
0
            continue;
197
0
        }
198
199
        /*** Ensure enough space ***/
200
0
        if (h->size == h->maxsize)
201
0
        {
202
0
            h->maxsize *= 2;
203
0
            h->fprint =
204
0
                (void **)realloc(h->fprint, sizeof(void *) * h->maxsize);
205
0
            h->fprint_disable =
206
0
                (unsigned char *)realloc(h->fprint_disable,
207
0
                                         sizeof(unsigned char) * h->maxsize);
208
0
        }
209
210
        /*** Load data ***/
211
0
        if ((h->fprint[h->size] = fp_Init(segment[1])) == NULL)
212
0
        {
213
0
            goto BAILOUT;
214
0
        }
215
216
0
        while (prefix_size + strlen(segment[0]) > finger_print_file_name_size)
217
0
        {
218
0
            char *tmp;
219
0
            size_t tmp_size = finger_print_file_name_size * 2;
220
0
            tmp =
221
0
                (char *)realloc(finger_print_file_name,
222
0
                                sizeof(char) * (tmp_size + 1));
223
0
            if (tmp == NULL)
224
0
            {
225
0
                goto BAILOUT;
226
0
            }
227
0
            else
228
0
            {
229
0
                finger_print_file_name = tmp;
230
0
                finger_print_file_name_size = tmp_size;
231
0
            }
232
0
        }
233
0
        finger_print_file_name[prefix_size] = '\0';
234
0
        strcat(finger_print_file_name, segment[0]);
235
236
0
        if (fp_Read(h->fprint[h->size], finger_print_file_name, 400) == 0)
237
0
            goto BAILOUT;
238
0
        h->fprint_disable[h->size] = 0xF0;  /* 0xF0 is the code for enabled
239
                                               languages, 0x0F is for disabled 
240
                                             */
241
0
        h->size++;
242
0
    }
243
244
0
    free(finger_print_file_name);
245
246
0
    fclose(fp);
247
0
    return h;
248
249
0
  BAILOUT:
250
0
    free(finger_print_file_name);
251
0
    fclose(fp);
252
0
    textcat_Done(h);
253
0
    return NULL;
254
0
}
255
256
extern candidate_t *textcat_GetClassifyFullOutput(void *handle)
257
0
{
258
0
    textcat_t *h = (textcat_t *) handle;
259
0
    return (candidate_t *) malloc(sizeof(candidate_t) * h->size);
260
0
}
261
262
extern void textcat_ReleaseClassifyFullOutput(void *handle,
263
                                              candidate_t * candidates)
264
0
{
265
0
    if (candidates != NULL)
266
0
    {
267
0
        free(candidates);
268
0
    }
269
0
}
270
271
extern char *textcat_Classify(void *handle, const char *buffer, size_t size)
272
0
{
273
0
    textcat_t *h = (textcat_t *) handle;
274
0
    char *result = h->output;
275
0
    uint4 i, cnt;
276
277
0
    if (h->tmp_candidates == NULL)
278
0
    {
279
0
        h->tmp_candidates = textcat_GetClassifyFullOutput(h);
280
0
    }
281
282
0
    cnt = textcat_ClassifyFull(h, buffer, size, h->tmp_candidates);
283
284
0
    switch (cnt)
285
0
    {
286
0
    case TEXTCAT_RESULT_UNKNOWN:
287
0
        result = TEXTCAT_RESULT_UNKNOWN_STR;
288
0
        break;
289
0
    case TEXTCAT_RESULT_SHORT:
290
0
        result = TEXTCAT_RESULT_SHORT_STR;
291
0
        break;
292
0
    default:
293
0
        {
294
0
            const char *plimit = result + MAXOUTPUTSIZE;
295
0
            char *p = result;
296
297
0
            *p = '\0';
298
0
            for (i = 0; i < cnt; i++)
299
0
            {
300
0
                p = wg_strgmov(p, "[", plimit);
301
0
                p = wg_strgmov(p, h->tmp_candidates[i].name, plimit);
302
0
                p = wg_strgmov(p, "]", plimit);
303
0
            }
304
0
        }
305
0
    }
306
307
0
    return result;
308
0
}
309
310
311
extern int textcat_ClassifyFull(void *handle, const char *buffer, size_t size,
312
                                candidate_t * candidates)
313
0
{
314
0
    textcat_t *h = (textcat_t *) handle;
315
0
    uint4 i, cnt = 0;
316
0
    int minscore = MAXSCORE;
317
0
    int threshold = minscore;
318
319
0
    void *unknown;
320
321
0
    unknown = fp_Init(NULL);
322
0
    fp_SetProperty(unknown, TCPROP_UTF8AWARE, h->utfaware);
323
0
    fp_SetProperty(unknown, TCPROP_MINIMUM_DOCUMENT_SIZE, h->mindocsize);
324
0
    if (fp_Create(unknown, buffer, size, MAXNGRAMS) == 0)
325
0
    {
326
        /*** Too little information ***/
327
0
        fp_Done(unknown);
328
0
        return TEXTCAT_RESULT_SHORT;
329
0
    }
330
331
    /*** Calculate the score for each category. ***/
332
0
    for (i = 0; i < h->size; i++)
333
0
    {
334
0
        int score;
335
0
        if (h->fprint_disable[i] & 0x0F)
336
0
        {                       /* if this language is disabled */
337
0
            score = MAXSCORE;
338
0
        }
339
0
        else
340
0
        {
341
0
            score = fp_Compare(h->fprint[i], unknown, threshold);
342
            /* printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score); */
343
0
        }
344
0
        candidates[i].score = score;
345
0
        candidates[i].name = fp_Name(h->fprint[i]);
346
0
        if (score < minscore)
347
0
        {
348
0
            minscore = score;
349
0
            threshold = (int)((double)score * THRESHOLDVALUE);
350
0
        }
351
0
    }
352
353
    /*** Find the best performers ***/
354
0
    for (i = 0, cnt = 0; i < h->size; i++)
355
0
    {
356
0
        if (candidates[i].score < threshold)
357
0
        {
358
0
            if (++cnt == MAXCANDIDATES + 1)
359
0
            {
360
0
                break;
361
0
            }
362
363
0
            memcpy(&candidates[cnt - 1], &candidates[i], sizeof(candidate_t));
364
365
0
        }
366
0
    }
367
368
0
    fp_Done(unknown);
369
    /*** The verdict ***/
370
0
    if (cnt == MAXCANDIDATES + 1)
371
0
    {
372
0
        return TEXTCAT_RESULT_UNKNOWN;
373
0
    }
374
0
    else
375
0
    {
376
0
        qsort(candidates, cnt, sizeof(candidate_t), cmpcandidates);
377
0
        return cnt;
378
0
    }
379
0
}
380
381
extern const char *textcat_Version(void)
382
0
{
383
0
    return EXTTEXTCAT_VERSION;
384
0
}
385
386
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */