/work/workdir/UnpackedTarball/libexttextcat/src/textcat.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
2 | | /** |
3 | | * textcat.c -- routines for categorizing text |
4 | | * |
5 | | * Copyright (C) 2003 WiseGuys Internet B.V. |
6 | | * |
7 | | * THE BSD LICENSE |
8 | | * |
9 | | * Redistribution and use in source and binary forms, with or without |
10 | | * modification, are permitted provided that the following conditions |
11 | | * are met: |
12 | | * |
13 | | * - Redistributions of source code must retain the above copyright |
14 | | * notice, this list of conditions and the following disclaimer. |
15 | | * |
16 | | * - Redistributions in binary form must reproduce the above copyright |
17 | | * notice, this list of conditions and the following disclaimer in the |
18 | | * documentation and/or other materials provided with the |
19 | | * distribution. |
20 | | * |
21 | | * - Neither the name of the WiseGuys Internet B.V. nor the names of |
22 | | * its contributors may be used to endorse or promote products derived |
23 | | * from this software without specific prior written permission. |
24 | | * |
25 | | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
26 | | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
27 | | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
28 | | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
29 | | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
30 | | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
31 | | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
32 | | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
33 | | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
34 | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
35 | | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
36 | | * |
37 | | * DESCRIPTION |
38 | | * |
39 | | * These routines use the N-gram fingerprinting technique as described |
40 | | * in Cavnar and Trenkle, (1994.), N-Gram-Based Text Categorization. |
41 | | * (cf. http://www.nonlineardynamics.com/trenkle/) |
42 | | * |
43 | | * REVISION HISTORY |
44 | | * |
45 | | * Mar 27, 2003 frank@wise-guys.nl -- created |
46 | | * |
47 | | * IMPROVEMENTS: |
48 | | * - If two n-grams have the same frequency count, choose the shortest |
49 | | * - Use a better similarity measure (the article suggests Wilcoxon rank test) |
50 | | * - The profiles are matched one by one, which results in redundant lookups. |
51 | | * - Make the thingy reentrant as well as thread-safe. (Reentrancy is abandoned |
52 | | * by the use of the output buffer in textcat_t.) |
53 | | */ |
54 | | #ifdef HAVE_CONFIG_H |
55 | | #include "config.h" |
56 | | #endif |
57 | | |
58 | | #include <stdlib.h> |
59 | | #include <string.h> |
60 | | |
61 | | #include "common_impl.h" |
62 | | #include "fingerprint.h" |
63 | | #include "textcat.h" |
64 | | #include "constants.h" |
65 | | |
66 | | |
67 | | typedef struct |
68 | | { |
69 | | |
70 | | void **fprint; |
71 | | unsigned char *fprint_disable; |
72 | | uint4 size; |
73 | | uint4 maxsize; |
74 | | uint4 mindocsize; |
75 | | |
76 | | char output[MAXOUTPUTSIZE]; |
77 | | candidate_t *tmp_candidates; |
78 | | boole utfaware; |
79 | | } textcat_t; |
80 | | |
81 | | |
82 | | static int cmpcandidates(const void *a, const void *b) |
83 | 0 | { |
84 | 0 | const candidate_t *x = (const candidate_t *)a; |
85 | 0 | const candidate_t *y = (const candidate_t *)b; |
86 | 0 | return (x->score - y->score); |
87 | 0 | } |
88 | | |
89 | | |
90 | | extern void textcat_Done(void *handle) |
91 | 0 | { |
92 | 0 | textcat_t *h = (textcat_t *) handle; |
93 | 0 | uint4 i; |
94 | |
|
95 | 0 | for (i = 0; i < h->size; i++) |
96 | 0 | { |
97 | 0 | fp_Done(h->fprint[i]); |
98 | 0 | } |
99 | 0 | if (h->tmp_candidates != NULL) |
100 | 0 | { |
101 | 0 | textcat_ReleaseClassifyFullOutput(h, h->tmp_candidates); |
102 | 0 | } |
103 | 0 | free(h->fprint); |
104 | 0 | free(h->fprint_disable); |
105 | 0 | free(h); |
106 | |
|
107 | 0 | } |
108 | | |
109 | | extern int textcat_SetProperty(void *handle, textcat_Property property, |
110 | | sint4 value) |
111 | 0 | { |
112 | 0 | textcat_t *h = (textcat_t *) handle; |
113 | 0 | switch (property) |
114 | 0 | { |
115 | 0 | case TCPROP_UTF8AWARE: |
116 | 0 | if ((value == TC_TRUE) || (value == TC_FALSE)) |
117 | 0 | { |
118 | 0 | h->utfaware = value; |
119 | 0 | return 0; |
120 | 0 | } |
121 | 0 | return -2; |
122 | 0 | break; |
123 | 0 | case TCPROP_MINIMUM_DOCUMENT_SIZE: |
124 | 0 | if (value > 0) |
125 | 0 | { |
126 | 0 | h->mindocsize = value; |
127 | 0 | return 0; |
128 | 0 | } |
129 | 0 | return -2; |
130 | 0 | break; |
131 | 0 | default: |
132 | 0 | break; |
133 | 0 | } |
134 | 0 | return -1; |
135 | 0 | } |
136 | | |
137 | | /** Replaces older function */ |
138 | | extern void *textcat_Init(const char *conffile) |
139 | 0 | { |
140 | 0 | return special_textcat_Init(conffile, DEFAULT_FINGERPRINTS_PATH); |
141 | 0 | } |
142 | | |
143 | | /** |
144 | | * Originaly this function had only one parameter (conffile) it has been modified since OOo use |
145 | | * Basicaly prefix is the directory path where fingerprints are stored |
146 | | */ |
147 | | extern void *special_textcat_Init(const char *conffile, const char *prefix) |
148 | 0 | { |
149 | 0 | textcat_t *h; |
150 | 0 | char *finger_print_file_name; |
151 | 0 | size_t finger_print_file_name_size; |
152 | 0 | size_t prefix_size; |
153 | 0 | char line[1024]; |
154 | 0 | FILE *fp; |
155 | |
|
156 | 0 | fp = fopen(conffile, "r"); |
157 | 0 | if (!fp) |
158 | 0 | { |
159 | 0 | #ifdef VERBOSE |
160 | 0 | fprintf(stderr, "Failed to open config file '%s'\n", conffile); |
161 | 0 | #endif |
162 | 0 | return NULL; |
163 | 0 | } |
164 | | |
165 | 0 | h = (textcat_t *) malloc(sizeof(textcat_t)); |
166 | 0 | h->size = 0; |
167 | 0 | h->maxsize = 16; |
168 | 0 | h->mindocsize = MINDOCSIZE; |
169 | 0 | h->fprint = (void **)malloc(sizeof(void *) * h->maxsize); |
170 | 0 | h->fprint_disable = |
171 | 0 | (unsigned char *)malloc(sizeof(unsigned char) * h->maxsize); |
172 | | /* added to store the state of languages */ |
173 | 0 | h->tmp_candidates = NULL; |
174 | 0 | h->utfaware = TC_TRUE; |
175 | |
|
176 | 0 | prefix_size = strlen(prefix); |
177 | 0 | finger_print_file_name_size = prefix_size + 1; |
178 | 0 | finger_print_file_name = |
179 | 0 | (char *)malloc(sizeof(char) * (finger_print_file_name_size + 1024)); |
180 | 0 | finger_print_file_name[0] = '\0'; |
181 | 0 | strcat(finger_print_file_name, prefix); |
182 | |
|
183 | 0 | while (wg_getline(line, 1024, fp)) |
184 | 0 | { |
185 | 0 | char *p; |
186 | 0 | char *segment[4]; |
187 | | |
188 | | /*** Skip comments ***/ |
189 | 0 | if ((p = strchr(line, '#'))) |
190 | 0 | { |
191 | 0 | *p = '\0'; |
192 | 0 | } |
193 | |
|
194 | 0 | if (wg_split(segment, line, line, 4) < 2) |
195 | 0 | { |
196 | 0 | continue; |
197 | 0 | } |
198 | | |
199 | | /*** Ensure enough space ***/ |
200 | 0 | if (h->size == h->maxsize) |
201 | 0 | { |
202 | 0 | h->maxsize *= 2; |
203 | 0 | h->fprint = |
204 | 0 | (void **)realloc(h->fprint, sizeof(void *) * h->maxsize); |
205 | 0 | h->fprint_disable = |
206 | 0 | (unsigned char *)realloc(h->fprint_disable, |
207 | 0 | sizeof(unsigned char) * h->maxsize); |
208 | 0 | } |
209 | | |
210 | | /*** Load data ***/ |
211 | 0 | if ((h->fprint[h->size] = fp_Init(segment[1])) == NULL) |
212 | 0 | { |
213 | 0 | goto BAILOUT; |
214 | 0 | } |
215 | | |
216 | 0 | while (prefix_size + strlen(segment[0]) > finger_print_file_name_size) |
217 | 0 | { |
218 | 0 | char *tmp; |
219 | 0 | size_t tmp_size = finger_print_file_name_size * 2; |
220 | 0 | tmp = |
221 | 0 | (char *)realloc(finger_print_file_name, |
222 | 0 | sizeof(char) * (tmp_size + 1)); |
223 | 0 | if (tmp == NULL) |
224 | 0 | { |
225 | 0 | goto BAILOUT; |
226 | 0 | } |
227 | 0 | else |
228 | 0 | { |
229 | 0 | finger_print_file_name = tmp; |
230 | 0 | finger_print_file_name_size = tmp_size; |
231 | 0 | } |
232 | 0 | } |
233 | 0 | finger_print_file_name[prefix_size] = '\0'; |
234 | 0 | strcat(finger_print_file_name, segment[0]); |
235 | |
|
236 | 0 | if (fp_Read(h->fprint[h->size], finger_print_file_name, 400) == 0) |
237 | 0 | goto BAILOUT; |
238 | 0 | h->fprint_disable[h->size] = 0xF0; /* 0xF0 is the code for enabled |
239 | | languages, 0x0F is for disabled |
240 | | */ |
241 | 0 | h->size++; |
242 | 0 | } |
243 | | |
244 | 0 | free(finger_print_file_name); |
245 | |
|
246 | 0 | fclose(fp); |
247 | 0 | return h; |
248 | | |
249 | 0 | BAILOUT: |
250 | 0 | free(finger_print_file_name); |
251 | 0 | fclose(fp); |
252 | 0 | textcat_Done(h); |
253 | 0 | return NULL; |
254 | 0 | } |
255 | | |
256 | | extern candidate_t *textcat_GetClassifyFullOutput(void *handle) |
257 | 0 | { |
258 | 0 | textcat_t *h = (textcat_t *) handle; |
259 | 0 | return (candidate_t *) malloc(sizeof(candidate_t) * h->size); |
260 | 0 | } |
261 | | |
262 | | extern void textcat_ReleaseClassifyFullOutput(void *handle, |
263 | | candidate_t * candidates) |
264 | 0 | { |
265 | 0 | if (candidates != NULL) |
266 | 0 | { |
267 | 0 | free(candidates); |
268 | 0 | } |
269 | 0 | } |
270 | | |
271 | | extern char *textcat_Classify(void *handle, const char *buffer, size_t size) |
272 | 0 | { |
273 | 0 | textcat_t *h = (textcat_t *) handle; |
274 | 0 | char *result = h->output; |
275 | 0 | uint4 i, cnt; |
276 | |
|
277 | 0 | if (h->tmp_candidates == NULL) |
278 | 0 | { |
279 | 0 | h->tmp_candidates = textcat_GetClassifyFullOutput(h); |
280 | 0 | } |
281 | |
|
282 | 0 | cnt = textcat_ClassifyFull(h, buffer, size, h->tmp_candidates); |
283 | |
|
284 | 0 | switch (cnt) |
285 | 0 | { |
286 | 0 | case TEXTCAT_RESULT_UNKNOWN: |
287 | 0 | result = TEXTCAT_RESULT_UNKNOWN_STR; |
288 | 0 | break; |
289 | 0 | case TEXTCAT_RESULT_SHORT: |
290 | 0 | result = TEXTCAT_RESULT_SHORT_STR; |
291 | 0 | break; |
292 | 0 | default: |
293 | 0 | { |
294 | 0 | const char *plimit = result + MAXOUTPUTSIZE; |
295 | 0 | char *p = result; |
296 | |
|
297 | 0 | *p = '\0'; |
298 | 0 | for (i = 0; i < cnt; i++) |
299 | 0 | { |
300 | 0 | p = wg_strgmov(p, "[", plimit); |
301 | 0 | p = wg_strgmov(p, h->tmp_candidates[i].name, plimit); |
302 | 0 | p = wg_strgmov(p, "]", plimit); |
303 | 0 | } |
304 | 0 | } |
305 | 0 | } |
306 | | |
307 | 0 | return result; |
308 | 0 | } |
309 | | |
310 | | |
311 | | extern int textcat_ClassifyFull(void *handle, const char *buffer, size_t size, |
312 | | candidate_t * candidates) |
313 | 0 | { |
314 | 0 | textcat_t *h = (textcat_t *) handle; |
315 | 0 | uint4 i, cnt = 0; |
316 | 0 | int minscore = MAXSCORE; |
317 | 0 | int threshold = minscore; |
318 | |
|
319 | 0 | void *unknown; |
320 | |
|
321 | 0 | unknown = fp_Init(NULL); |
322 | 0 | fp_SetProperty(unknown, TCPROP_UTF8AWARE, h->utfaware); |
323 | 0 | fp_SetProperty(unknown, TCPROP_MINIMUM_DOCUMENT_SIZE, h->mindocsize); |
324 | 0 | if (fp_Create(unknown, buffer, size, MAXNGRAMS) == 0) |
325 | 0 | { |
326 | | /*** Too little information ***/ |
327 | 0 | fp_Done(unknown); |
328 | 0 | return TEXTCAT_RESULT_SHORT; |
329 | 0 | } |
330 | | |
331 | | /*** Calculate the score for each category. ***/ |
332 | 0 | for (i = 0; i < h->size; i++) |
333 | 0 | { |
334 | 0 | int score; |
335 | 0 | if (h->fprint_disable[i] & 0x0F) |
336 | 0 | { /* if this language is disabled */ |
337 | 0 | score = MAXSCORE; |
338 | 0 | } |
339 | 0 | else |
340 | 0 | { |
341 | 0 | score = fp_Compare(h->fprint[i], unknown, threshold); |
342 | | /* printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score); */ |
343 | 0 | } |
344 | 0 | candidates[i].score = score; |
345 | 0 | candidates[i].name = fp_Name(h->fprint[i]); |
346 | 0 | if (score < minscore) |
347 | 0 | { |
348 | 0 | minscore = score; |
349 | 0 | threshold = (int)((double)score * THRESHOLDVALUE); |
350 | 0 | } |
351 | 0 | } |
352 | | |
353 | | /*** Find the best performers ***/ |
354 | 0 | for (i = 0, cnt = 0; i < h->size; i++) |
355 | 0 | { |
356 | 0 | if (candidates[i].score < threshold) |
357 | 0 | { |
358 | 0 | if (++cnt == MAXCANDIDATES + 1) |
359 | 0 | { |
360 | 0 | break; |
361 | 0 | } |
362 | | |
363 | 0 | memcpy(&candidates[cnt - 1], &candidates[i], sizeof(candidate_t)); |
364 | |
|
365 | 0 | } |
366 | 0 | } |
367 | |
|
368 | 0 | fp_Done(unknown); |
369 | | /*** The verdict ***/ |
370 | 0 | if (cnt == MAXCANDIDATES + 1) |
371 | 0 | { |
372 | 0 | return TEXTCAT_RESULT_UNKNOWN; |
373 | 0 | } |
374 | 0 | else |
375 | 0 | { |
376 | 0 | qsort(candidates, cnt, sizeof(candidate_t), cmpcandidates); |
377 | 0 | return cnt; |
378 | 0 | } |
379 | 0 | } |
380 | | |
381 | | extern const char *textcat_Version(void) |
382 | 0 | { |
383 | 0 | return EXTTEXTCAT_VERSION; |
384 | 0 | } |
385 | | |
386 | | /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |