/work/workdir/UnpackedTarball/libexttextcat/src/fingerprint.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
2 | | /** |
3 | | * fingerprint.c -- Routines for creating an n-gram fingerprint of a |
4 | | * buffer. |
5 | | * |
6 | | * Copyright (c) 2003, WiseGuys Internet B.V. |
7 | | * All rights reserved. |
8 | | * |
9 | | * THE BSD LICENSE |
10 | | * |
11 | | * Redistribution and use in source and binary forms, with or without |
12 | | * modification, are permitted provided that the following conditions |
13 | | * are met: |
14 | | * |
15 | | * - Redistributions of source code must retain the above copyright |
16 | | * notice, this list of conditions and the following disclaimer. |
17 | | * |
18 | | * - Redistributions in binary form must reproduce the above copyright |
19 | | * notice, this list of conditions and the following disclaimer in the |
20 | | * documentation and/or other materials provided with the |
21 | | * distribution. |
22 | | * |
23 | | * - Neither the name of the WiseGuys Internet B.V. nor the names of |
24 | | * its contributors may be used to endorse or promote products derived |
25 | | * from this software without specific prior written permission. |
26 | | * |
27 | | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
28 | | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
29 | | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
30 | | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
31 | | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
32 | | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
33 | | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
34 | | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
35 | | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
36 | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
37 | | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
38 | | * |
39 | | * DESCRIPTION |
40 | | * |
41 | | * A fingerprint is a list of most common n-grams, ordered by |
42 | | * frequency. (Note that we can use other strings than n-grams, for |
43 | | * instance entire words.) |
44 | | * |
45 | | * HOW DOES IT WORK? |
46 | | * |
47 | | * - Buffer is sliced up into n-grams |
48 | | * - N-grams are inserted into a hash table that records their frequency |
49 | | * - The table entries are filtered through a N-sized heap to |
50 | | * get the N most frequent n-grams. |
51 | | * |
52 | | * The reason why we go through the trouble of doing a partial |
53 | | * (heap)sort is that a full quicksort behaves horribly on the data: |
54 | | * most n-grams have a very low count, resulting in a data set in |
55 | | * nearly-sorted order. This causes quicksort to behave very badly. |
56 | | * Heapsort, on the other hand, behaves handsomely: worst case is |
57 | | * Mlog(N) for M n-grams filtered through a N-sized heap. |
58 | | * |
59 | | * REVISION HISTORY |
60 | | * |
61 | | * Mar 28, 2003 frank@wise-guys.nl -- created |
62 | | * |
63 | | * TODO: |
64 | | * - put table/heap datastructure in a separate file. |
65 | | */ |
66 | | |
67 | | #ifdef HAVE_CONFIG_H |
68 | | #include "config.h" |
69 | | #endif |
70 | | #include <stdio.h> |
71 | | #include <stdlib.h> |
72 | | #include <string.h> |
73 | | #include <ctype.h> |
74 | | |
75 | | #include "common_impl.h" |
76 | | #include "wg_mempool.h" |
77 | | #include "constants.h" |
78 | | |
79 | | #include "utf8misc.h" |
80 | | #include "fingerprint.h" |
81 | | |
82 | 0 | #define TABLESIZE (1<<TABLEPOW) |
83 | 0 | #define TABLEMASK ((TABLESIZE)-1) |
84 | | |
85 | | typedef struct |
86 | | { |
87 | | |
88 | | sint2 rank; |
89 | | char str[MAXNGRAMSIZE + 1]; |
90 | | |
91 | | } ngram_t; |
92 | | |
93 | | typedef struct fp_s |
94 | | { |
95 | | |
96 | | const char *name; |
97 | | ngram_t *fprint; |
98 | | uint4 size; |
99 | | uint4 mindocsize; |
100 | | boole utfaware; |
101 | | |
102 | | } fp_t; |
103 | | |
104 | | typedef struct entry_s |
105 | | { |
106 | | char str[MAXNGRAMSIZE + 1]; |
107 | | unsigned int cnt; |
108 | | struct entry_s *next; |
109 | | } entry_t; |
110 | | |
111 | | typedef struct table_s |
112 | | { |
113 | | void *pool; |
114 | | entry_t **table; |
115 | | entry_t *heap; |
116 | | |
117 | | struct table_s *next; |
118 | | |
119 | | uint4 heapsize; |
120 | | uint4 size; |
121 | | } table_t; |
122 | | |
123 | | /* |
124 | | * fast and furious little hash function |
125 | | * |
126 | | * (Note that we could use some kind of rolling checksum, and update it |
127 | | * during n-gram construction) |
128 | | */ |
129 | | static uint4 simplehash(const char *p, int len) |
130 | 0 | { |
131 | 0 | uint4 h = len * 13; |
132 | 0 | while (*p) |
133 | 0 | { |
134 | 0 | h = (h << 5) - h + *p++; |
135 | 0 | } |
136 | 0 | return h; |
137 | 0 | } |
138 | | |
139 | | /* increases frequency of ngram(p,len) */ |
140 | | static int increasefreq(table_t * t, char *p, int len, |
141 | | int issameimpl(char *, char *, int)) |
142 | 0 | { |
143 | 0 | uint4 hash = simplehash(p, len) & TABLEMASK; |
144 | 0 | entry_t *entry = t->table[hash]; |
145 | |
|
146 | 0 | while (entry) |
147 | 0 | { |
148 | 0 | if (issameimpl(entry->str, p, len)) |
149 | 0 | { |
150 | | /*** Found it! ***/ |
151 | 0 | entry->cnt++; |
152 | 0 | return 1; |
153 | 0 | } |
154 | 0 | else |
155 | 0 | { |
156 | 0 | entry = entry->next; |
157 | 0 | } |
158 | 0 | } |
159 | | |
160 | | /*** Not found, so create ***/ |
161 | 0 | entry = (entry_t *) (wgmempool_alloc(t->pool, sizeof(entry_t))); |
162 | 0 | strncpy(entry->str, p, MAXNGRAMSIZE); |
163 | 0 | entry->str[MAXNGRAMSIZE] = 0; |
164 | 0 | entry->cnt = 1; |
165 | |
|
166 | 0 | entry->next = t->table[hash]; |
167 | 0 | t->table[hash] = entry; |
168 | |
|
169 | 0 | return 1; |
170 | 0 | } |
171 | | |
172 | 0 | #define GREATER(x,y) ((x).cnt > (y).cnt) |
173 | 0 | #define LESS(x,y) ((x).cnt < (y).cnt) |
174 | | |
175 | | static void siftup(table_t * t, unsigned int child) |
176 | 0 | { |
177 | 0 | entry_t *heap = t->heap; |
178 | 0 | unsigned int parent = (child - 1) >> 1; |
179 | 0 | entry_t tmp; |
180 | |
|
181 | 0 | while (child > 0) |
182 | 0 | { |
183 | 0 | if (GREATER(heap[parent], heap[child])) |
184 | 0 | { |
185 | 0 | memcpy(&tmp, &heap[parent], sizeof(entry_t)); |
186 | 0 | memcpy(&heap[parent], &heap[child], sizeof(entry_t)); |
187 | 0 | memcpy(&heap[child], &tmp, sizeof(entry_t)); |
188 | 0 | } |
189 | 0 | else |
190 | 0 | return; |
191 | | |
192 | 0 | child = parent; |
193 | 0 | parent = (child - 1) >> 1; |
194 | 0 | } |
195 | 0 | } |
196 | | |
197 | | static void siftdown(table_t * t, unsigned int heapsize, uint4 parent) |
198 | 0 | { |
199 | 0 | entry_t *heap = t->heap; |
200 | 0 | unsigned int child = parent * 2 + 1; |
201 | 0 | entry_t tmp; |
202 | |
|
203 | 0 | while (child < heapsize) |
204 | 0 | { |
205 | 0 | if (child + 1 < heapsize && GREATER(heap[child], heap[child + 1])) |
206 | 0 | { |
207 | 0 | child++; |
208 | 0 | } |
209 | 0 | if (GREATER(heap[parent], heap[child])) |
210 | 0 | { |
211 | 0 | memcpy(&tmp, &heap[parent], sizeof(entry_t)); |
212 | 0 | memcpy(&heap[parent], &heap[child], sizeof(entry_t)); |
213 | 0 | memcpy(&heap[child], &tmp, sizeof(entry_t)); |
214 | 0 | } |
215 | 0 | else |
216 | 0 | return; |
217 | 0 | parent = child; |
218 | 0 | child = (parent * 2) + 1; |
219 | 0 | } |
220 | 0 | } |
221 | | |
222 | | static int heapinsert(table_t * t, entry_t * item) |
223 | 0 | { |
224 | 0 | entry_t *heap = t->heap; |
225 | | |
226 | | /*** Still room for an entry? ***/ |
227 | 0 | if (t->size < t->heapsize) |
228 | 0 | { |
229 | 0 | memcpy(&(heap[t->size]), item, sizeof(entry_t)); |
230 | 0 | siftup(t, t->size); |
231 | 0 | t->size++; |
232 | 0 | return 0; |
233 | 0 | } |
234 | | |
235 | | /*** Worse than the worst performer? ***/ |
236 | 0 | if (LESS(*item, heap[0])) |
237 | 0 | { |
238 | 0 | return 0; |
239 | 0 | } |
240 | | |
241 | | /*** Insert into heap and reheap ***/ |
242 | 0 | memcpy(&(heap[0]), item, sizeof(entry_t)); |
243 | 0 | siftdown(t, t->size, 0); |
244 | 0 | return 0; |
245 | 0 | } |
246 | | |
247 | | static int heapextract(table_t * t, entry_t * item) |
248 | 0 | { |
249 | 0 | entry_t *p; |
250 | |
|
251 | 0 | if (t->size == 0) |
252 | 0 | return 0; |
253 | | |
254 | 0 | p = &(t->heap[0]); |
255 | |
|
256 | 0 | memcpy(item, p, sizeof(entry_t)); |
257 | 0 | if (t->size > 1) |
258 | 0 | memcpy(&(t->heap[0]), &(t->heap[t->size - 1]), sizeof(entry_t)); |
259 | |
|
260 | 0 | siftdown(t, t->size, 0); |
261 | 0 | t->size--; |
262 | |
|
263 | 0 | return 1; |
264 | 0 | } |
265 | | |
266 | | /*** Makes a heap of all table entries ***/ |
267 | | static int table2heap(table_t * t) |
268 | 0 | { |
269 | 0 | int i; |
270 | | |
271 | | /*** Fill result heap ***/ |
272 | 0 | for (i = 0; i < TABLESIZE; i++) |
273 | 0 | { |
274 | 0 | entry_t *p = t->table[i]; |
275 | 0 | while (p) |
276 | 0 | { |
277 | 0 | heapinsert(t, p); |
278 | 0 | p = p->next; |
279 | 0 | } |
280 | 0 | } |
281 | |
|
282 | 0 | return 1; |
283 | 0 | } |
284 | | |
285 | | static table_t *inittable(uint4 maxngrams) |
286 | 0 | { |
287 | 0 | table_t *result = (table_t *) calloc(1, sizeof(table_t)); |
288 | 0 | result->table = (entry_t **) calloc(1, sizeof(entry_t *) * TABLESIZE); |
289 | 0 | result->pool = wgmempool_Init(10000, 10); |
290 | |
|
291 | 0 | result->heap = (entry_t *) malloc(sizeof(entry_t) * maxngrams); |
292 | 0 | result->heapsize = maxngrams; |
293 | 0 | result->size = 0; |
294 | |
|
295 | 0 | return result; |
296 | 0 | } |
297 | | |
298 | | static void tabledone(table_t * t) |
299 | 0 | { |
300 | 0 | if (!t) |
301 | 0 | return; |
302 | | |
303 | 0 | wgmempool_Done(t->pool); |
304 | 0 | free(t->table); |
305 | 0 | free(t->heap); |
306 | 0 | free(t); |
307 | 0 | } |
308 | | |
309 | | extern void *fp_Init(const char *name) |
310 | 0 | { |
311 | 0 | fp_t *h = (fp_t *) calloc(1, sizeof(fp_t)); |
312 | |
|
313 | 0 | h->utfaware = TC_TRUE; |
314 | 0 | h->mindocsize = MINDOCSIZE; |
315 | |
|
316 | 0 | if (name) |
317 | 0 | h->name = strdup(name); |
318 | |
|
319 | 0 | return (void *)h; |
320 | 0 | } |
321 | | |
322 | | extern void fp_Done(void *handle) |
323 | 0 | { |
324 | 0 | fp_t *h = (fp_t *) handle; |
325 | |
|
326 | 0 | if (h->name) |
327 | 0 | { |
328 | 0 | free((void *)h->name); |
329 | 0 | } |
330 | 0 | if (h->fprint) |
331 | 0 | { |
332 | 0 | free(h->fprint); |
333 | 0 | } |
334 | |
|
335 | 0 | free(h); |
336 | 0 | } |
337 | | |
338 | | extern const char *fp_Name(void *handle) |
339 | 0 | { |
340 | 0 | fp_t *h = (fp_t *) handle; |
341 | 0 | return h->name; |
342 | 0 | } |
343 | | |
344 | | /** |
345 | | * Function that prepares buffer for n-grammification: |
346 | | * runs of invalid characters are collapsed to a single |
347 | | * underscore. |
348 | | * |
349 | | * Function is implemented as a finite state machine. |
350 | | */ |
351 | | static char *prepbuffer(const char *src, size_t bufsize, uint4 mindocsize) |
352 | 0 | { |
353 | 0 | const char *p = src; |
354 | 0 | char *dest = (char *)malloc(bufsize + 3); |
355 | 0 | char *w = dest; |
356 | 0 | char *wlimit = dest + bufsize + 1; |
357 | |
|
358 | 0 | if (INVALID(*p)) |
359 | 0 | { |
360 | 0 | goto SPACE; |
361 | 0 | } |
362 | 0 | else if (*p == '\0') |
363 | 0 | { |
364 | 0 | goto END; |
365 | 0 | } |
366 | | |
367 | 0 | *w++ = '_'; |
368 | 0 | if (w == wlimit) |
369 | 0 | { |
370 | 0 | goto STOP; |
371 | 0 | } |
372 | | |
373 | 0 | goto WORD; |
374 | | |
375 | | |
376 | 0 | SPACE: |
377 | | /*** Inside string of invalid characters ***/ |
378 | 0 | p++; |
379 | 0 | if (INVALID(*p)) |
380 | 0 | { |
381 | 0 | goto SPACE; |
382 | 0 | } |
383 | 0 | else if (*p == '\0') |
384 | 0 | { |
385 | 0 | goto END; |
386 | 0 | } |
387 | | |
388 | 0 | *w++ = '_'; |
389 | 0 | if (w == wlimit) |
390 | 0 | { |
391 | 0 | goto STOP; |
392 | 0 | } |
393 | | |
394 | 0 | goto WORD; |
395 | | |
396 | 0 | WORD: |
397 | | /*** Inside string of valid characters ***/ |
398 | 0 | *w++ = *p++; |
399 | 0 | if (w == wlimit) |
400 | 0 | { |
401 | 0 | goto END; |
402 | 0 | } |
403 | 0 | else if (INVALID(*p)) |
404 | 0 | { |
405 | 0 | goto SPACE; |
406 | 0 | } |
407 | 0 | else if (*p == '\0') |
408 | 0 | { |
409 | 0 | goto STOP; |
410 | 0 | } |
411 | 0 | goto WORD; |
412 | | |
413 | 0 | END: |
414 | 0 | *w++ = '_'; |
415 | |
|
416 | 0 | STOP: |
417 | 0 | *w++ = '\0'; |
418 | | |
419 | | /*** Docs that are too small for a fingerprint, are refused ***/ |
420 | 0 | if (w - dest < mindocsize) |
421 | 0 | { |
422 | 0 | free(dest); |
423 | 0 | return NULL; |
424 | 0 | } |
425 | | |
426 | 0 | return dest; |
427 | 0 | } |
428 | | |
429 | | /** |
430 | | * this function extract all n-gram from past buffer and put them into the table "t" |
431 | | * [modified] by Jocelyn Merand to accept utf-8 multi-character symbols to be used in OpenOffice |
432 | | */ |
433 | | static void utfcreatengramtable(table_t * t, const char *buf) |
434 | 0 | { |
435 | 0 | char n[MAXNGRAMSIZE + 1]; |
436 | 0 | const char *p = buf; |
437 | 0 | int i, decay; |
438 | | |
439 | | /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/ |
440 | 0 | while (1) |
441 | 0 | { |
442 | |
|
443 | 0 | const char *q = p; |
444 | 0 | char *m = n; |
445 | | |
446 | | /*** First char may be an underscore ***/ |
447 | 0 | decay = utf8_charcopy(q, m); /* [modified] previously *q++ = *m++ */ |
448 | |
|
449 | 0 | q += decay; /* [modified] */ |
450 | 0 | m += decay; /* [modified] */ |
451 | 0 | *m = '\0'; |
452 | |
|
453 | 0 | increasefreq(t, n, 1, utf8_issame); |
454 | | |
455 | |
|
456 | 0 | if (*q == '\0') |
457 | 0 | return; |
458 | | |
459 | | /*** Let the compiler unroll this ***/ |
460 | 0 | for (i = 2; i <= MAXNGRAMSYMBOL; i++) |
461 | 0 | { |
462 | 0 | decay = utf8_charcopy(q, m); /* [modified] like above */ |
463 | 0 | m += decay; |
464 | 0 | *m = '\0'; |
465 | |
|
466 | 0 | increasefreq(t, n, i, utf8_issame); |
467 | |
|
468 | 0 | if (*q == '_') |
469 | 0 | break; |
470 | 0 | q += decay; |
471 | 0 | if (*q == '\0') |
472 | 0 | return; |
473 | 0 | } |
474 | | |
475 | 0 | p = utf8_next_char(p); /* [modified] */ |
476 | 0 | } |
477 | 0 | return; |
478 | 0 | } |
479 | | |
480 | | /* checks if n-gram lex is a prefix of key and of length len */ |
481 | | static int issame(char *lex, char *key, int len) |
482 | 0 | { |
483 | 0 | int i; |
484 | 0 | for (i = 0; i < len; i++) |
485 | 0 | { |
486 | 0 | if (key[i] != lex[i]) |
487 | 0 | { |
488 | 0 | return 0; |
489 | 0 | } |
490 | 0 | } |
491 | 0 | if (lex[i] != 0) |
492 | 0 | { |
493 | 0 | return 0; |
494 | 0 | } |
495 | 0 | return 1; |
496 | 0 | } |
497 | | |
498 | | static void createngramtable(table_t * t, const char *buf) |
499 | 0 | { |
500 | 0 | char n[MAXNGRAMSIZE + 1]; |
501 | 0 | const char *p = buf; |
502 | 0 | int i; |
503 | | |
504 | | /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/ |
505 | 0 | for (;; p++) |
506 | 0 | { |
507 | |
|
508 | 0 | const char *q = p; |
509 | 0 | char *m = n; |
510 | | |
511 | | /*** First char may be an underscore ***/ |
512 | 0 | *m++ = *q++; |
513 | 0 | *m = '\0'; |
514 | |
|
515 | 0 | increasefreq(t, n, 1, issame); |
516 | |
|
517 | 0 | if (*q == '\0') |
518 | 0 | { |
519 | 0 | return; |
520 | 0 | } |
521 | | |
522 | | /*** Let the compiler unroll this ***/ |
523 | 0 | for (i = 2; i <= MAXNGRAMSIZE; i++) |
524 | 0 | { |
525 | |
|
526 | 0 | *m++ = *q; |
527 | 0 | *m = '\0'; |
528 | |
|
529 | 0 | increasefreq(t, n, i, issame); |
530 | |
|
531 | 0 | if (*q == '_') |
532 | 0 | break; |
533 | 0 | q++; |
534 | 0 | if (*q == '\0') |
535 | 0 | { |
536 | 0 | return; |
537 | 0 | } |
538 | 0 | } |
539 | 0 | } |
540 | 0 | return; |
541 | 0 | } |
542 | | |
543 | | |
544 | | static int mystrcmp(const char *a, const char *b) |
545 | 0 | { |
546 | 0 | while (*a && *a == *b) |
547 | 0 | { |
548 | 0 | a++; |
549 | 0 | b++; |
550 | 0 | } |
551 | 0 | return (*a - *b); |
552 | 0 | } |
553 | | |
554 | | |
555 | | static int ngramcmp_str(const void *a, const void *b) |
556 | 0 | { |
557 | 0 | ngram_t *x = (ngram_t *) a; |
558 | 0 | ngram_t *y = (ngram_t *) b; |
559 | |
|
560 | 0 | return mystrcmp(x->str, y->str); |
561 | 0 | } |
562 | | |
563 | | static int ngramcmp_rank(const void *a, const void *b) |
564 | 0 | { |
565 | 0 | ngram_t *x = (ngram_t *) a; |
566 | 0 | ngram_t *y = (ngram_t *) b; |
567 | |
|
568 | 0 | return x->rank - y->rank; |
569 | 0 | } |
570 | | |
571 | | extern int fp_SetProperty(void *handle, textcat_Property property, sint4 value) |
572 | 0 | { |
573 | 0 | fp_t *h = (fp_t *) handle; |
574 | 0 | switch (property) |
575 | 0 | { |
576 | 0 | case TCPROP_UTF8AWARE: |
577 | 0 | if ((value == TC_TRUE) || (value == TC_FALSE)) |
578 | 0 | { |
579 | 0 | h->utfaware = value; |
580 | 0 | return 0; |
581 | 0 | } |
582 | 0 | return -2; |
583 | 0 | break; |
584 | 0 | case TCPROP_MINIMUM_DOCUMENT_SIZE: |
585 | 0 | h->mindocsize = (uint4) value; |
586 | 0 | return 0; |
587 | 0 | break; |
588 | 0 | default: |
589 | 0 | break; |
590 | 0 | } |
591 | 0 | return -1; |
592 | 0 | } |
593 | | |
594 | | /** |
595 | | * Create a fingerprint: |
596 | | * - record the frequency of each unique n-gram in a hash table |
597 | | * - take the most frequent n-grams |
598 | | * - sort them alphabetically, recording their relative rank |
599 | | */ |
600 | | extern int fp_Create(void *handle, const char *buffer, uint4 bufsize, |
601 | | uint4 maxngrams) |
602 | 0 | { |
603 | 0 | sint4 i = 0; |
604 | 0 | table_t *t = NULL; |
605 | 0 | char *tmp = NULL; |
606 | |
|
607 | 0 | fp_t *h = (fp_t *) handle; |
608 | |
|
609 | 0 | if (bufsize < h->mindocsize) |
610 | 0 | return 0; |
611 | | |
612 | | /*** Throw out all invalid chars ***/ |
613 | 0 | tmp = prepbuffer(buffer, bufsize, h->mindocsize); |
614 | | /* printf("Cleaned buffer : %s\n",tmp); */ |
615 | 0 | if (tmp == NULL) |
616 | 0 | return 0; |
617 | 0 | t = inittable(maxngrams); |
618 | | /* printf("Table initialized\n"); */ |
619 | | |
620 | | /*** Create a hash table containing n-gram counts ***/ |
621 | 0 | if (h->utfaware) |
622 | 0 | { |
623 | 0 | utfcreatengramtable(t, tmp); |
624 | 0 | } |
625 | 0 | else |
626 | 0 | { |
627 | 0 | createngramtable(t, tmp); |
628 | 0 | } |
629 | | /* printf("Table created\n"); */ |
630 | | /*** Take the top N n-grams and add them to the profile ***/ |
631 | 0 | table2heap(t); |
632 | 0 | maxngrams = WGMIN(maxngrams, t->size); |
633 | |
|
634 | 0 | h->fprint = (ngram_t *) malloc(sizeof(ngram_t) * maxngrams); |
635 | 0 | h->size = maxngrams; |
636 | | |
637 | | /*** Pull n-grams out of heap (backwards) ***/ |
638 | 0 | for (i = maxngrams - 1; i >= 0; i--) |
639 | 0 | { |
640 | 0 | entry_t tmp2; |
641 | |
|
642 | 0 | heapextract(t, &tmp2); |
643 | | |
644 | | /*** the string and its rank is all we need ***/ |
645 | 0 | strcpy(h->fprint[i].str, tmp2.str); |
646 | 0 | h->fprint[i].rank = i; |
647 | 0 | } |
648 | |
|
649 | 0 | tabledone(t); |
650 | 0 | free(tmp); |
651 | | |
652 | | /*** Sort n-grams alphabetically, for easy comparison ***/ |
653 | 0 | qsort(h->fprint, h->size, sizeof(ngram_t), ngramcmp_str); |
654 | 0 | return 1; |
655 | 0 | } |
656 | | |
657 | | extern int fp_Read(void *handle, const char *fname, int maxngrams) |
658 | 0 | { |
659 | 0 | fp_t *h = (fp_t *) handle; |
660 | 0 | FILE *fp; |
661 | 0 | char line[1024]; |
662 | 0 | int cnt = 0; |
663 | |
|
664 | 0 | fp = fopen(fname, "r"); |
665 | 0 | if (!fp) |
666 | 0 | { |
667 | 0 | #ifdef VERBOSE |
668 | 0 | fprintf(stderr, "Failed to open fingerprint file '%s'\n", fname); |
669 | 0 | #endif |
670 | 0 | return 0; |
671 | 0 | } |
672 | | |
673 | 0 | h->fprint = (ngram_t *) malloc(maxngrams * sizeof(ngram_t)); |
674 | |
|
675 | 0 | while (cnt < maxngrams && wg_getline(line, 1024, fp)) |
676 | 0 | { |
677 | |
|
678 | 0 | char *p; |
679 | |
|
680 | 0 | wg_trim(line, line); |
681 | |
|
682 | 0 | p = strpbrk(line, " \t"); |
683 | 0 | if (p) |
684 | 0 | { |
685 | 0 | *p = '\0'; |
686 | 0 | } |
687 | |
|
688 | 0 | if (strlen(line) > MAXNGRAMSIZE) |
689 | 0 | { |
690 | 0 | continue; |
691 | 0 | } |
692 | | |
693 | 0 | strcpy(h->fprint[cnt].str, line); |
694 | 0 | h->fprint[cnt].rank = cnt; |
695 | |
|
696 | 0 | cnt++; |
697 | 0 | } |
698 | |
|
699 | 0 | h->size = cnt; |
700 | | |
701 | | /*** Sort n-grams, for easy comparison later on ***/ |
702 | 0 | qsort(h->fprint, h->size, sizeof(ngram_t), ngramcmp_str); |
703 | |
|
704 | 0 | fclose(fp); |
705 | |
|
706 | 0 | return 1; |
707 | 0 | } |
708 | | |
709 | | extern void fp_Print(void *handle, FILE * fp) |
710 | 0 | { |
711 | 0 | uint4 i; |
712 | 0 | fp_t *h = (fp_t *) handle; |
713 | 0 | ngram_t *tmp = (ngram_t *) malloc(sizeof(ngram_t) * h->size); |
714 | | |
715 | | /*** Make a temporary and sort it on rank ***/ |
716 | 0 | memcpy(tmp, h->fprint, h->size * sizeof(ngram_t)); |
717 | 0 | qsort(tmp, h->size, sizeof(ngram_t), ngramcmp_rank); |
718 | |
|
719 | 0 | for (i = 0; i < h->size; i++) |
720 | 0 | { |
721 | | /* fprintf( fp, "%s\t%i\n", tmp[i].str, tmp[i].rank ); */ |
722 | 0 | fprintf(fp, "%s\n", tmp[i].str); |
723 | 0 | } |
724 | 0 | free(tmp); |
725 | 0 | } |
726 | | |
727 | | |
728 | | |
729 | | extern sint4 fp_Compare(void *cat, void *unknown, int cutoff) |
730 | 0 | { |
731 | 0 | fp_t *c = (fp_t *) cat; |
732 | 0 | fp_t *u = (fp_t *) unknown; |
733 | 0 | uint4 i = 0; |
734 | 0 | uint4 j = 0; |
735 | 0 | sint4 sum = 0; |
736 | | |
737 | | /*** Compare the profiles in mergesort fashion ***/ |
738 | 0 | while (i < c->size && j < u->size) |
739 | 0 | { |
740 | |
|
741 | 0 | int cmp = mystrcmp(c->fprint[i].str, u->fprint[j].str); |
742 | |
|
743 | 0 | if (cmp < 0) |
744 | 0 | { |
745 | 0 | i++; |
746 | 0 | } |
747 | 0 | else if (cmp == 0) |
748 | 0 | { |
749 | 0 | sum += abs(c->fprint[i].rank - u->fprint[j].rank); |
750 | 0 | if (sum > cutoff) |
751 | 0 | return MAXSCORE; |
752 | 0 | i++; |
753 | 0 | j++; |
754 | 0 | } |
755 | 0 | else |
756 | 0 | { |
757 | 0 | sum += MAXOUTOFPLACE; |
758 | 0 | if (sum > cutoff) |
759 | 0 | return MAXSCORE; |
760 | 0 | j++; |
761 | 0 | } |
762 | 0 | } |
763 | | |
764 | | /*** Process tail of unknown, if any ***/ |
765 | 0 | while (j < u->size) |
766 | 0 | { |
767 | 0 | sum += MAXOUTOFPLACE; |
768 | 0 | if (sum > cutoff) |
769 | 0 | return MAXSCORE; |
770 | 0 | j++; |
771 | 0 | } |
772 | | |
773 | 0 | return sum; |
774 | 0 | } |
775 | | |
776 | | /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |