Coverage Report

Created: 2026-02-14 06:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tinysparql/src/common/tracker-parser-libunistring.c
Line
Count
Source
1
/*
2
 * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
3
 * Copyright (C) 2008,2009,2010 Nokia <ivan.frade@nokia.com>
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2.1 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
18
 * 02110-1301  USA
19
 */
20
21
#include "config.h"
22
23
#include <stdio.h>
24
#include <string.h>
25
26
/* libunistring versions prior to 9.1.2 need this hack */
27
#define _UNUSED_PARAMETER_
28
#include <unistr.h>
29
#include <uniwbrk.h>
30
#include <unictype.h>
31
#include <unicase.h>
32
33
#include "tracker-language.h"
34
#include "tracker-parser.h"
35
#include "tracker-parser-utils.h"
36
37
/* Type of words detected */
38
typedef enum {
39
  TRACKER_PARSER_WORD_TYPE_ASCII,
40
  TRACKER_PARSER_WORD_TYPE_OTHER_UNAC,
41
  TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC,
42
} TrackerParserWordType;
43
44
/* If string lenth less than this value, allocating from the stack */
45
0
#define MAX_STACK_STR_SIZE 8192
46
47
/* Max possible length of a UTF-8 encoded string (just a safety limit) */
48
0
#define WORD_BUFFER_LENGTH 512
49
50
struct TrackerParser {
51
  const gchar           *txt;
52
  gint                   txt_size;
53
54
  TrackerLanguage       *language;
55
  guint                  max_word_length;
56
  gboolean               enable_stemmer;
57
  gboolean               enable_unaccent;
58
  gboolean               ignore_numbers;
59
  gboolean               enable_forced_wordbreaks;
60
61
  /* Private members */
62
  gchar                 *word;
63
  gint                   word_length;
64
  guint                  word_position;
65
66
  /* Cursor, as index of the input array of bytes */
67
  gsize                  cursor;
68
  /* libunistring flags array */
69
  gchar                 *word_break_flags;
70
  /* general category of the  start character in words */
71
  uc_general_category_t  allowed_start;
72
};
73
74
static gboolean
75
get_word_info (TrackerParser         *parser,
76
               gsize                 *p_word_length,
77
               gboolean              *p_is_allowed_word_start,
78
               TrackerParserWordType *p_word_type)
79
0
{
80
0
  ucs4_t first_unichar;
81
0
  gint first_unichar_len;
82
0
  gboolean ascii_only;
83
84
  /* Defaults */
85
0
  *p_is_allowed_word_start = TRUE;
86
87
  /* Get first character of the word as UCS4 */
88
0
  first_unichar_len = u8_strmbtouc (&first_unichar,
89
0
                                    (const guchar *) &(parser->txt[parser->cursor]));
90
0
  if (first_unichar_len <= 0) {
91
    /* This should only happen if NIL was passed to u8_strmbtouc,
92
     *  so better just force stop here */
93
0
    return FALSE;
94
0
  } else  {
95
    /* If first character has length 1, it's ASCII-7 */
96
0
    ascii_only = first_unichar_len == 1 ? TRUE : FALSE;
97
0
  }
98
99
  /* Consider word starts with a forced wordbreak */
100
0
  if (parser->enable_forced_wordbreaks &&
101
0
      IS_FORCED_WORDBREAK_UCS4 ((guint32)first_unichar)) {
102
0
    *p_word_length = first_unichar_len;
103
0
  } else {
104
0
    gsize i;
105
106
    /* Find next word break, and in the same loop checking if only ASCII
107
     *  characters */
108
0
    i = parser->cursor + first_unichar_len;
109
0
    while (1) {
110
      /* Text bounds reached? */
111
0
      if (i >= (gsize) parser->txt_size)
112
0
        break;
113
      /* Proper unicode word break detected? */
114
0
      if (parser->word_break_flags[i])
115
0
        break;
116
      /* Forced word break detected? */
117
0
      if (parser->enable_forced_wordbreaks &&
118
0
          IS_FORCED_WORDBREAK_UCS4 ((guint32)parser->txt[i]))
119
0
        break;
120
121
0
      if (ascii_only &&
122
0
          !IS_ASCII_UCS4 ((guint32)parser->txt[i])) {
123
0
        ascii_only = FALSE;
124
0
      }
125
126
0
      i++;
127
0
    }
128
129
    /* Word end is the first byte after the word, which is either the
130
     *  start of next word or the end of the string */
131
0
    *p_word_length = i - parser->cursor;
132
0
  }
133
134
  /* We only want the words where the first character
135
   *  in the word is either a letter, a number or a symbol.
136
   * This is needed because the word break algorithm also
137
   *  considers word breaks after for example commas or other
138
   *  punctuation marks.
139
   * Note that looking at the first character in the string
140
   *  should be compatible with all Unicode normalization
141
   *  methods.
142
   */
143
0
  if (!IS_UNDERSCORE_UCS4 ((guint32)first_unichar) &&
144
0
      !uc_is_general_category (first_unichar,
145
0
                               parser->allowed_start)) {
146
0
    *p_is_allowed_word_start = FALSE;
147
0
    return TRUE;
148
0
  }
149
150
  /* Decide word type */
151
0
  if (ascii_only) {
152
0
    *p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
153
0
  } else if (IS_CJK_UCS4 (first_unichar)) {
154
0
    *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
155
0
  } else {
156
0
    *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
157
0
  }
158
0
  return TRUE;
159
0
}
160
161
/* The input word in this method MUST be normalized in NFKD form,
162
 * and given in UTF-8, where str_length is the byte-length
163
 * (note: there is no trailing NUL character!) */
164
static gboolean
165
tracker_parser_unaccent_nfkd_string (gpointer  str,
166
                                     gsize    *str_length)
167
0
{
168
0
  gchar *word;
169
0
  gsize word_length;
170
0
  gsize i;
171
0
  gsize j;
172
173
0
  g_return_val_if_fail (str != NULL, FALSE);
174
0
  g_return_val_if_fail (str_length != NULL, FALSE);
175
176
0
  word = (gchar *)str;
177
0
  word_length = *str_length;
178
179
0
  i = 0;
180
0
  j = 0;
181
0
  while (i < word_length) {
182
0
    ucs4_t unichar;
183
0
    gint utf8_len;
184
185
    /* Get next character of the word as UCS4 */
186
0
    utf8_len = u8_strmbtouc (&unichar, (const guchar *) &word[i]);
187
188
    /* Invalid UTF-8 character or end of original string. */
189
0
    if (utf8_len <= 0) {
190
0
      break;
191
0
    }
192
193
    /* If the given unichar is a combining diacritical mark,
194
     * just update the original index, not the output one */
195
0
    if (IS_CDM_UCS4 ((guint32) unichar)) {
196
0
      i += utf8_len;
197
0
      continue;
198
0
    }
199
200
    /* If already found a previous combining
201
     * diacritical mark, indexes are different so
202
     * need to copy characters. As output and input
203
     * buffers may overlap, need to use memmove
204
     * instead of memcpy */
205
0
    if (i != j) {
206
0
      memmove (&word[j], &word[i], utf8_len);
207
0
    }
208
209
    /* Update both indexes */
210
0
    i += utf8_len;
211
0
    j += utf8_len;
212
0
  }
213
214
  /* Set new output length */
215
0
  *str_length = j;
216
217
0
  return TRUE;
218
0
}
219
220
static gchar *
221
process_word_utf8 (TrackerParser         *parser,
222
                   const gchar           *word,
223
                   gint                   length,
224
                   TrackerParserWordType  type)
225
0
{
226
0
  gchar word_buffer [WORD_BUFFER_LENGTH];
227
0
  gchar *normalized = NULL;
228
0
  gchar *stemmed = NULL;
229
0
  size_t new_word_length;
230
231
0
  g_return_val_if_fail (parser != NULL, NULL);
232
0
  g_return_val_if_fail (word != NULL, NULL);
233
234
  /* If length is set as -1, the input word MUST be NIL-terminated.
235
   * Otherwise, this restriction is not needed as the length to process
236
   * is given as input argument */
237
0
  if (length < 0) {
238
0
    length = strlen (word);
239
0
  }
240
241
  /* Log original word */
242
0
  tracker_parser_message_hex ("ORIGINAL word",
243
0
                              word, length);
244
245
  /* Normalization and case-folding ONLY for non-ASCII */
246
0
  if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
247
    /* Leave space for last NIL */
248
0
    new_word_length = WORD_BUFFER_LENGTH - 1;
249
250
    /* Casefold and NFKD normalization in output.
251
     * NOTE: if the output buffer is not big enough, u8_casefold will
252
     * return a newly-allocated buffer. */
253
0
    normalized = (gchar*) u8_casefold ((const uint8_t *)word,
254
0
                                       length,
255
0
                                       uc_locale_language (),
256
0
                                       UNINORM_NFKD,
257
0
                                       (guchar *) word_buffer,
258
0
                                       &new_word_length);
259
260
    /* Case folding + Normalization failed, ignore this word */
261
0
    g_return_val_if_fail (normalized != NULL, NULL);
262
263
    /* If output buffer is not the same as the one passed to
264
     * u8_casefold, we know it was newly-allocated, so need
265
     * to resize it in 1 byte to add last NIL */
266
0
    if (normalized != word_buffer) {
267
0
      normalized = g_realloc (normalized, new_word_length + 1);
268
0
    }
269
270
    /* Log after Normalization */
271
0
    tracker_parser_message_hex (" After Casefolding and NFKD normalization",
272
0
                                normalized, new_word_length);
273
0
  } else {
274
    /* For ASCII-only, just tolower() each character */
275
0
    gsize i;
276
277
0
    normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer;
278
279
0
    for (i = 0; i < (gsize) length; i++) {
280
0
      normalized[i] = g_ascii_tolower (word[i]);
281
0
    }
282
283
0
    new_word_length = length;
284
285
    /* Log after tolower */
286
0
    tracker_parser_message_hex (" After Lowercasing",
287
0
                                normalized, new_word_length);
288
0
  }
289
290
  /* UNAC stripping needed? (for non-CJK and non-ASCII) */
291
0
  if (parser->enable_unaccent &&
292
0
      type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
293
0
      tracker_parser_unaccent_nfkd_string (normalized, &new_word_length)) {
294
    /* Log after UNAC stripping */
295
0
    tracker_parser_message_hex ("  After UNAC stripping",
296
0
                                normalized, new_word_length);
297
0
  }
298
299
  /* Set output NIL */
300
0
  normalized[new_word_length] = '\0';
301
302
  /* Stemming needed? */
303
0
  if (parser->enable_stemmer) {
304
0
    tracker_language_stem_word (parser->language,
305
0
                                normalized,
306
0
                                &new_word_length,
307
0
                                new_word_length);
308
309
    /* Log after stemming */
310
0
    tracker_parser_message_hex ("   After stemming",
311
0
                                normalized, new_word_length);
312
0
  }
313
314
  /* It may be the case that no stripping and no stemming was needed, and
315
   * that the output buffer in stack was enough for case-folding and
316
   * normalization. In this case, need to strdup() the string to return it */
317
0
  return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
318
0
}
319
320
static gboolean
321
parser_next (TrackerParser *parser,
322
             gint          *byte_offset_start,
323
             gint          *byte_offset_end)
324
0
{
325
0
  gsize word_length = 0;
326
0
  gchar *processed_word = NULL;
327
328
0
  *byte_offset_start = 0;
329
0
  *byte_offset_end = 0;
330
331
0
  g_return_val_if_fail (parser, FALSE);
332
333
  /* Loop to look for next valid word */
334
0
  while (!processed_word &&
335
0
         parser->cursor < (gsize) parser->txt_size) {
336
0
    TrackerParserWordType type;
337
0
    gsize truncated_length;
338
0
    gboolean is_allowed;
339
340
    /* Get word info */
341
0
    if (!get_word_info (parser,
342
0
                        &word_length,
343
0
                        &is_allowed,
344
0
                        &type)) {
345
      /* Quit loop just in case */
346
0
      parser->cursor = parser->txt_size;
347
0
      break;
348
0
    }
349
350
    /* Ignore the word if not an allowed word start */
351
0
    if (!is_allowed) {
352
      /* Ignore this word and keep on looping */
353
0
      parser->cursor += word_length;
354
0
      continue;
355
0
    }
356
357
    /* Ignore the word if longer than the maximum allowed */
358
0
    if (word_length >= parser->max_word_length) {
359
      /* Ignore this word and keep on looping */
360
0
      parser->cursor += word_length;
361
0
      continue;
362
0
    }
363
364
    /* compute truncated word length if needed (to avoid extremely
365
     *  long words)*/
366
0
    truncated_length = (word_length < WORD_BUFFER_LENGTH ?
367
0
                        word_length :
368
0
                        WORD_BUFFER_LENGTH - 1);
369
370
    /* Process the word here. If it fails, we can still go
371
     *  to the next one. Returns newly allocated string
372
     *  always */
373
0
    processed_word = process_word_utf8 (parser,
374
0
                                        &(parser->txt[parser->cursor]),
375
0
                                        truncated_length,
376
0
                                        type);
377
0
    if (!processed_word) {
378
      /* Ignore this word and keep on looping */
379
0
      parser->cursor += word_length;
380
0
      continue;
381
0
    }
382
0
  }
383
384
  /* If we got a word here, set output */
385
0
  if (processed_word) {
386
    /* Set outputs */
387
0
    *byte_offset_start = parser->cursor;
388
0
    *byte_offset_end = parser->cursor + word_length;
389
390
    /* Update cursor */
391
0
    parser->cursor += word_length;
392
393
0
    parser->word_length = strlen (processed_word);
394
0
    parser->word = processed_word;
395
396
0
    return TRUE;
397
0
  }
398
399
  /* No more words... */
400
0
  return FALSE;
401
0
}
402
403
TrackerParser *
404
tracker_parser_new (void)
405
0
{
406
0
  TrackerParser *parser;
407
408
0
  parser = g_new0 (TrackerParser, 1);
409
0
  parser->language = tracker_language_new (NULL);
410
411
0
  return parser;
412
0
}
413
414
void
415
tracker_parser_free (TrackerParser *parser)
416
0
{
417
0
  g_return_if_fail (parser != NULL);
418
419
0
  if (parser->language) {
420
0
    g_object_unref (parser->language);
421
0
  }
422
423
0
  g_free (parser->word_break_flags);
424
425
0
  g_free (parser->word);
426
427
0
  g_free (parser);
428
0
}
429
430
void
431
tracker_parser_reset (TrackerParser *parser,
432
                      const gchar   *txt,
433
                      gint           txt_size,
434
                      guint          max_word_length,
435
                      gboolean       enable_stemmer,
436
                      gboolean       enable_unaccent,
437
                      gboolean       ignore_numbers)
438
0
{
439
0
  g_return_if_fail (parser != NULL);
440
0
  g_return_if_fail (txt != NULL);
441
442
0
  parser->max_word_length = max_word_length;
443
0
  parser->enable_stemmer = enable_stemmer;
444
0
  parser->enable_unaccent = enable_unaccent;
445
0
  parser->ignore_numbers = ignore_numbers;
446
447
  /* Note: We're forcing some unicode characters to behave
448
   * as wordbreakers: e.g, the '.' The main reason for this
449
   * is to enable FTS searches matching file extension. */
450
0
  parser->enable_forced_wordbreaks = TRUE;
451
452
0
  parser->txt_size = txt_size;
453
0
  parser->txt = txt;
454
455
0
  g_free (parser->word);
456
0
  parser->word = NULL;
457
458
0
  parser->word_position = 0;
459
460
0
  parser->cursor = 0;
461
462
0
  g_free (parser->word_break_flags);
463
464
  /* Create array of flags, same size as original text. */
465
0
  parser->word_break_flags = g_malloc (txt_size);
466
467
  /* Get wordbreak flags in the whole string */
468
0
  u8_wordbreaks ((const uint8_t *)txt,
469
0
                 (size_t) txt_size,
470
0
                 (char *)parser->word_break_flags);
471
472
  /* Prepare a custom category which is a combination of the
473
   * desired ones */
474
0
  parser->allowed_start = UC_LETTER;
475
0
  if (!parser->ignore_numbers) {
476
0
    parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
477
0
  }
478
0
}
479
480
const gchar *
481
tracker_parser_next (TrackerParser *parser,
482
                     gint          *position,
483
                     gint          *byte_offset_start,
484
                     gint          *byte_offset_end,
485
                     gint          *word_length)
486
0
{
487
0
  const gchar  *str;
488
0
  gint byte_start = 0, byte_end = 0;
489
490
0
  str = NULL;
491
492
0
  g_free (parser->word);
493
0
  parser->word = NULL;
494
495
0
  if (parser_next (parser, &byte_start, &byte_end)) {
496
0
    str = parser->word;
497
0
  }
498
499
0
  parser->word_position++;
500
501
0
  *word_length = parser->word_length;
502
0
  *position = parser->word_position;
503
0
  *byte_offset_start = byte_start;
504
0
  *byte_offset_end = byte_end;
505
506
0
  return str;
507
0
}
508
509
gpointer
510
tracker_collation_init (void)
511
6
{
512
  /* Nothing to do */
513
6
  return NULL;
514
6
}
515
516
void
517
tracker_collation_shutdown (gpointer collator)
518
4
{
519
  /* Nothing to do */
520
4
}
521
522
gint
523
tracker_collation_utf8 (gpointer      collator,
524
                        gint          len1,
525
                        gconstpointer str1,
526
                        gint          len2,
527
                        gconstpointer str2)
528
0
{
529
0
  gint result;
530
0
  guchar *aux1;
531
0
  guchar *aux2;
532
533
  /* Note: str1 and str2 are NOT NUL-terminated */
534
0
  aux1 = (len1 < MAX_STACK_STR_SIZE) ? g_alloca (len1+1) : g_malloc (len1+1);
535
0
  aux2 = (len2 < MAX_STACK_STR_SIZE) ? g_alloca (len2+1) : g_malloc (len2+1);
536
537
0
  memcpy (aux1, str1, len1); aux1[len1] = '\0';
538
0
  memcpy (aux2, str2, len2); aux2[len2] = '\0';
539
540
0
  result = u8_strcoll (aux1, aux2);
541
542
0
  if (len1 >= MAX_STACK_STR_SIZE)
543
0
    g_free (aux1);
544
0
  if (len2 >= MAX_STACK_STR_SIZE)
545
0
    g_free (aux2);
546
0
  return result;
547
0
}
548
549
gunichar2 *
550
tracker_parser_tolower (const gunichar2 *input,
551
      gsize            len,
552
      gsize           *len_out)
553
0
{
554
0
  return u16_tolower (input, len / 2, NULL, NULL, NULL, len_out);
555
0
}
556
557
gunichar2 *
558
tracker_parser_toupper (const gunichar2 *input,
559
                        gsize            len,
560
                        gsize           *len_out)
561
0
{
562
0
  return u16_toupper (input, len / 2, NULL, NULL, NULL, len_out);
563
0
}
564
565
gunichar2 *
566
tracker_parser_casefold (const gunichar2 *input,
567
       gsize            len,
568
       gsize           *len_out)
569
0
{
570
0
  return u16_casefold (input, len / 2, NULL, NULL, NULL, len_out);
571
0
}
572
573
gunichar2 *
574
tracker_parser_normalize (const gunichar2 *input,
575
        GNormalizeMode   mode,
576
        gsize            len,
577
        gsize           *len_out)
578
0
{
579
0
  uninorm_t nf;
580
581
0
  if (mode == G_NORMALIZE_NFC)
582
0
    nf = UNINORM_NFC;
583
0
  else if (mode == G_NORMALIZE_NFD)
584
0
    nf = UNINORM_NFD;
585
0
  else if (mode == G_NORMALIZE_NFKC)
586
0
    nf = UNINORM_NFKC;
587
0
  else if (mode == G_NORMALIZE_NFKD)
588
0
    nf = UNINORM_NFKD;
589
0
  else
590
0
    g_assert_not_reached ();
591
592
0
  return u16_normalize (nf, input, len / 2, NULL, len_out);
593
0
}
594
595
gunichar2 *
596
tracker_parser_unaccent (const gunichar2 *input,
597
       gsize            len,
598
       gsize           *len_out)
599
0
{
600
0
  gunichar2 *zOutput;
601
0
  gsize written = 0;
602
603
0
  zOutput = u16_normalize (UNINORM_NFKD, input, len, NULL, &written);
604
605
  /* Unaccenting is done in place */
606
0
  tracker_parser_unaccent_nfkd_string (zOutput, &written);
607
608
0
  *len_out = written;
609
610
0
  return zOutput;
611
0
}