/src/tinysparql/src/common/tracker-parser-libunistring.c

Source
/*
 * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
 * Copyright (C) 2008,2009,2010 Nokia <ivan.frade@nokia.com>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301  USA
 */

#include "config.h"

#include <stdio.h>
#include <string.h>

/* libunistring versions prior to 9.1.2 need this hack */
#define _UNUSED_PARAMETER_
#include <unistr.h>
#include <uniwbrk.h>
#include <unictype.h>
#include <unicase.h>

#include "tracker-language.h"
#include "tracker-parser.h"
#include "tracker-parser-utils.h"

/* Type of words detected */
typedef enum {
  TRACKER_PARSER_WORD_TYPE_ASCII,
  TRACKER_PARSER_WORD_TYPE_OTHER_UNAC,
  TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC,
} TrackerParserWordType;

/* If string lenth less than this value, allocating from the stack */
#define MAX_STACK_STR_SIZE 8192

/* Max possible length of a UTF-8 encoded string (just a safety limit) */
#define WORD_BUFFER_LENGTH 512

struct TrackerParser {
  const gchar           *txt;
  gint                   txt_size;

  TrackerLanguage       *language;
  guint                  max_word_length;
  gboolean               enable_stemmer;
  gboolean               enable_unaccent;
  gboolean               ignore_numbers;
  gboolean               enable_forced_wordbreaks;

  /* Private members */
  gchar                 *word;
  gint                   word_length;
  guint                  word_position;

  /* Cursor, as index of the input array of bytes */
  gsize                  cursor;
  /* libunistring flags array */
  gchar                 *word_break_flags;
  /* general category of the  start character in words */
  uc_general_category_t  allowed_start;
};

static gboolean
get_word_info (TrackerParser         *parser,
               gsize                 *p_word_length,
               gboolean              *p_is_allowed_word_start,
               TrackerParserWordType *p_word_type)
{
  ucs4_t first_unichar;
  gint first_unichar_len;
  gboolean ascii_only;

  /* Defaults */
  *p_is_allowed_word_start = TRUE;

  /* Get first character of the word as UCS4 */
  first_unichar_len = u8_strmbtouc (&first_unichar,
                                    (const guchar *) &(parser->txt[parser->cursor]));
  if (first_unichar_len <= 0) {
    /* This should only happen if NIL was passed to u8_strmbtouc,
     *  so better just force stop here */
    return FALSE;
  } else  {
    /* If first character has length 1, it's ASCII-7 */
    ascii_only = first_unichar_len == 1 ? TRUE : FALSE;
  }

  /* Consider word starts with a forced wordbreak */
  if (parser->enable_forced_wordbreaks &&
      IS_FORCED_WORDBREAK_UCS4 ((guint32)first_unichar)) {
    *p_word_length = first_unichar_len;
  } else {
    gsize i;

    /* Find next word break, and in the same loop checking if only ASCII
     *  characters */
    i = parser->cursor + first_unichar_len;
    while (1) {
      /* Text bounds reached? */
      if (i >= (gsize) parser->txt_size)
        break;
      /* Proper unicode word break detected? */
      if (parser->word_break_flags[i])
        break;
      /* Forced word break detected? */
      if (parser->enable_forced_wordbreaks &&
          IS_FORCED_WORDBREAK_UCS4 ((guint32)parser->txt[i]))
        break;

      if (ascii_only &&
          !IS_ASCII_UCS4 ((guint32)parser->txt[i])) {
        ascii_only = FALSE;
      }

      i++;
    }

    /* Word end is the first byte after the word, which is either the
     *  start of next word or the end of the string */
    *p_word_length = i - parser->cursor;
  }

  /* We only want the words where the first character
   *  in the word is either a letter, a number or a symbol.
   * This is needed because the word break algorithm also
   *  considers word breaks after for example commas or other
   *  punctuation marks.
   * Note that looking at the first character in the string
   *  should be compatible with all Unicode normalization
   *  methods.
   */
  if (!IS_UNDERSCORE_UCS4 ((guint32)first_unichar) &&
      !uc_is_general_category (first_unichar,
                               parser->allowed_start)) {
    *p_is_allowed_word_start = FALSE;
    return TRUE;
  }

  /* Decide word type */
  if (ascii_only) {
    *p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
  } else if (IS_CJK_UCS4 (first_unichar)) {
    *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
  } else {
    *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
  }
  return TRUE;
}

/* The input word in this method MUST be normalized in NFKD form,
 * and given in UTF-8, where str_length is the byte-length
 * (note: there is no trailing NUL character!) */
static gboolean
tracker_parser_unaccent_nfkd_string (gpointer  str,
                                     gsize    *str_length)
{
  gchar *word;
  gsize word_length;
  gsize i;
  gsize j;

  g_return_val_if_fail (str != NULL, FALSE);
  g_return_val_if_fail (str_length != NULL, FALSE);

  word = (gchar *)str;
  word_length = *str_length;

  i = 0;
  j = 0;
  while (i < word_length) {
    ucs4_t unichar;
    gint utf8_len;

    /* Get next character of the word as UCS4 */
    utf8_len = u8_strmbtouc (&unichar, (const guchar *) &word[i]);

    /* Invalid UTF-8 character or end of original string. */
    if (utf8_len <= 0) {
      break;
    }

    /* If the given unichar is a combining diacritical mark,
     * just update the original index, not the output one */
    if (IS_CDM_UCS4 ((guint32) unichar)) {
      i += utf8_len;
      continue;
    }

    /* If already found a previous combining
     * diacritical mark, indexes are different so
     * need to copy characters. As output and input
     * buffers may overlap, need to use memmove
     * instead of memcpy */
    if (i != j) {
      memmove (&word[j], &word[i], utf8_len);
    }

    /* Update both indexes */
    i += utf8_len;
    j += utf8_len;
  }

  /* Set new output length */
  *str_length = j;

  return TRUE;
}

static gchar *
process_word_utf8 (TrackerParser         *parser,
                   const gchar           *word,
                   gint                   length,
                   TrackerParserWordType  type)
{
  gchar word_buffer [WORD_BUFFER_LENGTH];
  gchar *normalized = NULL;
  gchar *stemmed = NULL;
  size_t new_word_length;

  g_return_val_if_fail (parser != NULL, NULL);
  g_return_val_if_fail (word != NULL, NULL);

  /* If length is set as -1, the input word MUST be NIL-terminated.
   * Otherwise, this restriction is not needed as the length to process
   * is given as input argument */
  if (length < 0) {
    length = strlen (word);
  }

  /* Log original word */
  tracker_parser_message_hex ("ORIGINAL word",
                              word, length);

  /* Normalization and case-folding ONLY for non-ASCII */
  if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
    /* Leave space for last NIL */
    new_word_length = WORD_BUFFER_LENGTH - 1;

    /* Casefold and NFKD normalization in output.
     * NOTE: if the output buffer is not big enough, u8_casefold will
     * return a newly-allocated buffer. */
    normalized = (gchar*) u8_casefold ((const uint8_t *)word,
                                       length,
                                       uc_locale_language (),
                                       UNINORM_NFKD,
                                       (guchar *) word_buffer,
                                       &new_word_length);

    /* Case folding + Normalization failed, ignore this word */
    g_return_val_if_fail (normalized != NULL, NULL);

    /* If output buffer is not the same as the one passed to
     * u8_casefold, we know it was newly-allocated, so need
     * to resize it in 1 byte to add last NIL */
    if (normalized != word_buffer) {
      normalized = g_realloc (normalized, new_word_length + 1);
    }

    /* Log after Normalization */
    tracker_parser_message_hex (" After Casefolding and NFKD normalization",
                                normalized, new_word_length);
  } else {
    /* For ASCII-only, just tolower() each character */
    gsize i;

    normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer;

    for (i = 0; i < (gsize) length; i++) {
      normalized[i] = g_ascii_tolower (word[i]);
    }

    new_word_length = length;

    /* Log after tolower */
    tracker_parser_message_hex (" After Lowercasing",
                                normalized, new_word_length);
  }

  /* UNAC stripping needed? (for non-CJK and non-ASCII) */
  if (parser->enable_unaccent &&
      type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
      tracker_parser_unaccent_nfkd_string (normalized, &new_word_length)) {
    /* Log after UNAC stripping */
    tracker_parser_message_hex ("  After UNAC stripping",
                                normalized, new_word_length);
  }

  /* Set output NIL */
  normalized[new_word_length] = '\0';

  /* Stemming needed? */
  if (parser->enable_stemmer) {
    tracker_language_stem_word (parser->language,
                                normalized,
                                &new_word_length,
                                new_word_length);

    /* Log after stemming */
    tracker_parser_message_hex ("   After stemming",
                                normalized, new_word_length);
  }

  /* It may be the case that no stripping and no stemming was needed, and
   * that the output buffer in stack was enough for case-folding and
   * normalization. In this case, need to strdup() the string to return it */
  return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
}

static gboolean
parser_next (TrackerParser *parser,
             gint          *byte_offset_start,
             gint          *byte_offset_end)
{
  gsize word_length = 0;
  gchar *processed_word = NULL;

  *byte_offset_start = 0;
  *byte_offset_end = 0;

  g_return_val_if_fail (parser, FALSE);

  /* Loop to look for next valid word */
  while (!processed_word &&
         parser->cursor < (gsize) parser->txt_size) {
    TrackerParserWordType type;
    gsize truncated_length;
    gboolean is_allowed;

    /* Get word info */
    if (!get_word_info (parser,
                        &word_length,
                        &is_allowed,
                        &type)) {
      /* Quit loop just in case */
      parser->cursor = parser->txt_size;
      break;
    }

    /* Ignore the word if not an allowed word start */
    if (!is_allowed) {
      /* Ignore this word and keep on looping */
      parser->cursor += word_length;
      continue;
    }

    /* Ignore the word if longer than the maximum allowed */
    if (word_length >= parser->max_word_length) {
      /* Ignore this word and keep on looping */
      parser->cursor += word_length;
      continue;
    }

    /* compute truncated word length if needed (to avoid extremely
     *  long words)*/
    truncated_length = (word_length < WORD_BUFFER_LENGTH ?
                        word_length :
                        WORD_BUFFER_LENGTH - 1);

    /* Process the word here. If it fails, we can still go
     *  to the next one. Returns newly allocated string
     *  always */
    processed_word = process_word_utf8 (parser,
                                        &(parser->txt[parser->cursor]),
                                        truncated_length,
                                        type);
    if (!processed_word) {
      /* Ignore this word and keep on looping */
      parser->cursor += word_length;
      continue;
    }
  }

  /* If we got a word here, set output */
  if (processed_word) {
    /* Set outputs */
    *byte_offset_start = parser->cursor;
    *byte_offset_end = parser->cursor + word_length;

    /* Update cursor */
    parser->cursor += word_length;

    parser->word_length = strlen (processed_word);
    parser->word = processed_word;

    return TRUE;
  }

  /* No more words... */
  return FALSE;
}

TrackerParser *
tracker_parser_new (void)
{
  TrackerParser *parser;

  parser = g_new0 (TrackerParser, 1);
  parser->language = tracker_language_new (NULL);

  return parser;
}

void
tracker_parser_free (TrackerParser *parser)
{
  g_return_if_fail (parser != NULL);

  if (parser->language) {
    g_object_unref (parser->language);
  }

  g_free (parser->word_break_flags);

  g_free (parser->word);

  g_free (parser);
}

void
tracker_parser_reset (TrackerParser *parser,
                      const gchar   *txt,
                      gint           txt_size,
                      guint          max_word_length,
                      gboolean       enable_stemmer,
                      gboolean       enable_unaccent,
                      gboolean       ignore_numbers)
{
  g_return_if_fail (parser != NULL);
  g_return_if_fail (txt != NULL);

  parser->max_word_length = max_word_length;
  parser->enable_stemmer = enable_stemmer;
  parser->enable_unaccent = enable_unaccent;
  parser->ignore_numbers = ignore_numbers;

  /* Note: We're forcing some unicode characters to behave
   * as wordbreakers: e.g, the '.' The main reason for this
   * is to enable FTS searches matching file extension. */
  parser->enable_forced_wordbreaks = TRUE;

  parser->txt_size = txt_size;
  parser->txt = txt;

  g_free (parser->word);
  parser->word = NULL;

  parser->word_position = 0;

  parser->cursor = 0;

  g_free (parser->word_break_flags);

  /* Create array of flags, same size as original text. */
  parser->word_break_flags = g_malloc (txt_size);

  /* Get wordbreak flags in the whole string */
  u8_wordbreaks ((const uint8_t *)txt,
                 (size_t) txt_size,
                 (char *)parser->word_break_flags);

  /* Prepare a custom category which is a combination of the
   * desired ones */
  parser->allowed_start = UC_LETTER;
  if (!parser->ignore_numbers) {
    parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
  }
}

const gchar *
tracker_parser_next (TrackerParser *parser,
                     gint          *position,
                     gint          *byte_offset_start,
                     gint          *byte_offset_end,
                     gint          *word_length)
{
  const gchar  *str;
  gint byte_start = 0, byte_end = 0;

  str = NULL;

  g_free (parser->word);
  parser->word = NULL;

  if (parser_next (parser, &byte_start, &byte_end)) {
    str = parser->word;
  }

  parser->word_position++;

  *word_length = parser->word_length;
  *position = parser->word_position;
  *byte_offset_start = byte_start;
  *byte_offset_end = byte_end;

  return str;
}

gpointer
tracker_collation_init (void)
{
  /* Nothing to do */
  return NULL;
}

void
tracker_collation_shutdown (gpointer collator)
{
  /* Nothing to do */
}

gint
tracker_collation_utf8 (gpointer      collator,
                        gint          len1,
                        gconstpointer str1,
                        gint          len2,
                        gconstpointer str2)
{
  gint result;
  guchar *aux1;
  guchar *aux2;

  /* Note: str1 and str2 are NOT NUL-terminated */
  aux1 = (len1 < MAX_STACK_STR_SIZE) ? g_alloca (len1+1) : g_malloc (len1+1);
  aux2 = (len2 < MAX_STACK_STR_SIZE) ? g_alloca (len2+1) : g_malloc (len2+1);

  memcpy (aux1, str1, len1); aux1[len1] = '\0';
  memcpy (aux2, str2, len2); aux2[len2] = '\0';

  result = u8_strcoll (aux1, aux2);

  if (len1 >= MAX_STACK_STR_SIZE)
    g_free (aux1);
  if (len2 >= MAX_STACK_STR_SIZE)
    g_free (aux2);
  return result;
}

gunichar2 *
tracker_parser_tolower (const gunichar2 *input,
      gsize            len,
      gsize           *len_out)
{
  return u16_tolower (input, len / 2, NULL, NULL, NULL, len_out);
}

gunichar2 *
tracker_parser_toupper (const gunichar2 *input,
                        gsize            len,
                        gsize           *len_out)
{
  return u16_toupper (input, len / 2, NULL, NULL, NULL, len_out);
}

gunichar2 *
tracker_parser_casefold (const gunichar2 *input,
       gsize            len,
       gsize           *len_out)
{
  return u16_casefold (input, len / 2, NULL, NULL, NULL, len_out);
}

gunichar2 *
tracker_parser_normalize (const gunichar2 *input,
        GNormalizeMode   mode,
        gsize            len,
        gsize           *len_out)
{
  uninorm_t nf;

  if (mode == G_NORMALIZE_NFC)
    nf = UNINORM_NFC;
  else if (mode == G_NORMALIZE_NFD)
    nf = UNINORM_NFD;
  else if (mode == G_NORMALIZE_NFKC)
    nf = UNINORM_NFKC;
  else if (mode == G_NORMALIZE_NFKD)
    nf = UNINORM_NFKD;
  else
    g_assert_not_reached ();

  return u16_normalize (nf, input, len / 2, NULL, len_out);
}

gunichar2 *
tracker_parser_unaccent (const gunichar2 *input,
       gsize            len,
       gsize           *len_out)
{
  gunichar2 *zOutput;
  gsize written = 0;

  zOutput = u16_normalize (UNINORM_NFKD, input, len, NULL, &written);

  /* Unaccenting is done in place */
  tracker_parser_unaccent_nfkd_string (zOutput, &written);

  *len_out = written;

  return zOutput;
}

Coverage Report

Created: 2026-02-14 06:25

Line	Count	Source
1		/*
2		* Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
3		* Copyright (C) 2008,2009,2010 Nokia <ivan.frade@nokia.com>
4		*
5		* This library is free software; you can redistribute it and/or
6		* modify it under the terms of the GNU Lesser General Public
7		* License as published by the Free Software Foundation; either
8		* version 2.1 of the License, or (at your option) any later version.
9		*
10		* This library is distributed in the hope that it will be useful,
11		* but WITHOUT ANY WARRANTY; without even the implied warranty of
12		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13		* Lesser General Public License for more details.
14		*
15		* You should have received a copy of the GNU Lesser General Public
16		* License along with this library; if not, write to the Free Software
17		* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
18		* 02110-1301 USA
19		*/
20
21		#include "config.h"
22
23		#include <stdio.h>
24		#include <string.h>
25
26		/* libunistring versions prior to 9.1.2 need this hack */
27		#define _UNUSED_PARAMETER_
28		#include <unistr.h>
29		#include <uniwbrk.h>
30		#include <unictype.h>
31		#include <unicase.h>
32
33		#include "tracker-language.h"
34		#include "tracker-parser.h"
35		#include "tracker-parser-utils.h"
36
37		/* Type of words detected */
38		typedef enum {
39		TRACKER_PARSER_WORD_TYPE_ASCII,
40		TRACKER_PARSER_WORD_TYPE_OTHER_UNAC,
41		TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC,
42		} TrackerParserWordType;
43
44		/* If string lenth less than this value, allocating from the stack */
45	0	#define MAX_STACK_STR_SIZE 8192
46
47		/* Max possible length of a UTF-8 encoded string (just a safety limit) */
48	0	#define WORD_BUFFER_LENGTH 512
49
50		struct TrackerParser {
51		const gchar *txt;
52		gint txt_size;
53
54		TrackerLanguage *language;
55		guint max_word_length;
56		gboolean enable_stemmer;
57		gboolean enable_unaccent;
58		gboolean ignore_numbers;
59		gboolean enable_forced_wordbreaks;
60
61		/* Private members */
62		gchar *word;
63		gint word_length;
64		guint word_position;
65
66		/* Cursor, as index of the input array of bytes */
67		gsize cursor;
68		/* libunistring flags array */
69		gchar *word_break_flags;
70		/* general category of the start character in words */
71		uc_general_category_t allowed_start;
72		};
73
74		static gboolean
75		get_word_info (TrackerParser *parser,
76		gsize *p_word_length,
77		gboolean *p_is_allowed_word_start,
78		TrackerParserWordType *p_word_type)
79	0	{
80	0	ucs4_t first_unichar;
81	0	gint first_unichar_len;
82	0	gboolean ascii_only;
83
84		/* Defaults */
85	0	*p_is_allowed_word_start = TRUE;
86
87		/* Get first character of the word as UCS4 */
88	0	first_unichar_len = u8_strmbtouc (&first_unichar,
89	0	(const guchar *) &(parser->txt[parser->cursor]));
90	0	if (first_unichar_len <= 0) {
91		/* This should only happen if NIL was passed to u8_strmbtouc,
92		* so better just force stop here */
93	0	return FALSE;
94	0	} else {
95		/* If first character has length 1, it's ASCII-7 */
96	0	ascii_only = first_unichar_len == 1 ? TRUE : FALSE;
97	0	}
98
99		/* Consider word starts with a forced wordbreak */
100	0	if (parser->enable_forced_wordbreaks &&
101	0	IS_FORCED_WORDBREAK_UCS4 ((guint32)first_unichar)) {
102	0	*p_word_length = first_unichar_len;
103	0	} else {
104	0	gsize i;
105
106		/* Find next word break, and in the same loop checking if only ASCII
107		* characters */
108	0	i = parser->cursor + first_unichar_len;
109	0	while (1) {
110		/* Text bounds reached? */
111	0	if (i >= (gsize) parser->txt_size)
112	0	break;
113		/* Proper unicode word break detected? */
114	0	if (parser->word_break_flags[i])
115	0	break;
116		/* Forced word break detected? */
117	0	if (parser->enable_forced_wordbreaks &&
118	0	IS_FORCED_WORDBREAK_UCS4 ((guint32)parser->txt[i]))
119	0	break;
120
121	0	if (ascii_only &&
122	0	!IS_ASCII_UCS4 ((guint32)parser->txt[i])) {
123	0	ascii_only = FALSE;
124	0	}
125
126	0	i++;
127	0	}
128
129		/* Word end is the first byte after the word, which is either the
130		* start of next word or the end of the string */
131	0	*p_word_length = i - parser->cursor;
132	0	}
133
134		/* We only want the words where the first character
135		* in the word is either a letter, a number or a symbol.
136		* This is needed because the word break algorithm also
137		* considers word breaks after for example commas or other
138		* punctuation marks.
139		* Note that looking at the first character in the string
140		* should be compatible with all Unicode normalization
141		* methods.
142		*/
143	0	if (!IS_UNDERSCORE_UCS4 ((guint32)first_unichar) &&
144	0	!uc_is_general_category (first_unichar,
145	0	parser->allowed_start)) {
146	0	*p_is_allowed_word_start = FALSE;
147	0	return TRUE;
148	0	}
149
150		/* Decide word type */
151	0	if (ascii_only) {
152	0	*p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
153	0	} else if (IS_CJK_UCS4 (first_unichar)) {
154	0	*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
155	0	} else {
156	0	*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
157	0	}
158	0	return TRUE;
159	0	}
160
161		/* The input word in this method MUST be normalized in NFKD form,
162		* and given in UTF-8, where str_length is the byte-length
163		* (note: there is no trailing NUL character!) */
164		static gboolean
165		tracker_parser_unaccent_nfkd_string (gpointer str,
166		gsize *str_length)
167	0	{
168	0	gchar *word;
169	0	gsize word_length;
170	0	gsize i;
171	0	gsize j;
172
173	0	g_return_val_if_fail (str != NULL, FALSE);
174	0	g_return_val_if_fail (str_length != NULL, FALSE);
175
176	0	word = (gchar *)str;
177	0	word_length = *str_length;
178
179	0	i = 0;
180	0	j = 0;
181	0	while (i < word_length) {
182	0	ucs4_t unichar;
183	0	gint utf8_len;
184
185		/* Get next character of the word as UCS4 */
186	0	utf8_len = u8_strmbtouc (&unichar, (const guchar *) &word[i]);
187
188		/* Invalid UTF-8 character or end of original string. */
189	0	if (utf8_len <= 0) {
190	0	break;
191	0	}
192
193		/* If the given unichar is a combining diacritical mark,
194		* just update the original index, not the output one */
195	0	if (IS_CDM_UCS4 ((guint32) unichar)) {
196	0	i += utf8_len;
197	0	continue;
198	0	}
199
200		/* If already found a previous combining
201		* diacritical mark, indexes are different so
202		* need to copy characters. As output and input
203		* buffers may overlap, need to use memmove
204		* instead of memcpy */
205	0	if (i != j) {
206	0	memmove (&word[j], &word[i], utf8_len);
207	0	}
208
209		/* Update both indexes */
210	0	i += utf8_len;
211	0	j += utf8_len;
212	0	}
213
214		/* Set new output length */
215	0	*str_length = j;
216
217	0	return TRUE;
218	0	}
219
220		static gchar *
221		process_word_utf8 (TrackerParser *parser,
222		const gchar *word,
223		gint length,
224		TrackerParserWordType type)
225	0	{
226	0	gchar word_buffer [WORD_BUFFER_LENGTH];
227	0	gchar *normalized = NULL;
228	0	gchar *stemmed = NULL;
229	0	size_t new_word_length;
230
231	0	g_return_val_if_fail (parser != NULL, NULL);
232	0	g_return_val_if_fail (word != NULL, NULL);
233
234		/* If length is set as -1, the input word MUST be NIL-terminated.
235		* Otherwise, this restriction is not needed as the length to process
236		* is given as input argument */
237	0	if (length < 0) {
238	0	length = strlen (word);
239	0	}
240
241		/* Log original word */
242	0	tracker_parser_message_hex ("ORIGINAL word",
243	0	word, length);
244
245		/* Normalization and case-folding ONLY for non-ASCII */
246	0	if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
247		/* Leave space for last NIL */
248	0	new_word_length = WORD_BUFFER_LENGTH - 1;
249
250		/* Casefold and NFKD normalization in output.
251		* NOTE: if the output buffer is not big enough, u8_casefold will
252		* return a newly-allocated buffer. */
253	0	normalized = (gchar) u8_casefold ((const uint8_t )word,
254	0	length,
255	0	uc_locale_language (),
256	0	UNINORM_NFKD,
257	0	(guchar *) word_buffer,
258	0	&new_word_length);
259
260		/* Case folding + Normalization failed, ignore this word */
261	0	g_return_val_if_fail (normalized != NULL, NULL);
262
263		/* If output buffer is not the same as the one passed to
264		* u8_casefold, we know it was newly-allocated, so need
265		* to resize it in 1 byte to add last NIL */
266	0	if (normalized != word_buffer) {
267	0	normalized = g_realloc (normalized, new_word_length + 1);
268	0	}
269
270		/* Log after Normalization */
271	0	tracker_parser_message_hex (" After Casefolding and NFKD normalization",
272	0	normalized, new_word_length);
273	0	} else {
274		/* For ASCII-only, just tolower() each character */
275	0	gsize i;
276
277	0	normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer;
278
279	0	for (i = 0; i < (gsize) length; i++) {
280	0	normalized[i] = g_ascii_tolower (word[i]);
281	0	}
282
283	0	new_word_length = length;
284
285		/* Log after tolower */
286	0	tracker_parser_message_hex (" After Lowercasing",
287	0	normalized, new_word_length);
288	0	}
289
290		/* UNAC stripping needed? (for non-CJK and non-ASCII) */
291	0	if (parser->enable_unaccent &&
292	0	type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
293	0	tracker_parser_unaccent_nfkd_string (normalized, &new_word_length)) {
294		/* Log after UNAC stripping */
295	0	tracker_parser_message_hex (" After UNAC stripping",
296	0	normalized, new_word_length);
297	0	}
298
299		/* Set output NIL */
300	0	normalized[new_word_length] = '\0';
301
302		/* Stemming needed? */
303	0	if (parser->enable_stemmer) {
304	0	tracker_language_stem_word (parser->language,
305	0	normalized,
306	0	&new_word_length,
307	0	new_word_length);
308
309		/* Log after stemming */
310	0	tracker_parser_message_hex (" After stemming",
311	0	normalized, new_word_length);
312	0	}
313
314		/* It may be the case that no stripping and no stemming was needed, and
315		* that the output buffer in stack was enough for case-folding and
316		* normalization. In this case, need to strdup() the string to return it */
317	0	return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
318	0	}
319
320		static gboolean
321		parser_next (TrackerParser *parser,
322		gint *byte_offset_start,
323		gint *byte_offset_end)
324	0	{
325	0	gsize word_length = 0;
326	0	gchar *processed_word = NULL;
327
328	0	*byte_offset_start = 0;
329	0	*byte_offset_end = 0;
330
331	0	g_return_val_if_fail (parser, FALSE);
332
333		/* Loop to look for next valid word */
334	0	while (!processed_word &&
335	0	parser->cursor < (gsize) parser->txt_size) {
336	0	TrackerParserWordType type;
337	0	gsize truncated_length;
338	0	gboolean is_allowed;
339
340		/* Get word info */
341	0	if (!get_word_info (parser,
342	0	&word_length,
343	0	&is_allowed,
344	0	&type)) {
345		/* Quit loop just in case */
346	0	parser->cursor = parser->txt_size;
347	0	break;
348	0	}
349
350		/* Ignore the word if not an allowed word start */
351	0	if (!is_allowed) {
352		/* Ignore this word and keep on looping */
353	0	parser->cursor += word_length;
354	0	continue;
355	0	}
356
357		/* Ignore the word if longer than the maximum allowed */
358	0	if (word_length >= parser->max_word_length) {
359		/* Ignore this word and keep on looping */
360	0	parser->cursor += word_length;
361	0	continue;
362	0	}
363
364		/* compute truncated word length if needed (to avoid extremely
365		* long words)*/
366	0	truncated_length = (word_length < WORD_BUFFER_LENGTH ?
367	0	word_length :
368	0	WORD_BUFFER_LENGTH - 1);
369
370		/* Process the word here. If it fails, we can still go
371		* to the next one. Returns newly allocated string
372		* always */
373	0	processed_word = process_word_utf8 (parser,
374	0	&(parser->txt[parser->cursor]),
375	0	truncated_length,
376	0	type);
377	0	if (!processed_word) {
378		/* Ignore this word and keep on looping */
379	0	parser->cursor += word_length;
380	0	continue;
381	0	}
382	0	}
383
384		/* If we got a word here, set output */
385	0	if (processed_word) {
386		/* Set outputs */
387	0	*byte_offset_start = parser->cursor;
388	0	*byte_offset_end = parser->cursor + word_length;
389
390		/* Update cursor */
391	0	parser->cursor += word_length;
392
393	0	parser->word_length = strlen (processed_word);
394	0	parser->word = processed_word;
395
396	0	return TRUE;
397	0	}
398
399		/* No more words... */
400	0	return FALSE;
401	0	}
402
403		TrackerParser *
404		tracker_parser_new (void)
405	0	{
406	0	TrackerParser *parser;
407
408	0	parser = g_new0 (TrackerParser, 1);
409	0	parser->language = tracker_language_new (NULL);
410
411	0	return parser;
412	0	}
413
414		void
415		tracker_parser_free (TrackerParser *parser)
416	0	{
417	0	g_return_if_fail (parser != NULL);
418
419	0	if (parser->language) {
420	0	g_object_unref (parser->language);
421	0	}
422
423	0	g_free (parser->word_break_flags);
424
425	0	g_free (parser->word);
426
427	0	g_free (parser);
428	0	}
429
430		void
431		tracker_parser_reset (TrackerParser *parser,
432		const gchar *txt,
433		gint txt_size,
434		guint max_word_length,
435		gboolean enable_stemmer,
436		gboolean enable_unaccent,
437		gboolean ignore_numbers)
438	0	{
439	0	g_return_if_fail (parser != NULL);
440	0	g_return_if_fail (txt != NULL);
441
442	0	parser->max_word_length = max_word_length;
443	0	parser->enable_stemmer = enable_stemmer;
444	0	parser->enable_unaccent = enable_unaccent;
445	0	parser->ignore_numbers = ignore_numbers;
446
447		/* Note: We're forcing some unicode characters to behave
448		* as wordbreakers: e.g, the '.' The main reason for this
449		* is to enable FTS searches matching file extension. */
450	0	parser->enable_forced_wordbreaks = TRUE;
451
452	0	parser->txt_size = txt_size;
453	0	parser->txt = txt;
454
455	0	g_free (parser->word);
456	0	parser->word = NULL;
457
458	0	parser->word_position = 0;
459
460	0	parser->cursor = 0;
461
462	0	g_free (parser->word_break_flags);
463
464		/* Create array of flags, same size as original text. */
465	0	parser->word_break_flags = g_malloc (txt_size);
466
467		/* Get wordbreak flags in the whole string */
468	0	u8_wordbreaks ((const uint8_t *)txt,
469	0	(size_t) txt_size,
470	0	(char *)parser->word_break_flags);
471
472		/* Prepare a custom category which is a combination of the
473		* desired ones */
474	0	parser->allowed_start = UC_LETTER;
475	0	if (!parser->ignore_numbers) {
476	0	parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
477	0	}
478	0	}
479
480		const gchar *
481		tracker_parser_next (TrackerParser *parser,
482		gint *position,
483		gint *byte_offset_start,
484		gint *byte_offset_end,
485		gint *word_length)
486	0	{
487	0	const gchar *str;
488	0	gint byte_start = 0, byte_end = 0;
489
490	0	str = NULL;
491
492	0	g_free (parser->word);
493	0	parser->word = NULL;
494
495	0	if (parser_next (parser, &byte_start, &byte_end)) {
496	0	str = parser->word;
497	0	}
498
499	0	parser->word_position++;
500
501	0	*word_length = parser->word_length;
502	0	*position = parser->word_position;
503	0	*byte_offset_start = byte_start;
504	0	*byte_offset_end = byte_end;
505
506	0	return str;
507	0	}
508
509		gpointer
510		tracker_collation_init (void)
511	6	{
512		/* Nothing to do */
513	6	return NULL;
514	6	}
515
516		void
517		tracker_collation_shutdown (gpointer collator)
518	4	{
519		/* Nothing to do */
520	4	}
521
522		gint
523		tracker_collation_utf8 (gpointer collator,
524		gint len1,
525		gconstpointer str1,
526		gint len2,
527		gconstpointer str2)
528	0	{
529	0	gint result;
530	0	guchar *aux1;
531	0	guchar *aux2;
532
533		/* Note: str1 and str2 are NOT NUL-terminated */
534	0	aux1 = (len1 < MAX_STACK_STR_SIZE) ? g_alloca (len1+1) : g_malloc (len1+1);
535	0	aux2 = (len2 < MAX_STACK_STR_SIZE) ? g_alloca (len2+1) : g_malloc (len2+1);
536
537	0	memcpy (aux1, str1, len1); aux1[len1] = '\0';
538	0	memcpy (aux2, str2, len2); aux2[len2] = '\0';
539
540	0	result = u8_strcoll (aux1, aux2);
541
542	0	if (len1 >= MAX_STACK_STR_SIZE)
543	0	g_free (aux1);
544	0	if (len2 >= MAX_STACK_STR_SIZE)
545	0	g_free (aux2);
546	0	return result;
547	0	}
548
549		gunichar2 *
550		tracker_parser_tolower (const gunichar2 *input,
551		gsize len,
552		gsize *len_out)
553	0	{
554	0	return u16_tolower (input, len / 2, NULL, NULL, NULL, len_out);
555	0	}
556
557		gunichar2 *
558		tracker_parser_toupper (const gunichar2 *input,
559		gsize len,
560		gsize *len_out)
561	0	{
562	0	return u16_toupper (input, len / 2, NULL, NULL, NULL, len_out);
563	0	}
564
565		gunichar2 *
566		tracker_parser_casefold (const gunichar2 *input,
567		gsize len,
568		gsize *len_out)
569	0	{
570	0	return u16_casefold (input, len / 2, NULL, NULL, NULL, len_out);
571	0	}
572
573		gunichar2 *
574		tracker_parser_normalize (const gunichar2 *input,
575		GNormalizeMode mode,
576		gsize len,
577		gsize *len_out)
578	0	{
579	0	uninorm_t nf;
580
581	0	if (mode == G_NORMALIZE_NFC)
582	0	nf = UNINORM_NFC;
583	0	else if (mode == G_NORMALIZE_NFD)
584	0	nf = UNINORM_NFD;
585	0	else if (mode == G_NORMALIZE_NFKC)
586	0	nf = UNINORM_NFKC;
587	0	else if (mode == G_NORMALIZE_NFKD)
588	0	nf = UNINORM_NFKD;
589	0	else
590	0	g_assert_not_reached ();
591
592	0	return u16_normalize (nf, input, len / 2, NULL, len_out);
593	0	}
594
595		gunichar2 *
596		tracker_parser_unaccent (const gunichar2 *input,
597		gsize len,
598		gsize *len_out)
599	0	{
600	0	gunichar2 *zOutput;
601	0	gsize written = 0;
602
603	0	zOutput = u16_normalize (UNINORM_NFKD, input, len, NULL, &written);
604
605		/* Unaccenting is done in place */
606	0	tracker_parser_unaccent_nfkd_string (zOutput, &written);
607
608	0	*len_out = written;
609
610	0	return zOutput;
611	0	}