/src/tinysparql/src/common/tracker-parser-libunistring.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org> |
3 | | * Copyright (C) 2008,2009,2010 Nokia <ivan.frade@nokia.com> |
4 | | * |
5 | | * This library is free software; you can redistribute it and/or |
6 | | * modify it under the terms of the GNU Lesser General Public |
7 | | * License as published by the Free Software Foundation; either |
8 | | * version 2.1 of the License, or (at your option) any later version. |
9 | | * |
10 | | * This library is distributed in the hope that it will be useful, |
11 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | | * Lesser General Public License for more details. |
14 | | * |
15 | | * You should have received a copy of the GNU Lesser General Public |
16 | | * License along with this library; if not, write to the Free Software |
17 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA |
18 | | * 02110-1301 USA |
19 | | */ |
20 | | |
21 | | #include "config.h" |
22 | | |
23 | | #include <stdio.h> |
24 | | #include <string.h> |
25 | | |
26 | | /* libunistring versions prior to 9.1.2 need this hack */ |
27 | | #define _UNUSED_PARAMETER_ |
28 | | #include <unistr.h> |
29 | | #include <uniwbrk.h> |
30 | | #include <unictype.h> |
31 | | #include <unicase.h> |
32 | | |
33 | | #include "tracker-language.h" |
34 | | #include "tracker-parser.h" |
35 | | #include "tracker-parser-utils.h" |
36 | | |
37 | | /* Type of words detected */ |
38 | | typedef enum { |
39 | | TRACKER_PARSER_WORD_TYPE_ASCII, |
40 | | TRACKER_PARSER_WORD_TYPE_OTHER_UNAC, |
41 | | TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC, |
42 | | } TrackerParserWordType; |
43 | | |
44 | | /* If string lenth less than this value, allocating from the stack */ |
45 | 0 | #define MAX_STACK_STR_SIZE 8192 |
46 | | |
47 | | /* Max possible length of a UTF-8 encoded string (just a safety limit) */ |
48 | 0 | #define WORD_BUFFER_LENGTH 512 |
49 | | |
50 | | struct TrackerParser { |
51 | | const gchar *txt; |
52 | | gint txt_size; |
53 | | |
54 | | TrackerLanguage *language; |
55 | | guint max_word_length; |
56 | | gboolean enable_stemmer; |
57 | | gboolean enable_unaccent; |
58 | | gboolean ignore_numbers; |
59 | | gboolean enable_forced_wordbreaks; |
60 | | |
61 | | /* Private members */ |
62 | | gchar *word; |
63 | | gint word_length; |
64 | | guint word_position; |
65 | | |
66 | | /* Cursor, as index of the input array of bytes */ |
67 | | gsize cursor; |
68 | | /* libunistring flags array */ |
69 | | gchar *word_break_flags; |
70 | | /* general category of the start character in words */ |
71 | | uc_general_category_t allowed_start; |
72 | | }; |
73 | | |
74 | | static gboolean |
75 | | get_word_info (TrackerParser *parser, |
76 | | gsize *p_word_length, |
77 | | gboolean *p_is_allowed_word_start, |
78 | | TrackerParserWordType *p_word_type) |
79 | 0 | { |
80 | 0 | ucs4_t first_unichar; |
81 | 0 | gint first_unichar_len; |
82 | 0 | gboolean ascii_only; |
83 | | |
84 | | /* Defaults */ |
85 | 0 | *p_is_allowed_word_start = TRUE; |
86 | | |
87 | | /* Get first character of the word as UCS4 */ |
88 | 0 | first_unichar_len = u8_strmbtouc (&first_unichar, |
89 | 0 | (const guchar *) &(parser->txt[parser->cursor])); |
90 | 0 | if (first_unichar_len <= 0) { |
91 | | /* This should only happen if NIL was passed to u8_strmbtouc, |
92 | | * so better just force stop here */ |
93 | 0 | return FALSE; |
94 | 0 | } else { |
95 | | /* If first character has length 1, it's ASCII-7 */ |
96 | 0 | ascii_only = first_unichar_len == 1 ? TRUE : FALSE; |
97 | 0 | } |
98 | | |
99 | | /* Consider word starts with a forced wordbreak */ |
100 | 0 | if (parser->enable_forced_wordbreaks && |
101 | 0 | IS_FORCED_WORDBREAK_UCS4 ((guint32)first_unichar)) { |
102 | 0 | *p_word_length = first_unichar_len; |
103 | 0 | } else { |
104 | 0 | gsize i; |
105 | | |
106 | | /* Find next word break, and in the same loop checking if only ASCII |
107 | | * characters */ |
108 | 0 | i = parser->cursor + first_unichar_len; |
109 | 0 | while (1) { |
110 | | /* Text bounds reached? */ |
111 | 0 | if (i >= (gsize) parser->txt_size) |
112 | 0 | break; |
113 | | /* Proper unicode word break detected? */ |
114 | 0 | if (parser->word_break_flags[i]) |
115 | 0 | break; |
116 | | /* Forced word break detected? */ |
117 | 0 | if (parser->enable_forced_wordbreaks && |
118 | 0 | IS_FORCED_WORDBREAK_UCS4 ((guint32)parser->txt[i])) |
119 | 0 | break; |
120 | | |
121 | 0 | if (ascii_only && |
122 | 0 | !IS_ASCII_UCS4 ((guint32)parser->txt[i])) { |
123 | 0 | ascii_only = FALSE; |
124 | 0 | } |
125 | |
|
126 | 0 | i++; |
127 | 0 | } |
128 | | |
129 | | /* Word end is the first byte after the word, which is either the |
130 | | * start of next word or the end of the string */ |
131 | 0 | *p_word_length = i - parser->cursor; |
132 | 0 | } |
133 | | |
134 | | /* We only want the words where the first character |
135 | | * in the word is either a letter, a number or a symbol. |
136 | | * This is needed because the word break algorithm also |
137 | | * considers word breaks after for example commas or other |
138 | | * punctuation marks. |
139 | | * Note that looking at the first character in the string |
140 | | * should be compatible with all Unicode normalization |
141 | | * methods. |
142 | | */ |
143 | 0 | if (!IS_UNDERSCORE_UCS4 ((guint32)first_unichar) && |
144 | 0 | !uc_is_general_category (first_unichar, |
145 | 0 | parser->allowed_start)) { |
146 | 0 | *p_is_allowed_word_start = FALSE; |
147 | 0 | return TRUE; |
148 | 0 | } |
149 | | |
150 | | /* Decide word type */ |
151 | 0 | if (ascii_only) { |
152 | 0 | *p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII; |
153 | 0 | } else if (IS_CJK_UCS4 (first_unichar)) { |
154 | 0 | *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC; |
155 | 0 | } else { |
156 | 0 | *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC; |
157 | 0 | } |
158 | 0 | return TRUE; |
159 | 0 | } |
160 | | |
161 | | /* The input word in this method MUST be normalized in NFKD form, |
162 | | * and given in UTF-8, where str_length is the byte-length |
163 | | * (note: there is no trailing NUL character!) */ |
164 | | static gboolean |
165 | | tracker_parser_unaccent_nfkd_string (gpointer str, |
166 | | gsize *str_length) |
167 | 0 | { |
168 | 0 | gchar *word; |
169 | 0 | gsize word_length; |
170 | 0 | gsize i; |
171 | 0 | gsize j; |
172 | |
|
173 | 0 | g_return_val_if_fail (str != NULL, FALSE); |
174 | 0 | g_return_val_if_fail (str_length != NULL, FALSE); |
175 | | |
176 | 0 | word = (gchar *)str; |
177 | 0 | word_length = *str_length; |
178 | |
|
179 | 0 | i = 0; |
180 | 0 | j = 0; |
181 | 0 | while (i < word_length) { |
182 | 0 | ucs4_t unichar; |
183 | 0 | gint utf8_len; |
184 | | |
185 | | /* Get next character of the word as UCS4 */ |
186 | 0 | utf8_len = u8_strmbtouc (&unichar, (const guchar *) &word[i]); |
187 | | |
188 | | /* Invalid UTF-8 character or end of original string. */ |
189 | 0 | if (utf8_len <= 0) { |
190 | 0 | break; |
191 | 0 | } |
192 | | |
193 | | /* If the given unichar is a combining diacritical mark, |
194 | | * just update the original index, not the output one */ |
195 | 0 | if (IS_CDM_UCS4 ((guint32) unichar)) { |
196 | 0 | i += utf8_len; |
197 | 0 | continue; |
198 | 0 | } |
199 | | |
200 | | /* If already found a previous combining |
201 | | * diacritical mark, indexes are different so |
202 | | * need to copy characters. As output and input |
203 | | * buffers may overlap, need to use memmove |
204 | | * instead of memcpy */ |
205 | 0 | if (i != j) { |
206 | 0 | memmove (&word[j], &word[i], utf8_len); |
207 | 0 | } |
208 | | |
209 | | /* Update both indexes */ |
210 | 0 | i += utf8_len; |
211 | 0 | j += utf8_len; |
212 | 0 | } |
213 | | |
214 | | /* Set new output length */ |
215 | 0 | *str_length = j; |
216 | |
|
217 | 0 | return TRUE; |
218 | 0 | } |
219 | | |
220 | | static gchar * |
221 | | process_word_utf8 (TrackerParser *parser, |
222 | | const gchar *word, |
223 | | gint length, |
224 | | TrackerParserWordType type) |
225 | 0 | { |
226 | 0 | gchar word_buffer [WORD_BUFFER_LENGTH]; |
227 | 0 | gchar *normalized = NULL; |
228 | 0 | gchar *stemmed = NULL; |
229 | 0 | size_t new_word_length; |
230 | |
|
231 | 0 | g_return_val_if_fail (parser != NULL, NULL); |
232 | 0 | g_return_val_if_fail (word != NULL, NULL); |
233 | | |
234 | | /* If length is set as -1, the input word MUST be NIL-terminated. |
235 | | * Otherwise, this restriction is not needed as the length to process |
236 | | * is given as input argument */ |
237 | 0 | if (length < 0) { |
238 | 0 | length = strlen (word); |
239 | 0 | } |
240 | | |
241 | | /* Log original word */ |
242 | 0 | tracker_parser_message_hex ("ORIGINAL word", |
243 | 0 | word, length); |
244 | | |
245 | | /* Normalization and case-folding ONLY for non-ASCII */ |
246 | 0 | if (type != TRACKER_PARSER_WORD_TYPE_ASCII) { |
247 | | /* Leave space for last NIL */ |
248 | 0 | new_word_length = WORD_BUFFER_LENGTH - 1; |
249 | | |
250 | | /* Casefold and NFKD normalization in output. |
251 | | * NOTE: if the output buffer is not big enough, u8_casefold will |
252 | | * return a newly-allocated buffer. */ |
253 | 0 | normalized = (gchar*) u8_casefold ((const uint8_t *)word, |
254 | 0 | length, |
255 | 0 | uc_locale_language (), |
256 | 0 | UNINORM_NFKD, |
257 | 0 | (guchar *) word_buffer, |
258 | 0 | &new_word_length); |
259 | | |
260 | | /* Case folding + Normalization failed, ignore this word */ |
261 | 0 | g_return_val_if_fail (normalized != NULL, NULL); |
262 | | |
263 | | /* If output buffer is not the same as the one passed to |
264 | | * u8_casefold, we know it was newly-allocated, so need |
265 | | * to resize it in 1 byte to add last NIL */ |
266 | 0 | if (normalized != word_buffer) { |
267 | 0 | normalized = g_realloc (normalized, new_word_length + 1); |
268 | 0 | } |
269 | | |
270 | | /* Log after Normalization */ |
271 | 0 | tracker_parser_message_hex (" After Casefolding and NFKD normalization", |
272 | 0 | normalized, new_word_length); |
273 | 0 | } else { |
274 | | /* For ASCII-only, just tolower() each character */ |
275 | 0 | gsize i; |
276 | |
|
277 | 0 | normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer; |
278 | |
|
279 | 0 | for (i = 0; i < (gsize) length; i++) { |
280 | 0 | normalized[i] = g_ascii_tolower (word[i]); |
281 | 0 | } |
282 | |
|
283 | 0 | new_word_length = length; |
284 | | |
285 | | /* Log after tolower */ |
286 | 0 | tracker_parser_message_hex (" After Lowercasing", |
287 | 0 | normalized, new_word_length); |
288 | 0 | } |
289 | | |
290 | | /* UNAC stripping needed? (for non-CJK and non-ASCII) */ |
291 | 0 | if (parser->enable_unaccent && |
292 | 0 | type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC && |
293 | 0 | tracker_parser_unaccent_nfkd_string (normalized, &new_word_length)) { |
294 | | /* Log after UNAC stripping */ |
295 | 0 | tracker_parser_message_hex (" After UNAC stripping", |
296 | 0 | normalized, new_word_length); |
297 | 0 | } |
298 | | |
299 | | /* Set output NIL */ |
300 | 0 | normalized[new_word_length] = '\0'; |
301 | | |
302 | | /* Stemming needed? */ |
303 | 0 | if (parser->enable_stemmer) { |
304 | 0 | tracker_language_stem_word (parser->language, |
305 | 0 | normalized, |
306 | 0 | &new_word_length, |
307 | 0 | new_word_length); |
308 | | |
309 | | /* Log after stemming */ |
310 | 0 | tracker_parser_message_hex (" After stemming", |
311 | 0 | normalized, new_word_length); |
312 | 0 | } |
313 | | |
314 | | /* It may be the case that no stripping and no stemming was needed, and |
315 | | * that the output buffer in stack was enough for case-folding and |
316 | | * normalization. In this case, need to strdup() the string to return it */ |
317 | 0 | return normalized == word_buffer ? g_strdup (word_buffer) : normalized; |
318 | 0 | } |
319 | | |
320 | | static gboolean |
321 | | parser_next (TrackerParser *parser, |
322 | | gint *byte_offset_start, |
323 | | gint *byte_offset_end) |
324 | 0 | { |
325 | 0 | gsize word_length = 0; |
326 | 0 | gchar *processed_word = NULL; |
327 | |
|
328 | 0 | *byte_offset_start = 0; |
329 | 0 | *byte_offset_end = 0; |
330 | |
|
331 | 0 | g_return_val_if_fail (parser, FALSE); |
332 | | |
333 | | /* Loop to look for next valid word */ |
334 | 0 | while (!processed_word && |
335 | 0 | parser->cursor < (gsize) parser->txt_size) { |
336 | 0 | TrackerParserWordType type; |
337 | 0 | gsize truncated_length; |
338 | 0 | gboolean is_allowed; |
339 | | |
340 | | /* Get word info */ |
341 | 0 | if (!get_word_info (parser, |
342 | 0 | &word_length, |
343 | 0 | &is_allowed, |
344 | 0 | &type)) { |
345 | | /* Quit loop just in case */ |
346 | 0 | parser->cursor = parser->txt_size; |
347 | 0 | break; |
348 | 0 | } |
349 | | |
350 | | /* Ignore the word if not an allowed word start */ |
351 | 0 | if (!is_allowed) { |
352 | | /* Ignore this word and keep on looping */ |
353 | 0 | parser->cursor += word_length; |
354 | 0 | continue; |
355 | 0 | } |
356 | | |
357 | | /* Ignore the word if longer than the maximum allowed */ |
358 | 0 | if (word_length >= parser->max_word_length) { |
359 | | /* Ignore this word and keep on looping */ |
360 | 0 | parser->cursor += word_length; |
361 | 0 | continue; |
362 | 0 | } |
363 | | |
364 | | /* compute truncated word length if needed (to avoid extremely |
365 | | * long words)*/ |
366 | 0 | truncated_length = (word_length < WORD_BUFFER_LENGTH ? |
367 | 0 | word_length : |
368 | 0 | WORD_BUFFER_LENGTH - 1); |
369 | | |
370 | | /* Process the word here. If it fails, we can still go |
371 | | * to the next one. Returns newly allocated string |
372 | | * always */ |
373 | 0 | processed_word = process_word_utf8 (parser, |
374 | 0 | &(parser->txt[parser->cursor]), |
375 | 0 | truncated_length, |
376 | 0 | type); |
377 | 0 | if (!processed_word) { |
378 | | /* Ignore this word and keep on looping */ |
379 | 0 | parser->cursor += word_length; |
380 | 0 | continue; |
381 | 0 | } |
382 | 0 | } |
383 | | |
384 | | /* If we got a word here, set output */ |
385 | 0 | if (processed_word) { |
386 | | /* Set outputs */ |
387 | 0 | *byte_offset_start = parser->cursor; |
388 | 0 | *byte_offset_end = parser->cursor + word_length; |
389 | | |
390 | | /* Update cursor */ |
391 | 0 | parser->cursor += word_length; |
392 | |
|
393 | 0 | parser->word_length = strlen (processed_word); |
394 | 0 | parser->word = processed_word; |
395 | |
|
396 | 0 | return TRUE; |
397 | 0 | } |
398 | | |
399 | | /* No more words... */ |
400 | 0 | return FALSE; |
401 | 0 | } |
402 | | |
403 | | TrackerParser * |
404 | | tracker_parser_new (void) |
405 | 0 | { |
406 | 0 | TrackerParser *parser; |
407 | |
|
408 | 0 | parser = g_new0 (TrackerParser, 1); |
409 | 0 | parser->language = tracker_language_new (NULL); |
410 | |
|
411 | 0 | return parser; |
412 | 0 | } |
413 | | |
414 | | void |
415 | | tracker_parser_free (TrackerParser *parser) |
416 | 0 | { |
417 | 0 | g_return_if_fail (parser != NULL); |
418 | | |
419 | 0 | if (parser->language) { |
420 | 0 | g_object_unref (parser->language); |
421 | 0 | } |
422 | |
|
423 | 0 | g_free (parser->word_break_flags); |
424 | |
|
425 | 0 | g_free (parser->word); |
426 | |
|
427 | 0 | g_free (parser); |
428 | 0 | } |
429 | | |
430 | | void |
431 | | tracker_parser_reset (TrackerParser *parser, |
432 | | const gchar *txt, |
433 | | gint txt_size, |
434 | | guint max_word_length, |
435 | | gboolean enable_stemmer, |
436 | | gboolean enable_unaccent, |
437 | | gboolean ignore_numbers) |
438 | 0 | { |
439 | 0 | g_return_if_fail (parser != NULL); |
440 | 0 | g_return_if_fail (txt != NULL); |
441 | | |
442 | 0 | parser->max_word_length = max_word_length; |
443 | 0 | parser->enable_stemmer = enable_stemmer; |
444 | 0 | parser->enable_unaccent = enable_unaccent; |
445 | 0 | parser->ignore_numbers = ignore_numbers; |
446 | | |
447 | | /* Note: We're forcing some unicode characters to behave |
448 | | * as wordbreakers: e.g, the '.' The main reason for this |
449 | | * is to enable FTS searches matching file extension. */ |
450 | 0 | parser->enable_forced_wordbreaks = TRUE; |
451 | |
|
452 | 0 | parser->txt_size = txt_size; |
453 | 0 | parser->txt = txt; |
454 | |
|
455 | 0 | g_free (parser->word); |
456 | 0 | parser->word = NULL; |
457 | |
|
458 | 0 | parser->word_position = 0; |
459 | |
|
460 | 0 | parser->cursor = 0; |
461 | |
|
462 | 0 | g_free (parser->word_break_flags); |
463 | | |
464 | | /* Create array of flags, same size as original text. */ |
465 | 0 | parser->word_break_flags = g_malloc (txt_size); |
466 | | |
467 | | /* Get wordbreak flags in the whole string */ |
468 | 0 | u8_wordbreaks ((const uint8_t *)txt, |
469 | 0 | (size_t) txt_size, |
470 | 0 | (char *)parser->word_break_flags); |
471 | | |
472 | | /* Prepare a custom category which is a combination of the |
473 | | * desired ones */ |
474 | 0 | parser->allowed_start = UC_LETTER; |
475 | 0 | if (!parser->ignore_numbers) { |
476 | 0 | parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER); |
477 | 0 | } |
478 | 0 | } |
479 | | |
480 | | const gchar * |
481 | | tracker_parser_next (TrackerParser *parser, |
482 | | gint *position, |
483 | | gint *byte_offset_start, |
484 | | gint *byte_offset_end, |
485 | | gint *word_length) |
486 | 0 | { |
487 | 0 | const gchar *str; |
488 | 0 | gint byte_start = 0, byte_end = 0; |
489 | |
|
490 | 0 | str = NULL; |
491 | |
|
492 | 0 | g_free (parser->word); |
493 | 0 | parser->word = NULL; |
494 | |
|
495 | 0 | if (parser_next (parser, &byte_start, &byte_end)) { |
496 | 0 | str = parser->word; |
497 | 0 | } |
498 | |
|
499 | 0 | parser->word_position++; |
500 | |
|
501 | 0 | *word_length = parser->word_length; |
502 | 0 | *position = parser->word_position; |
503 | 0 | *byte_offset_start = byte_start; |
504 | 0 | *byte_offset_end = byte_end; |
505 | |
|
506 | 0 | return str; |
507 | 0 | } |
508 | | |
509 | | gpointer |
510 | | tracker_collation_init (void) |
511 | 6 | { |
512 | | /* Nothing to do */ |
513 | 6 | return NULL; |
514 | 6 | } |
515 | | |
516 | | void |
517 | | tracker_collation_shutdown (gpointer collator) |
518 | 4 | { |
519 | | /* Nothing to do */ |
520 | 4 | } |
521 | | |
522 | | gint |
523 | | tracker_collation_utf8 (gpointer collator, |
524 | | gint len1, |
525 | | gconstpointer str1, |
526 | | gint len2, |
527 | | gconstpointer str2) |
528 | 0 | { |
529 | 0 | gint result; |
530 | 0 | guchar *aux1; |
531 | 0 | guchar *aux2; |
532 | | |
533 | | /* Note: str1 and str2 are NOT NUL-terminated */ |
534 | 0 | aux1 = (len1 < MAX_STACK_STR_SIZE) ? g_alloca (len1+1) : g_malloc (len1+1); |
535 | 0 | aux2 = (len2 < MAX_STACK_STR_SIZE) ? g_alloca (len2+1) : g_malloc (len2+1); |
536 | |
|
537 | 0 | memcpy (aux1, str1, len1); aux1[len1] = '\0'; |
538 | 0 | memcpy (aux2, str2, len2); aux2[len2] = '\0'; |
539 | |
|
540 | 0 | result = u8_strcoll (aux1, aux2); |
541 | |
|
542 | 0 | if (len1 >= MAX_STACK_STR_SIZE) |
543 | 0 | g_free (aux1); |
544 | 0 | if (len2 >= MAX_STACK_STR_SIZE) |
545 | 0 | g_free (aux2); |
546 | 0 | return result; |
547 | 0 | } |
548 | | |
549 | | gunichar2 * |
550 | | tracker_parser_tolower (const gunichar2 *input, |
551 | | gsize len, |
552 | | gsize *len_out) |
553 | 0 | { |
554 | 0 | return u16_tolower (input, len / 2, NULL, NULL, NULL, len_out); |
555 | 0 | } |
556 | | |
557 | | gunichar2 * |
558 | | tracker_parser_toupper (const gunichar2 *input, |
559 | | gsize len, |
560 | | gsize *len_out) |
561 | 0 | { |
562 | 0 | return u16_toupper (input, len / 2, NULL, NULL, NULL, len_out); |
563 | 0 | } |
564 | | |
565 | | gunichar2 * |
566 | | tracker_parser_casefold (const gunichar2 *input, |
567 | | gsize len, |
568 | | gsize *len_out) |
569 | 0 | { |
570 | 0 | return u16_casefold (input, len / 2, NULL, NULL, NULL, len_out); |
571 | 0 | } |
572 | | |
573 | | gunichar2 * |
574 | | tracker_parser_normalize (const gunichar2 *input, |
575 | | GNormalizeMode mode, |
576 | | gsize len, |
577 | | gsize *len_out) |
578 | 0 | { |
579 | 0 | uninorm_t nf; |
580 | |
|
581 | 0 | if (mode == G_NORMALIZE_NFC) |
582 | 0 | nf = UNINORM_NFC; |
583 | 0 | else if (mode == G_NORMALIZE_NFD) |
584 | 0 | nf = UNINORM_NFD; |
585 | 0 | else if (mode == G_NORMALIZE_NFKC) |
586 | 0 | nf = UNINORM_NFKC; |
587 | 0 | else if (mode == G_NORMALIZE_NFKD) |
588 | 0 | nf = UNINORM_NFKD; |
589 | 0 | else |
590 | 0 | g_assert_not_reached (); |
591 | | |
592 | 0 | return u16_normalize (nf, input, len / 2, NULL, len_out); |
593 | 0 | } |
594 | | |
595 | | gunichar2 * |
596 | | tracker_parser_unaccent (const gunichar2 *input, |
597 | | gsize len, |
598 | | gsize *len_out) |
599 | 0 | { |
600 | 0 | gunichar2 *zOutput; |
601 | 0 | gsize written = 0; |
602 | |
|
603 | 0 | zOutput = u16_normalize (UNINORM_NFKD, input, len, NULL, &written); |
604 | | |
605 | | /* Unaccenting is done in place */ |
606 | 0 | tracker_parser_unaccent_nfkd_string (zOutput, &written); |
607 | |
|
608 | 0 | *len_out = written; |
609 | |
|
610 | 0 | return zOutput; |
611 | 0 | } |