/src/leptonica/src/sarray2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*====================================================================* |
2 | | - Copyright (C) 2001 Leptonica. All rights reserved. |
3 | | - |
4 | | - Redistribution and use in source and binary forms, with or without |
5 | | - modification, are permitted provided that the following conditions |
6 | | - are met: |
7 | | - 1. Redistributions of source code must retain the above copyright |
8 | | - notice, this list of conditions and the following disclaimer. |
9 | | - 2. Redistributions in binary form must reproduce the above |
10 | | - copyright notice, this list of conditions and the following |
11 | | - disclaimer in the documentation and/or other materials |
12 | | - provided with the distribution. |
13 | | - |
14 | | - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
15 | | - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
16 | | - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
17 | | - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY |
18 | | - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
19 | | - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
20 | | - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
21 | | - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
22 | | - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
23 | | - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
24 | | - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
25 | | *====================================================================*/ |
26 | | |
27 | | /*! |
28 | | * \file sarray2.c |
29 | | * <pre> |
30 | | * |
31 | | * Sort |
32 | | * SARRAY *sarraySort() |
33 | | * SARRAY *sarraySortByIndex() |
34 | | * l_int32 stringCompareLexical() |
35 | | * |
36 | | * Set operations using aset (rbtree) |
37 | | * L_ASET *l_asetCreateFromSarray() |
38 | | * l_int32 sarrayRemoveDupsByAset() |
39 | | * l_int32 sarrayUnionByAset() |
40 | | * l_int32 sarrayIntersectionByAset() |
41 | | * |
42 | | * Hashmap operations |
43 | | * L_HASHMAP *l_hmapCreateFromSarray() |
44 | | * l_int32 sarrayRemoveDupsByHmap() |
45 | | * l_int32 sarrayUnionByHmap() |
46 | | * l_int32 sarrayIntersectionByHmap() |
47 | | * |
48 | | * Miscellaneous operations |
49 | | * SARRAY *sarrayGenerateIntegers() |
50 | | * l_int32 sarrayLookupCSKV() |
51 | | * |
52 | | * |
53 | | * We have two implementations of set operations on an array of strings: |
54 | | * |
55 | | * (1) Using an underlying tree (rbtree). |
56 | | * This uses a good 64 bit hashing function for the key, |
57 | | * that is not expected to have hash collisions (and we do |
58 | | * not test for them). The tree is built up of the hash |
59 | | * values, and if the hash is found in the tree, it is |
60 | | * assumed that the string has already been found. |
61 | | * |
62 | | * (2) Building a hashmap from the keys (hashmap). |
63 | | * This uses a fast 64 bit hashing function for the key, which |
64 | | * is then hashed into a hashtable. Collisions of hashkeys are |
65 | | * very rare, but the hashtable is designed to allow more than one |
66 | | * hashitem in a table entry. The hashitems are put in a list at |
67 | | * each hashtable entry, which is traversed looking for the key. |
68 | | * </pre> |
69 | | */ |
70 | | |
71 | | #ifdef HAVE_CONFIG_H |
72 | | #include <config_auto.h> |
73 | | #endif /* HAVE_CONFIG_H */ |
74 | | |
75 | | #include <string.h> |
76 | | #include "allheaders.h" |
77 | | #include "array_internal.h" |
78 | | |
79 | | /*----------------------------------------------------------------------* |
80 | | * Sort * |
81 | | *----------------------------------------------------------------------*/ |
82 | | /*! |
83 | | * \brief sarraySort() |
84 | | * |
85 | | * \param[in] saout output sarray; can be NULL or equal to sain |
86 | | * \param[in] sain input sarray |
87 | | * \param[in] sortorder L_SORT_INCREASING or L_SORT_DECREASING |
88 | | * \return saout output sarray, sorted by ascii value, or NULL on error |
89 | | * |
90 | | * <pre> |
91 | | * Notes: |
92 | | * (1) Set saout = sain for in-place; otherwise, set naout = NULL. |
93 | | * (2) Shell sort, modified from K&R, 2nd edition, p.62. |
94 | | * Slow but simple O(n logn) sort. |
95 | | * </pre> |
96 | | */ |
97 | | SARRAY * |
98 | | sarraySort(SARRAY *saout, |
99 | | SARRAY *sain, |
100 | | l_int32 sortorder) |
101 | 0 | { |
102 | 0 | char **array; |
103 | 0 | char *tmp; |
104 | 0 | l_int32 n, i, j, gap; |
105 | |
|
106 | 0 | if (!sain) |
107 | 0 | return (SARRAY *)ERROR_PTR("sain not defined", __func__, NULL); |
108 | | |
109 | | /* Make saout if necessary; otherwise do in-place */ |
110 | 0 | if (!saout) |
111 | 0 | saout = sarrayCopy(sain); |
112 | 0 | else if (sain != saout) |
113 | 0 | return (SARRAY *)ERROR_PTR("invalid: not in-place", __func__, NULL); |
114 | 0 | array = saout->array; /* operate directly on the array */ |
115 | 0 | n = sarrayGetCount(saout); |
116 | | |
117 | | /* Shell sort */ |
118 | 0 | for (gap = n/2; gap > 0; gap = gap / 2) { |
119 | 0 | for (i = gap; i < n; i++) { |
120 | 0 | for (j = i - gap; j >= 0; j -= gap) { |
121 | 0 | if ((sortorder == L_SORT_INCREASING && |
122 | 0 | stringCompareLexical(array[j], array[j + gap])) || |
123 | 0 | (sortorder == L_SORT_DECREASING && |
124 | 0 | stringCompareLexical(array[j + gap], array[j]))) |
125 | 0 | { |
126 | 0 | tmp = array[j]; |
127 | 0 | array[j] = array[j + gap]; |
128 | 0 | array[j + gap] = tmp; |
129 | 0 | } |
130 | 0 | } |
131 | 0 | } |
132 | 0 | } |
133 | |
|
134 | 0 | return saout; |
135 | 0 | } |
136 | | |
137 | | |
138 | | /*! |
139 | | * \brief sarraySortByIndex() |
140 | | * |
141 | | * \param[in] sain |
142 | | * \param[in] naindex na that maps from the new sarray to the input sarray |
143 | | * \return saout sorted, or NULL on error |
144 | | */ |
145 | | SARRAY * |
146 | | sarraySortByIndex(SARRAY *sain, |
147 | | NUMA *naindex) |
148 | 0 | { |
149 | 0 | char *str; |
150 | 0 | l_int32 i, n, index; |
151 | 0 | SARRAY *saout; |
152 | |
|
153 | 0 | if (!sain) |
154 | 0 | return (SARRAY *)ERROR_PTR("sain not defined", __func__, NULL); |
155 | 0 | if (!naindex) |
156 | 0 | return (SARRAY *)ERROR_PTR("naindex not defined", __func__, NULL); |
157 | | |
158 | 0 | n = sarrayGetCount(sain); |
159 | 0 | saout = sarrayCreate(n); |
160 | 0 | for (i = 0; i < n; i++) { |
161 | 0 | numaGetIValue(naindex, i, &index); |
162 | 0 | str = sarrayGetString(sain, index, L_COPY); |
163 | 0 | sarrayAddString(saout, str, L_INSERT); |
164 | 0 | } |
165 | |
|
166 | 0 | return saout; |
167 | 0 | } |
168 | | |
169 | | |
170 | | /*! |
171 | | * \brief stringCompareLexical() |
172 | | * |
173 | | * \param[in] str1 |
174 | | * \param[in] str2 |
175 | | * \return 1 if str1 > str2 lexically; 0 otherwise |
176 | | * |
177 | | * <pre> |
178 | | * Notes: |
179 | | * (1) If the lexical values are identical, return a 0, to |
180 | | * indicate that no swapping is required to sort the strings. |
181 | | * </pre> |
182 | | */ |
183 | | l_int32 |
184 | | stringCompareLexical(const char *str1, |
185 | | const char *str2) |
186 | 0 | { |
187 | 0 | l_int32 i, len1, len2, len; |
188 | |
|
189 | 0 | if (!str1) |
190 | 0 | return ERROR_INT("str1 not defined", __func__, 1); |
191 | 0 | if (!str2) |
192 | 0 | return ERROR_INT("str2 not defined", __func__, 1); |
193 | | |
194 | 0 | len1 = strlen(str1); |
195 | 0 | len2 = strlen(str2); |
196 | 0 | len = L_MIN(len1, len2); |
197 | |
|
198 | 0 | for (i = 0; i < len; i++) { |
199 | 0 | if (str1[i] == str2[i]) |
200 | 0 | continue; |
201 | 0 | if (str1[i] > str2[i]) |
202 | 0 | return 1; |
203 | 0 | else |
204 | 0 | return 0; |
205 | 0 | } |
206 | | |
207 | 0 | if (len1 > len2) |
208 | 0 | return 1; |
209 | 0 | else |
210 | 0 | return 0; |
211 | 0 | } |
212 | | |
213 | | |
214 | | /*----------------------------------------------------------------------* |
215 | | * Set operations using aset (rbtree) * |
216 | | *----------------------------------------------------------------------*/ |
217 | | /*! |
218 | | * \brief l_asetCreateFromSarray() |
219 | | * |
220 | | * \param[in] sa |
221 | | * \return set using a string hash into a uint64 as the key |
222 | | */ |
223 | | L_ASET * |
224 | | l_asetCreateFromSarray(SARRAY *sa) |
225 | 0 | { |
226 | 0 | char *str; |
227 | 0 | l_int32 i, n; |
228 | 0 | l_uint64 hash; |
229 | 0 | L_ASET *set; |
230 | 0 | RB_TYPE key; |
231 | |
|
232 | 0 | if (!sa) |
233 | 0 | return (L_ASET *)ERROR_PTR("sa not defined", __func__, NULL); |
234 | | |
235 | 0 | set = l_asetCreate(L_UINT_TYPE); |
236 | 0 | n = sarrayGetCount(sa); |
237 | 0 | for (i = 0; i < n; i++) { |
238 | 0 | str = sarrayGetString(sa, i, L_NOCOPY); |
239 | 0 | l_hashStringToUint64Fast(str, &hash); |
240 | 0 | key.utype = hash; |
241 | 0 | l_asetInsert(set, key); |
242 | 0 | } |
243 | |
|
244 | 0 | return set; |
245 | 0 | } |
246 | | |
247 | | |
248 | | /*! |
249 | | * \brief sarrayRemoveDupsByAset() |
250 | | * |
251 | | * \param[in] sas |
252 | | * \param[out] psad with duplicates removed |
253 | | * \return 0 if OK; 1 on error |
254 | | * |
255 | | * <pre> |
256 | | * Notes: |
257 | | * (1) This is O(nlogn), considerably slower than |
258 | | * sarrayRemoveDupsByHmap() for large string arrays. |
259 | | * (2) The key for each string is a 64-bit hash. |
260 | | * (3) Build a set, using hashed strings as keys. As the set is |
261 | | * built, first do a find; if not found, add the key to the |
262 | | * set and add the string to the output sarray. |
263 | | * </pre> |
264 | | */ |
265 | | l_ok |
266 | | sarrayRemoveDupsByAset(SARRAY *sas, |
267 | | SARRAY **psad) |
268 | 0 | { |
269 | 0 | char *str; |
270 | 0 | l_int32 i, n; |
271 | 0 | l_uint64 hash; |
272 | 0 | L_ASET *set; |
273 | 0 | RB_TYPE key; |
274 | 0 | SARRAY *sad; |
275 | |
|
276 | 0 | if (!psad) |
277 | 0 | return ERROR_INT("&sad not defined", __func__, 1); |
278 | 0 | *psad = NULL; |
279 | 0 | if (!sas) |
280 | 0 | return ERROR_INT("sas not defined", __func__, 1); |
281 | | |
282 | 0 | set = l_asetCreate(L_UINT_TYPE); |
283 | 0 | sad = sarrayCreate(0); |
284 | 0 | *psad = sad; |
285 | 0 | n = sarrayGetCount(sas); |
286 | 0 | for (i = 0; i < n; i++) { |
287 | 0 | str = sarrayGetString(sas, i, L_NOCOPY); |
288 | 0 | l_hashStringToUint64Fast(str, &hash); |
289 | 0 | key.utype = hash; |
290 | 0 | if (!l_asetFind(set, key)) { |
291 | 0 | sarrayAddString(sad, str, L_COPY); |
292 | 0 | l_asetInsert(set, key); |
293 | 0 | } |
294 | 0 | } |
295 | |
|
296 | 0 | l_asetDestroy(&set); |
297 | 0 | return 0; |
298 | 0 | } |
299 | | |
300 | | |
301 | | /*! |
302 | | * \brief sarrayUnionByAset() |
303 | | * |
304 | | * \param[in] sa1 |
305 | | * \param[in] sa2 |
306 | | * \param[out] psad union of the two string arrays |
307 | | * \return 0 if OK; 1 on error |
308 | | * |
309 | | * <pre> |
310 | | * Notes: |
311 | | * (1) Duplicates are removed from the concatenation of the two arrays. |
312 | | * (2) The key for each string is a 64-bit hash. |
313 | | * (2) Algorithm: Concatenate the two sarrays. Then build a set, |
314 | | * using hashed strings as keys. As the set is built, first do |
315 | | * a find; if not found, add the key to the set and add the string |
316 | | * to the output sarray. This is O(nlogn). |
317 | | * </pre> |
318 | | */ |
319 | | l_ok |
320 | | sarrayUnionByAset(SARRAY *sa1, |
321 | | SARRAY *sa2, |
322 | | SARRAY **psad) |
323 | 0 | { |
324 | 0 | SARRAY *sa3; |
325 | |
|
326 | 0 | if (!psad) |
327 | 0 | return ERROR_INT("&sad not defined", __func__, 1); |
328 | 0 | *psad = NULL; |
329 | 0 | if (!sa1) |
330 | 0 | return ERROR_INT("sa1 not defined", __func__, 1); |
331 | 0 | if (!sa2) |
332 | 0 | return ERROR_INT("sa2 not defined", __func__, 1); |
333 | | |
334 | | /* Join */ |
335 | 0 | sa3 = sarrayCopy(sa1); |
336 | 0 | if (sarrayJoin(sa3, sa2) == 1) { |
337 | 0 | sarrayDestroy(&sa3); |
338 | 0 | return ERROR_INT("join failed for sa3", __func__, 1); |
339 | 0 | } |
340 | | |
341 | | /* Eliminate duplicates */ |
342 | 0 | sarrayRemoveDupsByAset(sa3, psad); |
343 | 0 | sarrayDestroy(&sa3); |
344 | 0 | return 0; |
345 | 0 | } |
346 | | |
347 | | |
348 | | /*! |
349 | | * \brief sarrayIntersectionByAset() |
350 | | * |
351 | | * \param[in] sa1 |
352 | | * \param[in] sa2 |
353 | | * \param[out] psad intersection of the two string arrays |
354 | | * \return 0 if OK; 1 on error |
355 | | * |
356 | | * <pre> |
357 | | * Notes: |
358 | | * (1) Algorithm: put the larger sarray into a set, using the string |
359 | | * hashes as the key values. Then run through the smaller sarray, |
360 | | * building an output sarray and a second set from the strings |
361 | | * in the larger array: if a string is in the first set but |
362 | | * not in the second, add the string to the output sarray and hash |
363 | | * it into the second set. The second set is required to make |
364 | | * sure only one instance of each string is put into the output sarray. |
365 | | * This is O(mlogn), {m,n} = sizes of {smaller,larger} input arrays. |
366 | | * </pre> |
367 | | */ |
368 | | l_ok |
369 | | sarrayIntersectionByAset(SARRAY *sa1, |
370 | | SARRAY *sa2, |
371 | | SARRAY **psad) |
372 | 0 | { |
373 | 0 | char *str; |
374 | 0 | l_int32 n1, n2, i, n; |
375 | 0 | l_uint64 hash; |
376 | 0 | L_ASET *set1, *set2; |
377 | 0 | RB_TYPE key; |
378 | 0 | SARRAY *sa_small, *sa_big, *sad; |
379 | |
|
380 | 0 | if (!psad) |
381 | 0 | return ERROR_INT("&sad not defined", __func__, 1); |
382 | 0 | *psad = NULL; |
383 | 0 | if (!sa1) |
384 | 0 | return ERROR_INT("sa1 not defined", __func__, 1); |
385 | 0 | if (!sa2) |
386 | 0 | return ERROR_INT("sa2 not defined", __func__, 1); |
387 | | |
388 | | /* Put the elements of the biggest array into a set */ |
389 | 0 | n1 = sarrayGetCount(sa1); |
390 | 0 | n2 = sarrayGetCount(sa2); |
391 | 0 | sa_small = (n1 < n2) ? sa1 : sa2; /* do not destroy sa_small */ |
392 | 0 | sa_big = (n1 < n2) ? sa2 : sa1; /* do not destroy sa_big */ |
393 | 0 | set1 = l_asetCreateFromSarray(sa_big); |
394 | | |
395 | | /* Build up the intersection of strings */ |
396 | 0 | sad = sarrayCreate(0); |
397 | 0 | *psad = sad; |
398 | 0 | n = sarrayGetCount(sa_small); |
399 | 0 | set2 = l_asetCreate(L_UINT_TYPE); |
400 | 0 | for (i = 0; i < n; i++) { |
401 | 0 | str = sarrayGetString(sa_small, i, L_NOCOPY); |
402 | 0 | l_hashStringToUint64Fast(str, &hash); |
403 | 0 | key.utype = hash; |
404 | 0 | if (l_asetFind(set1, key) && !l_asetFind(set2, key)) { |
405 | 0 | sarrayAddString(sad, str, L_COPY); |
406 | 0 | l_asetInsert(set2, key); |
407 | 0 | } |
408 | 0 | } |
409 | |
|
410 | 0 | l_asetDestroy(&set1); |
411 | 0 | l_asetDestroy(&set2); |
412 | 0 | return 0; |
413 | 0 | } |
414 | | |
415 | | |
416 | | /*----------------------------------------------------------------------* |
417 | | * Hashmap operations * |
418 | | *----------------------------------------------------------------------*/ |
419 | | /*! |
420 | | * \brief l_hmapCreateFromSarray() |
421 | | * |
422 | | * \param[in] sa input sarray |
423 | | * \return hmap hashmap, or NULL on error |
424 | | */ |
425 | | L_HASHMAP * |
426 | | l_hmapCreateFromSarray(SARRAY *sa) |
427 | 0 | { |
428 | 0 | l_int32 i, n; |
429 | 0 | l_uint64 key; |
430 | 0 | char *str; |
431 | 0 | L_HASHMAP *hmap; |
432 | |
|
433 | 0 | if (!sa) |
434 | 0 | return (L_HASHMAP *)ERROR_PTR("sa not defined", __func__, NULL); |
435 | | |
436 | 0 | n = sarrayGetCount(sa); |
437 | 0 | if ((hmap = l_hmapCreate(0.51 * n, 2)) == NULL) |
438 | 0 | return (L_HASHMAP *)ERROR_PTR("hmap not made", __func__, NULL); |
439 | 0 | for (i = 0; i < n; i++) { |
440 | 0 | str = sarrayGetString(sa, i, L_NOCOPY); |
441 | 0 | l_hashStringToUint64Fast(str, &key); |
442 | 0 | l_hmapLookup(hmap, key, i, L_HMAP_CREATE); |
443 | 0 | } |
444 | 0 | return hmap; |
445 | 0 | } |
446 | | |
447 | | |
448 | | /*! |
449 | | * \brief sarrayRemoveDupsByHmap() |
450 | | * |
451 | | * \param[in] sas |
452 | | * \param[out] psad hash set of unique values |
453 | | * \param[out] phmap [optional] hashmap used for lookup |
454 | | * \return 0 if OK; 1 on error |
455 | | */ |
456 | | l_ok |
457 | | sarrayRemoveDupsByHmap(SARRAY *sas, |
458 | | SARRAY **psad, |
459 | | L_HASHMAP **phmap) |
460 | 0 | { |
461 | 0 | l_int32 i, tabsize; |
462 | 0 | char *str; |
463 | 0 | SARRAY *sad; |
464 | 0 | L_HASHITEM *hitem; |
465 | 0 | L_HASHMAP *hmap; |
466 | |
|
467 | 0 | if (phmap) *phmap = NULL; |
468 | 0 | if (!psad) |
469 | 0 | return ERROR_INT("&sad not defined", __func__, 1); |
470 | 0 | *psad = NULL; |
471 | 0 | if (!sas) |
472 | 0 | return ERROR_INT("sas not defined", __func__, 1); |
473 | | |
474 | | /* Traverse the hashtable lists */ |
475 | 0 | if ((hmap = l_hmapCreateFromSarray(sas)) == NULL) |
476 | 0 | return ERROR_INT("hmap not made", __func__, 1); |
477 | 0 | sad = sarrayCreate(0); |
478 | 0 | *psad = sad; |
479 | 0 | tabsize = hmap->tabsize; |
480 | 0 | for (i = 0; i < tabsize; i++) { |
481 | 0 | hitem = hmap->hashtab[i]; |
482 | 0 | while (hitem) { |
483 | 0 | str = sarrayGetString(sas, hitem->val, L_COPY); |
484 | 0 | sarrayAddString(sad, str, L_INSERT); |
485 | 0 | hitem = hitem->next; |
486 | 0 | } |
487 | 0 | } |
488 | |
|
489 | 0 | if (phmap) |
490 | 0 | *phmap = hmap; |
491 | 0 | else |
492 | 0 | l_hmapDestroy(&hmap); |
493 | 0 | return 0; |
494 | 0 | } |
495 | | |
496 | | |
497 | | /*! |
498 | | * \brief sarrayUnionByHmap() |
499 | | * |
500 | | * \param[in] sa1 |
501 | | * \param[in] sa2 |
502 | | * \param[out] *psad union of the array values |
503 | | * \return 0 if OK; 1 on error |
504 | | */ |
505 | | l_ok |
506 | | sarrayUnionByHmap(SARRAY *sa1, |
507 | | SARRAY *sa2, |
508 | | SARRAY **psad) |
509 | 0 | { |
510 | 0 | SARRAY *sa3; |
511 | |
|
512 | 0 | if (!psad) |
513 | 0 | return ERROR_INT("&sad not defined", __func__, 1); |
514 | 0 | *psad = NULL; |
515 | 0 | if (!sa1) |
516 | 0 | return ERROR_INT("sa1 not defined", __func__, 1); |
517 | 0 | if (!sa2) |
518 | 0 | return ERROR_INT("sa2 not defined", __func__, 1); |
519 | | |
520 | 0 | sa3 = sarrayCopy(sa1); |
521 | 0 | if (sarrayJoin(sa3, sa2) == 1) { |
522 | 0 | sarrayDestroy(&sa3); |
523 | 0 | return ERROR_INT("sa3 join failed", __func__, 1); |
524 | 0 | } |
525 | 0 | sarrayRemoveDupsByHmap(sa3, psad, NULL); |
526 | 0 | sarrayDestroy(&sa3); |
527 | 0 | return 0; |
528 | 0 | } |
529 | | |
530 | | |
531 | | /*! |
532 | | * \brief sarrayIntersectionByHmap() |
533 | | * |
534 | | * \param[in] sa1 |
535 | | * \param[in] sa2 |
536 | | * \param[out] *psad intersection of the array values |
537 | | * \return 0 if OK; 1 on error |
538 | | */ |
539 | | l_ok |
540 | | sarrayIntersectionByHmap(SARRAY *sa1, |
541 | | SARRAY *sa2, |
542 | | SARRAY **psad) |
543 | 0 | { |
544 | 0 | l_int32 i, n1, n2, n; |
545 | 0 | l_uint64 key; |
546 | 0 | char *str; |
547 | 0 | SARRAY *sa_small, *sa_big, *sa3, *sad; |
548 | 0 | L_HASHITEM *hitem; |
549 | 0 | L_HASHMAP *hmap; |
550 | |
|
551 | 0 | if (!psad) |
552 | 0 | return ERROR_INT("&sad not defined", __func__, 1); |
553 | 0 | *psad = NULL; |
554 | 0 | if (!sa1) |
555 | 0 | return ERROR_INT("sa1 not defined", __func__, 1); |
556 | 0 | if (!sa2) |
557 | 0 | return ERROR_INT("sa2 not defined", __func__, 1); |
558 | | |
559 | | /* Make a hashmap for the elements of the biggest array */ |
560 | 0 | n1 = sarrayGetCount(sa1); |
561 | 0 | n2 = sarrayGetCount(sa2); |
562 | 0 | sa_small = (n1 < n2) ? sa1 : sa2; /* do not destroy sa_small */ |
563 | 0 | sa_big = (n1 < n2) ? sa2 : sa1; /* do not destroy sa_big */ |
564 | 0 | if ((hmap = l_hmapCreateFromSarray(sa_big)) == NULL) |
565 | 0 | return ERROR_INT("hmap not made", __func__, 1); |
566 | | |
567 | | /* Remove duplicates from the smallest array. Alternatively, |
568 | | * we can skip this step and avoid counting duplicates in |
569 | | * sa_small by modifying the count fields in the sa_big hashitems; |
570 | | * e.g., see l_hmapIntersectionDna(). */ |
571 | 0 | sarrayRemoveDupsByHmap(sa_small, &sa3, NULL); |
572 | | |
573 | | /* Go through sa3, the set of strings derived from the smallest array, |
574 | | * hashing into the big array table. Any string found belongs to both, |
575 | | * so add it to the output array. */ |
576 | 0 | sad = sarrayCreate(0); |
577 | 0 | *psad = sad; |
578 | 0 | n = sarrayGetCount(sa3); |
579 | 0 | for (i = 0; i < n; i++) { |
580 | 0 | str = sarrayGetString(sa3, i, L_NOCOPY); |
581 | 0 | l_hashStringToUint64Fast(str, &key); |
582 | 0 | hitem = l_hmapLookup(hmap, key, i, L_HMAP_CHECK); |
583 | 0 | if (hitem) |
584 | 0 | sarrayAddString(sad, str, L_COPY); |
585 | 0 | } |
586 | 0 | l_hmapDestroy(&hmap); |
587 | 0 | sarrayDestroy(&sa3); |
588 | 0 | return 0; |
589 | 0 | } |
590 | | |
591 | | |
592 | | /*----------------------------------------------------------------------* |
593 | | * Miscellaneous operations * |
594 | | *----------------------------------------------------------------------*/ |
595 | | /*! |
596 | | * \brief sarrayGenerateIntegers() |
597 | | * |
598 | | * \param[in] n |
599 | | * \return sa of printed numbers, 1 - n, or NULL on error |
600 | | */ |
601 | | SARRAY * |
602 | | sarrayGenerateIntegers(l_int32 n) |
603 | 0 | { |
604 | 0 | char buf[32]; |
605 | 0 | l_int32 i; |
606 | 0 | SARRAY *sa; |
607 | |
|
608 | 0 | if ((sa = sarrayCreate(n)) == NULL) |
609 | 0 | return (SARRAY *)ERROR_PTR("sa not made", __func__, NULL); |
610 | 0 | for (i = 0; i < n; i++) { |
611 | 0 | snprintf(buf, sizeof(buf), "%d", i); |
612 | 0 | sarrayAddString(sa, buf, L_COPY); |
613 | 0 | } |
614 | 0 | return sa; |
615 | 0 | } |
616 | | |
617 | | |
618 | | /*! |
619 | | * \brief sarrayLookupCSKV() |
620 | | * |
621 | | * \param[in] sa of strings, each being a comma-separated pair |
622 | | * of strings, the first being a key and the |
623 | | * second a value |
624 | | * \param[in] keystring an input string to match with each key in %sa |
625 | | * \param[out] pvalstring the returned value string corresponding to the |
626 | | * input key string, if found; otherwise NULL |
627 | | * \return 0 if OK, 1 on error |
628 | | * |
629 | | * <pre> |
630 | | * Notes: |
631 | | * (1) The input %sa can have other strings that are not in |
632 | | * comma-separated key-value format. These will be ignored. |
633 | | * (2) This returns a copy of the first value string in %sa whose |
634 | | * key string matches the input %keystring. |
635 | | * (3) White space is not ignored; all white space before the ',' |
636 | | * is used for the keystring in matching. This allows the |
637 | | * key and val strings to have white space (e.g., multiple words). |
638 | | * </pre> |
639 | | */ |
640 | | l_ok |
641 | | sarrayLookupCSKV(SARRAY *sa, |
642 | | const char *keystring, |
643 | | char **pvalstring) |
644 | 0 | { |
645 | 0 | char *key, *val, *str; |
646 | 0 | l_int32 i, n; |
647 | 0 | SARRAY *sa1; |
648 | |
|
649 | 0 | if (!pvalstring) |
650 | 0 | return ERROR_INT("&valstring not defined", __func__, 1); |
651 | 0 | *pvalstring = NULL; |
652 | 0 | if (!sa) |
653 | 0 | return ERROR_INT("sa not defined", __func__, 1); |
654 | 0 | if (!keystring) |
655 | 0 | return ERROR_INT("keystring not defined", __func__, 1); |
656 | | |
657 | 0 | n = sarrayGetCount(sa); |
658 | 0 | for (i = 0; i < n; i++) { |
659 | 0 | str = sarrayGetString(sa, i, L_NOCOPY); |
660 | 0 | sa1 = sarrayCreate(2); |
661 | 0 | sarraySplitString(sa1, str, ","); |
662 | 0 | if (sarrayGetCount(sa1) != 2) { |
663 | 0 | sarrayDestroy(&sa1); |
664 | 0 | continue; |
665 | 0 | } |
666 | 0 | key = sarrayGetString(sa1, 0, L_NOCOPY); |
667 | 0 | val = sarrayGetString(sa1, 1, L_NOCOPY); |
668 | 0 | if (!strcmp(key, keystring)) { |
669 | 0 | *pvalstring = stringNew(val); |
670 | 0 | sarrayDestroy(&sa1); |
671 | 0 | return 0; |
672 | 0 | } |
673 | 0 | sarrayDestroy(&sa1); |
674 | 0 | } |
675 | | |
676 | 0 | return 0; |
677 | 0 | } |