/src/leptonica/src/sarray2.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /*====================================================================*  | 
2  |  |  -  Copyright (C) 2001 Leptonica.  All rights reserved.  | 
3  |  |  -  | 
4  |  |  -  Redistribution and use in source and binary forms, with or without  | 
5  |  |  -  modification, are permitted provided that the following conditions  | 
6  |  |  -  are met:  | 
7  |  |  -  1. Redistributions of source code must retain the above copyright  | 
8  |  |  -     notice, this list of conditions and the following disclaimer.  | 
9  |  |  -  2. Redistributions in binary form must reproduce the above  | 
10  |  |  -     copyright notice, this list of conditions and the following  | 
11  |  |  -     disclaimer in the documentation and/or other materials  | 
12  |  |  -     provided with the distribution.  | 
13  |  |  -  | 
14  |  |  -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS  | 
15  |  |  -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT  | 
16  |  |  -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR  | 
17  |  |  -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY  | 
18  |  |  -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,  | 
19  |  |  -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,  | 
20  |  |  -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR  | 
21  |  |  -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY  | 
22  |  |  -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING  | 
23  |  |  -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS  | 
24  |  |  -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  | 
25  |  |  *====================================================================*/  | 
26  |  |  | 
27  |  | /*!  | 
28  |  |  * \file  sarray2.c  | 
29  |  |  * <pre>  | 
30  |  |  *  | 
31  |  |  *      Sort  | 
32  |  |  *          SARRAY     *sarraySort()  | 
33  |  |  *          SARRAY     *sarraySortByIndex()  | 
34  |  |  *          l_int32     stringCompareLexical()  | 
35  |  |  *  | 
36  |  |  *      Set operations using aset (rbtree)  | 
37  |  |  *          L_ASET     *l_asetCreateFromSarray()  | 
38  |  |  *          l_int32     sarrayRemoveDupsByAset()  | 
39  |  |  *          l_int32     sarrayUnionByAset()  | 
40  |  |  *          l_int32     sarrayIntersectionByAset()  | 
41  |  |  *  | 
42  |  |  *      Hashmap operations  | 
43  |  |  *          L_HASHMAP  *l_hmapCreateFromSarray()  | 
44  |  |  *          l_int32     sarrayRemoveDupsByHmap()  | 
45  |  |  *          l_int32     sarrayUnionByHmap()  | 
46  |  |  *          l_int32     sarrayIntersectionByHmap()  | 
47  |  |  *  | 
48  |  |  *      Miscellaneous operations  | 
49  |  |  *          SARRAY     *sarrayGenerateIntegers()  | 
50  |  |  *          l_int32     sarrayLookupCSKV()  | 
51  |  |  *  | 
52  |  |  *  | 
53  |  |  * We have two implementations of set operations on an array of strings:  | 
54  |  |  *  | 
55  |  |  *   (1) Using an underlying tree (rbtree).  | 
56  |  |  *       This uses a good 64 bit hashing function for the key,  | 
57  |  |  *       that is not expected to have hash collisions (and we do  | 
58  |  |  *       not test for them).  The tree is built up of the hash  | 
59  |  |  *       values, and if the hash is found in the tree, it is  | 
60  |  |  *       assumed that the string has already been found.  | 
61  |  |  *  | 
62  |  |  *   (2) Building a hashmap from the keys (hashmap).  | 
63  |  |  *       This uses a fast 64 bit hashing function for the key, which  | 
64  |  |  *       is then hashed into a hashtable.  Collisions of hashkeys are  | 
65  |  |  *       very rare, but the hashtable is designed to allow more than one  | 
66  |  |  *       hashitem in a table entry.  The hashitems are put in a list at  | 
67  |  |  *       each hashtable entry, which is traversed looking for the key.  | 
68  |  |  * </pre>  | 
69  |  |  */  | 
70  |  |  | 
71  |  | #ifdef HAVE_CONFIG_H  | 
72  |  | #include <config_auto.h>  | 
73  |  | #endif  /* HAVE_CONFIG_H */  | 
74  |  |  | 
75  |  | #include <string.h>  | 
76  |  | #include "allheaders.h"  | 
77  |  | #include "array_internal.h"  | 
78  |  |  | 
79  |  | /*----------------------------------------------------------------------*  | 
80  |  |  *                                   Sort                               *  | 
81  |  |  *----------------------------------------------------------------------*/  | 
82  |  | /*!  | 
83  |  |  * \brief   sarraySort()  | 
84  |  |  *  | 
85  |  |  * \param[in]    saout       output sarray; can be NULL or equal to sain  | 
86  |  |  * \param[in]    sain        input sarray  | 
87  |  |  * \param[in]    sortorder   L_SORT_INCREASING or L_SORT_DECREASING  | 
88  |  |  * \return  saout output sarray, sorted by ascii value, or NULL on error  | 
89  |  |  *  | 
90  |  |  * <pre>  | 
91  |  |  * Notes:  | 
92  |  |  *      (1) Set saout = sain for in-place; otherwise, set naout = NULL.  | 
93  |  |  *      (2) Shell sort, modified from K&R, 2nd edition, p.62.  | 
94  |  |  *          Slow but simple O(n logn) sort.  | 
95  |  |  * </pre>  | 
96  |  |  */  | 
97  |  | SARRAY *  | 
98  |  | sarraySort(SARRAY  *saout,  | 
99  |  |            SARRAY  *sain,  | 
100  |  |            l_int32  sortorder)  | 
101  | 0  | { | 
102  | 0  | char   **array;  | 
103  | 0  | char    *tmp;  | 
104  | 0  | l_int32  n, i, j, gap;  | 
105  |  | 
  | 
106  | 0  |     if (!sain)  | 
107  | 0  |         return (SARRAY *)ERROR_PTR("sain not defined", __func__, NULL); | 
108  |  |  | 
109  |  |         /* Make saout if necessary; otherwise do in-place */  | 
110  | 0  |     if (!saout)  | 
111  | 0  |         saout = sarrayCopy(sain);  | 
112  | 0  |     else if (sain != saout)  | 
113  | 0  |         return (SARRAY *)ERROR_PTR("invalid: not in-place", __func__, NULL); | 
114  | 0  |     array = saout->array;  /* operate directly on the array */  | 
115  | 0  |     n = sarrayGetCount(saout);  | 
116  |  |  | 
117  |  |         /* Shell sort */  | 
118  | 0  |     for (gap = n/2; gap > 0; gap = gap / 2) { | 
119  | 0  |         for (i = gap; i < n; i++) { | 
120  | 0  |             for (j = i - gap; j >= 0; j -= gap) { | 
121  | 0  |                 if ((sortorder == L_SORT_INCREASING &&  | 
122  | 0  |                      stringCompareLexical(array[j], array[j + gap])) ||  | 
123  | 0  |                     (sortorder == L_SORT_DECREASING &&  | 
124  | 0  |                      stringCompareLexical(array[j + gap], array[j])))  | 
125  | 0  |                 { | 
126  | 0  |                     tmp = array[j];  | 
127  | 0  |                     array[j] = array[j + gap];  | 
128  | 0  |                     array[j + gap] = tmp;  | 
129  | 0  |                 }  | 
130  | 0  |             }  | 
131  | 0  |         }  | 
132  | 0  |     }  | 
133  |  | 
  | 
134  | 0  |     return saout;  | 
135  | 0  | }  | 
136  |  |  | 
137  |  |  | 
138  |  | /*!  | 
139  |  |  * \brief   sarraySortByIndex()  | 
140  |  |  *  | 
141  |  |  * \param[in]    sain  | 
142  |  |  * \param[in]    naindex   na that maps from the new sarray to the input sarray  | 
143  |  |  * \return  saout sorted, or NULL on error  | 
144  |  |  */  | 
145  |  | SARRAY *  | 
146  |  | sarraySortByIndex(SARRAY  *sain,  | 
147  |  |                   NUMA    *naindex)  | 
148  | 0  | { | 
149  | 0  | char    *str;  | 
150  | 0  | l_int32  i, n, index;  | 
151  | 0  | SARRAY  *saout;  | 
152  |  | 
  | 
153  | 0  |     if (!sain)  | 
154  | 0  |         return (SARRAY *)ERROR_PTR("sain not defined", __func__, NULL); | 
155  | 0  |     if (!naindex)  | 
156  | 0  |         return (SARRAY *)ERROR_PTR("naindex not defined", __func__, NULL); | 
157  |  |  | 
158  | 0  |     n = sarrayGetCount(sain);  | 
159  | 0  |     saout = sarrayCreate(n);  | 
160  | 0  |     for (i = 0; i < n; i++) { | 
161  | 0  |         numaGetIValue(naindex, i, &index);  | 
162  | 0  |         str = sarrayGetString(sain, index, L_COPY);  | 
163  | 0  |         sarrayAddString(saout, str, L_INSERT);  | 
164  | 0  |     }  | 
165  |  | 
  | 
166  | 0  |     return saout;  | 
167  | 0  | }  | 
168  |  |  | 
169  |  |  | 
170  |  | /*!  | 
171  |  |  * \brief   stringCompareLexical()  | 
172  |  |  *  | 
173  |  |  * \param[in]    str1  | 
174  |  |  * \param[in]    str2  | 
175  |  |  * \return  1 if str1 > str2 lexically; 0 otherwise  | 
176  |  |  *  | 
177  |  |  * <pre>  | 
178  |  |  * Notes:  | 
179  |  |  *      (1) If the lexical values are identical, return a 0, to  | 
180  |  |  *          indicate that no swapping is required to sort the strings.  | 
181  |  |  * </pre>  | 
182  |  |  */  | 
183  |  | l_int32  | 
184  |  | stringCompareLexical(const char *str1,  | 
185  |  |                      const char *str2)  | 
186  | 0  | { | 
187  | 0  | l_int32  i, len1, len2, len;  | 
188  |  | 
  | 
189  | 0  |     if (!str1)  | 
190  | 0  |         return ERROR_INT("str1 not defined", __func__, 1); | 
191  | 0  |     if (!str2)  | 
192  | 0  |         return ERROR_INT("str2 not defined", __func__, 1); | 
193  |  |  | 
194  | 0  |     len1 = strlen(str1);  | 
195  | 0  |     len2 = strlen(str2);  | 
196  | 0  |     len = L_MIN(len1, len2);  | 
197  |  | 
  | 
198  | 0  |     for (i = 0; i < len; i++) { | 
199  | 0  |         if (str1[i] == str2[i])  | 
200  | 0  |             continue;  | 
201  | 0  |         if (str1[i] > str2[i])  | 
202  | 0  |             return 1;  | 
203  | 0  |         else  | 
204  | 0  |             return 0;  | 
205  | 0  |     }  | 
206  |  |  | 
207  | 0  |     if (len1 > len2)  | 
208  | 0  |         return 1;  | 
209  | 0  |     else  | 
210  | 0  |         return 0;  | 
211  | 0  | }  | 
212  |  |  | 
213  |  |  | 
214  |  | /*----------------------------------------------------------------------*  | 
215  |  |  *                   Set operations using aset (rbtree)                 *  | 
216  |  |  *----------------------------------------------------------------------*/  | 
217  |  | /*!  | 
218  |  |  * \brief   l_asetCreateFromSarray()  | 
219  |  |  *  | 
220  |  |  * \param[in]    sa  | 
221  |  |  * \return  set using a string hash into a uint64 as the key  | 
222  |  |  */  | 
223  |  | L_ASET *  | 
224  |  | l_asetCreateFromSarray(SARRAY  *sa)  | 
225  | 0  | { | 
226  | 0  | char     *str;  | 
227  | 0  | l_int32   i, n;  | 
228  | 0  | l_uint64  hash;  | 
229  | 0  | L_ASET   *set;  | 
230  | 0  | RB_TYPE   key;  | 
231  |  | 
  | 
232  | 0  |     if (!sa)  | 
233  | 0  |         return (L_ASET *)ERROR_PTR("sa not defined", __func__, NULL); | 
234  |  |  | 
235  | 0  |     set = l_asetCreate(L_UINT_TYPE);  | 
236  | 0  |     n = sarrayGetCount(sa);  | 
237  | 0  |     for (i = 0; i < n; i++) { | 
238  | 0  |         str = sarrayGetString(sa, i, L_NOCOPY);  | 
239  | 0  |         l_hashStringToUint64Fast(str, &hash);  | 
240  | 0  |         key.utype = hash;  | 
241  | 0  |         l_asetInsert(set, key);  | 
242  | 0  |     }  | 
243  |  | 
  | 
244  | 0  |     return set;  | 
245  | 0  | }  | 
246  |  |  | 
247  |  |  | 
248  |  | /*!  | 
249  |  |  * \brief   sarrayRemoveDupsByAset()  | 
250  |  |  *  | 
251  |  |  * \param[in]    sas  | 
252  |  |  * \param[out]   psad      with duplicates removed  | 
253  |  |  * \return  0 if OK; 1 on error  | 
254  |  |  *  | 
255  |  |  * <pre>  | 
256  |  |  * Notes:  | 
257  |  |  *      (1) This is O(nlogn), considerably slower than  | 
258  |  |  *          sarrayRemoveDupsByHmap() for large string arrays.  | 
259  |  |  *      (2) The key for each string is a 64-bit hash.  | 
260  |  |  *      (3) Build a set, using hashed strings as keys.  As the set is  | 
261  |  |  *          built, first do a find; if not found, add the key to the  | 
262  |  |  *          set and add the string to the output sarray.  | 
263  |  |  * </pre>  | 
264  |  |  */  | 
265  |  | l_ok  | 
266  |  | sarrayRemoveDupsByAset(SARRAY   *sas,  | 
267  |  |                        SARRAY  **psad)  | 
268  | 0  | { | 
269  | 0  | char     *str;  | 
270  | 0  | l_int32   i, n;  | 
271  | 0  | l_uint64  hash;  | 
272  | 0  | L_ASET   *set;  | 
273  | 0  | RB_TYPE   key;  | 
274  | 0  | SARRAY   *sad;  | 
275  |  | 
  | 
276  | 0  |     if (!psad)  | 
277  | 0  |         return ERROR_INT("&sad not defined", __func__, 1); | 
278  | 0  |     *psad = NULL;  | 
279  | 0  |     if (!sas)  | 
280  | 0  |         return ERROR_INT("sas not defined", __func__, 1); | 
281  |  |  | 
282  | 0  |     set = l_asetCreate(L_UINT_TYPE);  | 
283  | 0  |     sad = sarrayCreate(0);  | 
284  | 0  |     *psad = sad;  | 
285  | 0  |     n = sarrayGetCount(sas);  | 
286  | 0  |     for (i = 0; i < n; i++) { | 
287  | 0  |         str = sarrayGetString(sas, i, L_NOCOPY);  | 
288  | 0  |         l_hashStringToUint64Fast(str, &hash);  | 
289  | 0  |         key.utype = hash;  | 
290  | 0  |         if (!l_asetFind(set, key)) { | 
291  | 0  |             sarrayAddString(sad, str, L_COPY);  | 
292  | 0  |             l_asetInsert(set, key);  | 
293  | 0  |         }  | 
294  | 0  |     }  | 
295  |  | 
  | 
296  | 0  |     l_asetDestroy(&set);  | 
297  | 0  |     return 0;  | 
298  | 0  | }  | 
299  |  |  | 
300  |  |  | 
301  |  | /*!  | 
302  |  |  * \brief   sarrayUnionByAset()  | 
303  |  |  *  | 
304  |  |  * \param[in]    sa1  | 
305  |  |  * \param[in]    sa2  | 
306  |  |  * \param[out]   psad      union of the two string arrays  | 
307  |  |  * \return  0 if OK; 1 on error  | 
308  |  |  *  | 
309  |  |  * <pre>  | 
310  |  |  * Notes:  | 
311  |  |  *      (1) Duplicates are removed from the concatenation of the two arrays.  | 
312  |  |  *      (2) The key for each string is a 64-bit hash.  | 
313  |  |  *      (2) Algorithm: Concatenate the two sarrays.  Then build a set,  | 
314  |  |  *          using hashed strings as keys.  As the set is built, first do  | 
315  |  |  *          a find; if not found, add the key to the set and add the string  | 
316  |  |  *          to the output sarray.  This is O(nlogn).  | 
317  |  |  * </pre>  | 
318  |  |  */  | 
319  |  | l_ok  | 
320  |  | sarrayUnionByAset(SARRAY   *sa1,  | 
321  |  |                   SARRAY   *sa2,  | 
322  |  |                   SARRAY  **psad)  | 
323  | 0  | { | 
324  | 0  | SARRAY  *sa3;  | 
325  |  | 
  | 
326  | 0  |     if (!psad)  | 
327  | 0  |         return ERROR_INT("&sad not defined", __func__, 1); | 
328  | 0  |     *psad = NULL;  | 
329  | 0  |     if (!sa1)  | 
330  | 0  |         return ERROR_INT("sa1 not defined", __func__, 1); | 
331  | 0  |     if (!sa2)  | 
332  | 0  |         return ERROR_INT("sa2 not defined", __func__, 1); | 
333  |  |  | 
334  |  |         /* Join */  | 
335  | 0  |     sa3 = sarrayCopy(sa1);  | 
336  | 0  |     if (sarrayJoin(sa3, sa2) == 1) { | 
337  | 0  |         sarrayDestroy(&sa3);  | 
338  | 0  |         return ERROR_INT("join failed for sa3", __func__, 1); | 
339  | 0  |     }  | 
340  |  |  | 
341  |  |         /* Eliminate duplicates */  | 
342  | 0  |     sarrayRemoveDupsByAset(sa3, psad);  | 
343  | 0  |     sarrayDestroy(&sa3);  | 
344  | 0  |     return 0;  | 
345  | 0  | }  | 
346  |  |  | 
347  |  |  | 
348  |  | /*!  | 
349  |  |  * \brief   sarrayIntersectionByAset()  | 
350  |  |  *  | 
351  |  |  * \param[in]    sa1  | 
352  |  |  * \param[in]    sa2  | 
353  |  |  * \param[out]   psad      intersection of the two string arrays  | 
354  |  |  * \return  0 if OK; 1 on error  | 
355  |  |  *  | 
356  |  |  * <pre>  | 
357  |  |  * Notes:  | 
358  |  |  *      (1) Algorithm: put the larger sarray into a set, using the string  | 
359  |  |  *          hashes as the key values.  Then run through the smaller sarray,  | 
360  |  |  *          building an output sarray and a second set from the strings  | 
361  |  |  *          in the larger array: if a string is in the first set but  | 
362  |  |  *          not in the second, add the string to the output sarray and hash  | 
363  |  |  *          it into the second set.  The second set is required to make  | 
364  |  |  *          sure only one instance of each string is put into the output sarray.  | 
365  |  |  *          This is O(mlogn), {m,n} = sizes of {smaller,larger} input arrays. | 
366  |  |  * </pre>  | 
367  |  |  */  | 
368  |  | l_ok  | 
369  |  | sarrayIntersectionByAset(SARRAY   *sa1,  | 
370  |  |                          SARRAY   *sa2,  | 
371  |  |                          SARRAY  **psad)  | 
372  | 0  | { | 
373  | 0  | char     *str;  | 
374  | 0  | l_int32   n1, n2, i, n;  | 
375  | 0  | l_uint64  hash;  | 
376  | 0  | L_ASET   *set1, *set2;  | 
377  | 0  | RB_TYPE   key;  | 
378  | 0  | SARRAY   *sa_small, *sa_big, *sad;  | 
379  |  | 
  | 
380  | 0  |     if (!psad)  | 
381  | 0  |         return ERROR_INT("&sad not defined", __func__, 1); | 
382  | 0  |     *psad = NULL;  | 
383  | 0  |     if (!sa1)  | 
384  | 0  |         return ERROR_INT("sa1 not defined", __func__, 1); | 
385  | 0  |     if (!sa2)  | 
386  | 0  |         return ERROR_INT("sa2 not defined", __func__, 1); | 
387  |  |  | 
388  |  |         /* Put the elements of the biggest array into a set */  | 
389  | 0  |     n1 = sarrayGetCount(sa1);  | 
390  | 0  |     n2 = sarrayGetCount(sa2);  | 
391  | 0  |     sa_small = (n1 < n2) ? sa1 : sa2;   /* do not destroy sa_small */  | 
392  | 0  |     sa_big = (n1 < n2) ? sa2 : sa1;   /* do not destroy sa_big */  | 
393  | 0  |     set1 = l_asetCreateFromSarray(sa_big);  | 
394  |  |  | 
395  |  |         /* Build up the intersection of strings */  | 
396  | 0  |     sad = sarrayCreate(0);  | 
397  | 0  |     *psad = sad;  | 
398  | 0  |     n = sarrayGetCount(sa_small);  | 
399  | 0  |     set2 = l_asetCreate(L_UINT_TYPE);  | 
400  | 0  |     for (i = 0; i < n; i++) { | 
401  | 0  |         str = sarrayGetString(sa_small, i, L_NOCOPY);  | 
402  | 0  |         l_hashStringToUint64Fast(str, &hash);  | 
403  | 0  |         key.utype = hash;  | 
404  | 0  |         if (l_asetFind(set1, key) && !l_asetFind(set2, key)) { | 
405  | 0  |             sarrayAddString(sad, str, L_COPY);  | 
406  | 0  |             l_asetInsert(set2, key);  | 
407  | 0  |         }  | 
408  | 0  |     }  | 
409  |  | 
  | 
410  | 0  |     l_asetDestroy(&set1);  | 
411  | 0  |     l_asetDestroy(&set2);  | 
412  | 0  |     return 0;  | 
413  | 0  | }  | 
414  |  |  | 
415  |  |  | 
416  |  | /*----------------------------------------------------------------------*  | 
417  |  |  *                          Hashmap operations                          *  | 
418  |  |  *----------------------------------------------------------------------*/  | 
419  |  | /*!  | 
420  |  |  * \brief  l_hmapCreateFromSarray()  | 
421  |  |  *  | 
422  |  |  * \param[in]   sa     input sarray  | 
423  |  |  * \return      hmap   hashmap, or NULL on error  | 
424  |  |  */  | 
425  |  | L_HASHMAP *  | 
426  |  | l_hmapCreateFromSarray(SARRAY  *sa)  | 
427  | 0  | { | 
428  | 0  | l_int32      i, n;  | 
429  | 0  | l_uint64     key;  | 
430  | 0  | char        *str;  | 
431  | 0  | L_HASHMAP   *hmap;  | 
432  |  | 
  | 
433  | 0  |     if (!sa)  | 
434  | 0  |         return (L_HASHMAP *)ERROR_PTR("sa not defined", __func__, NULL); | 
435  |  |  | 
436  | 0  |     n = sarrayGetCount(sa);  | 
437  | 0  |     if ((hmap = l_hmapCreate(0.51 * n, 2)) == NULL)  | 
438  | 0  |         return (L_HASHMAP *)ERROR_PTR("hmap not made", __func__, NULL); | 
439  | 0  |     for (i = 0; i < n; i++) { | 
440  | 0  |         str = sarrayGetString(sa, i, L_NOCOPY);  | 
441  | 0  |         l_hashStringToUint64Fast(str, &key);  | 
442  | 0  |         l_hmapLookup(hmap, key, i, L_HMAP_CREATE);  | 
443  | 0  |     }  | 
444  | 0  |     return hmap;  | 
445  | 0  | }  | 
446  |  |  | 
447  |  |  | 
448  |  | /*!  | 
449  |  |  * \brief  sarrayRemoveDupsByHmap()  | 
450  |  |  *  | 
451  |  |  * \param[in]   sas  | 
452  |  |  * \param[out]  psad    hash set of unique values  | 
453  |  |  * \param[out]  phmap   [optional] hashmap used for lookup  | 
454  |  |  * \return  0 if OK; 1 on error  | 
455  |  |  */  | 
456  |  | l_ok  | 
457  |  | sarrayRemoveDupsByHmap(SARRAY      *sas,  | 
458  |  |                        SARRAY     **psad,  | 
459  |  |                        L_HASHMAP  **phmap)  | 
460  | 0  | { | 
461  | 0  | l_int32      i, tabsize;  | 
462  | 0  | char        *str;  | 
463  | 0  | SARRAY      *sad;  | 
464  | 0  | L_HASHITEM  *hitem;  | 
465  | 0  | L_HASHMAP   *hmap;  | 
466  |  | 
  | 
467  | 0  |     if (phmap) *phmap = NULL;  | 
468  | 0  |     if (!psad)  | 
469  | 0  |         return ERROR_INT("&sad not defined", __func__, 1); | 
470  | 0  |     *psad = NULL;  | 
471  | 0  |     if (!sas)  | 
472  | 0  |         return ERROR_INT("sas not defined", __func__, 1); | 
473  |  |  | 
474  |  |         /* Traverse the hashtable lists */  | 
475  | 0  |     if ((hmap = l_hmapCreateFromSarray(sas)) == NULL)  | 
476  | 0  |         return ERROR_INT("hmap not made", __func__, 1); | 
477  | 0  |     sad = sarrayCreate(0);  | 
478  | 0  |     *psad = sad;  | 
479  | 0  |     tabsize = hmap->tabsize;  | 
480  | 0  |     for (i = 0; i < tabsize; i++) { | 
481  | 0  |         hitem = hmap->hashtab[i];  | 
482  | 0  |         while (hitem) { | 
483  | 0  |             str = sarrayGetString(sas, hitem->val, L_COPY);  | 
484  | 0  |             sarrayAddString(sad, str, L_INSERT);  | 
485  | 0  |             hitem = hitem->next;  | 
486  | 0  |         }  | 
487  | 0  |     }  | 
488  |  | 
  | 
489  | 0  |     if (phmap)  | 
490  | 0  |         *phmap = hmap;  | 
491  | 0  |     else  | 
492  | 0  |         l_hmapDestroy(&hmap);  | 
493  | 0  |     return 0;  | 
494  | 0  | }  | 
495  |  |  | 
496  |  |  | 
497  |  | /*!  | 
498  |  |  * \brief  sarrayUnionByHmap()  | 
499  |  |  *  | 
500  |  |  * \param[in]   sa1  | 
501  |  |  * \param[in]   sa2  | 
502  |  |  * \param[out]  *psad     union of the array values  | 
503  |  |  * \return  0 if OK; 1 on error  | 
504  |  |  */  | 
505  |  | l_ok  | 
506  |  | sarrayUnionByHmap(SARRAY   *sa1,  | 
507  |  |                   SARRAY   *sa2,  | 
508  |  |                   SARRAY  **psad)  | 
509  | 0  | { | 
510  | 0  | SARRAY  *sa3;  | 
511  |  | 
  | 
512  | 0  |     if (!psad)  | 
513  | 0  |         return ERROR_INT("&sad not defined", __func__, 1); | 
514  | 0  |     *psad = NULL;  | 
515  | 0  |     if (!sa1)  | 
516  | 0  |         return ERROR_INT("sa1 not defined", __func__, 1); | 
517  | 0  |     if (!sa2)  | 
518  | 0  |         return ERROR_INT("sa2 not defined", __func__, 1); | 
519  |  |  | 
520  | 0  |     sa3 = sarrayCopy(sa1);  | 
521  | 0  |     if (sarrayJoin(sa3, sa2) == 1) { | 
522  | 0  |         sarrayDestroy(&sa3);  | 
523  | 0  |         return ERROR_INT("sa3 join failed", __func__, 1); | 
524  | 0  |     }  | 
525  | 0  |     sarrayRemoveDupsByHmap(sa3, psad, NULL);  | 
526  | 0  |     sarrayDestroy(&sa3);  | 
527  | 0  |     return 0;  | 
528  | 0  | }  | 
529  |  |  | 
530  |  |  | 
531  |  | /*!  | 
532  |  |  * \brief  sarrayIntersectionByHmap()  | 
533  |  |  *  | 
534  |  |  * \param[in]   sa1  | 
535  |  |  * \param[in]   sa2  | 
536  |  |  * \param[out]  *psad     intersection of the array values  | 
537  |  |  * \return  0 if OK; 1 on error  | 
538  |  |  */  | 
539  |  | l_ok  | 
540  |  | sarrayIntersectionByHmap(SARRAY   *sa1,  | 
541  |  |                          SARRAY   *sa2,  | 
542  |  |                          SARRAY  **psad)  | 
543  | 0  | { | 
544  | 0  | l_int32      i, n1, n2, n;  | 
545  | 0  | l_uint64     key;  | 
546  | 0  | char        *str;  | 
547  | 0  | SARRAY      *sa_small, *sa_big, *sa3, *sad;  | 
548  | 0  | L_HASHITEM  *hitem;  | 
549  | 0  | L_HASHMAP   *hmap;  | 
550  |  | 
  | 
551  | 0  |     if (!psad)  | 
552  | 0  |         return ERROR_INT("&sad not defined", __func__, 1); | 
553  | 0  |     *psad = NULL;  | 
554  | 0  |     if (!sa1)  | 
555  | 0  |         return ERROR_INT("sa1 not defined", __func__, 1); | 
556  | 0  |     if (!sa2)  | 
557  | 0  |         return ERROR_INT("sa2 not defined", __func__, 1); | 
558  |  |  | 
559  |  |         /* Make a hashmap for the elements of the biggest array */  | 
560  | 0  |     n1 = sarrayGetCount(sa1);  | 
561  | 0  |     n2 = sarrayGetCount(sa2);  | 
562  | 0  |     sa_small = (n1 < n2) ? sa1 : sa2;   /* do not destroy sa_small */  | 
563  | 0  |     sa_big = (n1 < n2) ? sa2 : sa1;   /* do not destroy sa_big */  | 
564  | 0  |     if ((hmap = l_hmapCreateFromSarray(sa_big)) == NULL)  | 
565  | 0  |         return ERROR_INT("hmap not made", __func__, 1); | 
566  |  |  | 
567  |  |         /* Remove duplicates from the smallest array.  Alternatively,  | 
568  |  |          * we can skip this step and avoid counting duplicates in  | 
569  |  |          * sa_small by modifying the count fields in the sa_big hashitems;  | 
570  |  |          * e.g., see l_hmapIntersectionDna(). */  | 
571  | 0  |     sarrayRemoveDupsByHmap(sa_small, &sa3, NULL);  | 
572  |  |  | 
573  |  |         /* Go through sa3, the set of strings derived from the smallest array,  | 
574  |  |          * hashing into the big array table.  Any string found belongs to both,  | 
575  |  |          * so add it to the output array. */  | 
576  | 0  |     sad = sarrayCreate(0);  | 
577  | 0  |     *psad = sad;  | 
578  | 0  |     n = sarrayGetCount(sa3);  | 
579  | 0  |     for (i = 0; i < n; i++) { | 
580  | 0  |         str = sarrayGetString(sa3, i, L_NOCOPY);  | 
581  | 0  |         l_hashStringToUint64Fast(str, &key);  | 
582  | 0  |         hitem = l_hmapLookup(hmap, key, i, L_HMAP_CHECK);  | 
583  | 0  |         if (hitem)  | 
584  | 0  |             sarrayAddString(sad, str, L_COPY);  | 
585  | 0  |     }  | 
586  | 0  |     l_hmapDestroy(&hmap);  | 
587  | 0  |     sarrayDestroy(&sa3);  | 
588  | 0  |     return 0;  | 
589  | 0  | }  | 
590  |  |  | 
591  |  |  | 
592  |  | /*----------------------------------------------------------------------*  | 
593  |  |  *                      Miscellaneous operations                        *  | 
594  |  |  *----------------------------------------------------------------------*/  | 
595  |  | /*!  | 
596  |  |  * \brief   sarrayGenerateIntegers()  | 
597  |  |  *  | 
598  |  |  * \param[in]   n  | 
599  |  |  * \return  sa  of printed numbers, 1 - n, or NULL on error  | 
600  |  |  */  | 
601  |  | SARRAY *  | 
602  |  | sarrayGenerateIntegers(l_int32  n)  | 
603  | 0  | { | 
604  | 0  | char     buf[32];  | 
605  | 0  | l_int32  i;  | 
606  | 0  | SARRAY  *sa;  | 
607  |  | 
  | 
608  | 0  |     if ((sa = sarrayCreate(n)) == NULL)  | 
609  | 0  |         return (SARRAY *)ERROR_PTR("sa not made", __func__, NULL); | 
610  | 0  |     for (i = 0; i < n; i++) { | 
611  | 0  |         snprintf(buf, sizeof(buf), "%d", i);  | 
612  | 0  |         sarrayAddString(sa, buf, L_COPY);  | 
613  | 0  |     }  | 
614  | 0  |     return sa;  | 
615  | 0  | }  | 
616  |  |  | 
617  |  |  | 
618  |  | /*!  | 
619  |  |  * \brief   sarrayLookupCSKV()  | 
620  |  |  *  | 
621  |  |  * \param[in]    sa          of strings, each being a comma-separated pair  | 
622  |  |  *                           of strings, the first being a key and the  | 
623  |  |  *                           second a value  | 
624  |  |  * \param[in]    keystring   an input string to match with each key in %sa  | 
625  |  |  * \param[out]   pvalstring  the returned value string corresponding to the  | 
626  |  |  *                           input key string, if found; otherwise NULL  | 
627  |  |  * \return  0 if OK, 1 on error  | 
628  |  |  *  | 
629  |  |  * <pre>  | 
630  |  |  * Notes:  | 
631  |  |  *      (1) The input %sa can have other strings that are not in  | 
632  |  |  *          comma-separated key-value format.  These will be ignored.  | 
633  |  |  *      (2) This returns a copy of the first value string in %sa whose  | 
634  |  |  *          key string matches the input %keystring.  | 
635  |  |  *      (3) White space is not ignored; all white space before the ','  | 
636  |  |  *          is used for the keystring in matching.  This allows the  | 
637  |  |  *          key and val strings to have white space (e.g., multiple words).  | 
638  |  |  * </pre>  | 
639  |  |  */  | 
640  |  | l_ok  | 
641  |  | sarrayLookupCSKV(SARRAY      *sa,  | 
642  |  |                  const char  *keystring,  | 
643  |  |                  char       **pvalstring)  | 
644  | 0  | { | 
645  | 0  | char    *key, *val, *str;  | 
646  | 0  | l_int32  i, n;  | 
647  | 0  | SARRAY  *sa1;  | 
648  |  | 
  | 
649  | 0  |     if (!pvalstring)  | 
650  | 0  |         return ERROR_INT("&valstring not defined", __func__, 1); | 
651  | 0  |     *pvalstring = NULL;  | 
652  | 0  |     if (!sa)  | 
653  | 0  |         return ERROR_INT("sa not defined", __func__, 1); | 
654  | 0  |     if (!keystring)  | 
655  | 0  |         return ERROR_INT("keystring not defined", __func__, 1); | 
656  |  |  | 
657  | 0  |     n = sarrayGetCount(sa);  | 
658  | 0  |     for (i = 0; i < n; i++) { | 
659  | 0  |         str = sarrayGetString(sa, i, L_NOCOPY);  | 
660  | 0  |         sa1 = sarrayCreate(2);  | 
661  | 0  |         sarraySplitString(sa1, str, ",");  | 
662  | 0  |         if (sarrayGetCount(sa1) != 2) { | 
663  | 0  |             sarrayDestroy(&sa1);  | 
664  | 0  |             continue;  | 
665  | 0  |         }  | 
666  | 0  |         key = sarrayGetString(sa1, 0, L_NOCOPY);  | 
667  | 0  |         val = sarrayGetString(sa1, 1, L_NOCOPY);  | 
668  | 0  |         if (!strcmp(key, keystring)) { | 
669  | 0  |             *pvalstring = stringNew(val);  | 
670  | 0  |             sarrayDestroy(&sa1);  | 
671  | 0  |             return 0;  | 
672  | 0  |         }  | 
673  | 0  |         sarrayDestroy(&sa1);  | 
674  | 0  |     }  | 
675  |  |  | 
676  | 0  |     return 0;  | 
677  | 0  | }  |