/src/leptonica/src/hashmap.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*====================================================================* |
2 | | - Copyright (C) 2001 Leptonica. All rights reserved. |
3 | | - |
4 | | - Redistribution and use in source and binary forms, with or without |
5 | | - modification, are permitted provided that the following conditions |
6 | | - are met: |
7 | | - 1. Redistributions of source code must retain the above copyright |
8 | | - notice, this list of conditions and the following disclaimer. |
9 | | - 2. Redistributions in binary form must reproduce the above |
10 | | - copyright notice, this list of conditions and the following |
11 | | - disclaimer in the documentation and/or other materials |
12 | | - provided with the distribution. |
13 | | - |
14 | | - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
15 | | - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
16 | | - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
17 | | - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY |
18 | | - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
19 | | - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
20 | | - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
21 | | - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
22 | | - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
23 | | - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
24 | | - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
25 | | *====================================================================*/ |
26 | | |
27 | | /* |
28 | | * \file hashmap.c |
29 | | * <pre> |
30 | | * |
31 | | * Hashmap creation, destruction |
32 | | * L_HASHMAP *l_hmapCreate() |
33 | | * void l_hmapDestroy() |
34 | | * |
35 | | * Hashmap: Accessors and modifiers |
36 | | * L_HASHITEM *l_hmapLookup() |
37 | | * l_int32 l_hmapRehash() |
38 | | * |
39 | | * (1) See also hashmap.h for a brief description of the design. |
40 | | * (2) In a typical use, a set of objects (in an array or associated |
41 | | * with image pixels) is represented by a hashmap. A uint64 key is |
42 | | * produced for each object. This integer is then hashed into an |
43 | | * index in a hashtable, using the mod function with the table size |
44 | | * which is a prime number. Each entry in the hash table is a list |
45 | | * of hash items. In lookup, the appropriate list is traversed, |
46 | | * looking for the object key found earlier. |
47 | | * (3) Hash functions that map points, strings and float64 to uint64 |
48 | | * are given in utils1.c. |
49 | | * (4) Use of the hashmap on points, strings and float64 data are |
50 | | * given in ptafunc2.c, sarray2.c and dnafunc1.c. |
51 | | * (5) Useful rule of thumb for hashing collisions: |
52 | | * For a random hashing function (say, from strings to l_uint64), |
53 | | * the probability of a collision increases as N^2 for N much |
54 | | * less than 2^32. The quadratic behavior switches over to |
55 | | * approaching 1.0 around 2^32, which is the square root of 2^64. |
56 | | * So, for example, if you have 10^7 strings, the probability |
57 | | * of a single collision using an l_uint64 key is on the order of |
58 | | * (10^7/4x10^9)^2 ~ 10^-5. |
59 | | * For a million strings, collisions are very rare (~10^-7 probability). |
60 | | * </pre> |
61 | | */ |
62 | | |
63 | | #ifdef HAVE_CONFIG_H |
64 | | #include <config_auto.h> |
65 | | #endif /* HAVE_CONFIG_H */ |
66 | | |
67 | | #include "allheaders.h" |
68 | | |
69 | | /* Limit on the hashtable size */ |
70 | | static const l_uint32 MaxTabsize = 50000000; |
71 | | /* Default values for creating the hashmap. */ |
72 | | static const l_int32 DefaultInitNItems = 2000; |
73 | | static const l_int32 DefaultMaxOcc = 2; |
74 | | |
75 | | /*--------------------------------------------------------------------------* |
76 | | * Hashmap creation and destruction * |
77 | | *--------------------------------------------------------------------------*/ |
78 | | /*! |
79 | | * \brief l_hmapCreate() |
80 | | * |
81 | | * \param[in] ninit initial estimate of the number of items to be stored; |
82 | | * use 0 for default value. |
83 | | * \param[in] maxocc max average occupancy of each list of hashitme; |
84 | | * it should be in range [1 ... 5]; use 0 for default |
85 | | * \return ptr to new hashmap, or NULL on error |
86 | | * |
87 | | * <pre> |
88 | | * Notes: |
89 | | * (1) If the maximum number n of items to be hashed is known in |
90 | | * advance, suggested values are: |
91 | | * %nint = 0.51 * n |
92 | | * %maxocc = 2 |
93 | | * With these values, the table will not need to be rehashed, |
94 | | * even if all items have unique keys. |
95 | | * (2) The actual initial size of the hashtab is the first prime |
96 | | * number larger than %ninit/%maxocc. |
97 | | * (3) Each entry into the hashtab points to alist of hash items |
98 | | * (key, val, count). |
99 | | * </pre> |
100 | | */ |
101 | | L_HASHMAP * |
102 | | l_hmapCreate(l_int32 ninit, |
103 | | l_int32 maxocc) |
104 | 0 | { |
105 | 0 | l_uint32 size, tabsize; |
106 | 0 | L_HASHMAP *hmap; |
107 | |
|
108 | 0 | ninit = L_MAX(ninit, DefaultInitNItems); |
109 | 0 | if (maxocc <= 0) maxocc = DefaultMaxOcc; |
110 | 0 | if (maxocc > 5) { |
111 | 0 | L_WARNING("maxocc = %d; non-optimal value. Set to default = %d\n", |
112 | 0 | __func__, maxocc, DefaultMaxOcc); |
113 | 0 | maxocc = DefaultMaxOcc; |
114 | 0 | } |
115 | 0 | size = ninit / maxocc; |
116 | 0 | if (size > MaxTabsize) { |
117 | 0 | L_ERROR("ninit/maxocc = %d > MaxTabsize = %d\n", __func__, |
118 | 0 | size, MaxTabsize); |
119 | 0 | return NULL; |
120 | 0 | } |
121 | | |
122 | 0 | hmap = (L_HASHMAP *)LEPT_CALLOC(1, sizeof(L_HASHMAP)); |
123 | 0 | findNextLargerPrime(size, &tabsize); /* at least 101 */ |
124 | 0 | if ((hmap->hashtab = |
125 | 0 | (L_HASHITEM **)LEPT_CALLOC(tabsize, sizeof(L_HASHITEM *))) == NULL) { |
126 | 0 | LEPT_FREE(hmap); |
127 | 0 | return (L_HASHMAP *)ERROR_PTR("hashtab not made", __func__, NULL); |
128 | 0 | } |
129 | | |
130 | 0 | hmap->nitems = 0; |
131 | 0 | hmap->ntogo = ninit; |
132 | 0 | hmap->maxocc = maxocc; |
133 | 0 | hmap->tabsize = tabsize; |
134 | 0 | return hmap; |
135 | 0 | } |
136 | | |
137 | | |
138 | | /*! |
139 | | * \brief l_hmapDestroy() |
140 | | * |
141 | | * \param[in,out] phmap to be nulled, if it exists |
142 | | * \return void |
143 | | */ |
144 | | void |
145 | | l_hmapDestroy(L_HASHMAP **phmap) |
146 | 0 | { |
147 | 0 | l_int32 i; |
148 | 0 | L_HASHITEM *hitem, *next; |
149 | 0 | L_HASHMAP *hmap; |
150 | |
|
151 | 0 | if (phmap == NULL) { |
152 | 0 | L_WARNING("ptr address is NULL!\n", __func__); |
153 | 0 | return; |
154 | 0 | } |
155 | | |
156 | 0 | if ((hmap = *phmap) == NULL) |
157 | 0 | return; |
158 | | |
159 | 0 | for (i = 0; i < hmap->tabsize; i++) { |
160 | 0 | for (hitem = hmap->hashtab[i]; hitem != NULL; hitem = next) { |
161 | 0 | next = hitem->next; |
162 | 0 | LEPT_FREE(hitem); |
163 | 0 | } |
164 | 0 | } |
165 | 0 | LEPT_FREE(hmap->hashtab); |
166 | 0 | LEPT_FREE(hmap); |
167 | 0 | *phmap = NULL; |
168 | 0 | } |
169 | | |
170 | | |
171 | | /*--------------------------------------------------------------------------* |
172 | | * Hashmap: Accessors and modifiers * |
173 | | *--------------------------------------------------------------------------*/ |
174 | | /*! |
175 | | * \brief l_hmapLookup() |
176 | | * |
177 | | * \param[in] hmap |
178 | | * \param[in] key to be hashed into an index in the hashtab |
179 | | * \param[in] val to be stored in the hitem if creating it |
180 | | * \param[in] op L_HMAP_CHECK, L_HMAP_CREATE |
181 | | * \return ptr hitem; or null either on error or if not found |
182 | | * with op == L_HMAP_CHECK. |
183 | | * |
184 | | * <pre> |
185 | | * Notes: |
186 | | * (1) This lookup function will also create a new hitem if requested. |
187 | | * (2) The %op parameter does the following: |
188 | | * op == L_HMAP_CHECK: return the hitem, or null if not found |
189 | | * op == L_HMAP_CREATE: if found, increment the count; otherwise, make |
190 | | * and store a new hitem; always return the hitem. |
191 | | * (3) The key is a uint64. It is made by hashing some data in the object. |
192 | | * (4) The value is an index into an array of the objects from which |
193 | | * the hashtable has been constructed. |
194 | | * (5) If an hitem is found, a pointer to it is returned. It is owned |
195 | | * by the hashtable; do not destroy it. |
196 | | * </pre> |
197 | | */ |
198 | | L_HASHITEM * |
199 | | l_hmapLookup(L_HASHMAP *hmap, |
200 | | l_uint64 key, |
201 | | l_uint64 val, |
202 | | l_int32 op) |
203 | 0 | { |
204 | 0 | l_uint32 index; |
205 | 0 | L_HASHITEM *hlist, *hitem; |
206 | |
|
207 | 0 | if (!hmap) |
208 | 0 | return (L_HASHITEM *)ERROR_PTR("hmap not defined", __func__, NULL); |
209 | 0 | if (op != L_HMAP_CHECK && op != L_HMAP_CREATE) |
210 | 0 | return (L_HASHITEM *)ERROR_PTR("invalid op", __func__, NULL); |
211 | | |
212 | | /* If found, return a ptr to the hitem (not a copy) */ |
213 | 0 | index = key % hmap->tabsize; /* into hashtab */ |
214 | 0 | hlist = hmap->hashtab[index]; /* head of the list */ |
215 | 0 | for (hitem = hlist; hitem != NULL; hitem = hitem->next) { |
216 | 0 | if (key == hitem->key) { |
217 | 0 | if (op == L_HMAP_CREATE) hitem->count++; |
218 | 0 | return hitem; |
219 | 0 | } |
220 | 0 | } |
221 | 0 | if (op == L_HMAP_CHECK) return NULL; |
222 | | |
223 | | /* Not found and %op == L_HMAP_CREATE. |
224 | | * Make a new hitem and add to the head of the list */ |
225 | 0 | hitem = (L_HASHITEM *)LEPT_CALLOC(1, sizeof(L_HASHITEM)); |
226 | 0 | hitem->key = key; |
227 | 0 | hitem->val = val; |
228 | 0 | hitem->count = 1; |
229 | 0 | hitem->next = hlist; |
230 | 0 | hmap->hashtab[index] = hitem; |
231 | 0 | hmap->nitems++; |
232 | 0 | hmap->ntogo--; |
233 | | |
234 | | /* If hmap is full based on average occupancy, rehash */ |
235 | 0 | if (hmap->ntogo == 0) |
236 | 0 | l_hmapRehash(hmap); |
237 | |
|
238 | 0 | return hitem; |
239 | 0 | } |
240 | | |
241 | | |
242 | | /*! |
243 | | * \brief l_hmapRehash() |
244 | | * |
245 | | * \param[in] hmap |
246 | | * \return 0 if OK; 1 on error |
247 | | * |
248 | | * <pre> |
249 | | * Notes: |
250 | | * (1) This is called when the average occupancy reaches maxocc. |
251 | | * It doubles the size of the hashtab, and reuses the hash items. |
252 | | * </pre> |
253 | | */ |
254 | | l_ok |
255 | | l_hmapRehash(L_HASHMAP *hmap) |
256 | 0 | { |
257 | 0 | l_int32 i, index; |
258 | 0 | l_uint32 tabsize; |
259 | 0 | L_HASHITEM *hstore, *hitem, *next; |
260 | |
|
261 | 0 | if (!hmap) |
262 | 0 | return ERROR_INT("hmap not defined", __func__, 1); |
263 | | |
264 | | /* Put hash items in temporary storage in a single list, |
265 | | * successively adding each to the list head. */ |
266 | 0 | hstore = NULL; /* ptr to resulting list */ |
267 | 0 | for (i = 0; i < hmap->tabsize; i++) { |
268 | 0 | for (hitem = hmap->hashtab[i]; hitem != NULL; hitem = next) { |
269 | 0 | next = hitem->next; |
270 | 0 | hitem->next = hstore; |
271 | 0 | hstore = hitem; |
272 | 0 | } |
273 | 0 | } |
274 | | |
275 | | /* Destroy the old hashtab and make a new one that is twice as big */ |
276 | 0 | LEPT_FREE(hmap->hashtab); |
277 | 0 | findNextLargerPrime(2 * hmap->tabsize, &tabsize); |
278 | 0 | hmap->tabsize = tabsize; |
279 | 0 | hmap->hashtab = (L_HASHITEM **)LEPT_CALLOC(tabsize, sizeof(L_HASHITEM *)); |
280 | 0 | if (hmap->hashtab == NULL) { |
281 | 0 | hmap->tabsize = 0; |
282 | 0 | return ERROR_INT("hashtab ptr array not made", __func__, 1); |
283 | 0 | } |
284 | 0 | hmap->ntogo = hmap->maxocc * tabsize - hmap->nitems; |
285 | | |
286 | | /* Populate with the stored hash items */ |
287 | 0 | for (hitem = hstore; hitem != NULL; hitem = next) { |
288 | 0 | next = hitem->next; |
289 | 0 | index = hitem->key % tabsize; /* into new hashtab */ |
290 | 0 | hitem->next = hmap->hashtab[index]; /* link to head of existing list */ |
291 | 0 | hmap->hashtab[index] = hitem; /* put at head */ |
292 | 0 | } |
293 | |
|
294 | 0 | return 0; |
295 | 0 | } |