Coverage Report

Created: 2025-06-13 06:48

/src/leptonica/src/hashmap.c
Line
Count
Source (jump to first uncovered line)
1
/*====================================================================*
2
 -  Copyright (C) 2001 Leptonica.  All rights reserved.
3
 -
4
 -  Redistribution and use in source and binary forms, with or without
5
 -  modification, are permitted provided that the following conditions
6
 -  are met:
7
 -  1. Redistributions of source code must retain the above copyright
8
 -     notice, this list of conditions and the following disclaimer.
9
 -  2. Redistributions in binary form must reproduce the above
10
 -     copyright notice, this list of conditions and the following
11
 -     disclaimer in the documentation and/or other materials
12
 -     provided with the distribution.
13
 -
14
 -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15
 -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16
 -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17
 -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
18
 -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19
 -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20
 -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21
 -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22
 -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23
 -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
 -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
 *====================================================================*/
26
27
/*
28
 * \file  hashmap.c
29
 * <pre>
30
 *
31
 *      Hashmap creation, destruction
32
 *          L_HASHMAP    *l_hmapCreate()
33
 *          void          l_hmapDestroy()
34
 *
35
 *      Hashmap: Accessors and modifiers
36
 *          L_HASHITEM   *l_hmapLookup()
37
 *          l_int32       l_hmapRehash()
38
 *
39
 *    (1) See also hashmap.h for a brief description of the design.
40
 *    (2) In a typical use, a set of objects (in an array or associated
41
 *        with image pixels) is represented by a hashmap.  A uint64 key is
42
 *        produced for each object.  This integer is then hashed into an
43
 *        index in a hashtable, using the mod function with the table size
44
 *        which is a prime number.  Each entry in the hash table is a list
45
 *        of hash items.  In lookup, the appropriate list is traversed,
46
 *        looking for the object key found earlier.
47
 *    (3) Hash functions that map points, strings and float64 to uint64
48
 *        are given in utils1.c.
49
 *    (4) Use of the hashmap on points, strings and float64 data are
50
 *        given in ptafunc2.c, sarray2.c and dnafunc1.c.
51
 *    (5) Useful rule of thumb for hashing collisions:
52
 *        For a random hashing function (say, from strings to l_uint64),
53
 *        the probability of a collision increases as N^2 for N much
54
 *        less than 2^32.  The quadratic behavior switches over to
55
 *        approaching 1.0 around 2^32, which is the square root of 2^64.
56
 *        So, for example, if you have 10^7 strings, the probability
57
 *        of a single collision using an l_uint64 key is on the order of
58
 *            (10^7/4x10^9)^2 ~ 10^-5.
59
 *        For a million strings, collisions are very rare (~10^-7 probability).
60
 * </pre>
61
 */
62
63
#ifdef HAVE_CONFIG_H
64
#include <config_auto.h>
65
#endif  /* HAVE_CONFIG_H */
66
67
#include "allheaders.h"
68
69
    /* Limit on the hashtable size */
70
static const l_uint32  MaxTabsize = 50000000;
71
    /* Default values for creating the hashmap. */
72
static const l_int32   DefaultInitNItems = 2000;
73
static const l_int32   DefaultMaxOcc = 2;
74
75
/*--------------------------------------------------------------------------*
76
 *                     Hashmap creation and destruction                     *
77
 *--------------------------------------------------------------------------*/
78
/*!
79
 * \brief  l_hmapCreate()
80
 *
81
 * \param[in]   ninit   initial estimate of the number of items to be stored;
82
 *                      use 0 for default value.
83
 * \param[in]   maxocc  max average occupancy of each list of hashitme;
84
 *                      it should be in range [1 ... 5]; use 0 for default
85
 * \return      ptr to new hashmap, or NULL on error
86
 *
87
 * <pre>
88
 * Notes:
89
 *      (1) If the maximum number n of items to be hashed is known in
90
 *          advance, suggested values are:
91
 *                 %nint = 0.51 * n
92
 *                 %maxocc = 2
93
 *          With these values, the table will not need to be rehashed,
94
 *          even if all items have unique keys.
95
 *      (2) The actual initial size of the hashtab is the first prime
96
 *          number larger than %ninit/%maxocc.
97
 *      (3) Each entry into the hashtab points to alist of hash items
98
 *          (key, val, count).
99
 * </pre>
100
 */
101
L_HASHMAP *
102
l_hmapCreate(l_int32  ninit,
103
             l_int32  maxocc)
104
0
{
105
0
l_uint32    size, tabsize;
106
0
L_HASHMAP  *hmap;
107
108
0
    ninit = L_MAX(ninit, DefaultInitNItems);
109
0
    if (maxocc <= 0) maxocc = DefaultMaxOcc;
110
0
    if (maxocc > 5) {
111
0
        L_WARNING("maxocc = %d; non-optimal value. Set to default = %d\n",
112
0
                  __func__, maxocc, DefaultMaxOcc);
113
0
        maxocc = DefaultMaxOcc;
114
0
    }
115
0
    size = ninit / maxocc;
116
0
    if (size > MaxTabsize) {
117
0
        L_ERROR("ninit/maxocc = %d > MaxTabsize = %d\n", __func__,
118
0
                size, MaxTabsize);
119
0
        return NULL;
120
0
    }
121
122
0
    hmap = (L_HASHMAP *)LEPT_CALLOC(1, sizeof(L_HASHMAP));
123
0
    findNextLargerPrime(size, &tabsize);  /* at least 101 */
124
0
    if ((hmap->hashtab =
125
0
        (L_HASHITEM **)LEPT_CALLOC(tabsize, sizeof(L_HASHITEM *))) == NULL) {
126
0
        LEPT_FREE(hmap);
127
0
        return (L_HASHMAP *)ERROR_PTR("hashtab not made", __func__, NULL);
128
0
    }
129
130
0
    hmap->nitems = 0;
131
0
    hmap->ntogo = ninit;
132
0
    hmap->maxocc = maxocc;
133
0
    hmap->tabsize = tabsize;
134
0
    return hmap;
135
0
}
136
137
138
/*!
139
 * \brief  l_hmapDestroy()
140
 *
141
 * \param[in,out]   phmap  to be nulled, if it exists
142
 * \return          void
143
 */
144
void
145
l_hmapDestroy(L_HASHMAP  **phmap)
146
0
{
147
0
l_int32      i;
148
0
L_HASHITEM  *hitem, *next;
149
0
L_HASHMAP   *hmap;
150
151
0
    if (phmap == NULL) {
152
0
        L_WARNING("ptr address is NULL!\n", __func__);
153
0
        return;
154
0
    }
155
156
0
    if ((hmap = *phmap) == NULL)
157
0
        return;
158
159
0
    for (i = 0; i < hmap->tabsize; i++) {
160
0
        for (hitem = hmap->hashtab[i]; hitem != NULL; hitem = next) {
161
0
            next = hitem->next;
162
0
            LEPT_FREE(hitem);
163
0
        }
164
0
    }
165
0
    LEPT_FREE(hmap->hashtab);
166
0
    LEPT_FREE(hmap);
167
0
    *phmap = NULL;
168
0
}
169
170
171
/*--------------------------------------------------------------------------*
172
 *                    Hashmap: Accessors and modifiers                      *
173
 *--------------------------------------------------------------------------*/
174
/*!
175
 * \brief  l_hmapLookup()
176
 *
177
 * \param[in]   hmap
178
 * \param[in]   key     to be hashed into an index in the hashtab
179
 * \param[in]   val     to be stored in the hitem if creating it
180
 * \param[in]   op      L_HMAP_CHECK, L_HMAP_CREATE
181
 * \return      ptr     hitem; or null either on error or if not found
182
 *                      with op == L_HMAP_CHECK.
183
 *
184
 * <pre>
185
 * Notes:
186
 *      (1) This lookup function will also create a new hitem if requested.
187
 *      (2) The %op parameter does the following:
188
 *          op == L_HMAP_CHECK: return the hitem, or null if not found
189
 *          op == L_HMAP_CREATE: if found, increment the count; otherwise, make
190
 *                               and store a new hitem; always return the hitem.
191
 *      (3) The key is a uint64.  It is made by hashing some data in the object.
192
 *      (4) The value is an index into an array of the objects from which
193
 *          the hashtable has been constructed.
194
 *      (5) If an hitem is found, a pointer to it is returned.  It is owned
195
 *          by the hashtable; do not destroy it.
196
 * </pre>
197
 */
198
L_HASHITEM *
199
l_hmapLookup(L_HASHMAP  *hmap,
200
             l_uint64    key,
201
             l_uint64    val,
202
             l_int32     op)
203
0
{
204
0
l_uint32     index;
205
0
L_HASHITEM  *hlist, *hitem;
206
207
0
    if (!hmap)
208
0
        return (L_HASHITEM *)ERROR_PTR("hmap not defined", __func__, NULL);
209
0
    if (op != L_HMAP_CHECK && op != L_HMAP_CREATE)
210
0
        return (L_HASHITEM *)ERROR_PTR("invalid op", __func__, NULL);
211
212
        /* If found, return a ptr to the hitem (not a copy) */
213
0
    index = key % hmap->tabsize;  /* into hashtab */
214
0
    hlist = hmap->hashtab[index];  /* head of the list */
215
0
    for (hitem = hlist; hitem != NULL; hitem = hitem->next) {
216
0
        if (key == hitem->key) {
217
0
            if (op == L_HMAP_CREATE) hitem->count++;
218
0
            return hitem;
219
0
        }
220
0
    }
221
0
    if (op == L_HMAP_CHECK) return NULL;
222
223
        /* Not found and %op == L_HMAP_CREATE.
224
         * Make a new hitem and add to the head of the list */
225
0
    hitem = (L_HASHITEM *)LEPT_CALLOC(1, sizeof(L_HASHITEM));
226
0
    hitem->key = key;
227
0
    hitem->val = val;
228
0
    hitem->count = 1;
229
0
    hitem->next = hlist;
230
0
    hmap->hashtab[index] = hitem;
231
0
    hmap->nitems++;
232
0
    hmap->ntogo--;
233
234
        /* If hmap is full based on average occupancy, rehash */
235
0
    if (hmap->ntogo == 0)
236
0
        l_hmapRehash(hmap);
237
238
0
    return hitem;
239
0
}
240
241
242
/*!
243
 * \brief  l_hmapRehash()
244
 *
245
 * \param[in]  hmap
246
 * \return     0 if OK; 1 on error
247
 *
248
 * <pre>
249
 * Notes:
250
 *      (1) This is called when the average occupancy reaches maxocc.
251
 *          It doubles the size of the hashtab, and reuses the hash items.
252
 * </pre>
253
 */
254
l_ok
255
l_hmapRehash(L_HASHMAP  *hmap)
256
0
{
257
0
l_int32      i, index;
258
0
l_uint32     tabsize;
259
0
L_HASHITEM  *hstore, *hitem, *next;
260
261
0
    if (!hmap)
262
0
        return ERROR_INT("hmap not defined", __func__, 1);
263
264
        /* Put hash items in temporary storage in a single list,
265
         * successively adding each to the list head. */
266
0
    hstore = NULL;  /* ptr to resulting list */
267
0
    for (i = 0; i < hmap->tabsize; i++) {
268
0
        for (hitem = hmap->hashtab[i]; hitem != NULL; hitem = next) {
269
0
            next = hitem->next;
270
0
            hitem->next = hstore;
271
0
            hstore = hitem;
272
0
        }
273
0
    }
274
275
        /* Destroy the old hashtab and make a new one that is twice as big */
276
0
    LEPT_FREE(hmap->hashtab);
277
0
    findNextLargerPrime(2 * hmap->tabsize, &tabsize);
278
0
    hmap->tabsize = tabsize;
279
0
    hmap->hashtab = (L_HASHITEM **)LEPT_CALLOC(tabsize, sizeof(L_HASHITEM *));
280
0
    if (hmap->hashtab == NULL) {
281
0
        hmap->tabsize = 0;
282
0
        return ERROR_INT("hashtab ptr array not made", __func__, 1);
283
0
    }
284
0
    hmap->ntogo = hmap->maxocc * tabsize - hmap->nitems;
285
286
        /* Populate with the stored hash items */
287
0
    for (hitem = hstore; hitem != NULL; hitem = next) {
288
0
        next = hitem->next;
289
0
        index = hitem->key % tabsize;  /* into new hashtab */
290
0
        hitem->next = hmap->hashtab[index];  /* link to head of existing list */
291
0
        hmap->hashtab[index] = hitem;  /* put at head */
292
0
    }
293
294
0
    return 0;
295
0
}