/src/leptonica/src/sarray2.c

Source (jump to first uncovered line)
/*====================================================================*
 -  Copyright (C) 2001 Leptonica.  All rights reserved.
 -
 -  Redistribution and use in source and binary forms, with or without
 -  modification, are permitted provided that the following conditions
 -  are met:
 -  1. Redistributions of source code must retain the above copyright
 -     notice, this list of conditions and the following disclaimer.
 -  2. Redistributions in binary form must reproduce the above
 -     copyright notice, this list of conditions and the following
 -     disclaimer in the documentation and/or other materials
 -     provided with the distribution.
 -
 -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
 -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *====================================================================*/

/*!
 * \file  sarray2.c
 * <pre>
 *
 *      Sort
 *          SARRAY     *sarraySort()
 *          SARRAY     *sarraySortByIndex()
 *          l_int32     stringCompareLexical()
 *
 *      Set operations using aset (rbtree)
 *          L_ASET     *l_asetCreateFromSarray()
 *          l_int32     sarrayRemoveDupsByAset()
 *          l_int32     sarrayUnionByAset()
 *          l_int32     sarrayIntersectionByAset()
 *
 *      Hashmap operations
 *          L_HASHMAP  *l_hmapCreateFromSarray()
 *          l_int32     sarrayRemoveDupsByHmap()
 *          l_int32     sarrayUnionByHmap()
 *          l_int32     sarrayIntersectionByHmap()
 *
 *      Miscellaneous operations
 *          SARRAY     *sarrayGenerateIntegers()
 *          l_int32     sarrayLookupCSKV()
 *
 *
 * We have two implementations of set operations on an array of strings:
 *
 *   (1) Using an underlying tree (rbtree).
 *       This uses a good 64 bit hashing function for the key,
 *       that is not expected to have hash collisions (and we do
 *       not test for them).  The tree is built up of the hash
 *       values, and if the hash is found in the tree, it is
 *       assumed that the string has already been found.
 *
 *   (2) Building a hashmap from the keys (hashmap).
 *       This uses a fast 64 bit hashing function for the key, which
 *       is then hashed into a hashtable.  Collisions of hashkeys are
 *       very rare, but the hashtable is designed to allow more than one
 *       hashitem in a table entry.  The hashitems are put in a list at
 *       each hashtable entry, which is traversed looking for the key.
 * </pre>
 */

#ifdef HAVE_CONFIG_H
#include <config_auto.h>
#endif  /* HAVE_CONFIG_H */

#include <string.h>
#include "allheaders.h"
#include "array_internal.h"

/*----------------------------------------------------------------------*
 *                                   Sort                               *
 *----------------------------------------------------------------------*/
/*!
 * \brief   sarraySort()
 *
 * \param[in]    saout       output sarray; can be NULL or equal to sain
 * \param[in]    sain        input sarray
 * \param[in]    sortorder   L_SORT_INCREASING or L_SORT_DECREASING
 * \return  saout output sarray, sorted by ascii value, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) Set saout = sain for in-place; otherwise, set naout = NULL.
 *      (2) Shell sort, modified from K&R, 2nd edition, p.62.
 *          Slow but simple O(n logn) sort.
 * </pre>
 */
SARRAY *
sarraySort(SARRAY  *saout,
           SARRAY  *sain,
           l_int32  sortorder)
{
char   **array;
char    *tmp;
l_int32  n, i, j, gap;

    if (!sain)
        return (SARRAY *)ERROR_PTR("sain not defined", __func__, NULL);

        /* Make saout if necessary; otherwise do in-place */
    if (!saout)
        saout = sarrayCopy(sain);
    else if (sain != saout)
        return (SARRAY *)ERROR_PTR("invalid: not in-place", __func__, NULL);
    array = saout->array;  /* operate directly on the array */
    n = sarrayGetCount(saout);

        /* Shell sort */
    for (gap = n/2; gap > 0; gap = gap / 2) {
        for (i = gap; i < n; i++) {
            for (j = i - gap; j >= 0; j -= gap) {
                if ((sortorder == L_SORT_INCREASING &&
                     stringCompareLexical(array[j], array[j + gap])) ||
                    (sortorder == L_SORT_DECREASING &&
                     stringCompareLexical(array[j + gap], array[j])))
                {
                    tmp = array[j];
                    array[j] = array[j + gap];
                    array[j + gap] = tmp;
                }
            }
        }
    }

    return saout;
}


/*!
 * \brief   sarraySortByIndex()
 *
 * \param[in]    sain
 * \param[in]    naindex   na that maps from the new sarray to the input sarray
 * \return  saout sorted, or NULL on error
 */
SARRAY *
sarraySortByIndex(SARRAY  *sain,
                  NUMA    *naindex)
{
char    *str;
l_int32  i, n, index;
SARRAY  *saout;

    if (!sain)
        return (SARRAY *)ERROR_PTR("sain not defined", __func__, NULL);
    if (!naindex)
        return (SARRAY *)ERROR_PTR("naindex not defined", __func__, NULL);

    n = sarrayGetCount(sain);
    saout = sarrayCreate(n);
    for (i = 0; i < n; i++) {
        numaGetIValue(naindex, i, &index);
        str = sarrayGetString(sain, index, L_COPY);
        sarrayAddString(saout, str, L_INSERT);
    }

    return saout;
}


/*!
 * \brief   stringCompareLexical()
 *
 * \param[in]    str1
 * \param[in]    str2
 * \return  1 if str1 > str2 lexically; 0 otherwise
 *
 * <pre>
 * Notes:
 *      (1) If the lexical values are identical, return a 0, to
 *          indicate that no swapping is required to sort the strings.
 * </pre>
 */
l_int32
stringCompareLexical(const char *str1,
                     const char *str2)
{
l_int32  i, len1, len2, len;

    if (!str1)
        return ERROR_INT("str1 not defined", __func__, 1);
    if (!str2)
        return ERROR_INT("str2 not defined", __func__, 1);

    len1 = strlen(str1);
    len2 = strlen(str2);
    len = L_MIN(len1, len2);

    for (i = 0; i < len; i++) {
        if (str1[i] == str2[i])
            continue;
        if (str1[i] > str2[i])
            return 1;
        else
            return 0;
    }

    if (len1 > len2)
        return 1;
    else
        return 0;
}


/*----------------------------------------------------------------------*
 *                   Set operations using aset (rbtree)                 *
 *----------------------------------------------------------------------*/
/*!
 * \brief   l_asetCreateFromSarray()
 *
 * \param[in]    sa
 * \return  set using a string hash into a uint64 as the key
 */
L_ASET *
l_asetCreateFromSarray(SARRAY  *sa)
{
char     *str;
l_int32   i, n;
l_uint64  hash;
L_ASET   *set;
RB_TYPE   key;

    if (!sa)
        return (L_ASET *)ERROR_PTR("sa not defined", __func__, NULL);

    set = l_asetCreate(L_UINT_TYPE);
    n = sarrayGetCount(sa);
    for (i = 0; i < n; i++) {
        str = sarrayGetString(sa, i, L_NOCOPY);
        l_hashStringToUint64Fast(str, &hash);
        key.utype = hash;
        l_asetInsert(set, key);
    }

    return set;
}


/*!
 * \brief   sarrayRemoveDupsByAset()
 *
 * \param[in]    sas
 * \param[out]   psad      with duplicates removed
 * \return  0 if OK; 1 on error
 *
 * <pre>
 * Notes:
 *      (1) This is O(nlogn), considerably slower than
 *          sarrayRemoveDupsByHmap() for large string arrays.
 *      (2) The key for each string is a 64-bit hash.
 *      (3) Build a set, using hashed strings as keys.  As the set is
 *          built, first do a find; if not found, add the key to the
 *          set and add the string to the output sarray.
 * </pre>
 */
l_ok
sarrayRemoveDupsByAset(SARRAY   *sas,
                       SARRAY  **psad)
{
char     *str;
l_int32   i, n;
l_uint64  hash;
L_ASET   *set;
RB_TYPE   key;
SARRAY   *sad;

    if (!psad)
        return ERROR_INT("&sad not defined", __func__, 1);
    *psad = NULL;
    if (!sas)
        return ERROR_INT("sas not defined", __func__, 1);

    set = l_asetCreate(L_UINT_TYPE);
    sad = sarrayCreate(0);
    *psad = sad;
    n = sarrayGetCount(sas);
    for (i = 0; i < n; i++) {
        str = sarrayGetString(sas, i, L_NOCOPY);
        l_hashStringToUint64Fast(str, &hash);
        key.utype = hash;
        if (!l_asetFind(set, key)) {
            sarrayAddString(sad, str, L_COPY);
            l_asetInsert(set, key);
        }
    }

    l_asetDestroy(&set);
    return 0;
}


/*!
 * \brief   sarrayUnionByAset()
 *
 * \param[in]    sa1
 * \param[in]    sa2
 * \param[out]   psad      union of the two string arrays
 * \return  0 if OK; 1 on error
 *
 * <pre>
 * Notes:
 *      (1) Duplicates are removed from the concatenation of the two arrays.
 *      (2) The key for each string is a 64-bit hash.
 *      (2) Algorithm: Concatenate the two sarrays.  Then build a set,
 *          using hashed strings as keys.  As the set is built, first do
 *          a find; if not found, add the key to the set and add the string
 *          to the output sarray.  This is O(nlogn).
 * </pre>
 */
l_ok
sarrayUnionByAset(SARRAY   *sa1,
                  SARRAY   *sa2,
                  SARRAY  **psad)
{
SARRAY  *sa3;

    if (!psad)
        return ERROR_INT("&sad not defined", __func__, 1);
    *psad = NULL;
    if (!sa1)
        return ERROR_INT("sa1 not defined", __func__, 1);
    if (!sa2)
        return ERROR_INT("sa2 not defined", __func__, 1);

        /* Join */
    sa3 = sarrayCopy(sa1);
    if (sarrayJoin(sa3, sa2) == 1) {
        sarrayDestroy(&sa3);
        return ERROR_INT("join failed for sa3", __func__, 1);
    }

        /* Eliminate duplicates */
    sarrayRemoveDupsByAset(sa3, psad);
    sarrayDestroy(&sa3);
    return 0;
}


/*!
 * \brief   sarrayIntersectionByAset()
 *
 * \param[in]    sa1
 * \param[in]    sa2
 * \param[out]   psad      intersection of the two string arrays
 * \return  0 if OK; 1 on error
 *
 * <pre>
 * Notes:
 *      (1) Algorithm: put the larger sarray into a set, using the string
 *          hashes as the key values.  Then run through the smaller sarray,
 *          building an output sarray and a second set from the strings
 *          in the larger array: if a string is in the first set but
 *          not in the second, add the string to the output sarray and hash
 *          it into the second set.  The second set is required to make
 *          sure only one instance of each string is put into the output sarray.
 *          This is O(mlogn), {m,n} = sizes of {smaller,larger} input arrays.
 * </pre>
 */
l_ok
sarrayIntersectionByAset(SARRAY   *sa1,
                         SARRAY   *sa2,
                         SARRAY  **psad)
{
char     *str;
l_int32   n1, n2, i, n;
l_uint64  hash;
L_ASET   *set1, *set2;
RB_TYPE   key;
SARRAY   *sa_small, *sa_big, *sad;

    if (!psad)
        return ERROR_INT("&sad not defined", __func__, 1);
    *psad = NULL;
    if (!sa1)
        return ERROR_INT("sa1 not defined", __func__, 1);
    if (!sa2)
        return ERROR_INT("sa2 not defined", __func__, 1);

        /* Put the elements of the biggest array into a set */
    n1 = sarrayGetCount(sa1);
    n2 = sarrayGetCount(sa2);
    sa_small = (n1 < n2) ? sa1 : sa2;   /* do not destroy sa_small */
    sa_big = (n1 < n2) ? sa2 : sa1;   /* do not destroy sa_big */
    set1 = l_asetCreateFromSarray(sa_big);

        /* Build up the intersection of strings */
    sad = sarrayCreate(0);
    *psad = sad;
    n = sarrayGetCount(sa_small);
    set2 = l_asetCreate(L_UINT_TYPE);
    for (i = 0; i < n; i++) {
        str = sarrayGetString(sa_small, i, L_NOCOPY);
        l_hashStringToUint64Fast(str, &hash);
        key.utype = hash;
        if (l_asetFind(set1, key) && !l_asetFind(set2, key)) {
            sarrayAddString(sad, str, L_COPY);
            l_asetInsert(set2, key);
        }
    }

    l_asetDestroy(&set1);
    l_asetDestroy(&set2);
    return 0;
}


/*----------------------------------------------------------------------*
 *                          Hashmap operations                          *
 *----------------------------------------------------------------------*/
/*!
 * \brief  l_hmapCreateFromSarray()
 *
 * \param[in]   sa     input sarray
 * \return      hmap   hashmap, or NULL on error
 */
L_HASHMAP *
l_hmapCreateFromSarray(SARRAY  *sa)
{
l_int32      i, n;
l_uint64     key;
char        *str;
L_HASHMAP   *hmap;

    if (!sa)
        return (L_HASHMAP *)ERROR_PTR("sa not defined", __func__, NULL);

    n = sarrayGetCount(sa);
    if ((hmap = l_hmapCreate(0.51 * n, 2)) == NULL)
        return (L_HASHMAP *)ERROR_PTR("hmap not made", __func__, NULL);
    for (i = 0; i < n; i++) {
        str = sarrayGetString(sa, i, L_NOCOPY);
        l_hashStringToUint64Fast(str, &key);
        l_hmapLookup(hmap, key, i, L_HMAP_CREATE);
    }
    return hmap;
}


/*!
 * \brief  sarrayRemoveDupsByHmap()
 *
 * \param[in]   sas
 * \param[out]  psad    hash set of unique values
 * \param[out]  phmap   [optional] hashmap used for lookup
 * \return  0 if OK; 1 on error
 */
l_ok
sarrayRemoveDupsByHmap(SARRAY      *sas,
                       SARRAY     **psad,
                       L_HASHMAP  **phmap)
{
l_int32      i, tabsize;
char        *str;
SARRAY      *sad;
L_HASHITEM  *hitem;
L_HASHMAP   *hmap;

    if (phmap) *phmap = NULL;
    if (!psad)
        return ERROR_INT("&sad not defined", __func__, 1);
    *psad = NULL;
    if (!sas)
        return ERROR_INT("sas not defined", __func__, 1);

        /* Traverse the hashtable lists */
    if ((hmap = l_hmapCreateFromSarray(sas)) == NULL)
        return ERROR_INT("hmap not made", __func__, 1);
    sad = sarrayCreate(0);
    *psad = sad;
    tabsize = hmap->tabsize;
    for (i = 0; i < tabsize; i++) {
        hitem = hmap->hashtab[i];
        while (hitem) {
            str = sarrayGetString(sas, hitem->val, L_COPY);
            sarrayAddString(sad, str, L_INSERT);
            hitem = hitem->next;
        }
    }

    if (phmap)
        *phmap = hmap;
    else
        l_hmapDestroy(&hmap);
    return 0;
}


/*!
 * \brief  sarrayUnionByHmap()
 *
 * \param[in]   sa1
 * \param[in]   sa2
 * \param[out]  *psad     union of the array values
 * \return  0 if OK; 1 on error
 */
l_ok
sarrayUnionByHmap(SARRAY   *sa1,
                  SARRAY   *sa2,
                  SARRAY  **psad)
{
SARRAY  *sa3;

    if (!psad)
        return ERROR_INT("&sad not defined", __func__, 1);
    *psad = NULL;
    if (!sa1)
        return ERROR_INT("sa1 not defined", __func__, 1);
    if (!sa2)
        return ERROR_INT("sa2 not defined", __func__, 1);

    sa3 = sarrayCopy(sa1);
    if (sarrayJoin(sa3, sa2) == 1) {
        sarrayDestroy(&sa3);
        return ERROR_INT("sa3 join failed", __func__, 1);
    }
    sarrayRemoveDupsByHmap(sa3, psad, NULL);
    sarrayDestroy(&sa3);
    return 0;
}


/*!
 * \brief  sarrayIntersectionByHmap()
 *
 * \param[in]   sa1
 * \param[in]   sa2
 * \param[out]  *psad     intersection of the array values
 * \return  0 if OK; 1 on error
 */
l_ok
sarrayIntersectionByHmap(SARRAY   *sa1,
                         SARRAY   *sa2,
                         SARRAY  **psad)
{
l_int32      i, n1, n2, n;
l_uint64     key;
char        *str;
SARRAY      *sa_small, *sa_big, *sa3, *sad;
L_HASHITEM  *hitem;
L_HASHMAP   *hmap;

    if (!psad)
        return ERROR_INT("&sad not defined", __func__, 1);
    *psad = NULL;
    if (!sa1)
        return ERROR_INT("sa1 not defined", __func__, 1);
    if (!sa2)
        return ERROR_INT("sa2 not defined", __func__, 1);

        /* Make a hashmap for the elements of the biggest array */
    n1 = sarrayGetCount(sa1);
    n2 = sarrayGetCount(sa2);
    sa_small = (n1 < n2) ? sa1 : sa2;   /* do not destroy sa_small */
    sa_big = (n1 < n2) ? sa2 : sa1;   /* do not destroy sa_big */
    if ((hmap = l_hmapCreateFromSarray(sa_big)) == NULL)
        return ERROR_INT("hmap not made", __func__, 1);

        /* Remove duplicates from the smallest array.  Alternatively,
         * we can skip this step and avoid counting duplicates in
         * sa_small by modifying the count fields in the sa_big hashitems;
         * e.g., see l_hmapIntersectionDna(). */
    sarrayRemoveDupsByHmap(sa_small, &sa3, NULL);

        /* Go through sa3, the set of strings derived from the smallest array,
         * hashing into the big array table.  Any string found belongs to both,
         * so add it to the output array. */
    sad = sarrayCreate(0);
    *psad = sad;
    n = sarrayGetCount(sa3);
    for (i = 0; i < n; i++) {
        str = sarrayGetString(sa3, i, L_NOCOPY);
        l_hashStringToUint64Fast(str, &key);
        hitem = l_hmapLookup(hmap, key, i, L_HMAP_CHECK);
        if (hitem)
            sarrayAddString(sad, str, L_COPY);
    }
    l_hmapDestroy(&hmap);
    sarrayDestroy(&sa3);
    return 0;
}


/*----------------------------------------------------------------------*
 *                      Miscellaneous operations                        *
 *----------------------------------------------------------------------*/
/*!
 * \brief   sarrayGenerateIntegers()
 *
 * \param[in]   n
 * \return  sa  of printed numbers, 1 - n, or NULL on error
 */
SARRAY *
sarrayGenerateIntegers(l_int32  n)
{
char     buf[32];
l_int32  i;
SARRAY  *sa;

    if ((sa = sarrayCreate(n)) == NULL)
        return (SARRAY *)ERROR_PTR("sa not made", __func__, NULL);
    for (i = 0; i < n; i++) {
        snprintf(buf, sizeof(buf), "%d", i);
        sarrayAddString(sa, buf, L_COPY);
    }
    return sa;
}


/*!
 * \brief   sarrayLookupCSKV()
 *
 * \param[in]    sa          of strings, each being a comma-separated pair
 *                           of strings, the first being a key and the
 *                           second a value
 * \param[in]    keystring   an input string to match with each key in %sa
 * \param[out]   pvalstring  the returned value string corresponding to the
 *                           input key string, if found; otherwise NULL
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) The input %sa can have other strings that are not in
 *          comma-separated key-value format.  These will be ignored.
 *      (2) This returns a copy of the first value string in %sa whose
 *          key string matches the input %keystring.
 *      (3) White space is not ignored; all white space before the ','
 *          is used for the keystring in matching.  This allows the
 *          key and val strings to have white space (e.g., multiple words).
 * </pre>
 */
l_ok
sarrayLookupCSKV(SARRAY      *sa,
                 const char  *keystring,
                 char       **pvalstring)
{
char    *key, *val, *str;
l_int32  i, n;
SARRAY  *sa1;

    if (!pvalstring)
        return ERROR_INT("&valstring not defined", __func__, 1);
    *pvalstring = NULL;
    if (!sa)
        return ERROR_INT("sa not defined", __func__, 1);
    if (!keystring)
        return ERROR_INT("keystring not defined", __func__, 1);

    n = sarrayGetCount(sa);
    for (i = 0; i < n; i++) {
        str = sarrayGetString(sa, i, L_NOCOPY);
        sa1 = sarrayCreate(2);
        sarraySplitString(sa1, str, ",");
        if (sarrayGetCount(sa1) != 2) {
            sarrayDestroy(&sa1);
            continue;
        }
        key = sarrayGetString(sa1, 0, L_NOCOPY);
        val = sarrayGetString(sa1, 1, L_NOCOPY);
        if (!strcmp(key, keystring)) {
            *pvalstring = stringNew(val);
            sarrayDestroy(&sa1);
            return 0;
        }
        sarrayDestroy(&sa1);
    }

    return 0;
}

Coverage Report

Created: 2025-06-13 07:02

Line	Count	Source (jump to first uncovered line)
1		/====================================================================
2		- Copyright (C) 2001 Leptonica. All rights reserved.
3		-
4		- Redistribution and use in source and binary forms, with or without
5		- modification, are permitted provided that the following conditions
6		- are met:
7		- 1. Redistributions of source code must retain the above copyright
8		- notice, this list of conditions and the following disclaimer.
9		- 2. Redistributions in binary form must reproduce the above
10		- copyright notice, this list of conditions and the following
11		- disclaimer in the documentation and/or other materials
12		- provided with the distribution.
13		-
14		- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15		- ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16		- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17		- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
18		- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19		- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20		- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21		- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22		- OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23		- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24		- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25		====================================================================/
26
27		/*!
28		* \file sarray2.c
29		* <pre>
30		*
31		* Sort
32		* SARRAY *sarraySort()
33		* SARRAY *sarraySortByIndex()
34		* l_int32 stringCompareLexical()
35		*
36		* Set operations using aset (rbtree)
37		* L_ASET *l_asetCreateFromSarray()
38		* l_int32 sarrayRemoveDupsByAset()
39		* l_int32 sarrayUnionByAset()
40		* l_int32 sarrayIntersectionByAset()
41		*
42		* Hashmap operations
43		* L_HASHMAP *l_hmapCreateFromSarray()
44		* l_int32 sarrayRemoveDupsByHmap()
45		* l_int32 sarrayUnionByHmap()
46		* l_int32 sarrayIntersectionByHmap()
47		*
48		* Miscellaneous operations
49		* SARRAY *sarrayGenerateIntegers()
50		* l_int32 sarrayLookupCSKV()
51		*
52		*
53		* We have two implementations of set operations on an array of strings:
54		*
55		* (1) Using an underlying tree (rbtree).
56		* This uses a good 64 bit hashing function for the key,
57		* that is not expected to have hash collisions (and we do
58		* not test for them). The tree is built up of the hash
59		* values, and if the hash is found in the tree, it is
60		* assumed that the string has already been found.
61		*
62		* (2) Building a hashmap from the keys (hashmap).
63		* This uses a fast 64 bit hashing function for the key, which
64		* is then hashed into a hashtable. Collisions of hashkeys are
65		* very rare, but the hashtable is designed to allow more than one
66		* hashitem in a table entry. The hashitems are put in a list at
67		* each hashtable entry, which is traversed looking for the key.
68		* </pre>
69		*/
70
71		#ifdef HAVE_CONFIG_H
72		#include <config_auto.h>
73		#endif /* HAVE_CONFIG_H */
74
75		#include <string.h>
76		#include "allheaders.h"
77		#include "array_internal.h"
78
79		/----------------------------------------------------------------------
80		* Sort *
81		----------------------------------------------------------------------/
82		/*!
83		* \brief sarraySort()
84		*
85		* \param[in] saout output sarray; can be NULL or equal to sain
86		* \param[in] sain input sarray
87		* \param[in] sortorder L_SORT_INCREASING or L_SORT_DECREASING
88		* \return saout output sarray, sorted by ascii value, or NULL on error
89		*
90		* <pre>
91		* Notes:
92		* (1) Set saout = sain for in-place; otherwise, set naout = NULL.
93		* (2) Shell sort, modified from K&R, 2nd edition, p.62.
94		* Slow but simple O(n logn) sort.
95		* </pre>
96		*/
97		SARRAY *
98		sarraySort(SARRAY *saout,
99		SARRAY *sain,
100		l_int32 sortorder)
101	0	{
102	0	char **array;
103	0	char *tmp;
104	0	l_int32 n, i, j, gap;
105
106	0	if (!sain)
107	0	return (SARRAY *)ERROR_PTR("sain not defined", __func__, NULL);
108
109		/* Make saout if necessary; otherwise do in-place */
110	0	if (!saout)
111	0	saout = sarrayCopy(sain);
112	0	else if (sain != saout)
113	0	return (SARRAY *)ERROR_PTR("invalid: not in-place", __func__, NULL);
114	0	array = saout->array; /* operate directly on the array */
115	0	n = sarrayGetCount(saout);
116
117		/* Shell sort */
118	0	for (gap = n/2; gap > 0; gap = gap / 2) {
119	0	for (i = gap; i < n; i++) {
120	0	for (j = i - gap; j >= 0; j -= gap) {
121	0	if ((sortorder == L_SORT_INCREASING &&
122	0	stringCompareLexical(array[j], array[j + gap])) \|\|
123	0	(sortorder == L_SORT_DECREASING &&
124	0	stringCompareLexical(array[j + gap], array[j])))
125	0	{
126	0	tmp = array[j];
127	0	array[j] = array[j + gap];
128	0	array[j + gap] = tmp;
129	0	}
130	0	}
131	0	}
132	0	}
133
134	0	return saout;
135	0	}
136
137
138		/*!
139		* \brief sarraySortByIndex()
140		*
141		* \param[in] sain
142		* \param[in] naindex na that maps from the new sarray to the input sarray
143		* \return saout sorted, or NULL on error
144		*/
145		SARRAY *
146		sarraySortByIndex(SARRAY *sain,
147		NUMA *naindex)
148	0	{
149	0	char *str;
150	0	l_int32 i, n, index;
151	0	SARRAY *saout;
152
153	0	if (!sain)
154	0	return (SARRAY *)ERROR_PTR("sain not defined", __func__, NULL);
155	0	if (!naindex)
156	0	return (SARRAY *)ERROR_PTR("naindex not defined", __func__, NULL);
157
158	0	n = sarrayGetCount(sain);
159	0	saout = sarrayCreate(n);
160	0	for (i = 0; i < n; i++) {
161	0	numaGetIValue(naindex, i, &index);
162	0	str = sarrayGetString(sain, index, L_COPY);
163	0	sarrayAddString(saout, str, L_INSERT);
164	0	}
165
166	0	return saout;
167	0	}
168
169
170		/*!
171		* \brief stringCompareLexical()
172		*
173		* \param[in] str1
174		* \param[in] str2
175		* \return 1 if str1 > str2 lexically; 0 otherwise
176		*
177		* <pre>
178		* Notes:
179		* (1) If the lexical values are identical, return a 0, to
180		* indicate that no swapping is required to sort the strings.
181		* </pre>
182		*/
183		l_int32
184		stringCompareLexical(const char *str1,
185		const char *str2)
186	0	{
187	0	l_int32 i, len1, len2, len;
188
189	0	if (!str1)
190	0	return ERROR_INT("str1 not defined", __func__, 1);
191	0	if (!str2)
192	0	return ERROR_INT("str2 not defined", __func__, 1);
193
194	0	len1 = strlen(str1);
195	0	len2 = strlen(str2);
196	0	len = L_MIN(len1, len2);
197
198	0	for (i = 0; i < len; i++) {
199	0	if (str1[i] == str2[i])
200	0	continue;
201	0	if (str1[i] > str2[i])
202	0	return 1;
203	0	else
204	0	return 0;
205	0	}
206
207	0	if (len1 > len2)
208	0	return 1;
209	0	else
210	0	return 0;
211	0	}
212
213
214		/----------------------------------------------------------------------
215		* Set operations using aset (rbtree) *
216		----------------------------------------------------------------------/
217		/*!
218		* \brief l_asetCreateFromSarray()
219		*
220		* \param[in] sa
221		* \return set using a string hash into a uint64 as the key
222		*/
223		L_ASET *
224		l_asetCreateFromSarray(SARRAY *sa)
225	0	{
226	0	char *str;
227	0	l_int32 i, n;
228	0	l_uint64 hash;
229	0	L_ASET *set;
230	0	RB_TYPE key;
231
232	0	if (!sa)
233	0	return (L_ASET *)ERROR_PTR("sa not defined", __func__, NULL);
234
235	0	set = l_asetCreate(L_UINT_TYPE);
236	0	n = sarrayGetCount(sa);
237	0	for (i = 0; i < n; i++) {
238	0	str = sarrayGetString(sa, i, L_NOCOPY);
239	0	l_hashStringToUint64Fast(str, &hash);
240	0	key.utype = hash;
241	0	l_asetInsert(set, key);
242	0	}
243
244	0	return set;
245	0	}
246
247
248		/*!
249		* \brief sarrayRemoveDupsByAset()
250		*
251		* \param[in] sas
252		* \param[out] psad with duplicates removed
253		* \return 0 if OK; 1 on error
254		*
255		* <pre>
256		* Notes:
257		* (1) This is O(nlogn), considerably slower than
258		* sarrayRemoveDupsByHmap() for large string arrays.
259		* (2) The key for each string is a 64-bit hash.
260		* (3) Build a set, using hashed strings as keys. As the set is
261		* built, first do a find; if not found, add the key to the
262		* set and add the string to the output sarray.
263		* </pre>
264		*/
265		l_ok
266		sarrayRemoveDupsByAset(SARRAY *sas,
267		SARRAY **psad)
268	0	{
269	0	char *str;
270	0	l_int32 i, n;
271	0	l_uint64 hash;
272	0	L_ASET *set;
273	0	RB_TYPE key;
274	0	SARRAY *sad;
275
276	0	if (!psad)
277	0	return ERROR_INT("&sad not defined", __func__, 1);
278	0	*psad = NULL;
279	0	if (!sas)
280	0	return ERROR_INT("sas not defined", __func__, 1);
281
282	0	set = l_asetCreate(L_UINT_TYPE);
283	0	sad = sarrayCreate(0);
284	0	*psad = sad;
285	0	n = sarrayGetCount(sas);
286	0	for (i = 0; i < n; i++) {
287	0	str = sarrayGetString(sas, i, L_NOCOPY);
288	0	l_hashStringToUint64Fast(str, &hash);
289	0	key.utype = hash;
290	0	if (!l_asetFind(set, key)) {
291	0	sarrayAddString(sad, str, L_COPY);
292	0	l_asetInsert(set, key);
293	0	}
294	0	}
295
296	0	l_asetDestroy(&set);
297	0	return 0;
298	0	}
299
300
301		/*!
302		* \brief sarrayUnionByAset()
303		*
304		* \param[in] sa1
305		* \param[in] sa2
306		* \param[out] psad union of the two string arrays
307		* \return 0 if OK; 1 on error
308		*
309		* <pre>
310		* Notes:
311		* (1) Duplicates are removed from the concatenation of the two arrays.
312		* (2) The key for each string is a 64-bit hash.
313		* (2) Algorithm: Concatenate the two sarrays. Then build a set,
314		* using hashed strings as keys. As the set is built, first do
315		* a find; if not found, add the key to the set and add the string
316		* to the output sarray. This is O(nlogn).
317		* </pre>
318		*/
319		l_ok
320		sarrayUnionByAset(SARRAY *sa1,
321		SARRAY *sa2,
322		SARRAY **psad)
323	0	{
324	0	SARRAY *sa3;
325
326	0	if (!psad)
327	0	return ERROR_INT("&sad not defined", __func__, 1);
328	0	*psad = NULL;
329	0	if (!sa1)
330	0	return ERROR_INT("sa1 not defined", __func__, 1);
331	0	if (!sa2)
332	0	return ERROR_INT("sa2 not defined", __func__, 1);
333
334		/* Join */
335	0	sa3 = sarrayCopy(sa1);
336	0	if (sarrayJoin(sa3, sa2) == 1) {
337	0	sarrayDestroy(&sa3);
338	0	return ERROR_INT("join failed for sa3", __func__, 1);
339	0	}
340
341		/* Eliminate duplicates */
342	0	sarrayRemoveDupsByAset(sa3, psad);
343	0	sarrayDestroy(&sa3);
344	0	return 0;
345	0	}
346
347
348		/*!
349		* \brief sarrayIntersectionByAset()
350		*
351		* \param[in] sa1
352		* \param[in] sa2
353		* \param[out] psad intersection of the two string arrays
354		* \return 0 if OK; 1 on error
355		*
356		* <pre>
357		* Notes:
358		* (1) Algorithm: put the larger sarray into a set, using the string
359		* hashes as the key values. Then run through the smaller sarray,
360		* building an output sarray and a second set from the strings
361		* in the larger array: if a string is in the first set but
362		* not in the second, add the string to the output sarray and hash
363		* it into the second set. The second set is required to make
364		* sure only one instance of each string is put into the output sarray.
365		* This is O(mlogn), {m,n} = sizes of {smaller,larger} input arrays.
366		* </pre>
367		*/
368		l_ok
369		sarrayIntersectionByAset(SARRAY *sa1,
370		SARRAY *sa2,
371		SARRAY **psad)
372	0	{
373	0	char *str;
374	0	l_int32 n1, n2, i, n;
375	0	l_uint64 hash;
376	0	L_ASET set1, set2;
377	0	RB_TYPE key;
378	0	SARRAY sa_small, sa_big, *sad;
379
380	0	if (!psad)
381	0	return ERROR_INT("&sad not defined", __func__, 1);
382	0	*psad = NULL;
383	0	if (!sa1)
384	0	return ERROR_INT("sa1 not defined", __func__, 1);
385	0	if (!sa2)
386	0	return ERROR_INT("sa2 not defined", __func__, 1);
387
388		/* Put the elements of the biggest array into a set */
389	0	n1 = sarrayGetCount(sa1);
390	0	n2 = sarrayGetCount(sa2);
391	0	sa_small = (n1 < n2) ? sa1 : sa2; /* do not destroy sa_small */
392	0	sa_big = (n1 < n2) ? sa2 : sa1; /* do not destroy sa_big */
393	0	set1 = l_asetCreateFromSarray(sa_big);
394
395		/* Build up the intersection of strings */
396	0	sad = sarrayCreate(0);
397	0	*psad = sad;
398	0	n = sarrayGetCount(sa_small);
399	0	set2 = l_asetCreate(L_UINT_TYPE);
400	0	for (i = 0; i < n; i++) {
401	0	str = sarrayGetString(sa_small, i, L_NOCOPY);
402	0	l_hashStringToUint64Fast(str, &hash);
403	0	key.utype = hash;
404	0	if (l_asetFind(set1, key) && !l_asetFind(set2, key)) {
405	0	sarrayAddString(sad, str, L_COPY);
406	0	l_asetInsert(set2, key);
407	0	}
408	0	}
409
410	0	l_asetDestroy(&set1);
411	0	l_asetDestroy(&set2);
412	0	return 0;
413	0	}
414
415
416		/----------------------------------------------------------------------
417		* Hashmap operations *
418		----------------------------------------------------------------------/
419		/*!
420		* \brief l_hmapCreateFromSarray()
421		*
422		* \param[in] sa input sarray
423		* \return hmap hashmap, or NULL on error
424		*/
425		L_HASHMAP *
426		l_hmapCreateFromSarray(SARRAY *sa)
427	0	{
428	0	l_int32 i, n;
429	0	l_uint64 key;
430	0	char *str;
431	0	L_HASHMAP *hmap;
432
433	0	if (!sa)
434	0	return (L_HASHMAP *)ERROR_PTR("sa not defined", __func__, NULL);
435
436	0	n = sarrayGetCount(sa);
437	0	if ((hmap = l_hmapCreate(0.51 * n, 2)) == NULL)
438	0	return (L_HASHMAP *)ERROR_PTR("hmap not made", __func__, NULL);
439	0	for (i = 0; i < n; i++) {
440	0	str = sarrayGetString(sa, i, L_NOCOPY);
441	0	l_hashStringToUint64Fast(str, &key);
442	0	l_hmapLookup(hmap, key, i, L_HMAP_CREATE);
443	0	}
444	0	return hmap;
445	0	}
446
447
448		/*!
449		* \brief sarrayRemoveDupsByHmap()
450		*
451		* \param[in] sas
452		* \param[out] psad hash set of unique values
453		* \param[out] phmap [optional] hashmap used for lookup
454		* \return 0 if OK; 1 on error
455		*/
456		l_ok
457		sarrayRemoveDupsByHmap(SARRAY *sas,
458		SARRAY **psad,
459		L_HASHMAP **phmap)
460	0	{
461	0	l_int32 i, tabsize;
462	0	char *str;
463	0	SARRAY *sad;
464	0	L_HASHITEM *hitem;
465	0	L_HASHMAP *hmap;
466
467	0	if (phmap) *phmap = NULL;
468	0	if (!psad)
469	0	return ERROR_INT("&sad not defined", __func__, 1);
470	0	*psad = NULL;
471	0	if (!sas)
472	0	return ERROR_INT("sas not defined", __func__, 1);
473
474		/* Traverse the hashtable lists */
475	0	if ((hmap = l_hmapCreateFromSarray(sas)) == NULL)
476	0	return ERROR_INT("hmap not made", __func__, 1);
477	0	sad = sarrayCreate(0);
478	0	*psad = sad;
479	0	tabsize = hmap->tabsize;
480	0	for (i = 0; i < tabsize; i++) {
481	0	hitem = hmap->hashtab[i];
482	0	while (hitem) {
483	0	str = sarrayGetString(sas, hitem->val, L_COPY);
484	0	sarrayAddString(sad, str, L_INSERT);
485	0	hitem = hitem->next;
486	0	}
487	0	}
488
489	0	if (phmap)
490	0	*phmap = hmap;
491	0	else
492	0	l_hmapDestroy(&hmap);
493	0	return 0;
494	0	}
495
496
497		/*!
498		* \brief sarrayUnionByHmap()
499		*
500		* \param[in] sa1
501		* \param[in] sa2
502		* \param[out] *psad union of the array values
503		* \return 0 if OK; 1 on error
504		*/
505		l_ok
506		sarrayUnionByHmap(SARRAY *sa1,
507		SARRAY *sa2,
508		SARRAY **psad)
509	0	{
510	0	SARRAY *sa3;
511
512	0	if (!psad)
513	0	return ERROR_INT("&sad not defined", __func__, 1);
514	0	*psad = NULL;
515	0	if (!sa1)
516	0	return ERROR_INT("sa1 not defined", __func__, 1);
517	0	if (!sa2)
518	0	return ERROR_INT("sa2 not defined", __func__, 1);
519
520	0	sa3 = sarrayCopy(sa1);
521	0	if (sarrayJoin(sa3, sa2) == 1) {
522	0	sarrayDestroy(&sa3);
523	0	return ERROR_INT("sa3 join failed", __func__, 1);
524	0	}
525	0	sarrayRemoveDupsByHmap(sa3, psad, NULL);
526	0	sarrayDestroy(&sa3);
527	0	return 0;
528	0	}
529
530
531		/*!
532		* \brief sarrayIntersectionByHmap()
533		*
534		* \param[in] sa1
535		* \param[in] sa2
536		* \param[out] *psad intersection of the array values
537		* \return 0 if OK; 1 on error
538		*/
539		l_ok
540		sarrayIntersectionByHmap(SARRAY *sa1,
541		SARRAY *sa2,
542		SARRAY **psad)
543	0	{
544	0	l_int32 i, n1, n2, n;
545	0	l_uint64 key;
546	0	char *str;
547	0	SARRAY sa_small, sa_big, sa3, sad;
548	0	L_HASHITEM *hitem;
549	0	L_HASHMAP *hmap;
550
551	0	if (!psad)
552	0	return ERROR_INT("&sad not defined", __func__, 1);
553	0	*psad = NULL;
554	0	if (!sa1)
555	0	return ERROR_INT("sa1 not defined", __func__, 1);
556	0	if (!sa2)
557	0	return ERROR_INT("sa2 not defined", __func__, 1);
558
559		/* Make a hashmap for the elements of the biggest array */
560	0	n1 = sarrayGetCount(sa1);
561	0	n2 = sarrayGetCount(sa2);
562	0	sa_small = (n1 < n2) ? sa1 : sa2; /* do not destroy sa_small */
563	0	sa_big = (n1 < n2) ? sa2 : sa1; /* do not destroy sa_big */
564	0	if ((hmap = l_hmapCreateFromSarray(sa_big)) == NULL)
565	0	return ERROR_INT("hmap not made", __func__, 1);
566
567		/* Remove duplicates from the smallest array. Alternatively,
568		* we can skip this step and avoid counting duplicates in
569		* sa_small by modifying the count fields in the sa_big hashitems;
570		* e.g., see l_hmapIntersectionDna(). */
571	0	sarrayRemoveDupsByHmap(sa_small, &sa3, NULL);
572
573		/* Go through sa3, the set of strings derived from the smallest array,
574		* hashing into the big array table. Any string found belongs to both,
575		* so add it to the output array. */
576	0	sad = sarrayCreate(0);
577	0	*psad = sad;
578	0	n = sarrayGetCount(sa3);
579	0	for (i = 0; i < n; i++) {
580	0	str = sarrayGetString(sa3, i, L_NOCOPY);
581	0	l_hashStringToUint64Fast(str, &key);
582	0	hitem = l_hmapLookup(hmap, key, i, L_HMAP_CHECK);
583	0	if (hitem)
584	0	sarrayAddString(sad, str, L_COPY);
585	0	}
586	0	l_hmapDestroy(&hmap);
587	0	sarrayDestroy(&sa3);
588	0	return 0;
589	0	}
590
591
592		/----------------------------------------------------------------------
593		* Miscellaneous operations *
594		----------------------------------------------------------------------/
595		/*!
596		* \brief sarrayGenerateIntegers()
597		*
598		* \param[in] n
599		* \return sa of printed numbers, 1 - n, or NULL on error
600		*/
601		SARRAY *
602		sarrayGenerateIntegers(l_int32 n)
603	0	{
604	0	char buf[32];
605	0	l_int32 i;
606	0	SARRAY *sa;
607
608	0	if ((sa = sarrayCreate(n)) == NULL)
609	0	return (SARRAY *)ERROR_PTR("sa not made", __func__, NULL);
610	0	for (i = 0; i < n; i++) {
611	0	snprintf(buf, sizeof(buf), "%d", i);
612	0	sarrayAddString(sa, buf, L_COPY);
613	0	}
614	0	return sa;
615	0	}
616
617
618		/*!
619		* \brief sarrayLookupCSKV()
620		*
621		* \param[in] sa of strings, each being a comma-separated pair
622		* of strings, the first being a key and the
623		* second a value
624		* \param[in] keystring an input string to match with each key in %sa
625		* \param[out] pvalstring the returned value string corresponding to the
626		* input key string, if found; otherwise NULL
627		* \return 0 if OK, 1 on error
628		*
629		* <pre>
630		* Notes:
631		* (1) The input %sa can have other strings that are not in
632		* comma-separated key-value format. These will be ignored.
633		* (2) This returns a copy of the first value string in %sa whose
634		* key string matches the input %keystring.
635		* (3) White space is not ignored; all white space before the ','
636		* is used for the keystring in matching. This allows the
637		* key and val strings to have white space (e.g., multiple words).
638		* </pre>
639		*/
640		l_ok
641		sarrayLookupCSKV(SARRAY *sa,
642		const char *keystring,
643		char **pvalstring)
644	0	{
645	0	char key, val, *str;
646	0	l_int32 i, n;
647	0	SARRAY *sa1;
648
649	0	if (!pvalstring)
650	0	return ERROR_INT("&valstring not defined", __func__, 1);
651	0	*pvalstring = NULL;
652	0	if (!sa)
653	0	return ERROR_INT("sa not defined", __func__, 1);
654	0	if (!keystring)
655	0	return ERROR_INT("keystring not defined", __func__, 1);
656
657	0	n = sarrayGetCount(sa);
658	0	for (i = 0; i < n; i++) {
659	0	str = sarrayGetString(sa, i, L_NOCOPY);
660	0	sa1 = sarrayCreate(2);
661	0	sarraySplitString(sa1, str, ",");
662	0	if (sarrayGetCount(sa1) != 2) {
663	0	sarrayDestroy(&sa1);
664	0	continue;
665	0	}
666	0	key = sarrayGetString(sa1, 0, L_NOCOPY);
667	0	val = sarrayGetString(sa1, 1, L_NOCOPY);
668	0	if (!strcmp(key, keystring)) {
669	0	*pvalstring = stringNew(val);
670	0	sarrayDestroy(&sa1);
671	0	return 0;
672	0	}
673	0	sarrayDestroy(&sa1);
674	0	}
675
676	0	return 0;
677	0	}