twain3.0/3rdparty/hgOCR/leptonica/dnahash.c

590 lines
18 KiB
C

/*====================================================================*
- Copyright (C) 2001 Leptonica. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following
- disclaimer in the documentation and/or other materials
- provided with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*====================================================================*/
/*!
* \file dnahash.c
* <pre>
*
* DnaHash creation, destruction
* L_DNAHASH *l_dnaHashCreate()
* void l_dnaHashDestroy()
*
* DnaHash: Accessors and modifiers *
* l_int32 l_dnaHashGetCount()
* l_int32 l_dnaHashGetTotalCount()
* L_DNA *l_dnaHashGetDna()
* void l_dnaHashAdd()
*
* DnaHash: Operations on Dna
* L_DNAHASH *l_dnaHashCreateFromDna()
* l_int32 l_dnaRemoveDupsByHash()
* l_int32 l_dnaMakeHistoByHash()
* L_DNA *l_dnaIntersectionByHash()
* l_int32 l_dnaFindValByHash()
*
* (1) The DnaHash is an array of Dna. It is useful for fast
* storage and lookup for sets and maps. If the set or map
* is on a Dna itself, the hash is a simple function that
* maps a double to a l_uint64; otherwise the function will
* map a string or a (x,y) point to a l_uint64. The result of
* the map is the "key", which is then used with the mod
* function to select which Dna array is to be used. The
* number of arrays in a DnaHash should be a prime number.
* If there are N items, we set up the DnaHash array to have
* approximately N/20 Dna, so the average size of these arrays
* will be about 20 when fully populated. The number 20 was
* found empirically to be in a broad maximum of efficiency.
* (2) Note that the word "hash" is overloaded. There are actually
* two hashing steps: the first hashes the object to a l_uint64,
* called the "key", and the second uses the mod function to
* "hash" the "key" to the index of a particular Dna in the
* DnaHash array.
* (3) Insertion and lookup time for DnaHash is O(1). Hash collisions
* are easily handled (we expect an average of 20 for each key),
* so we can use simple (fast) hash functions: we deal with
* collisions by storing an array for each hash key.
* This can be contrasted with using rbtree for sets and
* maps, where insertion and lookup are O(logN) and hash functions
* are slower because they must be good enough (i.e, random
* enough with arbitrary input) to avoid collisions.
* (4) Hash functions that map points, strings and floats to l_uint64
* are given in utils.c.
* (5) The use of the DnaHash (and RBTree) with strings and
* (x,y) points can be found in string2.c and ptafunc2.c, rsp.
* This file has similar hash set functions, using DnaHash on
* two input Dna, for removing duplicates and finding the
* intersection. It also uses DnaHash as a hash map to find
* a histogram of counts from an input Dna.
* (6) Comparisons in running time, between DnaHash and RBTree, for
* large sets of strings and points, are given in prog/hashtest.c.
* (7) This is a very simple implementation, that expects that you
* know approximately (i.e., within a factor of 2 or 3) how many
* items are to be stored when you initialize the DnaHash.
* (It would be nice to modify the l_dnaHashAdd() function
* to increase the number of bins when the average occupation
* exceeds 40 or so.)
* (8) Useful rule of thumb for hashing collisions:
* For a random hashing function (say, from strings to l_uint64),
* the probability of a collision increases as N^2 for N much
* less than 2^32. The quadratic behavior switches over to
* approaching 1.0 around 2^32, which is the square root of 2^64.
* So, for example, if you have 10^7 strings, the probability
* of a single collision using an l_uint64 key is on the order of
* (10^7/10^9)^2 ~ 10^-4.
* For a million strings you don't need to worry about collisons
* (~10-6 probability), and for most applications can use the
* RBTree (sorting) implementation with confidence.
* </pre>
*/
#include "allheaders.h"
/*--------------------------------------------------------------------------*
* Dna hash: Creation and destruction *
*--------------------------------------------------------------------------*/
/*!
* \brief l_dnaHashCreate()
*
* \param[in] nbuckets the number of buckets in the hash table,
* which should be prime.
* \param[in] initsize initial size of each allocated dna; 0 for default
* \return ptr to new dnahash, or NULL on error
*
* <pre>
* Notes:
* (1) Actual dna are created only as required by l_dnaHashAdd()
* </pre>
*/
L_DNAHASH *
l_dnaHashCreate(l_int32 nbuckets,
l_int32 initsize)
{
L_DNAHASH *dahash;
PROCNAME("l_dnaHashCreate");
if (nbuckets <= 0)
return (L_DNAHASH *)ERROR_PTR("negative hash size", procName, NULL);
dahash = (L_DNAHASH *)LEPT_CALLOC(1, sizeof(L_DNAHASH));
if ((dahash->dna = (L_DNA **)LEPT_CALLOC(nbuckets, sizeof(L_DNA *)))
== NULL) {
LEPT_FREE(dahash);
return (L_DNAHASH *)ERROR_PTR("dna ptr array not made", procName, NULL);
}
dahash->nbuckets = nbuckets;
dahash->initsize = initsize;
return dahash;
}
/*!
* \brief l_dnaHashDestroy()
*
* \param[in,out] pdahash will be set to null before returning
* \return void
*/
void
l_dnaHashDestroy(L_DNAHASH **pdahash)
{
L_DNAHASH *dahash;
l_int32 i;
PROCNAME("l_dnaHashDestroy");
if (pdahash == NULL) {
L_WARNING("ptr address is NULL!\n", procName);
return;
}
if ((dahash = *pdahash) == NULL)
return;
for (i = 0; i < dahash->nbuckets; i++)
l_dnaDestroy(&dahash->dna[i]);
LEPT_FREE(dahash->dna);
LEPT_FREE(dahash);
*pdahash = NULL;
}
/*--------------------------------------------------------------------------*
* Dna hash: Accessors and modifiers *
*--------------------------------------------------------------------------*/
/*!
* \brief l_dnaHashGetCount()
*
* \param[in] dahash
* \return nbuckets allocated, or 0 on error
*/
l_int32
l_dnaHashGetCount(L_DNAHASH *dahash)
{
PROCNAME("l_dnaHashGetCount");
if (!dahash)
return ERROR_INT("dahash not defined", procName, 0);
return dahash->nbuckets;
}
/*!
* \brief l_dnaHashGetTotalCount()
*
* \param[in] dahash
* \return n number of numbers in all dna, or 0 on error
*/
l_int32
l_dnaHashGetTotalCount(L_DNAHASH *dahash)
{
l_int32 i, n;
L_DNA *da;
PROCNAME("l_dnaHashGetTotalCount");
if (!dahash)
return ERROR_INT("dahash not defined", procName, 0);
for (i = 0, n = 0; i < dahash->nbuckets; i++) {
da = l_dnaHashGetDna(dahash, i, L_NOCOPY);
if (da)
n += l_dnaGetCount(da);
}
return n;
}
/*!
* \brief l_dnaHashGetDna()
*
* \param[in] dahash
* \param[in] key key to be hashed into a bucket number
* \param[in] copyflag L_NOCOPY, L_COPY, L_CLONE
* \return ptr to dna
*/
L_DNA *
l_dnaHashGetDna(L_DNAHASH *dahash,
l_uint64 key,
l_int32 copyflag)
{
l_int32 bucket;
L_DNA *da;
PROCNAME("l_dnaHashGetDna");
if (!dahash)
return (L_DNA *)ERROR_PTR("dahash not defined", procName, NULL);
bucket = key % dahash->nbuckets;
da = dahash->dna[bucket];
if (da) {
if (copyflag == L_NOCOPY)
return da;
else if (copyflag == L_COPY)
return l_dnaCopy(da);
else
return l_dnaClone(da);
}
else
return NULL;
}
/*!
* \brief l_dnaHashAdd()
*
* \param[in] dahash
* \param[in] key key to be hashed into a bucket number
* \param[in] value float value to be appended to the specific dna
* \return 0 if OK; 1 on error
*/
l_ok
l_dnaHashAdd(L_DNAHASH *dahash,
l_uint64 key,
l_float64 value)
{
l_int32 bucket;
L_DNA *da;
PROCNAME("l_dnaHashAdd");
if (!dahash)
return ERROR_INT("dahash not defined", procName, 1);
bucket = key % dahash->nbuckets;
da = dahash->dna[bucket];
if (!da) {
if ((da = l_dnaCreate(dahash->initsize)) == NULL)
return ERROR_INT("da not made", procName, 1);
dahash->dna[bucket] = da;
}
l_dnaAddNumber(da, value);
return 0;
}
/*--------------------------------------------------------------------------*
* DnaHash: Operations on Dna *
*--------------------------------------------------------------------------*/
/*!
* \brief l_dnaHashCreateFromDna()
*
* \param[in] da
* \return dahash if OK; 1 on error
*
* <pre>
* Notes:
* (1) The values stored in the %dahash are indices into %da;
* %dahash has no use without %da.
* </pre>
*/
L_DNAHASH *
l_dnaHashCreateFromDna(L_DNA *da)
{
l_int32 i, n;
l_uint32 nsize;
l_uint64 key;
l_float64 val;
L_DNAHASH *dahash;
PROCNAME("l_dnaHashCreateFromDna");
if (!da)
return (L_DNAHASH *)ERROR_PTR("da not defined", procName, NULL);
n = l_dnaGetCount(da);
findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */
dahash = l_dnaHashCreate(nsize, 8);
for (i = 0; i < n; i++) {
l_dnaGetDValue(da, i, &val);
l_hashFloat64ToUint64(nsize, val, &key);
l_dnaHashAdd(dahash, key, (l_float64)i);
}
return dahash;
}
/*!
* \brief l_dnaRemoveDupsByHash()
*
* \param[in] das
* \param[out] pdad hash set
* \param[out] pdahash [optional] dnahash used for lookup
* \return 0 if OK; 1 on error
*
* <pre>
* Notes:
* (1) Generates a dna with unique values.
* (2) The dnahash is built up with dad to assure uniqueness.
* It can be used to find if an element is in the set:
* l_dnaFindValByHash(dad, dahash, val, &index)
* </pre>
*/
l_ok
l_dnaRemoveDupsByHash(L_DNA *das,
L_DNA **pdad,
L_DNAHASH **pdahash)
{
l_int32 i, n, index, items;
l_uint32 nsize;
l_uint64 key;
l_float64 val;
L_DNA *dad;
L_DNAHASH *dahash;
PROCNAME("l_dnaRemoveDupsByHash");
if (pdahash) *pdahash = NULL;
if (!pdad)
return ERROR_INT("&dad not defined", procName, 1);
*pdad = NULL;
if (!das)
return ERROR_INT("das not defined", procName, 1);
n = l_dnaGetCount(das);
findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */
dahash = l_dnaHashCreate(nsize, 8);
dad = l_dnaCreate(n);
*pdad = dad;
for (i = 0, items = 0; i < n; i++) {
l_dnaGetDValue(das, i, &val);
l_dnaFindValByHash(dad, dahash, val, &index);
if (index < 0) { /* not found */
l_hashFloat64ToUint64(nsize, val, &key);
l_dnaHashAdd(dahash, key, (l_float64)items);
l_dnaAddNumber(dad, val);
items++;
}
}
if (pdahash)
*pdahash = dahash;
else
l_dnaHashDestroy(&dahash);
return 0;
}
/*!
* \brief l_dnaMakeHistoByHash()
*
* \param[in] das
* \param[out] pdahash hash map: val --> index
* \param[out] pdav array of values: index --> val
* \param[out] pdac histo array of counts: index --> count
* \return 0 if OK; 1 on error
*
* <pre>
* Notes:
* (1) Generates and returns a dna of occurrences (histogram),
* an aligned dna of values, and an associated hashmap.
* The hashmap takes %dav and a value, and points into the
* histogram in %dac.
* (2) The dna of values, %dav, is aligned with the histogram %dac,
* and is needed for fast lookup. It is a hash set, because
* the values are unique.
* (3) Lookup is simple:
* l_dnaFindValByHash(dav, dahash, val, &index);
* if (index >= 0)
* l_dnaGetIValue(dac, index, &icount);
* else
* icount = 0;
* </pre>
*/
l_ok
l_dnaMakeHistoByHash(L_DNA *das,
L_DNAHASH **pdahash,
L_DNA **pdav,
L_DNA **pdac)
{
l_int32 i, n, nitems, index, count;
l_uint32 nsize;
l_uint64 key;
l_float64 val;
L_DNA *dac, *dav;
L_DNAHASH *dahash;
PROCNAME("l_dnaMakeHistoByHash");
if (pdahash) *pdahash = NULL;
if (pdac) *pdac = NULL;
if (pdav) *pdav = NULL;
if (!pdahash || !pdac || !pdav)
return ERROR_INT("&dahash, &dac, &dav not all defined", procName, 1);
if (!das)
return ERROR_INT("das not defined", procName, 1);
if ((n = l_dnaGetCount(das)) == 0)
return ERROR_INT("no data in das", procName, 1);
findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */
dahash = l_dnaHashCreate(nsize, 8);
dac = l_dnaCreate(n); /* histogram */
dav = l_dnaCreate(n); /* the values */
for (i = 0, nitems = 0; i < n; i++) {
l_dnaGetDValue(das, i, &val);
/* Is this value already stored in dav? */
l_dnaFindValByHash(dav, dahash, val, &index);
if (index >= 0) { /* found */
l_dnaGetIValue(dac, (l_float64)index, &count);
l_dnaSetValue(dac, (l_float64)index, count + 1);
} else { /* not found */
l_hashFloat64ToUint64(nsize, val, &key);
l_dnaHashAdd(dahash, key, (l_float64)nitems);
l_dnaAddNumber(dav, val);
l_dnaAddNumber(dac, 1);
nitems++;
}
}
*pdahash = dahash;
*pdac = dac;
*pdav = dav;
return 0;
}
/*!
* \brief l_dnaIntersectionByHash()
*
* \param[in] da1, da2
* \return dad intersection of the number arrays, or NULL on error
*
* <pre>
* Notes:
* (1) This uses the same method for building the intersection set
* as ptaIntersectionByHash() and sarrayIntersectionByHash().
* </pre>
*/
L_DNA *
l_dnaIntersectionByHash(L_DNA *da1,
L_DNA *da2)
{
l_int32 n1, n2, nsmall, nbuckets, i, index1, index2;
l_uint32 nsize2;
l_uint64 key;
l_float64 val;
L_DNAHASH *dahash1, *dahash2;
L_DNA *da_small, *da_big, *dad;
PROCNAME("l_dnaIntersectionByHash");
if (!da1)
return (L_DNA *)ERROR_PTR("da1 not defined", procName, NULL);
if (!da2)
return (L_DNA *)ERROR_PTR("da2 not defined", procName, NULL);
/* Put the elements of the biggest array into a dnahash */
n1 = l_dnaGetCount(da1);
n2 = l_dnaGetCount(da2);
da_small = (n1 < n2) ? da1 : da2; /* do not destroy da_small */
da_big = (n1 < n2) ? da2 : da1; /* do not destroy da_big */
dahash1 = l_dnaHashCreateFromDna(da_big);
/* Build up the intersection of numbers. Add to %dad
* if the number is in da_big (using dahash1) but hasn't
* yet been seen in the traversal of da_small (using dahash2). */
dad = l_dnaCreate(0);
nsmall = l_dnaGetCount(da_small);
findNextLargerPrime(nsmall / 20, &nsize2); /* buckets in hash table */
dahash2 = l_dnaHashCreate(nsize2, 0);
nbuckets = l_dnaHashGetCount(dahash2);
for (i = 0; i < nsmall; i++) {
l_dnaGetDValue(da_small, i, &val);
l_dnaFindValByHash(da_big, dahash1, val, &index1);
if (index1 >= 0) { /* found */
l_dnaFindValByHash(da_small, dahash2, val, &index2);
if (index2 == -1) { /* not found */
l_dnaAddNumber(dad, val);
l_hashFloat64ToUint64(nbuckets, val, &key);
l_dnaHashAdd(dahash2, key, (l_float64)i);
}
}
}
l_dnaHashDestroy(&dahash1);
l_dnaHashDestroy(&dahash2);
return dad;
}
/*!
* \brief l_dnaFindValByHash()
*
* \param[in] da
* \param[in] dahash containing indices into %da
* \param[in] val searching for this number in %da
* \param[out] pindex index into da if found; -1 otherwise
* \return 0 if OK; 1 on error
*
* <pre>
* Notes:
* (1) Algo: hash %val into a key; hash the key to get the dna
* in %dahash (that holds indices into %da); traverse
* the dna of indices looking for %val in %da.
* </pre>
*/
l_ok
l_dnaFindValByHash(L_DNA *da,
L_DNAHASH *dahash,
l_float64 val,
l_int32 *pindex)
{
l_int32 i, nbuckets, nvals, indexval;
l_float64 vali;
l_uint64 key;
L_DNA *da1;
PROCNAME("l_dnaFindValByHash");
if (!pindex)
return ERROR_INT("&index not defined", procName, 1);
*pindex = -1;
if (!da)
return ERROR_INT("da not defined", procName, 1);
if (!dahash)
return ERROR_INT("dahash not defined", procName, 1);
nbuckets = l_dnaHashGetCount(dahash);
l_hashFloat64ToUint64(nbuckets, val, &key);
da1 = l_dnaHashGetDna(dahash, key, L_NOCOPY);
if (!da1) return 0;
/* Run through da1, looking for this %val */
nvals = l_dnaGetCount(da1);
for (i = 0; i < nvals; i++) {
l_dnaGetIValue(da1, i, &indexval);
l_dnaGetDValue(da, indexval, &vali);
if (val == vali) {
*pindex = indexval;
return 0;
}
}
return 0;
}