twain3.0/3rdparty/hgOCR/leptonica/partition.c

660 lines
24 KiB
C

/*====================================================================*
- Copyright (C) 2001 Leptonica. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following
- disclaimer in the documentation and/or other materials
- provided with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*====================================================================*/
/*!
* \file partition.c
* <pre>
*
* Whitespace block extraction
* BOXA *boxaGetWhiteblocks()
*
* Helpers
* static PARTEL *partelCreate()
* static void partelDestroy()
* static l_int32 partelSetSize()
* static BOXA *boxaGenerateSubboxes()
* static BOX *boxaSelectPivotBox()
* static l_int32 boxaCheckIfOverlapIsSmall()
* BOXA *boxaPruneSortedOnOverlap()
* </pre>
*/
#include "allheaders.h"
/*! Partition element */
struct PartitionElement {
l_float32 size; /* sorting key */
BOX *box; /* region of the element */
BOXA *boxa; /* set of intersecting boxes */
};
typedef struct PartitionElement PARTEL;
static PARTEL * partelCreate(BOX *box);
static void partelDestroy(PARTEL **ppartel);
static l_int32 partelSetSize(PARTEL *partel, l_int32 sortflag);
static BOXA * boxaGenerateSubboxes(BOX *box, BOXA *boxa, l_int32 maxperim,
l_float32 fract);
static BOX * boxaSelectPivotBox(BOX *box, BOXA *boxa, l_int32 maxperim,
l_float32 fract);
static l_int32 boxCheckIfOverlapIsBig(BOX *box, BOXA *boxa,
l_float32 maxoverlap);
static const l_int32 DefaultMaxPops = 20000;
#ifndef NO_CONSOLE_IO
#define OUTPUT_HEAP_STATS 0
#endif /* ~NO_CONSOLE_IO */
/*------------------------------------------------------------------*
* Whitespace block extraction *
*------------------------------------------------------------------*/
/*!
* \brief boxaGetWhiteblocks()
*
* \param[in] boxas typ. a set of bounding boxes of fg components
* \param[in] box initial region; typically including all boxes
* in boxas; if null, it computes the region to
* include all boxes in boxas
* \param[in] sortflag L_SORT_BY_WIDTH, L_SORT_BY_HEIGHT,
* L_SORT_BY_MIN_DIMENSION, L_SORT_BY_MAX_DIMENSION,
* L_SORT_BY_PERIMETER, L_SORT_BY_AREA
* \param[in] maxboxes max number of output whitespace boxes; e.g., 100
* \param[in] maxoverlap maximum fractional overlap of a box by any
* of the larger boxes; e.g., 0.2
* \param[in] maxperim maximum half-perimeter, in pixels, for which
* pivot is selected by proximity to box centroid;
* e.g., 200
* \param[in] fract fraction of box diagonal that is an acceptable
* distance from the box centroid to select
* the pivot; e.g., 0.2
* \param[in] maxpops max number of pops from the heap; use 0 as default
* \return boxa of sorted whitespace boxes, or NULL on error
*
* <pre>
* Notes:
* (1) This uses the elegant Breuel algorithm, found in "Two
* Geometric Algorithms for Layout Analysis", 2002,
* url: "citeseer.ist.psu.edu/breuel02two.html".
* It starts with the bounding boxes (b.b.) of the connected
* components (c.c.) in a region, along with the rectangle
* representing that region. It repeatedly divides the
* rectangle into four maximal rectangles that exclude a
* pivot rectangle, sorting them in a priority queue
* according to one of the six sort flags. It returns a boxa
* of the "largest" set that have no intersection with boxes
* from the input boxas.
* (2) If box == NULL, the initial region is the minimal region
* that includes the origin and every box in boxas.
* (3) maxboxes is the maximum number of whitespace boxes that will
* be returned. The actual number will depend on the image
* and the values chosen for maxoverlap and maxpops. In many
* cases, the actual number will be 'maxboxes'.
* (4) maxoverlap allows pruning of whitespace boxes depending on
* the overlap. To avoid all pruning, use maxoverlap = 1.0.
* To select only boxes that have no overlap with each other
* (maximal pruning), choose maxoverlap = 0.0.
* Otherwise, no box can have more than the 'maxoverlap' fraction
* of its area overlapped by any larger (in the sense of the
* sortflag) box.
* (5) Choose maxperim (actually, maximum half-perimeter) to
* represent a c.c. that is small enough so that you don't care
* about the white space that could be inside of it. For all such
* c.c., the pivot for 'quadfurcation' of a rectangle is selected
* as having a reasonable proximity to the rectangle centroid.
* (6) Use fract in the range [0.0 ... 1.0]. Set fract = 0.0
* to choose the small box nearest the centroid as the pivot.
* If you choose fract > 0.0, it is suggested that you call
* boxaPermuteRandom() first, to permute the boxes (see usage below).
* This should reduce the search time for each of the pivot boxes.
* (7) Choose maxpops to be the maximum number of rectangles that
* are popped from the heap. This is an indirect way to limit the
* execution time. Use 0 for default (a fairly large number).
* At any time, you can expect the heap to contain about
* 2.5 times as many boxes as have been popped off.
* (8) The output result is a sorted set of overlapping
* boxes, constrained by 'maxboxes', 'maxoverlap' and 'maxpops'.
* (9) The main defect of the method is that it abstracts out the
* actual components, retaining only the b.b. for analysis.
* Consider a component with a large b.b. If this is chosen
* as a pivot, all white space inside is immediately taken
* out of consideration. Furthermore, even if it is never chosen
* as a pivot, as the partitioning continues, at no time will
* any of the whitespace inside this component be part of a
* rectangle with zero overlapping boxes. Thus, the interiors
* of all boxes are necessarily excluded from the union of
* the returned whitespace boxes.
* (10) It should be noted that the algorithm puts a large number
* of partels on the queue. Setting a limit of X partels to
* remove from the queue, one typically finds that there will be
* several times that number (say, 2X - 3X) left on the queue.
* For an efficient algorithm to find the largest white or
* or black rectangles, without permitting them to overlap,
* see pixFindLargeRectangles().
* (11) USAGE: One way to accommodate to this weakness is to remove such
* large b.b. before starting the computation. For example,
* if 'box' is an input image region containing 'boxa' b.b. of c.c.:
*
* // Faster pivot choosing
* boxaPermuteRandom(boxa, boxa);
*
* // Remove anything either large width or height
* boxat = boxaSelectBySize(boxa, maxwidth, maxheight,
* L_SELECT_IF_BOTH, L_SELECT_IF_LT,
* NULL);
*
* boxad = boxaGetWhiteblocks(boxat, box, type, maxboxes,
* maxoverlap, maxperim, fract,
* maxpops);
*
* The result will be rectangular regions of "white space" that
* extend into (and often through) the excluded components.
* (11) As a simple example, suppose you wish to find the columns on a page.
* First exclude large c.c. that may block the columns, and then call:
*
* boxad = boxaGetWhiteblocks(boxa, box, L_SORT_BY_HEIGHT,
* 20, 0.15, 200, 0.2, 2000);
*
* to get the 20 tallest boxes with no more than 0.15 overlap
* between a box and any of the taller ones, and avoiding the
* use of any c.c. with a b.b. half perimeter greater than 200
* as a pivot.
* </pre>
*/
BOXA *
boxaGetWhiteblocks(BOXA *boxas,
BOX *box,
l_int32 sortflag,
l_int32 maxboxes,
l_float32 maxoverlap,
l_int32 maxperim,
l_float32 fract,
l_int32 maxpops)
{
l_int32 i, w, h, n, nsub, npush, npop;
BOX *boxsub;
BOXA *boxa, *boxa4, *boxasub, *boxad;
PARTEL *partel;
L_HEAP *lh;
PROCNAME("boxaGetWhiteblocks");
if (!boxas)
return (BOXA *)ERROR_PTR("boxas not defined", procName, NULL);
if (sortflag != L_SORT_BY_WIDTH && sortflag != L_SORT_BY_HEIGHT &&
sortflag != L_SORT_BY_MIN_DIMENSION &&
sortflag != L_SORT_BY_MAX_DIMENSION &&
sortflag != L_SORT_BY_PERIMETER && sortflag != L_SORT_BY_AREA)
return (BOXA *)ERROR_PTR("invalid sort flag", procName, NULL);
if (maxboxes < 1) {
maxboxes = 1;
L_WARNING("setting maxboxes = 1\n", procName);
}
if (maxoverlap < 0.0 || maxoverlap > 1.0)
return (BOXA *)ERROR_PTR("invalid maxoverlap", procName, NULL);
if (maxpops == 0)
maxpops = DefaultMaxPops;
if (!box) {
boxaGetExtent(boxas, &w, &h, NULL);
box = boxCreate(0, 0, w, h);
}
/* Prime the heap */
lh = lheapCreate(20, L_SORT_DECREASING);
partel = partelCreate(box);
partel->boxa = boxaCopy(boxas, L_CLONE);
partelSetSize(partel, sortflag);
lheapAdd(lh, partel);
npush = 1;
npop = 0;
boxad = boxaCreate(0);
while (1) {
if ((partel = (PARTEL *)lheapRemove(lh)) == NULL) /* we're done */
break;
npop++; /* How many boxes have we retrieved from the queue? */
if (npop > maxpops) {
partelDestroy(&partel);
break;
}
/* Extract the contents */
boxa = boxaCopy(partel->boxa, L_CLONE);
box = boxClone(partel->box);
partelDestroy(&partel);
/* Can we output this one? */
n = boxaGetCount(boxa);
if (n == 0) {
if (boxCheckIfOverlapIsBig(box, boxad, maxoverlap) == 0)
boxaAddBox(boxad, box, L_INSERT);
else
boxDestroy(&box);
boxaDestroy(&boxa);
if (boxaGetCount(boxad) >= maxboxes) /* we're done */
break;
continue;
}
/* Generate up to 4 subboxes and put them on the heap */
boxa4 = boxaGenerateSubboxes(box, boxa, maxperim, fract);
boxDestroy(&box);
nsub = boxaGetCount(boxa4);
for (i = 0; i < nsub; i++) {
boxsub = boxaGetBox(boxa4, i, L_CLONE);
boxasub = boxaIntersectsBox(boxa, boxsub);
partel = partelCreate(boxsub);
partel->boxa = boxasub;
partelSetSize(partel, sortflag);
lheapAdd(lh, partel);
boxDestroy(&boxsub);
}
npush += nsub; /* How many boxes have we put on the queue? */
/* boxaWriteStream(stderr, boxa4); */
boxaDestroy(&boxa4);
boxaDestroy(&boxa);
}
#if OUTPUT_HEAP_STATS
fprintf(stderr, "Heap statistics:\n");
fprintf(stderr, " Number of boxes pushed: %d\n", npush);
fprintf(stderr, " Number of boxes popped: %d\n", npop);
fprintf(stderr, " Number of boxes on heap: %d\n", lheapGetCount(lh));
#endif /* OUTPUT_HEAP_STATS */
/* Clean up the heap */
while ((partel = (PARTEL *)lheapRemove(lh)) != NULL)
partelDestroy(&partel);
lheapDestroy(&lh, FALSE);
return boxad;
}
/*------------------------------------------------------------------*
* Helpers *
*------------------------------------------------------------------*/
/*!
* \brief partelCreate()
*
* \param[in] box region; inserts a copy
* \return partel, or NULL on error
*/
static PARTEL *
partelCreate(BOX *box)
{
PARTEL *partel;
partel = (PARTEL *)LEPT_CALLOC(1, sizeof(PARTEL));
partel->box = boxCopy(box);
return partel;
}
/*!
* \brief partelDestroy()
*
* \param[in,out] ppartel contents will be set to null before returning
* \return void
*/
static void
partelDestroy(PARTEL **ppartel)
{
PARTEL *partel;
PROCNAME("partelDestroy");
if (ppartel == NULL) {
L_WARNING("ptr address is null!\n", procName);
return;
}
if ((partel = *ppartel) == NULL)
return;
boxDestroy(&partel->box);
boxaDestroy(&partel->boxa);
LEPT_FREE(partel);
*ppartel = NULL;
return;
}
/*!
* \brief partelSetSize()
*
* \param[in] partel
* \param[in] sortflag L_SORT_BY_WIDTH, L_SORT_BY_HEIGHT,
* L_SORT_BY_MIN_DIMENSION, L_SORT_BY_MAX_DIMENSION,
* L_SORT_BY_PERIMETER, L_SORT_BY_AREA
* \return 0 if OK, 1 on error
*/
static l_int32
partelSetSize(PARTEL *partel,
l_int32 sortflag)
{
l_int32 w, h;
PROCNAME("partelSetSize");
if (!partel)
return ERROR_INT("partel not defined", procName, 1);
boxGetGeometry(partel->box, NULL, NULL, &w, &h);
if (sortflag == L_SORT_BY_WIDTH)
partel->size = (l_float32)w;
else if (sortflag == L_SORT_BY_HEIGHT)
partel->size = (l_float32)h;
else if (sortflag == L_SORT_BY_MIN_DIMENSION)
partel->size = (l_float32)L_MIN(w, h);
else if (sortflag == L_SORT_BY_MAX_DIMENSION)
partel->size = (l_float32)L_MAX(w, h);
else if (sortflag == L_SORT_BY_PERIMETER)
partel->size = (l_float32)(w + h);
else if (sortflag == L_SORT_BY_AREA)
partel->size = (l_float32)(w * h);
else
return ERROR_INT("invalid sortflag", procName, 1);
return 0;
}
/*!
* \brief boxaGenerateSubboxes()
*
* \param[in] box region to be split into up to four overlapping
* subregions
* \param[in] boxa boxes of rectangles intersecting the box
* \param[in] maxperim maximum half-perimeter for which pivot
* is selected by proximity to box centroid
* \param[in] fract fraction of box diagonal that is an acceptable
* distance from the box centroid to select the pivot
* \return boxa of four or less overlapping subrectangles of
* the box, or NULL on error
*/
static BOXA *
boxaGenerateSubboxes(BOX *box,
BOXA *boxa,
l_int32 maxperim,
l_float32 fract)
{
l_int32 x, y, w, h, xp, yp, wp, hp;
BOX *boxp; /* pivot box */
BOX *boxsub;
BOXA *boxa4;
PROCNAME("boxaGenerateSubboxes");
if (!box)
return (BOXA *)ERROR_PTR("box not defined", procName, NULL);
if (!boxa)
return (BOXA *)ERROR_PTR("boxa not defined", procName, NULL);
boxa4 = boxaCreate(4);
boxp = boxaSelectPivotBox(box, boxa, maxperim, fract);
boxGetGeometry(box, &x, &y, &w, &h);
boxGetGeometry(boxp, &xp, &yp, &wp, &hp);
boxDestroy(&boxp);
if (xp > x) { /* left sub-box */
boxsub = boxCreate(x, y, xp - x, h);
boxaAddBox(boxa4, boxsub, L_INSERT);
}
if (yp > y) { /* top sub-box */
boxsub = boxCreate(x, y, w, yp - y);
boxaAddBox(boxa4, boxsub, L_INSERT);
}
if (xp + wp < x + w) { /* right sub-box */
boxsub = boxCreate(xp + wp, y, x + w - xp - wp, h);
boxaAddBox(boxa4, boxsub, L_INSERT);
}
if (yp + hp < y + h) { /* bottom sub-box */
boxsub = boxCreate(x, yp + hp, w, y + h - yp - hp);
boxaAddBox(boxa4, boxsub, L_INSERT);
}
return boxa4;
}
/*!
* \brief boxaSelectPivotBox()
*
* \param[in] box containing box; to be split by the pivot box
* \param[in] boxa boxes of rectangles, from which 1 is to be chosen
* \param[in] maxperim maximum half-perimeter for which pivot
* is selected by proximity to box centroid
* \param[in] fract fraction of box diagonal that is an acceptable
* distance from the box centroid to select the pivot
* \return box pivot box for subdivision into 4 rectangles,
* or NULL on error
*
* <pre>
* Notes:
* (1) This is a tricky piece that wasn't discussed in the
* Breuel's 2002 paper.
* (2) Selects a box from boxa whose centroid is reasonably close to
* the centroid of the containing box (xc, yc) and whose
* half-perimeter does not exceed the maxperim value.
* (3) If there are no boxes in the boxa that are small enough,
* then it selects the smallest of the larger boxes,
* without reference to its location in the containing box.
* (4) If a small box has a centroid at a distance from the
* centroid of the containing box that is not more than
* the fraction 'fract' of the diagonal of the containing
* box, that box is chosen as the pivot, terminating the
* search for the nearest small box.
* (5) Use fract in the range [0.0 ... 1.0]. Set fract = 0.0
* to choose the small box nearest the centroid.
* (6) Choose maxperim to represent a connected component that is
* small enough so that you don't care about the white space
* that could be inside of it.
* </pre>
*/
static BOX *
boxaSelectPivotBox(BOX *box,
BOXA *boxa,
l_int32 maxperim,
l_float32 fract)
{
l_int32 i, n, bw, bh, w, h;
l_int32 smallfound, minindex, perim, minsize;
l_float32 delx, dely, mindist, threshdist, dist, x, y, cx, cy;
BOX *boxt;
PROCNAME("boxaSelectPivotBox");
if (!box)
return (BOX *)ERROR_PTR("box not defined", procName, NULL);
if (!boxa)
return (BOX *)ERROR_PTR("boxa not defined", procName, NULL);
n = boxaGetCount(boxa);
if (n == 0)
return (BOX *)ERROR_PTR("no boxes in boxa", procName, NULL);
if (fract < 0.0 || fract > 1.0) {
L_WARNING("fract out of bounds; using 0.0\n", procName);
fract = 0.0;
}
boxGetGeometry(box, NULL, NULL, &w, &h);
boxGetCenter(box, &x, &y);
threshdist = fract * (w * w + h * h);
mindist = 1000000000.;
minindex = 0;
smallfound = FALSE;
for (i = 0; i < n; i++) {
boxt = boxaGetBox(boxa, i, L_CLONE);
boxGetGeometry(boxt, NULL, NULL, &bw, &bh);
boxGetCenter(boxt, &cx, &cy);
boxDestroy(&boxt);
if (bw + bh > maxperim)
continue;
smallfound = TRUE;
delx = cx - x;
dely = cy - y;
dist = delx * delx + dely * dely;
if (dist <= threshdist)
return boxaGetBox(boxa, i, L_COPY);
if (dist < mindist) {
minindex = i;
mindist = dist;
}
}
/* If there are small boxes but none are within 'fract' of the
* centroid, return the nearest one. */
if (smallfound == TRUE)
return boxaGetBox(boxa, minindex, L_COPY);
/* No small boxes; return the smallest of the large boxes */
minsize = 1000000000;
minindex = 0;
for (i = 0; i < n; i++) {
boxaGetBoxGeometry(boxa, i, NULL, NULL, &bw, &bh);
perim = bw + bh;
if (perim < minsize) {
minsize = perim;
minindex = i;
}
}
return boxaGetBox(boxa, minindex, L_COPY);
}
/*!
* \brief boxCheckIfOverlapIsBig()
*
* \param[in] box to be tested
* \param[in] boxa of boxes already stored
* \param[in] maxoverlap maximum fractional overlap of the input box
* by any of the boxes in boxa
* \return 0 if box has small overlap with every box in boxa;
* 1 otherwise or on error
*/
static l_int32
boxCheckIfOverlapIsBig(BOX *box,
BOXA *boxa,
l_float32 maxoverlap)
{
l_int32 i, n, bigoverlap;
l_float32 fract;
BOX *boxt;
PROCNAME("boxCheckIfOverlapIsBig");
if (!box)
return ERROR_INT("box not defined", procName, 1);
if (!boxa)
return ERROR_INT("boxa not defined", procName, 1);
if (maxoverlap < 0.0 || maxoverlap > 1.0)
return ERROR_INT("invalid maxoverlap", procName, 1);
n = boxaGetCount(boxa);
if (n == 0 || maxoverlap == 1.0)
return 0;
bigoverlap = 0;
for (i = 0; i < n; i++) {
boxt = boxaGetBox(boxa, i, L_CLONE);
boxOverlapFraction(boxt, box, &fract);
boxDestroy(&boxt);
if (fract > maxoverlap) {
bigoverlap = 1;
break;
}
}
return bigoverlap;
}
/*!
* \brief boxaPruneSortedOnOverlap()
*
* \param[in] boxas sorted by size in decreasing order
* \param[in] maxoverlap maximum fractional overlap of a box by any
* of the larger boxes
* \return boxad pruned, or NULL on error
*
* <pre>
* Notes:
* (1) This selectively removes smaller boxes when they are overlapped
* by any larger box by more than the input 'maxoverlap' fraction.
* (2) To avoid all pruning, use maxoverlap = 1.0. To select only
* boxes that have no overlap with each other (maximal pruning),
* set maxoverlap = 0.0.
* (3) If there are no boxes in boxas, returns an empty boxa.
* </pre>
*/
BOXA *
boxaPruneSortedOnOverlap(BOXA *boxas,
l_float32 maxoverlap)
{
l_int32 i, j, n, remove;
l_float32 fract;
BOX *box1, *box2;
BOXA *boxad;
PROCNAME("boxaPruneSortedOnOverlap");
if (!boxas)
return (BOXA *)ERROR_PTR("boxas not defined", procName, NULL);
if (maxoverlap < 0.0 || maxoverlap > 1.0)
return (BOXA *)ERROR_PTR("invalid maxoverlap", procName, NULL);
n = boxaGetCount(boxas);
if (n == 0 || maxoverlap == 1.0)
return boxaCopy(boxas, L_COPY);
boxad = boxaCreate(0);
box2 = boxaGetBox(boxas, 0, L_COPY);
boxaAddBox(boxad, box2, L_INSERT);
for (j = 1; j < n; j++) { /* prune on j */
box2 = boxaGetBox(boxas, j, L_COPY);
remove = FALSE;
for (i = 0; i < j; i++) { /* test on i */
box1 = boxaGetBox(boxas, i, L_CLONE);
boxOverlapFraction(box1, box2, &fract);
boxDestroy(&box1);
if (fract > maxoverlap) {
remove = TRUE;
break;
}
}
if (remove == TRUE)
boxDestroy(&box2);
else
boxaAddBox(boxad, box2, L_INSERT);
}
return boxad;
}