twain3.0/3rdparty/hgOCR/leptonica/pageseg.c

2361 lines
86 KiB
C

/*====================================================================*
- Copyright (C) 2001 Leptonica. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following
- disclaimer in the documentation and/or other materials
- provided with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*====================================================================*/
/*!
* \file pageseg.c
* <pre>
*
* Top level page segmentation
* l_int32 pixGetRegionsBinary()
*
* Halftone region extraction
* PIX *pixGenHalftoneMask() **Deprecated wrapper**
* PIX *pixGenerateHalftoneMask()
*
* Textline extraction
* PIX *pixGenTextlineMask()
*
* Textblock extraction
* PIX *pixGenTextblockMask()
*
* Location of page foreground
* PIX *pixFindPageForeground()
*
* Extraction of characters from image with only text
* l_int32 pixSplitIntoCharacters()
* BOXA *pixSplitComponentWithProfile()
*
* Extraction of lines of text
* PIXA *pixExtractTextlines()
* PIXA *pixExtractRawTextlines()
*
* How many text columns
* l_int32 pixCountTextColumns()
*
* Decision: text vs photo
* l_int32 pixDecideIfText()
* l_int32 pixFindThreshFgExtent()
*
* Decision: table vs text
* l_int32 pixDecideIfTable()
* Pix *pixPrepare1bpp()
*
* Estimate the grayscale background value
* l_int32 pixEstimateBackground()
*
* Largest white or black rectangles in an image
* l_int32 pixFindLargeRectangles()
* l_int32 pixFindLargestRectangle()
*
* Generate rectangle inside connected component
* BOX *pixFindRectangleInCC()
* </pre>
*/
#include "allheaders.h"
#include "math.h"
/* These functions are not intended to work on very low-res images */
static const l_int32 MinWidth = 100;
static const l_int32 MinHeight = 100;
/*------------------------------------------------------------------*
* Top level page segmentation *
*------------------------------------------------------------------*/
/*!
* \brief pixGetRegionsBinary()
*
* \param[in] pixs 1 bpp, assumed to be 300 to 400 ppi
* \param[out] ppixhm [optional] halftone mask
* \param[out] ppixtm [optional] textline mask
* \param[out] ppixtb [optional] textblock mask
* \param[in] pixadb input for collecting debug pix; use NULL to skip
* \return 0 if OK, 1 on error
*
* <pre>
* Notes:
* (1) It is best to deskew the image before segmenting.
* (2) Passing in %pixadb enables debug output.
* </pre>
*/
l_ok
pixGetRegionsBinary(PIX *pixs,
PIX **ppixhm,
PIX **ppixtm,
PIX **ppixtb,
PIXA *pixadb)
{
l_int32 w, h, htfound, tlfound;
PIX *pixr, *pix1, *pix2;
PIX *pixtext; /* text pixels only */
PIX *pixhm2; /* halftone mask; 2x reduction */
PIX *pixhm; /* halftone mask; */
PIX *pixtm2; /* textline mask; 2x reduction */
PIX *pixtm; /* textline mask */
PIX *pixvws; /* vertical white space mask */
PIX *pixtb2; /* textblock mask; 2x reduction */
PIX *pixtbf2; /* textblock mask; 2x reduction; small comps filtered */
PIX *pixtb; /* textblock mask */
PROCNAME("pixGetRegionsBinary");
if (ppixhm) *ppixhm = NULL;
if (ppixtm) *ppixtm = NULL;
if (ppixtb) *ppixtb = NULL;
if (!pixs || pixGetDepth(pixs) != 1)
return ERROR_INT("pixs undefined or not 1 bpp", procName, 1);
pixGetDimensions(pixs, &w, &h, NULL);
if (w < MinWidth || h < MinHeight) {
L_ERROR("pix too small: w = %d, h = %d\n", procName, w, h);
return 1;
}
/* 2x reduce, to 150 -200 ppi */
pixr = pixReduceRankBinaryCascade(pixs, 1, 0, 0, 0);
if (pixadb) pixaAddPix(pixadb, pixr, L_COPY);
/* Get the halftone mask */
pixhm2 = pixGenerateHalftoneMask(pixr, &pixtext, &htfound, pixadb);
/* Get the textline mask from the text pixels */
pixtm2 = pixGenTextlineMask(pixtext, &pixvws, &tlfound, pixadb);
/* Get the textblock mask from the textline mask */
pixtb2 = pixGenTextblockMask(pixtm2, pixvws, pixadb);
pixDestroy(&pixr);
pixDestroy(&pixtext);
pixDestroy(&pixvws);
/* Remove small components from the mask, where a small
* component is defined as one with both width and height < 60 */
pixtbf2 = NULL;
if (pixtb2) {
pixtbf2 = pixSelectBySize(pixtb2, 60, 60, 4, L_SELECT_IF_EITHER,
L_SELECT_IF_GTE, NULL);
pixDestroy(&pixtb2);
if (pixadb) pixaAddPix(pixadb, pixtbf2, L_COPY);
}
/* Expand all masks to full resolution, and do filling or
* small dilations for better coverage. */
pixhm = pixExpandReplicate(pixhm2, 2);
pix1 = pixSeedfillBinary(NULL, pixhm, pixs, 8);
pixOr(pixhm, pixhm, pix1);
pixDestroy(&pixhm2);
pixDestroy(&pix1);
if (pixadb) pixaAddPix(pixadb, pixhm, L_COPY);
pix1 = pixExpandReplicate(pixtm2, 2);
pixtm = pixDilateBrick(NULL, pix1, 3, 3);
pixDestroy(&pixtm2);
pixDestroy(&pix1);
if (pixadb) pixaAddPix(pixadb, pixtm, L_COPY);
if (pixtbf2) {
pix1 = pixExpandReplicate(pixtbf2, 2);
pixtb = pixDilateBrick(NULL, pix1, 3, 3);
pixDestroy(&pixtbf2);
pixDestroy(&pix1);
if (pixadb) pixaAddPix(pixadb, pixtb, L_COPY);
} else {
pixtb = pixCreateTemplate(pixs); /* empty mask */
}
/* Debug: identify objects that are neither text nor halftone image */
if (pixadb) {
pix1 = pixSubtract(NULL, pixs, pixtm); /* remove text pixels */
pix2 = pixSubtract(NULL, pix1, pixhm); /* remove halftone pixels */
pixaAddPix(pixadb, pix2, L_INSERT);
pixDestroy(&pix1);
}
/* Debug: display textline components with random colors */
if (pixadb) {
l_int32 w, h;
BOXA *boxa;
PIXA *pixa;
boxa = pixConnComp(pixtm, &pixa, 8);
pixGetDimensions(pixtm, &w, &h, NULL);
pix1 = pixaDisplayRandomCmap(pixa, w, h);
pixcmapResetColor(pixGetColormap(pix1), 0, 255, 255, 255);
pixaAddPix(pixadb, pix1, L_INSERT);
pixaDestroy(&pixa);
boxaDestroy(&boxa);
}
/* Debug: identify the outlines of each textblock */
if (pixadb) {
PIXCMAP *cmap;
PTAA *ptaa;
ptaa = pixGetOuterBordersPtaa(pixtb);
lept_mkdir("lept/pageseg");
ptaaWriteDebug("/tmp/lept/pageseg/tb_outlines.ptaa", ptaa, 1);
pix1 = pixRenderRandomCmapPtaa(pixtb, ptaa, 1, 16, 1);
cmap = pixGetColormap(pix1);
pixcmapResetColor(cmap, 0, 130, 130, 130);
pixaAddPix(pixadb, pix1, L_INSERT);
ptaaDestroy(&ptaa);
}
/* Debug: get b.b. for all mask components */
if (pixadb) {
BOXA *bahm, *batm, *batb;
bahm = pixConnComp(pixhm, NULL, 4);
batm = pixConnComp(pixtm, NULL, 4);
batb = pixConnComp(pixtb, NULL, 4);
boxaWriteDebug("/tmp/lept/pageseg/htmask.boxa", bahm);
boxaWriteDebug("/tmp/lept/pageseg/textmask.boxa", batm);
boxaWriteDebug("/tmp/lept/pageseg/textblock.boxa", batb);
boxaDestroy(&bahm);
boxaDestroy(&batm);
boxaDestroy(&batb);
}
if (pixadb) {
pixaConvertToPdf(pixadb, 0, 1.0, 0, 0, "Debug page segmentation",
"/tmp/lept/pageseg/debug.pdf");
L_INFO("Writing debug pdf to /tmp/lept/pageseg/debug.pdf\n", procName);
}
if (ppixhm)
*ppixhm = pixhm;
else
pixDestroy(&pixhm);
if (ppixtm)
*ppixtm = pixtm;
else
pixDestroy(&pixtm);
if (ppixtb)
*ppixtb = pixtb;
else
pixDestroy(&pixtb);
return 0;
}
/*------------------------------------------------------------------*
* Halftone region extraction *
*------------------------------------------------------------------*/
/*!
* \brief pixGenHalftoneMask()
*
* <pre>
* Deprecated:
* This wrapper avoids an ABI change with tesseract 3.0.4.
* It should be removed when we no longer need to support 3.0.4.
* The debug parameter is ignored (assumed 0).
* </pre>
*/
PIX *
pixGenHalftoneMask(PIX *pixs,
PIX **ppixtext,
l_int32 *phtfound,
l_int32 debug)
{
return pixGenerateHalftoneMask(pixs, ppixtext, phtfound, NULL);
}
/*!
* \brief pixGenerateHalftoneMask()
*
* \param[in] pixs 1 bpp, assumed to be 150 to 200 ppi
* \param[out] ppixtext [optional] text part of pixs
* \param[out] phtfound [optional] 1 if the mask is not empty
* \param[in] pixadb input for collecting debug pix; use NULL to skip
* \return pixd halftone mask, or NULL on error
*
* <pre>
* Notes:
* (1) This is not intended to work on small thumbnails. The
* dimensions of pixs must be at least MinWidth x MinHeight.
* </pre>
*/
PIX *
pixGenerateHalftoneMask(PIX *pixs,
PIX **ppixtext,
l_int32 *phtfound,
PIXA *pixadb)
{
l_int32 w, h, empty;
PIX *pix1, *pix2, *pixhs, *pixhm, *pixd;
PROCNAME("pixGenerateHalftoneMask");
if (ppixtext) *ppixtext = NULL;
if (phtfound) *phtfound = 0;
if (!pixs || pixGetDepth(pixs) != 1)
return (PIX *)ERROR_PTR("pixs undefined or not 1 bpp", procName, NULL);
pixGetDimensions(pixs, &w, &h, NULL);
if (w < MinWidth || h < MinHeight) {
L_ERROR("pix too small: w = %d, h = %d\n", procName, w, h);
return NULL;
}
/* Compute seed for halftone parts at 8x reduction */
pix1 = pixReduceRankBinaryCascade(pixs, 4, 4, 3, 0);
pix2 = pixOpenBrick(NULL, pix1, 5, 5);
pixhs = pixExpandReplicate(pix2, 8); /* back to 2x reduction */
pixDestroy(&pix1);
pixDestroy(&pix2);
if (pixadb) pixaAddPix(pixadb, pixhs, L_COPY);
/* Compute mask for connected regions */
pixhm = pixCloseSafeBrick(NULL, pixs, 4, 4);
if (pixadb) pixaAddPix(pixadb, pixhm, L_COPY);
/* Fill seed into mask to get halftone mask */
pixd = pixSeedfillBinary(NULL, pixhs, pixhm, 4);
#if 0
/* Moderate opening to remove thin lines, etc. */
pixOpenBrick(pixd, pixd, 10, 10);
#endif
/* Check if mask is empty */
pixZero(pixd, &empty);
if (phtfound && !empty)
*phtfound = 1;
/* Optionally, get all pixels that are not under the halftone mask */
if (ppixtext) {
if (empty)
*ppixtext = pixCopy(NULL, pixs);
else
*ppixtext = pixSubtract(NULL, pixs, pixd);
if (pixadb) pixaAddPix(pixadb, *ppixtext, L_COPY);
}
pixDestroy(&pixhs);
pixDestroy(&pixhm);
return pixd;
}
/*------------------------------------------------------------------*
* Textline extraction *
*------------------------------------------------------------------*/
/*!
* \brief pixGenTextlineMask()
*
* \param[in] pixs 1 bpp, assumed to be 150 to 200 ppi
* \param[out] ppixvws vertical whitespace mask
* \param[out] ptlfound [optional] 1 if the mask is not empty
* \param[in] pixadb input for collecting debug pix; use NULL to skip
* \return pixd textline mask, or NULL on error
*
* <pre>
* Notes:
* (1) The input pixs should be deskewed.
* (2) pixs should have no halftone pixels.
* (3) This is not intended to work on small thumbnails. The
* dimensions of pixs must be at least MinWidth x MinHeight.
* (4) Both the input image and the returned textline mask
* are at the same resolution.
* </pre>
*/
PIX *
pixGenTextlineMask(PIX *pixs,
PIX **ppixvws,
l_int32 *ptlfound,
PIXA *pixadb)
{
l_int32 w, h, empty;
PIX *pix1, *pix2, *pixvws, *pixd;
PROCNAME("pixGenTextlineMask");
if (ptlfound) *ptlfound = 0;
if (!ppixvws)
return (PIX *)ERROR_PTR("&pixvws not defined", procName, NULL);
*ppixvws = NULL;
if (!pixs || pixGetDepth(pixs) != 1)
return (PIX *)ERROR_PTR("pixs undefined or not 1 bpp", procName, NULL);
pixGetDimensions(pixs, &w, &h, NULL);
if (w < MinWidth || h < MinHeight) {
L_ERROR("pix too small: w = %d, h = %d\n", procName, w, h);
return NULL;
}
/* First we need a vertical whitespace mask. Invert the image. */
pix1 = pixInvert(NULL, pixs);
/* The whitespace mask will break textlines where there
* is a large amount of white space below or above.
* This can be prevented by identifying regions of the
* inverted image that have large horizontal extent (bigger than
* the separation between columns) and significant
* vertical extent (bigger than the separation between
* textlines), and subtracting this from the bg. */
pix2 = pixMorphCompSequence(pix1, "o80.60", 0);
pixSubtract(pix1, pix1, pix2);
if (pixadb) pixaAddPix(pixadb, pix1, L_COPY);
pixDestroy(&pix2);
/* Identify vertical whitespace by opening the remaining bg.
* o5.1 removes thin vertical bg lines and o1.200 extracts
* long vertical bg lines. */
pixvws = pixMorphCompSequence(pix1, "o5.1 + o1.200", 0);
*ppixvws = pixvws;
if (pixadb) pixaAddPix(pixadb, pixvws, L_COPY);
pixDestroy(&pix1);
/* Three steps to getting text line mask:
* (1) close the characters and words in the textlines
* (2) open the vertical whitespace corridors back up
* (3) small opening to remove noise */
pix1 = pixMorphSequence(pixs, "c30.1", 0);
if (pixadb) pixaAddPix(pixadb, pix1, L_COPY);
pixd = pixSubtract(NULL, pix1, pixvws);
pixOpenBrick(pixd, pixd, 3, 3);
if (pixadb) pixaAddPix(pixadb, pixd, L_COPY);
pixDestroy(&pix1);
/* Check if text line mask is empty */
if (ptlfound) {
pixZero(pixd, &empty);
if (!empty)
*ptlfound = 1;
}
return pixd;
}
/*------------------------------------------------------------------*
* Textblock extraction *
*------------------------------------------------------------------*/
/*!
* \brief pixGenTextblockMask()
*
* \param[in] pixs 1 bpp, textline mask, assumed to be 150 to 200 ppi
* \param[in] pixvws vertical white space mask
* \param[in] pixadb input for collecting debug pix; use NULL to skip
* \return pixd textblock mask, or NULL if empty or on error
*
* <pre>
* Notes:
* (1) Both the input masks (textline and vertical white space) and
* the returned textblock mask are at the same resolution.
* (2) This is not intended to work on small thumbnails. The
* dimensions of pixs must be at least MinWidth x MinHeight.
* (3) The result is somewhat noisy, in that small "blocks" of
* text may be included. These can be removed by post-processing,
* using, e.g.,
* pixSelectBySize(pix, 60, 60, 4, L_SELECT_IF_EITHER,
* L_SELECT_IF_GTE, NULL);
* </pre>
*/
PIX *
pixGenTextblockMask(PIX *pixs,
PIX *pixvws,
PIXA *pixadb)
{
l_int32 w, h, empty;
PIX *pix1, *pix2, *pix3, *pixd;
PROCNAME("pixGenTextblockMask");
if (!pixs || pixGetDepth(pixs) != 1)
return (PIX *)ERROR_PTR("pixs undefined or not 1 bpp", procName, NULL);
pixGetDimensions(pixs, &w, &h, NULL);
if (w < MinWidth || h < MinHeight) {
L_ERROR("pix too small: w = %d, h = %d\n", procName, w, h);
return NULL;
}
if (!pixvws)
return (PIX *)ERROR_PTR("pixvws not defined", procName, NULL);
/* Join pixels vertically to make a textblock mask */
pix1 = pixMorphSequence(pixs, "c1.10 + o4.1", 0);
pixZero(pix1, &empty);
if (empty) {
pixDestroy(&pix1);
L_INFO("no fg pixels in textblock mask\n", procName);
return NULL;
}
if (pixadb) pixaAddPix(pixadb, pix1, L_COPY);
/* Solidify the textblock mask and remove noise:
* (1) For each cc, close the blocks and dilate slightly
* to form a solid mask.
* (2) Small horizontal closing between components.
* (3) Open the white space between columns, again.
* (4) Remove small components. */
pix2 = pixMorphSequenceByComponent(pix1, "c30.30 + d3.3", 8, 0, 0, NULL);
pixCloseSafeBrick(pix2, pix2, 10, 1);
if (pixadb) pixaAddPix(pixadb, pix2, L_COPY);
pix3 = pixSubtract(NULL, pix2, pixvws);
if (pixadb) pixaAddPix(pixadb, pix3, L_COPY);
pixd = pixSelectBySize(pix3, 25, 5, 8, L_SELECT_IF_BOTH,
L_SELECT_IF_GTE, NULL);
if (pixadb) pixaAddPix(pixadb, pixd, L_COPY);
pixDestroy(&pix1);
pixDestroy(&pix2);
pixDestroy(&pix3);
return pixd;
}
/*------------------------------------------------------------------*
* Location of page foreground *
*------------------------------------------------------------------*/
/*!
* \brief pixFindPageForeground()
*
* \param[in] pixs full resolution (any type or depth
* \param[in] threshold for binarization; typically about 128
* \param[in] mindist min distance of text from border to allow
* cleaning near border; at 2x reduction, this
* should be larger than 50; typically about 70
* \param[in] erasedist when conditions are satisfied, erase anything
* within this distance of the edge;
* typically 20-30 at 2x reduction
* \param[in] showmorph debug: set to a negative integer to show steps
* in generating masks; this is typically used
* for debugging region extraction
* \param[in] pixac debug: allocate outside and pass this in to
* accumulate results of each call to this function,
* which can be displayed in a mosaic or a pdf.
* \return box region including foreground, with some pixel noise
* removed, or NULL if not found
*
* <pre>
* Notes:
* (1) This doesn't simply crop to the fg. It attempts to remove
* pixel noise and junk at the edge of the image before cropping.
* The input %threshold is used if pixs is not 1 bpp.
* (2) This is not intended to work on small thumbnails. The
* dimensions of pixs must be at least MinWidth x MinHeight.
* (3) Debug: set showmorph to display the intermediate image in
* the morphological operations on this page.
* (4) Debug: to get pdf output of results when called repeatedly,
* call with an existing pixac, which will add an image of this page,
* with the fg outlined. If no foreground is found, there is
* no output for this page image.
* </pre>
*/
BOX *
pixFindPageForeground(PIX *pixs,
l_int32 threshold,
l_int32 mindist,
l_int32 erasedist,
l_int32 showmorph,
PIXAC *pixac)
{
l_int32 flag, nbox, intersects;
l_int32 w, h, bx, by, bw, bh, left, right, top, bottom;
PIX *pixb, *pixb2, *pixseed, *pixsf, *pixm, *pix1, *pixg2;
BOX *box, *boxfg, *boxin, *boxd;
BOXA *ba1, *ba2;
PROCNAME("pixFindPageForeground");
if (!pixs)
return (BOX *)ERROR_PTR("pixs not defined", procName, NULL);
pixGetDimensions(pixs, &w, &h, NULL);
if (w < MinWidth || h < MinHeight) {
L_ERROR("pix too small: w = %d, h = %d\n", procName, w, h);
return NULL;
}
/* Binarize, downscale by 0.5, remove the noise to generate a seed,
* and do a seedfill back from the seed into those 8-connected
* components of the binarized image for which there was at least
* one seed pixel. Also clear out any components that are within
* 10 pixels of the edge at 2x reduction. */
flag = (showmorph) ? 100 : 0;
pixb = pixConvertTo1(pixs, threshold);
pixb2 = pixScale(pixb, 0.5, 0.5);
pixseed = pixMorphSequence(pixb2, "o1.2 + c9.9 + o3.3", flag);
pix1 = pixMorphSequence(pixb2, "o50.1", 0);
pixOr(pixseed, pixseed, pix1);
pixDestroy(&pix1);
pix1 = pixMorphSequence(pixb2, "o1.50", 0);
pixOr(pixseed, pixseed, pix1);
pixDestroy(&pix1);
pixsf = pixSeedfillBinary(NULL, pixseed, pixb2, 8);
pixSetOrClearBorder(pixsf, 10, 10, 10, 10, PIX_SET);
pixm = pixRemoveBorderConnComps(pixsf, 8);
/* Now, where is the main block of text? We want to remove noise near
* the edge of the image, but to do that, we have to be convinced that
* (1) there is noise and (2) it is far enough from the text block
* and close enough to the edge. For each edge, if the block
* is more than mindist from that edge, then clean 'erasedist'
* pixels from the edge. */
pix1 = pixMorphSequence(pixm, "c50.50", flag);
ba1 = pixConnComp(pix1, NULL, 8);
ba2 = boxaSort(ba1, L_SORT_BY_AREA, L_SORT_DECREASING, NULL);
pixGetDimensions(pix1, &w, &h, NULL);
nbox = boxaGetCount(ba2);
if (nbox > 1) {
box = boxaGetBox(ba2, 0, L_CLONE);
boxGetGeometry(box, &bx, &by, &bw, &bh);
left = (bx > mindist) ? erasedist : 0;
right = (w - bx - bw > mindist) ? erasedist : 0;
top = (by > mindist) ? erasedist : 0;
bottom = (h - by - bh > mindist) ? erasedist : 0;
pixSetOrClearBorder(pixm, left, right, top, bottom, PIX_CLR);
boxDestroy(&box);
}
pixDestroy(&pix1);
boxaDestroy(&ba1);
boxaDestroy(&ba2);
/* Locate the foreground region; don't bother cropping */
pixClipToForeground(pixm, NULL, &boxfg);
/* Sanity check the fg region. Make sure it's not confined
* to a thin boundary on the left and right sides of the image,
* in which case it is likely to be noise. */
if (boxfg) {
boxin = boxCreate(0.1 * w, 0, 0.8 * w, h);
boxIntersects(boxfg, boxin, &intersects);
boxDestroy(&boxin);
if (!intersects) boxDestroy(&boxfg);
}
boxd = NULL;
if (boxfg) {
boxAdjustSides(boxfg, boxfg, -2, 2, -2, 2); /* tiny expansion */
boxd = boxTransform(boxfg, 0, 0, 2.0, 2.0);
/* Save the debug image showing the box for this page */
if (pixac) {
pixg2 = pixConvert1To4Cmap(pixb);
pixRenderBoxArb(pixg2, boxd, 3, 255, 0, 0);
pixacompAddPix(pixac, pixg2, IFF_DEFAULT);
pixDestroy(&pixg2);
}
}
pixDestroy(&pixb);
pixDestroy(&pixb2);
pixDestroy(&pixseed);
pixDestroy(&pixsf);
pixDestroy(&pixm);
boxDestroy(&boxfg);
return boxd;
}
/*------------------------------------------------------------------*
* Extraction of characters from image with only text *
*------------------------------------------------------------------*/
/*!
* \brief pixSplitIntoCharacters()
*
* \param[in] pixs 1 bpp, contains only deskewed text
* \param[in] minw min component width for initial filtering; typ. 4
* \param[in] minh min component height for initial filtering; typ. 4
* \param[out] pboxa [optional] character bounding boxes
* \param[out] ppixa [optional] character images
* \param[out] ppixdebug [optional] showing splittings
*
* \return 0 if OK, 1 on error
*
* <pre>
* Notes:
* (1) This is a simple function that attempts to find split points
* based on vertical pixel profiles.
* (2) It should be given an image that has an arbitrary number
* of text characters.
* (3) The returned pixa includes the boxes from which the
* (possibly split) components are extracted.
* </pre>
*/
l_ok
pixSplitIntoCharacters(PIX *pixs,
l_int32 minw,
l_int32 minh,
BOXA **pboxa,
PIXA **ppixa,
PIX **ppixdebug)
{
l_int32 ncomp, i, xoff, yoff;
BOXA *boxa1, *boxa2, *boxat1, *boxat2, *boxad;
BOXAA *baa;
PIX *pix, *pix1, *pix2, *pixdb;
PIXA *pixa1, *pixadb;
PROCNAME("pixSplitIntoCharacters");
if (pboxa) *pboxa = NULL;
if (ppixa) *ppixa = NULL;
if (ppixdebug) *ppixdebug = NULL;
if (!pixs || pixGetDepth(pixs) != 1)
return ERROR_INT("pixs not defined or not 1 bpp", procName, 1);
/* Remove the small stuff */
pix1 = pixSelectBySize(pixs, minw, minh, 8, L_SELECT_IF_BOTH,
L_SELECT_IF_GT, NULL);
/* Small vertical close for consolidation */
pix2 = pixMorphSequence(pix1, "c1.10", 0);
pixDestroy(&pix1);
/* Get the 8-connected components */
boxa1 = pixConnComp(pix2, &pixa1, 8);
pixDestroy(&pix2);
boxaDestroy(&boxa1);
/* Split the components if obvious */
ncomp = pixaGetCount(pixa1);
boxa2 = boxaCreate(ncomp);
pixadb = (ppixdebug) ? pixaCreate(ncomp) : NULL;
for (i = 0; i < ncomp; i++) {
pix = pixaGetPix(pixa1, i, L_CLONE);
if (ppixdebug) {
boxat1 = pixSplitComponentWithProfile(pix, 10, 7, &pixdb);
if (pixdb)
pixaAddPix(pixadb, pixdb, L_INSERT);
} else {
boxat1 = pixSplitComponentWithProfile(pix, 10, 7, NULL);
}
pixaGetBoxGeometry(pixa1, i, &xoff, &yoff, NULL, NULL);
boxat2 = boxaTransform(boxat1, xoff, yoff, 1.0, 1.0);
boxaJoin(boxa2, boxat2, 0, -1);
pixDestroy(&pix);
boxaDestroy(&boxat1);
boxaDestroy(&boxat2);
}
pixaDestroy(&pixa1);
/* Generate the debug image */
if (ppixdebug) {
if (pixaGetCount(pixadb) > 0) {
*ppixdebug = pixaDisplayTiledInRows(pixadb, 32, 1500,
1.0, 0, 20, 1);
}
pixaDestroy(&pixadb);
}
/* Do a 2D sort on the bounding boxes, and flatten the result to 1D */
baa = boxaSort2d(boxa2, NULL, 0, 0, 5);
boxad = boxaaFlattenToBoxa(baa, NULL, L_CLONE);
boxaaDestroy(&baa);
boxaDestroy(&boxa2);
/* Optionally extract the pieces from the input image */
if (ppixa)
*ppixa = pixClipRectangles(pixs, boxad);
if (pboxa)
*pboxa = boxad;
else
boxaDestroy(&boxad);
return 0;
}
/*!
* \brief pixSplitComponentWithProfile()
*
* \param[in] pixs 1 bpp, exactly one connected component
* \param[in] delta distance used in extrema finding in a numa; typ. 10
* \param[in] mindel minimum required difference between profile
* minimum and profile values +2 and -2 away; typ. 7
* \param[out] ppixdebug [optional] debug image of splitting
* \return boxa of c.c. after splitting, or NULL on error
*
* <pre>
* Notes:
* (1) This will split the most obvious cases of touching characters.
* The split points it is searching for are narrow and deep
* minimima in the vertical pixel projection profile, after a
* large vertical closing has been applied to the component.
* </pre>
*/
BOXA *
pixSplitComponentWithProfile(PIX *pixs,
l_int32 delta,
l_int32 mindel,
PIX **ppixdebug)
{
l_int32 w, h, n2, i, firstmin, xmin, xshift;
l_int32 nmin, nleft, nright, nsplit, isplit, ncomp;
l_int32 *array1, *array2;
BOX *box;
BOXA *boxad;
NUMA *na1, *na2, *nasplit;
PIX *pix1, *pixdb;
PROCNAME("pixSplitComponentsWithProfile");
if (ppixdebug) *ppixdebug = NULL;
if (!pixs || pixGetDepth(pixs) != 1)
return (BOXA *)ERROR_PTR("pixa undefined or not 1 bpp", procName, NULL);
pixGetDimensions(pixs, &w, &h, NULL);
/* Closing to consolidate characters vertically */
pix1 = pixCloseSafeBrick(NULL, pixs, 1, 100);
/* Get extrema of column projections */
boxad = boxaCreate(2);
na1 = pixCountPixelsByColumn(pix1); /* w elements */
pixDestroy(&pix1);
na2 = numaFindExtrema(na1, delta, NULL);
n2 = numaGetCount(na2);
if (n2 < 3) { /* no split possible */
box = boxCreate(0, 0, w, h);
boxaAddBox(boxad, box, L_INSERT);
numaDestroy(&na1);
numaDestroy(&na2);
return boxad;
}
/* Look for sufficiently deep and narrow minima.
* All minima of of interest must be surrounded by max on each
* side. firstmin is the index of first possible minimum. */
array1 = numaGetIArray(na1);
array2 = numaGetIArray(na2);
if (ppixdebug) numaWriteStream(stderr, na2);
firstmin = (array1[array2[0]] > array1[array2[1]]) ? 1 : 2;
nasplit = numaCreate(n2); /* will hold split locations */
for (i = firstmin; i < n2 - 1; i+= 2) {
xmin = array2[i];
nmin = array1[xmin];
if (xmin + 2 >= w) break; /* no more splits possible */
nleft = array1[xmin - 2];
nright = array1[xmin + 2];
if (ppixdebug) {
fprintf(stderr,
"Splitting: xmin = %d, w = %d; nl = %d, nmin = %d, nr = %d\n",
xmin, w, nleft, nmin, nright);
}
if (nleft - nmin >= mindel && nright - nmin >= mindel) /* split */
numaAddNumber(nasplit, xmin);
}
nsplit = numaGetCount(nasplit);
#if 0
if (ppixdebug && nsplit > 0) {
lept_mkdir("lept/split");
gplotSimple1(na1, GPLOT_PNG, "/tmp/lept/split/split", NULL);
}
#endif
numaDestroy(&na1);
numaDestroy(&na2);
LEPT_FREE(array1);
LEPT_FREE(array2);
if (nsplit == 0) { /* no splitting */
numaDestroy(&nasplit);
box = boxCreate(0, 0, w, h);
boxaAddBox(boxad, box, L_INSERT);
return boxad;
}
/* Use split points to generate b.b. after splitting */
for (i = 0, xshift = 0; i < nsplit; i++) {
numaGetIValue(nasplit, i, &isplit);
box = boxCreate(xshift, 0, isplit - xshift, h);
boxaAddBox(boxad, box, L_INSERT);
xshift = isplit + 1;
}
box = boxCreate(xshift, 0, w - xshift, h);
boxaAddBox(boxad, box, L_INSERT);
numaDestroy(&nasplit);
if (ppixdebug) {
pixdb = pixConvertTo32(pixs);
ncomp = boxaGetCount(boxad);
for (i = 0; i < ncomp; i++) {
box = boxaGetBox(boxad, i, L_CLONE);
pixRenderBoxBlend(pixdb, box, 1, 255, 0, 0, 0.5);
boxDestroy(&box);
}
*ppixdebug = pixdb;
}
return boxad;
}
/*------------------------------------------------------------------*
* Extraction of lines of text *
*------------------------------------------------------------------*/
/*!
* \brief pixExtractTextlines()
*
* \param[in] pixs any depth, assumed to have nearly horizontal text
* \param[in] maxw, maxh initial filtering: remove any components in pixs
* with components larger than maxw or maxh
* \param[in] minw, minh final filtering: remove extracted 'lines'
* with sizes smaller than minw or minh; use
* 0 for default.
* \param[in] adjw, adjh final adjustment of boxes representing each
* text line. If > 0, these increase the box
* size at each edge by this amount.
* \param[in] pixadb pixa for saving intermediate steps; NULL to omit
* \return pixa of textline images, including bounding boxes, or
* NULL on error
*
* <pre>
* Notes:
* (1) This function assumes that textline fragments have sufficient
* vertical separation and small enough skew so that a
* horizontal dilation sufficient to join words will not join
* textlines. It does not guarantee that horizontally adjacent
* textline fragments on the same line will be joined.
* (2) For images with multiple columns, it attempts to avoid joining
* textlines across the space between columns. If that is not
* a concern, you can also use pixExtractRawTextlines(),
* which will join them with alacrity.
* (3) This first removes components from pixs that are either
* wide (> %maxw) or tall (> %maxh).
* (4) A final filtering operation removes small components, such
* that width < %minw or height < %minh.
* (5) For reasonable accuracy, the resolution of pixs should be
* at least 100 ppi. For reasonable efficiency, the resolution
* should not exceed 600 ppi.
* (6) This can be used to determine if some region of a scanned
* image is horizontal text.
* (7) As an example, for a pix with resolution 300 ppi, a reasonable
* set of parameters is:
* pixExtractTextlines(pix, 150, 150, 36, 20, 5, 5, NULL);
* The defaults minw and minh for 300 ppi are about 36 and 20,
* so the same result is obtained with:
* pixExtractTextlines(pix, 150, 150, 0, 0, 5, 5, NULL);
* (8) The output pixa is composed of subimages, one for each textline,
* and the boxa in the pixa tells where in %pixs each textline goes.
* </pre>
*/
PIXA *
pixExtractTextlines(PIX *pixs,
l_int32 maxw,
l_int32 maxh,
l_int32 minw,
l_int32 minh,
l_int32 adjw,
l_int32 adjh,
PIXA *pixadb)
{
char buf[64];
l_int32 res, csize, empty;
BOXA *boxa1, *boxa2, *boxa3;
PIX *pix1, *pix2, *pix3;
PIXA *pixa1, *pixa2, *pixa3;
PROCNAME("pixExtractTextlines");
if (!pixs)
return (PIXA *)ERROR_PTR("pixs not defined", procName, NULL);
/* Binarize carefully, if necessary */
if (pixGetDepth(pixs) > 1) {
pix2 = pixConvertTo8(pixs, FALSE);
pix3 = pixCleanBackgroundToWhite(pix2, NULL, NULL, 1.0, 70, 190);
pix1 = pixThresholdToBinary(pix3, 150);
pixDestroy(&pix2);
pixDestroy(&pix3);
} else {
pix1 = pixClone(pixs);
}
pixZero(pix1, &empty);
if (empty) {
pixDestroy(&pix1);
L_INFO("no fg pixels in input image\n", procName);
return NULL;
}
if (pixadb) pixaAddPix(pixadb, pix1, L_COPY);
/* Remove any very tall or very wide connected components */
pix2 = pixSelectBySize(pix1, maxw, maxh, 8, L_SELECT_IF_BOTH,
L_SELECT_IF_LT, NULL);
if (pixadb) pixaAddPix(pixadb, pix2, L_COPY);
pixDestroy(&pix1);
/* Filter to solidify the text lines within the x-height region.
* The closing (csize) bridges gaps between words. The opening
* removes isolated bridges between textlines. */
if ((res = pixGetXRes(pixs)) == 0) {
L_INFO("Resolution is not set: setting to 300 ppi\n", procName);
res = 300;
}
csize = L_MIN(120., 60.0 * res / 300.0);
snprintf(buf, sizeof(buf), "c%d.1 + o%d.1", csize, csize / 3);
pix3 = pixMorphCompSequence(pix2, buf, 0);
if (pixadb) pixaAddPix(pixadb, pix3, L_COPY);
/* Extract the connected components. These should be dilated lines */
boxa1 = pixConnComp(pix3, &pixa1, 4);
if (pixadb) {
pix1 = pixaDisplayRandomCmap(pixa1, 0, 0);
pixcmapResetColor(pixGetColormap(pix1), 0, 255, 255, 255);
pixaAddPix(pixadb, pix1, L_INSERT);
}
/* Set minw, minh if default is requested */
minw = (minw != 0) ? minw : (l_int32)(0.12 * res);
minh = (minh != 0) ? minh : (l_int32)(0.07 * res);
/* Remove line components that are too small */
pixa2 = pixaSelectBySize(pixa1, minw, minh, L_SELECT_IF_BOTH,
L_SELECT_IF_GTE, NULL);
if (pixadb) {
pix1 = pixaDisplayRandomCmap(pixa2, 0, 0);
pixcmapResetColor(pixGetColormap(pix1), 0, 255, 255, 255);
pixaAddPix(pixadb, pix1, L_INSERT);
pix1 = pixConvertTo32(pix2);
pixRenderBoxaArb(pix1, pixa2->boxa, 2, 255, 0, 0);
pixaAddPix(pixadb, pix1, L_INSERT);
}
/* Selectively AND with the version before dilation, and save */
boxa2 = pixaGetBoxa(pixa2, L_CLONE);
boxa3 = boxaAdjustSides(boxa2, -adjw, adjw, -adjh, adjh);
pixa3 = pixClipRectangles(pix2, boxa3);
if (pixadb) {
pix1 = pixaDisplayRandomCmap(pixa3, 0, 0);
pixcmapResetColor(pixGetColormap(pix1), 0, 255, 255, 255);
pixaAddPix(pixadb, pix1, L_INSERT);
}
pixDestroy(&pix2);
pixDestroy(&pix3);
pixaDestroy(&pixa1);
pixaDestroy(&pixa2);
boxaDestroy(&boxa1);
boxaDestroy(&boxa2);
boxaDestroy(&boxa3);
return pixa3;
}
/*!
* \brief pixExtractRawTextlines()
*
* \param[in] pixs any depth, assumed to have nearly horizontal text
* \param[in] maxw, maxh initial filtering: remove any components in pixs
* with components larger than maxw or maxh;
* use 0 for default values.
* \param[in] adjw, adjh final adjustment of boxes representing each
* text line. If > 0, these increase the box
* size at each edge by this amount.
* \param[in] pixadb pixa for saving intermediate steps; NULL to omit
* \return pixa of textline images, including bounding boxes, or
* NULL on error
*
* <pre>
* Notes:
* (1) This function assumes that textlines have sufficient
* vertical separation and small enough skew so that a
* horizontal dilation sufficient to join words will not join
* textlines. It aggressively joins textlines across multiple
* columns, so if that is not desired, you must either (a) make
* sure that %pixs is a single column of text or (b) use instead
* pixExtractTextlines(), which is more conservative
* about joining text fragments that have vertical overlap.
* (2) This first removes components from pixs that are either
* very wide (> %maxw) or very tall (> %maxh).
* (3) For reasonable accuracy, the resolution of pixs should be
* at least 100 ppi. For reasonable efficiency, the resolution
* should not exceed 600 ppi.
* (4) This can be used to determine if some region of a scanned
* image is horizontal text.
* (5) As an example, for a pix with resolution 300 ppi, a reasonable
* set of parameters is:
* pixExtractRawTextlines(pix, 150, 150, 0, 0, NULL);
* (6) The output pixa is composed of subimages, one for each textline,
* and the boxa in the pixa tells where in %pixs each textline goes.
* </pre>
*/
PIXA *
pixExtractRawTextlines(PIX *pixs,
l_int32 maxw,
l_int32 maxh,
l_int32 adjw,
l_int32 adjh,
PIXA *pixadb)
{
char buf[64];
l_int32 res, csize, empty;
BOXA *boxa1, *boxa2, *boxa3;
BOXAA *baa1;
PIX *pix1, *pix2, *pix3;
PIXA *pixa1, *pixa2;
PROCNAME("pixExtractRawTextlines");
if (!pixs)
return (PIXA *)ERROR_PTR("pixs not defined", procName, NULL);
/* Set maxw, maxh if default is requested */
if ((res = pixGetXRes(pixs)) == 0) {
L_INFO("Resolution is not set: setting to 300 ppi\n", procName);
res = 300;
}
maxw = (maxw != 0) ? maxw : (l_int32)(0.5 * res);
maxh = (maxh != 0) ? maxh : (l_int32)(0.5 * res);
/* Binarize carefully, if necessary */
if (pixGetDepth(pixs) > 1) {
pix2 = pixConvertTo8(pixs, FALSE);
pix3 = pixCleanBackgroundToWhite(pix2, NULL, NULL, 1.0, 70, 190);
pix1 = pixThresholdToBinary(pix3, 150);
pixDestroy(&pix2);
pixDestroy(&pix3);
} else {
pix1 = pixClone(pixs);
}
pixZero(pix1, &empty);
if (empty) {
pixDestroy(&pix1);
L_INFO("no fg pixels in input image\n", procName);
return NULL;
}
if (pixadb) pixaAddPix(pixadb, pix1, L_COPY);
/* Remove any very tall or very wide connected components */
pix2 = pixSelectBySize(pix1, maxw, maxh, 8, L_SELECT_IF_BOTH,
L_SELECT_IF_LT, NULL);
if (pixadb) pixaAddPix(pixadb, pix2, L_COPY);
pixDestroy(&pix1);
/* Filter to solidify the text lines within the x-height region.
* The closing (csize) bridges gaps between words. */
csize = L_MIN(120., 60.0 * res / 300.0);
snprintf(buf, sizeof(buf), "c%d.1", csize);
pix3 = pixMorphCompSequence(pix2, buf, 0);
if (pixadb) pixaAddPix(pixadb, pix3, L_COPY);
/* Extract the connected components. These should be dilated lines */
boxa1 = pixConnComp(pix3, &pixa1, 4);
if (pixadb) {
pix1 = pixaDisplayRandomCmap(pixa1, 0, 0);
pixcmapResetColor(pixGetColormap(pix1), 0, 255, 255, 255);
pixaAddPix(pixadb, pix1, L_INSERT);
}
/* Do a 2-d sort, and generate a bounding box for each set of text
* line segments that is aligned horizontally (i.e., has vertical
* overlap) into a box representing a single text line. */
baa1 = boxaSort2d(boxa1, NULL, -1, -1, 5);
boxaaGetExtent(baa1, NULL, NULL, NULL, &boxa2);
if (pixadb) {
pix1 = pixConvertTo32(pix2);
pixRenderBoxaArb(pix1, boxa2, 2, 255, 0, 0);
pixaAddPix(pixadb, pix1, L_INSERT);
}
/* Optionally adjust the sides of each text line box, and then
* use the boxes to generate a pixa of the text lines. */
boxa3 = boxaAdjustSides(boxa2, -adjw, adjw, -adjh, adjh);
pixa2 = pixClipRectangles(pix2, boxa3);
if (pixadb) {
pix1 = pixaDisplayRandomCmap(pixa2, 0, 0);
pixcmapResetColor(pixGetColormap(pix1), 0, 255, 255, 255);
pixaAddPix(pixadb, pix1, L_INSERT);
}
pixDestroy(&pix2);
pixDestroy(&pix3);
pixaDestroy(&pixa1);
boxaDestroy(&boxa1);
boxaDestroy(&boxa2);
boxaDestroy(&boxa3);
boxaaDestroy(&baa1);
return pixa2;
}
/*------------------------------------------------------------------*
* How many text columns *
*------------------------------------------------------------------*/
/*!
* \brief pixCountTextColumns()
*
* \param[in] pixs 1 bpp
* \param[in] deltafract fraction of (max - min) to be used in the delta
* for extrema finding; typ 0.3
* \param[in] peakfract fraction of (max - min) to be used to threshold
* the peak value; typ. 0.5
* \param[in] clipfract fraction of image dimension removed on each side;
* typ. 0.1, which leaves w and h reduced by 0.8
* \param[out] pncols number of columns; -1 if not determined
* \param[in] pixadb [optional] pre-allocated, for showing
* intermediate computation; use null to skip
* \return 0 if OK, 1 on error
*
* <pre>
* Notes:
* (1) It is assumed that pixs has the correct resolution set.
* If the resolution is 0, we set to 300 and issue a warning.
* (2) If necessary, the image is scaled to between 37 and 75 ppi;
* most of the processing is done at this resolution.
* (3) If no text is found (essentially a blank page),
* this returns ncols = 0.
* (4) For debug output, input a pre-allocated pixa.
* </pre>
*/
l_ok
pixCountTextColumns(PIX *pixs,
l_float32 deltafract,
l_float32 peakfract,
l_float32 clipfract,
l_int32 *pncols,
PIXA *pixadb)
{
l_int32 w, h, res, i, n, npeak;
l_float32 scalefact, redfact, minval, maxval, val4, val5, fract;
BOX *box;
NUMA *na1, *na2, *na3, *na4, *na5;
PIX *pix1, *pix2, *pix3, *pix4, *pix5;
PROCNAME("pixCountTextColumns");
if (!pncols)
return ERROR_INT("&ncols not defined", procName, 1);
*pncols = -1; /* init */
if (!pixs || pixGetDepth(pixs) != 1)
return ERROR_INT("pixs not defined or not 1 bpp", procName, 1);
if (deltafract < 0.15 || deltafract > 0.75)
L_WARNING("deltafract not in [0.15 ... 0.75]\n", procName);
if (peakfract < 0.25 || peakfract > 0.9)
L_WARNING("peakfract not in [0.25 ... 0.9]\n", procName);
if (clipfract < 0.0 || clipfract >= 0.5)
return ERROR_INT("clipfract not in [0.0 ... 0.5)\n", procName, 1);
if (pixadb) pixaAddPix(pixadb, pixs, L_COPY);
/* Scale to between 37.5 and 75 ppi */
if ((res = pixGetXRes(pixs)) == 0) {
L_WARNING("resolution undefined; set to 300\n", procName);
pixSetResolution(pixs, 300, 300);
res = 300;
}
if (res < 37) {
L_WARNING("resolution %d very low\n", procName, res);
scalefact = 37.5 / res;
pix1 = pixScale(pixs, scalefact, scalefact);
} else {
redfact = (l_float32)res / 37.5;
if (redfact < 2.0)
pix1 = pixClone(pixs);
else if (redfact < 4.0)
pix1 = pixReduceRankBinaryCascade(pixs, 1, 0, 0, 0);
else if (redfact < 8.0)
pix1 = pixReduceRankBinaryCascade(pixs, 1, 2, 0, 0);
else if (redfact < 16.0)
pix1 = pixReduceRankBinaryCascade(pixs, 1, 2, 2, 0);
else
pix1 = pixReduceRankBinaryCascade(pixs, 1, 2, 2, 2);
}
if (pixadb) pixaAddPix(pixadb, pix1, L_COPY);
/* Crop inner 80% of image */
pixGetDimensions(pix1, &w, &h, NULL);
box = boxCreate(clipfract * w, clipfract * h,
(1.0 - 2 * clipfract) * w, (1.0 - 2 * clipfract) * h);
pix2 = pixClipRectangle(pix1, box, NULL);
pixGetDimensions(pix2, &w, &h, NULL);
boxDestroy(&box);
if (pixadb) pixaAddPix(pixadb, pix2, L_COPY);
/* Deskew */
pix3 = pixDeskew(pix2, 0);
if (pixadb) pixaAddPix(pixadb, pix3, L_COPY);
/* Close to increase column counts for text */
pix4 = pixCloseSafeBrick(NULL, pix3, 5, 21);
if (pixadb) pixaAddPix(pixadb, pix4, L_COPY);
pixInvert(pix4, pix4);
na1 = pixCountByColumn(pix4, NULL);
if (pixadb) {
gplotSimple1(na1, GPLOT_PNG, "/tmp/lept/plot", NULL);
pix5 = pixRead("/tmp/lept/plot.png");
pixaAddPix(pixadb, pix5, L_INSERT);
}
/* Analyze the column counts. na4 gives the locations of
* the extrema in normalized units (0.0 to 1.0) across the
* cropped image. na5 gives the magnitude of the
* extrema, normalized to the dynamic range. The peaks
* are values that are at least peakfract of (max - min). */
numaGetMax(na1, &maxval, NULL);
numaGetMin(na1, &minval, NULL);
fract = (l_float32)(maxval - minval) / h; /* is there much at all? */
if (fract < 0.05) {
L_INFO("very little content on page; 0 text columns\n", procName);
*pncols = 0;
} else {
na2 = numaFindExtrema(na1, deltafract * (maxval - minval), &na3);
na4 = numaTransform(na2, 0, 1.0 / w);
na5 = numaTransform(na3, -minval, 1.0 / (maxval - minval));
n = numaGetCount(na4);
for (i = 0, npeak = 0; i < n; i++) {
numaGetFValue(na4, i, &val4);
numaGetFValue(na5, i, &val5);
if (val4 > 0.3 && val4 < 0.7 && val5 >= peakfract) {
npeak++;
L_INFO("Peak(loc,val) = (%5.3f,%5.3f)\n", procName, val4, val5);
}
}
*pncols = npeak + 1;
numaDestroy(&na2);
numaDestroy(&na3);
numaDestroy(&na4);
numaDestroy(&na5);
}
pixDestroy(&pix1);
pixDestroy(&pix2);
pixDestroy(&pix3);
pixDestroy(&pix4);
numaDestroy(&na1);
return 0;
}
/*------------------------------------------------------------------*
* Decision text vs photo *
*------------------------------------------------------------------*/
/*!
* \brief pixDecideIfText()
*
* \param[in] pixs any depth
* \param[in] box [optional] if null, use entire pixs
* \param[out] pistext 1 if text; 0 if photo; -1 if not determined or empty
* \param[in] pixadb [optional] pre-allocated, for showing intermediate
* computation; use NULL to skip
* \return 0 if OK, 1 on error
*
* <pre>
* Notes:
* (1) It is assumed that pixs has the correct resolution set.
* If the resolution is 0, we set to 300 and issue a warning.
* (2) If necessary, the image is scaled to 300 ppi; most of the
* processing is done at this resolution.
* (3) Text is assumed to be in horizontal lines.
* (4) Because thin vertical lines are removed before filtering for
* text lines, this should identify tables as text.
* (5) If %box is null and pixs contains both text lines and line art,
* this function might return %istext == true.
* (6) If the input pixs is empty, or for some other reason the
* result can not be determined, return -1.
* (7) For debug output, input a pre-allocated pixa.
* </pre>
*/
l_ok
pixDecideIfText(PIX *pixs,
BOX *box,
l_int32 *pistext,
PIXA *pixadb)
{
l_int32 i, empty, maxw, w, h, n1, n2, n3, minlines, big_comp;
l_float32 ratio1, ratio2;
L_BMF *bmf;
BOXA *boxa1, *boxa2, *boxa3, *boxa4, *boxa5;
PIX *pix1, *pix2, *pix3, *pix4, *pix5, *pix6, *pix7;
PIXA *pixa1;
SEL *sel1;
PROCNAME("pixDecideIfText");
if (!pistext)
return ERROR_INT("&istext not defined", procName, 1);
*pistext = -1;
if (!pixs)
return ERROR_INT("pixs not defined", procName, 1);
/* Crop, convert to 1 bpp, 300 ppi */
if ((pix1 = pixPrepare1bpp(pixs, box, 0.1, 300)) == NULL)
return ERROR_INT("pix1 not made", procName, 1);
pixZero(pix1, &empty);
if (empty) {
pixDestroy(&pix1);
L_INFO("pix is empty\n", procName);
return 0;
}
w = pixGetWidth(pix1);
/* Identify and remove tall, thin vertical lines (as found in tables)
* that are up to 9 pixels wide. Make a hit-miss sel with an
* 81 pixel vertical set of hits and with 3 pairs of misses that
* are 10 pixels apart horizontally. It is necessary to use a
* hit-miss transform; if we only opened with a vertical line of
* hits, we would remove solid regions of pixels that are not
* text or vertical lines. */
pix2 = pixCreate(11, 81, 1);
for (i = 0; i < 81; i++)
pixSetPixel(pix2, 5, i, 1);
sel1 = selCreateFromPix(pix2, 40, 5, NULL);
selSetElement(sel1, 20, 0, SEL_MISS);
selSetElement(sel1, 20, 10, SEL_MISS);
selSetElement(sel1, 40, 0, SEL_MISS);
selSetElement(sel1, 40, 10, SEL_MISS);
selSetElement(sel1, 60, 0, SEL_MISS);
selSetElement(sel1, 60, 10, SEL_MISS);
pix3 = pixHMT(NULL, pix1, sel1);
pix4 = pixSeedfillBinaryRestricted(NULL, pix3, pix1, 8, 5, 1000);
pix5 = pixXor(NULL, pix1, pix4);
pixDestroy(&pix2);
selDestroy(&sel1);
/* Convert the text lines to separate long horizontal components */
pix6 = pixMorphCompSequence(pix5, "c30.1 + o15.1 + c60.1 + o2.2", 0);
/* Estimate the distance to the bottom of the significant region */
if (box) { /* use full height */
pixGetDimensions(pix6, NULL, &h, NULL);
} else { /* use height of region that has text lines */
pixFindThreshFgExtent(pix6, 400, NULL, &h);
}
if (pixadb) {
bmf = bmfCreate(NULL, 6);
pixaAddPixWithText(pixadb, pix1, 1, bmf, "threshold/crop to binary",
0x0000ff00, L_ADD_BELOW);
pixaAddPixWithText(pixadb, pix3, 2, bmf, "hit-miss for vertical line",
0x0000ff00, L_ADD_BELOW);
pixaAddPixWithText(pixadb, pix4, 2, bmf, "restricted seed-fill",
0x0000ff00, L_ADD_BELOW);
pixaAddPixWithText(pixadb, pix5, 2, bmf, "remove using xor",
0x0000ff00, L_ADD_BELOW);
pixaAddPixWithText(pixadb, pix6, 2, bmf, "make long horiz components",
0x0000ff00, L_ADD_BELOW);
}
/* Extract the connected components */
if (pixadb) {
boxa1 = pixConnComp(pix6, &pixa1, 8);
pix7 = pixaDisplayRandomCmap(pixa1, 0, 0);
pixcmapResetColor(pixGetColormap(pix7), 0, 255, 255, 255);
pixaAddPixWithText(pixadb, pix7, 2, bmf, "show connected components",
0x0000ff00, L_ADD_BELOW);
pixDestroy(&pix7);
pixaDestroy(&pixa1);
bmfDestroy(&bmf);
} else {
boxa1 = pixConnComp(pix6, NULL, 8);
}
/* Analyze the connected components. The following conditions
* at 300 ppi must be satisfied if the image is text:
* (1) There are no components that are wider than 400 pixels and
* taller than 175 pixels.
* (2) The second longest component is at least 60% of the
* (possibly cropped) image width. This catches images
* that don't have any significant content.
* (3) Of the components that are at least 40% of the length
* of the longest (n2), at least 80% of them must not exceed
* 60 pixels in height.
* (4) The number of those long, thin components (n3) must
* equal or exceed a minimum that scales linearly with the
* image height.
* Most images that are not text fail more than one of these
* conditions. */
boxa2 = boxaSort(boxa1, L_SORT_BY_WIDTH, L_SORT_DECREASING, NULL);
boxaGetBoxGeometry(boxa2, 1, NULL, NULL, &maxw, NULL); /* 2nd longest */
boxa3 = boxaSelectBySize(boxa1, 0.4 * maxw, 0, L_SELECT_WIDTH,
L_SELECT_IF_GTE, NULL);
boxa4 = boxaSelectBySize(boxa3, 0, 60, L_SELECT_HEIGHT,
L_SELECT_IF_LTE, NULL);
boxa5 = boxaSelectBySize(boxa1, 400, 175, L_SELECT_IF_BOTH,
L_SELECT_IF_GT, NULL);
big_comp = (boxaGetCount(boxa5) == 0) ? 0 : 1;
n1 = boxaGetCount(boxa1);
n2 = boxaGetCount(boxa3);
n3 = boxaGetCount(boxa4);
ratio1 = (l_float32)maxw / (l_float32)w;
ratio2 = (l_float32)n3 / (l_float32)n2;
minlines = L_MAX(2, h / 125);
if (big_comp || ratio1 < 0.6 || ratio2 < 0.8 || n3 < minlines)
*pistext = 0;
else
*pistext = 1;
if (pixadb) {
if (*pistext == 1) {
L_INFO("This is text: \n n1 = %d, n2 = %d, n3 = %d, "
"minlines = %d\n maxw = %d, ratio1 = %4.2f, h = %d, "
"big_comp = %d\n", procName, n1, n2, n3, minlines,
maxw, ratio1, h, big_comp);
} else {
L_INFO("This is not text: \n n1 = %d, n2 = %d, n3 = %d, "
"minlines = %d\n maxw = %d, ratio1 = %4.2f, h = %d, "
"big_comp = %d\n", procName, n1, n2, n3, minlines,
maxw, ratio1, h, big_comp);
}
}
boxaDestroy(&boxa1);
boxaDestroy(&boxa2);
boxaDestroy(&boxa3);
boxaDestroy(&boxa4);
boxaDestroy(&boxa5);
pixDestroy(&pix1);
pixDestroy(&pix3);
pixDestroy(&pix4);
pixDestroy(&pix5);
pixDestroy(&pix6);
return 0;
}
/*!
* \brief pixFindThreshFgExtent()
*
* \param[in] pixs 1 bpp
* \param[in] thresh threshold number of pixels in row
* \param[out] ptop [optional] location of top of region
* \param[out] pbot [optional] location of bottom of region
* \return 0 if OK, 1 on error
*/
l_ok
pixFindThreshFgExtent(PIX *pixs,
l_int32 thresh,
l_int32 *ptop,
l_int32 *pbot)
{
l_int32 i, n;
l_int32 *array;
NUMA *na;
PROCNAME("pixFindThreshFgExtent");
if (ptop) *ptop = 0;
if (pbot) *pbot = 0;
if (!ptop && !pbot)
return ERROR_INT("nothing to determine", procName, 1);
if (!pixs || pixGetDepth(pixs) != 1)
return ERROR_INT("pixs not defined or not 1 bpp", procName, 1);
na = pixCountPixelsByRow(pixs, NULL);
n = numaGetCount(na);
array = numaGetIArray(na);
if (ptop) {
for (i = 0; i < n; i++) {
if (array[i] >= thresh) {
*ptop = i;
break;
}
}
}
if (pbot) {
for (i = n - 1; i >= 0; i--) {
if (array[i] >= thresh) {
*pbot = i;
break;
}
}
}
LEPT_FREE(array);
numaDestroy(&na);
return 0;
}
/*------------------------------------------------------------------*
* Decision: table vs text *
*------------------------------------------------------------------*/
/*!
* \brief pixDecideIfTable()
*
* \param[in] pixs any depth, any resolution >= 75 ppi
* \param[in] box [optional] if null, use entire pixs
* \param[in] orient L_PORTRAIT_MODE, L_LANDSCAPE_MODE
* \param[out] pscore 0 - 4; -1 if not determined
* \param[in] pixadb [optional] pre-allocated, for showing intermediate
* computation; use NULL to skip
* \return 0 if OK, 1 on error
*
* <pre>
* Notes:
* (1) It is assumed that pixs has the correct resolution set.
* If the resolution is 0, we assume it is 300 ppi and issue a warning.
* (2) If %orient == L_LANDSCAPE_MODE, the image is rotated 90 degrees
* clockwise before being analyzed.
* (3) The interpretation of the returned score:
* -1 undetermined
* 0 no table
* 1 unlikely to have a table
* 2 likely to have a table
* 3 even more likely to have a table
* 4 extremely likely to have a table
* * Setting the condition for finding a table at score >= 2 works
* well, except for false positives on kanji and landscape text.
* * These false positives can be removed by setting the condition
* at score >= 3, but recall is lowered because it will not find
* tables without either horizontal or vertical lines.
* (4) Most of the processing takes place at 75 ppi.
* (5) Internally, three numbers are determined, for horizontal and
* vertical fg lines, and for vertical bg lines. From these,
* four tests are made to decide if there is a table occupying
* a significant part of the image.
* (6) Images have arbitrary content and would be likely to trigger
* this detector, so they are checked for first, and if found,
* return with a 0 (no table) score.
* (7) Musical scores (tablature) are likely to trigger the detector.
* (8) Tables of content with more than 2 columns are likely to
* trigger the detector.
* (9) For debug output, input a pre-allocated pixa.
* </pre>
*/
l_ok
pixDecideIfTable(PIX *pixs,
BOX *box,
l_int32 orient,
l_int32 *pscore,
PIXA *pixadb)
{
l_int32 empty, nhb, nvb, nvw, score, htfound;
PIX *pix1, *pix2, *pix3, *pix4, *pix5, *pix6, *pix7, *pix8, *pix9;
PROCNAME("pixDecideIfTable");
if (!pscore)
return ERROR_INT("&score not defined", procName, 1);
*pscore = -1;
if (!pixs)
return ERROR_INT("pixs not defined", procName, 1);
/* Check if there is an image region. First convert to 1 bpp
* at 175 ppi. If an image is found, assume there is no table. */
pix1 = pixPrepare1bpp(pixs, box, 0.1, 175);
pix2 = pixGenerateHalftoneMask(pix1, NULL, &htfound, NULL);
if (htfound && pixadb) pixaAddPix(pixadb, pix2, L_COPY);
pixDestroy(&pix1);
pixDestroy(&pix2);
if (htfound) {
*pscore = 0;
L_INFO("pix has an image region\n", procName);
return 0;
}
/* Crop, convert to 1 bpp, 75 ppi */
if ((pix1 = pixPrepare1bpp(pixs, box, 0.05, 75)) == NULL)
return ERROR_INT("pix1 not made", procName, 1);
pixZero(pix1, &empty);
if (empty) {
*pscore = 0;
pixDestroy(&pix1);
L_INFO("pix is empty\n", procName);
return 0;
}
/* The 2x2 dilation on 75 ppi makes these two approaches very similar:
* (1) pix1 = pixPrepare1bpp(..., 300); // 300 ppi resolution
* pix2 = pixReduceRankBinaryCascade(pix1, 1, 1, 0, 0);
* (2) pix1 = pixPrepare1bpp(..., 75); // 75 ppi resolution
* pix2 = pixDilateBrick(NULL, pix1, 2, 2);
* But (2) is more efficient if the input image to pixPrepare1bpp()
* is not at 300 ppi. */
pix2 = pixDilateBrick(NULL, pix1, 2, 2);
/* Deskew both horizontally and vertically; rotate by 90
* degrees if in landscape mode. */
pix3 = pixDeskewBoth(pix2, 1);
if (pixadb) {
pixaAddPix(pixadb, pix2, L_COPY);
pixaAddPix(pixadb, pix3, L_COPY);
}
if (orient == L_LANDSCAPE_MODE)
pix4 = pixRotate90(pix3, 1);
else
pix4 = pixClone(pix3);
pixDestroy(&pix1);
pixDestroy(&pix2);
pixDestroy(&pix3);
pix1 = pixClone(pix4);
pixDestroy(&pix4);
/* Look for horizontal and vertical lines */
pix2 = pixMorphSequence(pix1, "o100.1 + c1.4", 0);
pix3 = pixSeedfillBinary(NULL, pix2, pix1, 8);
pix4 = pixMorphSequence(pix1, "o1.100 + c4.1", 0);
pix5 = pixSeedfillBinary(NULL, pix4, pix1, 8);
pix6 = pixOr(NULL, pix3, pix5);
if (pixadb) {
pixaAddPix(pixadb, pix2, L_COPY);
pixaAddPix(pixadb, pix4, L_COPY);
pixaAddPix(pixadb, pix3, L_COPY);
pixaAddPix(pixadb, pix5, L_COPY);
pixaAddPix(pixadb, pix6, L_COPY);
}
pixCountConnComp(pix2, 8, &nhb); /* number of horizontal black lines */
pixCountConnComp(pix4, 8, &nvb); /* number of vertical black lines */
/* Remove the lines */
pixSubtract(pix1, pix1, pix6);
if (pixadb) pixaAddPix(pixadb, pix1, L_COPY);
/* Remove noise pixels */
pix7 = pixMorphSequence(pix1, "c4.1 + o8.1", 0);
if (pixadb) pixaAddPix(pixadb, pix7, L_COPY);
/* Look for vertical white space. Invert to convert white bg
* to fg. Use a single rank-1 2x reduction, which closes small
* fg holes, for the final processing at 37.5 ppi.
* The vertical opening is then about 3 inches on a 300 ppi image.
* We also remove vertical whitespace that is less than 5 pixels
* wide at this resolution (about 0.1 inches) */
pixInvert(pix7, pix7);
pix8 = pixMorphSequence(pix7, "r1 + o1.100", 0);
pix9 = pixSelectBySize(pix8, 5, 0, 8, L_SELECT_WIDTH,
L_SELECT_IF_GTE, NULL);
pixCountConnComp(pix9, 8, &nvw); /* number of vertical white lines */
if (pixadb) {
pixaAddPix(pixadb, pixScale(pix8, 2.0, 2.0), L_INSERT);
pixaAddPix(pixadb, pixScale(pix9, 2.0, 2.0), L_INSERT);
}
/* Require at least 2 of the following 4 conditions for a table.
* Some tables do not have black (fg) lines, and for those we
* require more than 6 long vertical whitespace (bg) lines. */
score = 0;
if (nhb > 1) score++;
if (nvb > 2) score++;
if (nvw > 3) score++;
if (nvw > 6) score++;
*pscore = score;
pixDestroy(&pix1);
pixDestroy(&pix2);
pixDestroy(&pix3);
pixDestroy(&pix4);
pixDestroy(&pix5);
pixDestroy(&pix6);
pixDestroy(&pix7);
pixDestroy(&pix8);
pixDestroy(&pix9);
return 0;
}
/*!
* \brief pixPrepare1bpp()
*
* \param[in] pixs any depth
* \param[in] box [optional] if null, use entire pixs
* \param[in] cropfract fraction to be removed from the boundary;
* use 0.0 to retain the entire image
* \param[in] outres desired resolution of output image; if the
* input image resolution is not set, assume
* 300 ppi; use 0 to skip scaling.
* \return pixd if OK, NULL on error
*
* <pre>
* Notes:
* (1) This handles some common pre-processing operations,
* where the page segmentation algorithm takes a 1 bpp image.
* </pre>
*/
PIX *
pixPrepare1bpp(PIX *pixs,
BOX *box,
l_float32 cropfract,
l_int32 outres)
{
l_int32 w, h, res;
l_float32 factor;
BOX *box1;
PIX *pix1, *pix2, *pix3, *pix4, *pix5;
PROCNAME("pixPrepare1bpp");
if (!pixs)
return (PIX *)ERROR_PTR("pixs not defined", procName, NULL);
/* Crop the image. If no box is given, use %cropfract to remove
* pixels near the image boundary; this helps avoid false
* negatives from noise that is often found there. */
if (box) {
pix1 = pixClipRectangle(pixs, box, NULL);
} else {
pixGetDimensions(pixs, &w, &h, NULL);
box1 = boxCreate((l_int32)(cropfract * w), (l_int32)(cropfract * h),
(l_int32)((1.0 - 2 * cropfract) * w),
(l_int32)((1.0 - 2 * cropfract) * h));
pix1 = pixClipRectangle(pixs, box1, NULL);
boxDestroy(&box1);
}
/* Convert to 1 bpp with adaptive background cleaning */
if (pixGetDepth(pixs) > 1) {
pix2 = pixConvertTo8(pix1, 0);
pix3 = pixCleanBackgroundToWhite(pix2, NULL, NULL, 1.0, 70, 160);
pixDestroy(&pix1);
pixDestroy(&pix2);
if (!pix3) {
L_INFO("pix cleaning failed\n", procName);
return NULL;
}
pix4 = pixThresholdToBinary(pix3, 200);
pixDestroy(&pix3);
} else {
pix4 = pixClone(pix1);
pixDestroy(&pix1);
}
/* Scale the image to the requested output resolution;
do not scale if %outres <= 0 */
if (outres <= 0)
return pix4;
if ((res = pixGetXRes(pixs)) == 0) {
L_WARNING("Resolution is not set: using 300 ppi\n", procName);
res = 300;
}
if (res != outres) {
factor = (l_float32)outres / (l_float32)res;
pix5 = pixScale(pix4, factor, factor);
} else {
pix5 = pixClone(pix4);
}
pixDestroy(&pix4);
return pix5;
}
/*------------------------------------------------------------------*
* Estimate the grayscale background value *
*------------------------------------------------------------------*/
/*!
* \brief pixEstimateBackground()
*
* \param[in] pixs 8 bpp, with or without colormap
* \param[in] darkthresh pixels below this value are never considered
* part of the background; typ. 70; use 0 to skip
* \param[in] edgecrop fraction of half-width on each side, and of
* half-height at top and bottom, that are cropped
* \param[out] pbg estimated background, or 0 on error
* \return 0 if OK, 1 on error
*
* <pre>
* Notes:
* (1) Caller should check that return bg value is > 0.
* </pre>
*/
l_ok
pixEstimateBackground(PIX *pixs,
l_int32 darkthresh,
l_float32 edgecrop,
l_int32 *pbg)
{
l_int32 w, h, sampling;
l_float32 fbg;
BOX *box;
PIX *pix1, *pix2, *pixm;
PROCNAME("pixEstimateBackground");
if (!pbg)
return ERROR_INT("&bg not defined", procName, 1);
*pbg = 0;
if (!pixs || pixGetDepth(pixs) != 8)
return ERROR_INT("pixs not defined or not 8 bpp", procName, 1);
if (darkthresh > 128)
L_WARNING("darkthresh unusually large\n", procName);
if (edgecrop < 0.0 || edgecrop >= 1.0)
return ERROR_INT("edgecrop not in [0.0 ... 1.0)", procName, 1);
pix1 = pixRemoveColormap(pixs, REMOVE_CMAP_TO_GRAYSCALE);
pixGetDimensions(pix1, &w, &h, NULL);
/* Optionally crop inner part of image */
if (edgecrop > 0.0) {
box = boxCreate(0.5 * edgecrop * w, 0.5 * edgecrop * h,
(1.0 - edgecrop) * w, (1.0 - edgecrop) * h);
pix2 = pixClipRectangle(pix1, box, NULL);
boxDestroy(&box);
} else {
pix2 = pixClone(pix1);
}
/* We will use no more than 50K samples */
sampling = L_MAX(1, (l_int32)sqrt((l_float64)(w * h) / 50000. + 0.5));
/* Optionally make a mask over all pixels lighter than %darkthresh */
pixm = NULL;
if (darkthresh > 0) {
pixm = pixThresholdToBinary(pix2, darkthresh);
pixInvert(pixm, pixm);
}
pixGetRankValueMasked(pix2, pixm, 0, 0, sampling, 0.5, &fbg, NULL);
*pbg = (l_int32)(fbg + 0.5);
pixDestroy(&pix1);
pixDestroy(&pix2);
pixDestroy(&pixm);
return 0;
}
/*---------------------------------------------------------------------*
* Largest white or black rectangles in an image *
*---------------------------------------------------------------------*/
/*!
* \brief pixFindLargeRectangles()
*
* \param[in] pixs 1 bpp
* \param[in] polarity 0 within background, 1 within foreground
* \param[in] nrect number of rectangles to be found
* \param[out] pboxa largest rectangles, sorted by decreasing area
* \param[in,out] ppixdb optional return output with rectangles drawn on it
* \return 0 if OK, 1 on error
*
* <pre>
* Notes:
* (1) This does a greedy search to find the largest rectangles,
* either black or white and without overlaps, in %pix.
* (2) See pixFindLargestRectangle(), which is called multiple
* times, for details. On each call, the largest rectangle
* found is painted, so that none of its pixels can be
* used later, before calling it again.
* (3) This function is surprisingly fast. Although
* pixFindLargestRectangle() runs at about 50 MPix/sec, when it
* is run multiple times by pixFindLargeRectangles(), it processes
* at 150 - 250 MPix/sec, and the time is approximately linear
* in %nrect. For example, for a 1 MPix image, searching for
* the largest 50 boxes takes about 0.2 seconds.
* </pre>
*/
l_ok
pixFindLargeRectangles(PIX *pixs,
l_int32 polarity,
l_int32 nrect,
BOXA **pboxa,
PIX **ppixdb)
{
l_int32 i, op, bx, by, bw, bh;
BOX *box;
BOXA *boxa;
PIX *pix;
PROCNAME("pixFindLargeRectangles");
if (ppixdb) *ppixdb = NULL;
if (!pboxa)
return ERROR_INT("&boxa not defined", procName, 1);
*pboxa = NULL;
if (!pixs || pixGetDepth(pixs) != 1)
return ERROR_INT("pixs not defined or not 1 bpp", procName, 1);
if (polarity != 0 && polarity != 1)
return ERROR_INT("invalid polarity", procName, 1);
if (nrect > 1000) {
L_WARNING("large num rectangles = %d requested; using 1000\n",
procName, nrect);
nrect = 1000;
}
pix = pixCopy(NULL, pixs);
boxa = boxaCreate(nrect);
*pboxa = boxa;
/* Sequentially find largest rectangle and fill with opposite color */
for (i = 0; i < nrect; i++) {
if (pixFindLargestRectangle(pix, polarity, &box, NULL) == 1) {
boxDestroy(&box);
L_ERROR("failure in pixFindLargestRectangle\n", procName);
break;
}
boxaAddBox(boxa, box, L_INSERT);
op = (polarity == 0) ? PIX_SET : PIX_CLR;
boxGetGeometry(box, &bx, &by, &bw, &bh);
pixRasterop(pix, bx, by, bw, bh, op, NULL, 0, 0);
}
if (ppixdb)
*ppixdb = pixDrawBoxaRandom(pixs, boxa, 3);
pixDestroy(&pix);
return 0;
}
/*!
* \brief pixFindLargestRectangle()
*
* \param[in] pixs 1 bpp
* \param[in] polarity 0 within background, 1 within foreground
* \param[out] pbox largest area rectangle
* \param[in,out] ppixdb optional return output with rectangle drawn on it
* \return 0 if OK, 1 on error
*
* <pre>
* Notes:
* (1) This is a simple and elegant solution to a problem in
* computational geometry that at first appears to be quite
* difficult: what is the largest rectangle that can be
* placed in the image, covering only pixels of one polarity
* (bg or fg)? The solution is O(n), where n is the number
* of pixels in the image, and it requires nothing more than
* using a simple recursion relation in a single sweep of the image.
* (2) In a sweep from UL to LR with left-to-right being the fast
* direction, calculate the largest white rectangle at (x, y),
* using previously calculated values at pixels #1 and #2:
* #1: (x, y - 1)
* #2: (x - 1, y)
* We also need the most recent "black" pixels that were seen
* in the current row and column.
* Consider the largest area. There are only two possibilities:
* (a) Min(w(1), horizdist) * (h(1) + 1)
* (b) Min(h(2), vertdist) * (w(2) + 1)
* where
* horizdist: the distance from the rightmost "black" pixel seen
* in the current row across to the current pixel
* vertdist: the distance from the lowest "black" pixel seen
* in the current column down to the current pixel
* and we choose the Max of (a) and (b).
* (3) To convince yourself that these recursion relations are correct,
* it helps to draw the maximum rectangles at #1 and #2.
* Then for #1, you try to extend the rectangle down one line,
* so that the height is h(1) + 1. Do you get the full
* width of #1, w(1)? It depends on where the black pixels are
* in the current row. You know the final width is bounded by w(1)
* and w(2) + 1, but the actual value depends on the distribution
* of black pixels in the current row that are at a distance
* from the current pixel that is between these limits.
* We call that value "horizdist", and the area is then given
* by the expression (a) above. Using similar reasoning for #2,
* where you attempt to extend the rectangle to the right
* by 1 pixel, you arrive at (b). The largest rectangle is
* then found by taking the Max.
* </pre>
*/
l_ok
pixFindLargestRectangle(PIX *pixs,
l_int32 polarity,
BOX **pbox,
PIX **ppixdb)
{
l_int32 i, j, w, h, d, wpls, val;
l_int32 wp, hp, w1, w2, h1, h2, wmin, hmin, area1, area2;
l_int32 xmax, ymax; /* LR corner of the largest rectangle */
l_int32 maxarea, wmax, hmax, vertdist, horizdist, prevfg;
l_int32 *lowestfg;
l_uint32 *datas, *lines;
l_uint32 **linew, **lineh;
BOX *box;
PIX *pixw, *pixh; /* keeps the width and height for the largest */
/* rectangles whose LR corner is located there. */
PROCNAME("pixFindLargestRectangle");
if (ppixdb) *ppixdb = NULL;
if (!pbox)
return ERROR_INT("&box not defined", procName, 1);
*pbox = NULL;
if (!pixs)
return ERROR_INT("pixs not defined", procName, 1);
pixGetDimensions(pixs, &w, &h, &d);
if (d != 1)
return ERROR_INT("pixs not 1 bpp", procName, 1);
if (polarity != 0 && polarity != 1)
return ERROR_INT("invalid polarity", procName, 1);
/* Initialize lowest "fg" seen so far for each column */
lowestfg = (l_int32 *)LEPT_CALLOC(w, sizeof(l_int32));
for (i = 0; i < w; i++)
lowestfg[i] = -1;
/* The combination (val ^ polarity) is the color for which we
* are searching for the maximum rectangle. For polarity == 0,
* we search in the bg (white). */
pixw = pixCreate(w, h, 32); /* stores width */
pixh = pixCreate(w, h, 32); /* stores height */
linew = (l_uint32 **)pixGetLinePtrs(pixw, NULL);
lineh = (l_uint32 **)pixGetLinePtrs(pixh, NULL);
datas = pixGetData(pixs);
wpls = pixGetWpl(pixs);
maxarea = xmax = ymax = wmax = hmax = 0;
for (i = 0; i < h; i++) {
lines = datas + i * wpls;
prevfg = -1;
for (j = 0; j < w; j++) {
val = GET_DATA_BIT(lines, j);
if ((val ^ polarity) == 0) { /* bg (0) if polarity == 0, etc. */
if (i == 0 && j == 0) {
wp = hp = 1;
} else if (i == 0) {
wp = linew[i][j - 1] + 1;
hp = 1;
} else if (j == 0) {
wp = 1;
hp = lineh[i - 1][j] + 1;
} else {
/* Expand #1 prev rectangle down */
w1 = linew[i - 1][j];
h1 = lineh[i - 1][j];
horizdist = j - prevfg;
wmin = L_MIN(w1, horizdist); /* width of new rectangle */
area1 = wmin * (h1 + 1);
/* Expand #2 prev rectangle to right */
w2 = linew[i][j - 1];
h2 = lineh[i][j - 1];
vertdist = i - lowestfg[j];
hmin = L_MIN(h2, vertdist); /* height of new rectangle */
area2 = hmin * (w2 + 1);
if (area1 > area2) {
wp = wmin;
hp = h1 + 1;
} else {
wp = w2 + 1;
hp = hmin;
}
}
} else { /* fg (1) if polarity == 0; bg (0) if polarity == 1 */
prevfg = j;
lowestfg[j] = i;
wp = hp = 0;
}
linew[i][j] = wp;
lineh[i][j] = hp;
if (wp * hp > maxarea) {
maxarea = wp * hp;
xmax = j;
ymax = i;
wmax = wp;
hmax = hp;
}
}
}
/* Translate from LR corner to Box coords (UL corner, w, h) */
box = boxCreate(xmax - wmax + 1, ymax - hmax + 1, wmax, hmax);
*pbox = box;
if (ppixdb) {
*ppixdb = pixConvertTo8(pixs, TRUE);
pixRenderHashBoxArb(*ppixdb, box, 6, 2, L_NEG_SLOPE_LINE, 1, 255, 0, 0);
}
LEPT_FREE(linew);
LEPT_FREE(lineh);
LEPT_FREE(lowestfg);
pixDestroy(&pixw);
pixDestroy(&pixh);
return 0;
}
/*---------------------------------------------------------------------*
* Generate rectangle inside connected component *
*---------------------------------------------------------------------*/
/*!
* \brief pixFindRectangleInCC()
*
* \param[in] pixs 1 bpp, with sufficient closings to make the fg be
* a single c.c. that is a convex hull
* \param[in] boxs [optional] if NULL, %pixs should be a minimum
* container of a single c.c.
* \param[in] fract first and all consecutive lines found must be at
* least this fraction of the fast scan dimension
* \param[in] dir L_SCAN_HORIZONTAL, L_SCAN_VERTICAL; direction of
* fast scan
* \param[in] select L_GEOMETRIC_UNION, L_GEOMETRIC_INTERSECTION,
* L_LARGEST_AREA, L_SMALEST_AREA
* \param[in] debug if 1, generates output pdf showing intermediate
* computation and final result
* \return box of included rectangle, or NULL on error
*
* <pre>
* Notes:
* (1) Computation is similar to pixFindLargestRectangle(), but allows
* a different set of results to choose from.
* (2) Select the fast scan direction. Then, scanning in the slow
* direction, finds the longest run of ON pixels in the fast
* scan direction and look for the first first run that is longer
* than %fract of the dimension. Continues until a shorter run
* is found. This generates a box of ON pixels fitting into the c.c.
* (3) Do this from both slow scan directions and use %select to get
* a resulting box from these two.
* (4) The extracted rectangle is not necessarily the largest that
* can fit in the c.c. To get that, use pixFindLargestRectangle().
*/
BOX *
pixFindRectangleInCC(PIX *pixs,
BOX *boxs,
l_float32 fract,
l_int32 dir,
l_int32 select,
l_int32 debug)
{
l_int32 x, y, i, j, w, h, w1, h1, w2, h2, found, res;
l_int32 xfirst, xlast, xstart, yfirst, ylast, length;
BOX *box1, *box2, *box3, *box4, *box5;
PIX *pix1, *pix2, *pixdb1, *pixdb2;
PIXA *pixadb;
PROCNAME("pixFindRectangleInCC");
if (!pixs || pixGetDepth(pixs) != 1)
return (BOX *)ERROR_PTR("pixs undefined or not 1 bpp", procName, NULL);
if (fract <= 0.0 || fract > 1.0)
return (BOX *)ERROR_PTR("invalid fraction", procName, NULL);
if (dir != L_SCAN_VERTICAL && dir != L_SCAN_HORIZONTAL)
return (BOX *)ERROR_PTR("invalid scan direction", procName, NULL);
if (select != L_GEOMETRIC_UNION && select != L_GEOMETRIC_INTERSECTION &&
select != L_LARGEST_AREA && select != L_SMALLEST_AREA)
return (BOX *)ERROR_PTR("invalid select", procName, NULL);
/* Extract the c.c. if necessary */
x = y = 0;
if (boxs) {
pix1 = pixClipRectangle(pixs, boxs, NULL);
boxGetGeometry(boxs, &x, &y, NULL, NULL);
} else {
pix1 = pixClone(pixs);
}
/* All fast scans are horizontal; rotate 90 deg cw if necessary */
if (dir == L_SCAN_VERTICAL)
pix2 = pixRotate90(pix1, 1);
else /* L_SCAN_HORIZONTAL */
pix2 = pixClone(pix1);
pixGetDimensions(pix2, &w, &h, NULL);
pixadb = (debug) ? pixaCreate(0) : NULL;
pixdb1 = NULL;
if (pixadb) {
lept_mkdir("lept/rect");
pixaAddPix(pixadb, pix1, L_CLONE);
pixdb1 = pixConvertTo32(pix2);
}
pixDestroy(&pix1);
/* Scanning down, find the first scanline with a long enough run.
* That run goes from (xfirst, yfirst) to (xlast, yfirst). */
found = FALSE;
for (i = 0; i < h; i++) {
pixFindMaxHorizontalRunOnLine(pix2, i, &xstart, &length);
if (length >= (l_int32)(fract * w + 0.5)) {
yfirst = i;
xfirst = xstart;
xlast = xfirst + length - 1;
found = TRUE;
break;
}
}
if (!found) {
L_WARNING("no run of sufficient size was found\n", procName);
pixDestroy(&pix2);
pixDestroy(&pixdb1);
pixaDestroy(&pixadb);
return NULL;
}
/* Continue down until the condition fails */
w1 = xlast - xfirst + 1;
h1 = h - yfirst; /* initialize */
for (i = yfirst + 1; i < h; i++) {
pixFindMaxHorizontalRunOnLine(pix2, i, &xstart, &length);
if (xstart > xfirst || (xstart + length - 1 < xlast) ||
i == h - 1) {
ylast = i - 1;
h1 = ylast - yfirst + 1;
break;
}
}
box1 = boxCreate(xfirst, yfirst, w1, h1);
/* Scanning up, find the first scanline with a long enough run.
* That run goes from (xfirst, ylast) to (xlast, ylast). */
for (i = h - 1; i >= 0; i--) {
pixFindMaxHorizontalRunOnLine(pix2, i, &xstart, &length);
if (length >= (l_int32)(fract * w + 0.5)) {
ylast = i;
xfirst = xstart;
xlast = xfirst + length - 1;
break;
}
}
/* Continue up until the condition fails */
w2 = xlast - xfirst + 1;
h2 = ylast + 1; /* initialize */
for (i = ylast - 1; i >= 0; i--) {
pixFindMaxHorizontalRunOnLine(pix2, i, &xstart, &length);
if (xstart > xfirst || (xstart + length - 1 < xlast) ||
i == 0) {
yfirst = i + 1;
h2 = ylast - yfirst + 1;
break;
}
}
box2 = boxCreate(xfirst, yfirst, w2, h2);
pixDestroy(&pix2);
if (pixadb) {
pixRenderBoxArb(pixdb1, box1, 2, 255, 0, 0);
pixRenderBoxArb(pixdb1, box2, 2, 0, 255, 0);
pixaAddPix(pixadb, pixdb1, L_INSERT);
}
/* Select the final result from the two boxes */
if (select == L_GEOMETRIC_UNION)
box3 = boxBoundingRegion(box1, box2);
else if (select == L_GEOMETRIC_INTERSECTION)
box3 = boxOverlapRegion(box1, box2);
else if (select == L_LARGEST_AREA)
box3 = (w1 * h1 >= w2 * h2) ? boxCopy(box1) : boxCopy(box2);
else /* select == L_SMALLEST_AREA) */
box3 = (w1 * h1 <= w2 * h2) ? boxCopy(box1) : boxCopy(box2);
boxDestroy(&box1);
boxDestroy(&box2);
/* Rotate the box 90 degrees ccw if necessary */
box4 = NULL;
if (box3) {
if (dir == L_SCAN_VERTICAL)
box4 = boxRotateOrth(box3, w, h, 3);
else
box4 = boxCopy(box3);
}
/* Transform back to global coordinates if %boxs exists */
box5 = (box4) ? boxTransform(box4, x, y, 1.0, 1.0) : NULL;
boxDestroy(&box3);
boxDestroy(&box4);
/* Debug output */
if (pixadb) {
pixdb1 = pixConvertTo8(pixs, 0);
pixAddConstantGray(pixdb1, 190);
pixdb2 = pixConvertTo32(pixdb1);
if (box5) pixRenderBoxArb(pixdb2, box5, 4, 0, 0, 255);
pixaAddPix(pixadb, pixdb2, L_INSERT);
res = pixGetXRes(pixs);
L_INFO("Writing debug files to /tmp/lept/rect/\n", procName);
pixaConvertToPdf(pixadb, res, 1.0, L_DEFAULT_ENCODE, 75, NULL,
"/tmp/lept/rect/fitrect.pdf");
pix1 = pixaDisplayTiledAndScaled(pixadb, 32, 800, 1, 0, 40, 2);
pixWrite("/tmp/lept/rect/fitrect.png", pix1, IFF_PNG);
pixDestroy(&pix1);
pixDestroy(&pixdb1);
pixaDestroy(&pixadb);
}
return box5;
}