initialize

This commit is contained in:
gb 2022-05-03 11:56:07 +08:00
commit d1907a5191
960 changed files with 591304 additions and 0 deletions

33
build.sh Normal file
View File

@ -0,0 +1,33 @@
cd ../
mkdir build
cd build
mkdir imgpc
cd imgpc
rm -rf *
cmake ../../device/hgdriver/ImageProcess
make
cd ..
mkdir hgdev
cd hgdev
rm -rf *
cmake ../../device/hgdriver/hgdev
make
cd ..
mkdir wrapper
cd wrapper
rm -rf *
cmake ../../device/hgdriver/huagaoxxx_warraper_ex
make
cd ..
mkdir hgsane
cd hgsane
rm -rf *
cmake ../../device/hgsane
make
sudo cp ../../release/Linux/x86_64/libsane-hgsane.so /usr/lib/x86_64-linux-gnu/sane/libsane-hgsane.so.1

462
hgdriver/3rdparty/cyusb/inc/CyAPI.h vendored Normal file
View File

@ -0,0 +1,462 @@
//______________________________________________________________________________
//
// Copyright (c) Cypress Semiconductor, 2003
// All rights reserved.
//
//______________________________________________________________________________
#ifndef CyUSBH
#define CyUSBH
#ifndef __USB200_H__
#define __USB200_H__
#include <Windows.h>
#pragma pack(push,1) ///
typedef struct _USB_DEVICE_DESCRIPTOR { //设备描述符
UCHAR bLength; //长度
UCHAR bDescriptorType; //描述符类型
USHORT bcdUSB; //USB
UCHAR bDeviceClass; //设备类
UCHAR bDeviceSubClass; //设备派生类
UCHAR bDeviceProtocol; //设备协议
UCHAR bMaxPacketSize0; //最大数据包尺寸
USHORT idVendor; //厂商ID
USHORT idProduct; //产品ID
USHORT bcdDevice; //设备
UCHAR iManufacturer; //制造商
UCHAR iProduct; //产品
UCHAR iSerialNumber; //序列号
UCHAR bNumConfigurations;//配置
} USB_DEVICE_DESCRIPTOR, *PUSB_DEVICE_DESCRIPTOR;
typedef struct _USB_ENDPOINT_DESCRIPTOR { //端点描述符
UCHAR bLength; //长度
UCHAR bDescriptorType; //描述符类型
UCHAR bEndpointAddress; //端点地址
UCHAR bmAttributes; //端点属性
USHORT wMaxPacketSize; //最大数据包尺寸
UCHAR bInterval; //间隔
} USB_ENDPOINT_DESCRIPTOR, *PUSB_ENDPOINT_DESCRIPTOR;
typedef struct _USB_CONFIGURATION_DESCRIPTOR { //配置描述符
UCHAR bLength;
UCHAR bDescriptorType;
USHORT wTotalLength;
UCHAR bNumInterfaces;
UCHAR bConfigurationValue;
UCHAR iConfiguration;
UCHAR bmAttributes;
UCHAR MaxPower;
} USB_CONFIGURATION_DESCRIPTOR, *PUSB_CONFIGURATION_DESCRIPTOR;
typedef struct _USB_INTERFACE_DESCRIPTOR { //接口描述符
UCHAR bLength;
UCHAR bDescriptorType;
UCHAR bInterfaceNumber;
UCHAR bAlternateSetting;
UCHAR bNumEndpoints;
UCHAR bInterfaceClass;
UCHAR bInterfaceSubClass;
UCHAR bInterfaceProtocol;
UCHAR iInterface;
} USB_INTERFACE_DESCRIPTOR, *PUSB_INTERFACE_DESCRIPTOR;
typedef struct _USB_STRING_DESCRIPTOR { //字符串描述符
UCHAR bLength;
UCHAR bDescriptorType;
WCHAR bString[1];
} USB_STRING_DESCRIPTOR, *PUSB_STRING_DESCRIPTOR;
typedef struct _USB_COMMON_DESCRIPTOR { //USB串口描述符
UCHAR bLength;
UCHAR bDescriptorType;
} USB_COMMON_DESCRIPTOR, *PUSB_COMMON_DESCRIPTOR;
#pragma pack(pop)
#endif
//______________________________________________________________________________
class CCyIsoPktInfo { //包信息
public:
LONG Status; //包状态
LONG Length; //包长度
};
//______________________________________________________________________________
// {AE18AA60-7F6A-11d4-97DD-00010229B959}
static GUID CYUSBDRV_GUID = {0xae18aa60, 0x7f6a, 0x11d4, 0x97, 0xdd, 0x0, 0x1, 0x2, 0x29, 0xb9, 0x59};
typedef enum {TGT_DEVICE, TGT_INTFC, TGT_ENDPT, TGT_OTHER } CTL_XFER_TGT_TYPE;
typedef enum {REQ_STD, REQ_CLASS, REQ_VENDOR } CTL_XFER_REQ_TYPE;
typedef enum {DIR_TO_DEVICE, DIR_FROM_DEVICE } CTL_XFER_DIR_TYPE;
typedef enum {XMODE_BUFFERED, XMODE_DIRECT } XFER_MODE_TYPE;
const int MAX_ENDPTS = 16;
const int MAX_INTERFACES = 8;
const int USB_STRING_MAXLEN = 256;
////////////////////////////////////////////////////////////////////////////////
//
// The CCyEndPoint ABSTRACT Class
//
////////////////////////////////////////////////////////////////////////////////
class CCyUSBEndPoint
{
protected:
bool WaitForIO(OVERLAPPED *ovLapStatus);
virtual PUCHAR BeginDirectXfer(PUCHAR buf, LONG bufLen, OVERLAPPED *ov); //直接传输模式
virtual PUCHAR BeginBufferedXfer(PUCHAR buf, LONG bufLen, OVERLAPPED *ov); //缓冲传输模式
public:
CCyUSBEndPoint(void);
CCyUSBEndPoint(CCyUSBEndPoint& ept);
CCyUSBEndPoint(HANDLE h, PUSB_ENDPOINT_DESCRIPTOR pEndPtDescriptor);
HANDLE hDevice;
// The fields of an EndPoint Descriptor
UCHAR DscLen;
UCHAR DscType;
UCHAR Address;
UCHAR Attributes;
USHORT MaxPktSize;
USHORT PktsPerFrame;
UCHAR Interval;
// Other fields
ULONG TimeOut;
ULONG UsbdStatus;
ULONG NtStatus;
DWORD bytesWritten;
DWORD LastError;
bool bIn;
XFER_MODE_TYPE XferMode;
bool XferData(PUCHAR buf, LONG &len, CCyIsoPktInfo* pktInfos = NULL);
bool XferData(PUCHAR buf, LONG &bufLen, CCyIsoPktInfo* pktInfos, bool pktMode);
virtual PUCHAR BeginDataXfer(PUCHAR buf, LONG len, OVERLAPPED *ov) = 0;
virtual bool FinishDataXfer(PUCHAR buf, LONG &len, OVERLAPPED *ov, PUCHAR pXmitBuf, CCyIsoPktInfo* pktInfos = NULL);
bool WaitForXfer(OVERLAPPED *ov, ULONG tOut);
ULONG GetXferSize(void);
void SetXferSize(ULONG xfer);
bool Reset(void);
bool Abort(void);
private:
};
////////////////////////////////////////////////////////////////////////////////
//
// The Control Endpoint Class
//
////////////////////////////////////////////////////////////////////////////////
class CCyControlEndPoint : public CCyUSBEndPoint
{
private:
public:
CCyControlEndPoint(void);
CCyControlEndPoint(CCyControlEndPoint& ept);
CCyControlEndPoint(HANDLE h, PUSB_ENDPOINT_DESCRIPTOR pEndPtDescriptor);
CTL_XFER_TGT_TYPE Target;
CTL_XFER_REQ_TYPE ReqType;
CTL_XFER_DIR_TYPE Direction;
UCHAR ReqCode;
WORD Value;
WORD Index;
bool Read(PUCHAR buf, LONG &len);
bool Write(PUCHAR buf, LONG &len);
PUCHAR BeginDataXfer(PUCHAR buf, LONG len, OVERLAPPED *ov);
};
////////////////////////////////////////////////////////////////////////////////
//
// The Isoc Endpoint Class
//
////////////////////////////////////////////////////////////////////////////////
class CCyIsocEndPoint : public CCyUSBEndPoint
{
protected:
virtual PUCHAR BeginDirectXfer(PUCHAR buf, LONG bufLen, OVERLAPPED *ov);
virtual PUCHAR BeginBufferedXfer(PUCHAR buf, LONG bufLen, OVERLAPPED *ov);
public:
CCyIsocEndPoint(void);
CCyIsocEndPoint(HANDLE h, PUSB_ENDPOINT_DESCRIPTOR pEndPtDescriptor);
PUCHAR BeginDataXfer(PUCHAR buf, LONG len, OVERLAPPED *ov);
CCyIsoPktInfo* CreatePktInfos(LONG bufLen, int &packets);
};
////////////////////////////////////////////////////////////////////////////////
//
// The Bulk Endpoint Class
//
////////////////////////////////////////////////////////////////////////////////
class CCyBulkEndPoint : public CCyUSBEndPoint
{
public:
CCyBulkEndPoint(void);
CCyBulkEndPoint(HANDLE h, PUSB_ENDPOINT_DESCRIPTOR pEndPtDescriptor);
PUCHAR BeginDataXfer(PUCHAR buf, LONG len, OVERLAPPED *ov);
};
////////////////////////////////////////////////////////////////////////////////
//
// The Interrupt Endpoint Class
//
////////////////////////////////////////////////////////////////////////////////
class CCyInterruptEndPoint : public CCyUSBEndPoint
{
public:
CCyInterruptEndPoint(void);
CCyInterruptEndPoint(HANDLE h, PUSB_ENDPOINT_DESCRIPTOR pEndPtDescriptor);
PUCHAR BeginDataXfer(PUCHAR buf, LONG len, OVERLAPPED *ov);
};
////////////////////////////////////////////////////////////////////////////////
//
// The Interface Class
//
////////////////////////////////////////////////////////////////////////////////
class CCyUSBInterface
{
private:
protected:
public:
CCyUSBEndPoint *EndPoints[MAX_ENDPTS]; // Holds pointers to all the interface's endpoints, plus a pointer to the Control endpoint zero
UCHAR bLength;
UCHAR bDescriptorType;
UCHAR bInterfaceNumber;
UCHAR bAlternateSetting;
UCHAR bNumEndpoints; // Not counting the control endpoint
UCHAR bInterfaceClass;
UCHAR bInterfaceSubClass;
UCHAR bInterfaceProtocol;
UCHAR iInterface;
UCHAR bAltSettings;
USHORT wTotalLength; // Needed in case Intfc has additional (non-endpt) descriptors
CCyUSBInterface(HANDLE h, PUSB_INTERFACE_DESCRIPTOR pIntfcDescriptor);
CCyUSBInterface(CCyUSBInterface& ifc); // Copy Constructor
~CCyUSBInterface(void);
};
////////////////////////////////////////////////////////////////////////////////
//
// The Config Class
//
////////////////////////////////////////////////////////////////////////////////
class CCyUSBConfig
{
private:
protected:
public:
CCyUSBInterface *Interfaces[MAX_INTERFACES];
UCHAR bLength;
UCHAR bDescriptorType;
USHORT wTotalLength;
UCHAR bNumInterfaces;
UCHAR bConfigurationValue;
UCHAR iConfiguration;
UCHAR bmAttributes;
UCHAR MaxPower;
UCHAR AltInterfaces;
CCyUSBConfig(void);
CCyUSBConfig(CCyUSBConfig& cfg); // Copy Constructor
CCyUSBConfig(HANDLE h, PUSB_CONFIGURATION_DESCRIPTOR pConfigDescr);
~CCyUSBConfig(void);
};
////////////////////////////////////////////////////////////////////////////////
//
// The USB Device Class - This is the main class that contains members of all the
// other classes.
//
// To use the library, create an instance of this Class and call it's Open method
//
////////////////////////////////////////////////////////////////////////////////
class CCyUSBDevice
{
// The public members are accessible (i.e. corruptible) by the user of the library
// Algorithms of the class don't rely on any public members. Instead, they use the
// private members of the class for their calculations.
public:
CCyUSBDevice(HANDLE hnd = NULL, GUID guid = CYUSBDRV_GUID, BOOL bOpen = true);
~CCyUSBDevice(void);
CCyUSBEndPoint **EndPoints; // Shortcut to USBCfgs[CfgNum]->Interfaces[IntfcIndex]->Endpoints
CCyUSBEndPoint *EndPointOf(UCHAR addr);
CCyControlEndPoint *ControlEndPt;
CCyIsocEndPoint *IsocInEndPt;
CCyIsocEndPoint *IsocOutEndPt;
CCyBulkEndPoint *BulkInEndPt;
CCyBulkEndPoint *BulkOutEndPt;
CCyInterruptEndPoint *InterruptInEndPt;
CCyInterruptEndPoint *InterruptOutEndPt;
USHORT StrLangID;
ULONG UsbdStatus;
ULONG NtStatus;
ULONG DriverVersion;
ULONG USBDIVersion;
char DeviceName[USB_STRING_MAXLEN];
char FriendlyName[USB_STRING_MAXLEN];
wchar_t Manufacturer[USB_STRING_MAXLEN];
wchar_t Product[USB_STRING_MAXLEN];
wchar_t SerialNumber[USB_STRING_MAXLEN];
CHAR DevPath[USB_STRING_MAXLEN];
USHORT BcdUSB;
USHORT VendorID;
USHORT ProductID;
UCHAR USBAddress;
UCHAR DevClass;
UCHAR DevSubClass;
UCHAR DevProtocol;
UCHAR MaxPacketSize;
USHORT BcdDevice;
UCHAR ConfigValue;
UCHAR ConfigAttrib;
UCHAR MaxPower;
UCHAR IntfcClass;
UCHAR IntfcSubClass;
UCHAR IntfcProtocol;
bool bHighSpeed;
DWORD BytesXfered;
UCHAR DeviceCount(void);
UCHAR ConfigCount(void);
UCHAR IntfcCount(void);
UCHAR AltIntfcCount(void);
UCHAR EndPointCount(void);
UCHAR Config(void) { return CfgNum; } // Normally 0
void SetConfig(UCHAR cfg);
UCHAR Interface(void) { return IntfcNum; } // Usually 0
// No SetInterface method since only 1 intfc per device (per Windows)
UCHAR AltIntfc(void);
bool SetAltIntfc(UCHAR alt);
GUID DriverGUID(void) { return DrvGuid; }
HANDLE DeviceHandle(void) { return hDevice; }
void UsbdStatusString(ULONG stat, PCHAR s);
bool CreateHandle(UCHAR dev);
void DestroyHandle();
bool Open(UCHAR dev);
void Close(void);
bool Reset(void);
bool ReConnect(void);
bool Suspend(void);
bool Resume(void);
bool IsOpen(void) { return (hDevice != INVALID_HANDLE_VALUE); }
UCHAR PowerState(void);
void GetDeviceDescriptor(PUSB_DEVICE_DESCRIPTOR descr);
void GetConfigDescriptor(PUSB_CONFIGURATION_DESCRIPTOR descr);
void GetIntfcDescriptor(PUSB_INTERFACE_DESCRIPTOR descr);
CCyUSBConfig GetUSBConfig(int index);
private:
USB_DEVICE_DESCRIPTOR USBDeviceDescriptor;
PUSB_CONFIGURATION_DESCRIPTOR USBConfigDescriptors[2];
CCyUSBConfig *USBCfgs[2];
HANDLE hWnd;
HANDLE hDevice;
HANDLE hDevNotification;
HANDLE hHndNotification;
GUID DrvGuid;
UCHAR Devices;
UCHAR Interfaces;
UCHAR AltInterfaces;
UCHAR Configs;
UCHAR DevNum;
UCHAR CfgNum;
UCHAR IntfcNum; // The current selected interface's bInterfaceNumber
UCHAR IntfcIndex; // The entry in the Config's interfaces table matching to IntfcNum and AltSetting
void GetDevDescriptor(void);
void GetCfgDescriptor(int descIndex);
void GetString(wchar_t *s, UCHAR sIndex);
void SetStringDescrLanguage(void);
void SetAltIntfcParams(UCHAR alt);
bool IoControl(ULONG cmd, PUCHAR buf, ULONG len);
void SetEndPointPtrs(void);
void GetDeviceName(void);
void GetFriendlyName(void);
void GetDriverVer(void);
void GetUSBDIVer(void);
void GetSpeed(void);
void GetUSBAddress(void);
//void CloseEndPtHandles(void);
bool RegisterForPnpEvents(HANDLE h);
};
//---------------------------------------------------------------------------
#endif

View File

@ -0,0 +1,34 @@
#pragma once
#include <chrono>
class StopWatch
{
public:
StopWatch() {
_start = std::chrono::steady_clock::now();
}
void reset() {
_start = std::chrono::steady_clock::now();
}
double elapsed_s() {
return std::chrono::duration<double>(std::chrono::steady_clock::now() - _start).count();
}
double elapsed_ms() {
return std::chrono::duration<double, std::milli>(std::chrono::steady_clock::now() - _start).count();
}
double elapsed_us() {
return std::chrono::duration<double, std::micro>(std::chrono::steady_clock::now() - _start).count();
}
double elapsed_ns() {
return std::chrono::duration<double, std::nano>(std::chrono::steady_clock::now() - _start).count();
}
private:
std::chrono::steady_clock::time_point _start;
};

View File

@ -0,0 +1,33 @@
///////////////////////////////////////////////////////////////////////
// File: apitypes.h
// Description: Types used in both the API and internally
// Author: Ray Smith
// Created: Wed Mar 03 09:22:53 PST 2010
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_API_APITYPES_H__
#define TESSERACT_API_APITYPES_H__
#include "publictypes.h"
// The types used by the API and Page/ResultIterator can be found in:
// ccstruct/publictypes.h
// ccmain/resultiterator.h
// ccmain/pageiterator.h
// API interfaces and API users should be sure to include this file, rather
// than the lower-level one, and lower-level code should be sure to include
// only the lower-level file.
#endif // TESSERACT_API_APITYPES_H__

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,922 @@
///////////////////////////////////////////////////////////////////////
// File: baseapi.h
// Description: Simple API for calling tesseract.
// Author: Ray Smith
// Created: Fri Oct 06 15:35:01 PDT 2006
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_API_BASEAPI_H__
#define TESSERACT_API_BASEAPI_H__
#define TESSERACT_VERSION_STR "3.05.02"
#define TESSERACT_VERSION 0x030502
#define MAKE_VERSION(major, minor, patch) (((major) << 16) | ((minor) << 8) | \
(patch))
#include <stdio.h>
// To avoid collision with other typenames include the ABSOLUTE MINIMUM
// complexity of includes here. Use forward declarations wherever possible
// and hide includes of complex types in baseapi.cpp.
#include "platform.h"
#include "apitypes.h"
#include "thresholder.h"
#include "unichar.h"
#include "tesscallback.h"
#include "publictypes.h"
#include "pageiterator.h"
#include "resultiterator.h"
template <typename T> class GenericVector;
class PAGE_RES;
class PAGE_RES_IT;
class ParagraphModel;
struct BlamerBundle;
class BLOCK_LIST;
class DENORM;
class MATRIX;
class ROW;
class STRING;
class WERD;
struct Pix;
struct Box;
struct Pixa;
struct Boxa;
class ETEXT_DESC;
struct OSResults;
class TBOX;
class UNICHARSET;
class WERD_CHOICE_LIST;
struct INT_FEATURE_STRUCT;
typedef INT_FEATURE_STRUCT *INT_FEATURE;
struct TBLOB;
namespace tesseract {
#ifndef NO_CUBE_BUILD
class CubeRecoContext;
#endif // NO_CUBE_BUILD
class Dawg;
class Dict;
class EquationDetect;
class PageIterator;
class LTRResultIterator;
class ResultIterator;
class MutableIterator;
class TessResultRenderer;
class Tesseract;
class Trie;
class Wordrec;
typedef int (Dict::*DictFunc)(void* void_dawg_args,
UNICHAR_ID unichar_id, bool word_end) const;
typedef double (Dict::*ProbabilityInContextFunc)(const char* lang,
const char* context,
int context_bytes,
const char* character,
int character_bytes);
typedef float (Dict::*ParamsModelClassifyFunc)(
const char *lang, void *path);
typedef void (Wordrec::*FillLatticeFunc)(const MATRIX &ratings,
const WERD_CHOICE_LIST &best_choices,
const UNICHARSET &unicharset,
BlamerBundle *blamer_bundle);
typedef TessCallback4<const UNICHARSET &, int, PageIterator *, Pix *>
TruthCallback;
/**
* Base class for all tesseract APIs.
* Specific classes can add ability to work on different inputs or produce
* different outputs.
* This class is mostly an interface layer on top of the Tesseract instance
* class to hide the data types so that users of this class don't have to
* include any other Tesseract headers.
*/
class TESS_API TessBaseAPI {
public:
TessBaseAPI();
virtual ~TessBaseAPI();
/**
* Returns the version identifier as a static string. Do not delete.
*/
static const char* Version();
/**
* If compiled with OpenCL AND an available OpenCL
* device is deemed faster than serial code, then
* "device" is populated with the cl_device_id
* and returns sizeof(cl_device_id)
* otherwise *device=NULL and returns 0.
*/
static size_t getOpenCLDevice(void **device);
/**
* Writes the thresholded image to stderr as a PBM file on receipt of a
* SIGSEGV, SIGFPE, or SIGBUS signal. (Linux/Unix only).
*/
static void CatchSignals();
/**
* Set the name of the input file. Needed for training and
* reading a UNLV zone file, and for searchable PDF output.
*/
void SetInputName(const char* name);
/**
* These functions are required for searchable PDF output.
* We need our hands on the input file so that we can include
* it in the PDF without transcoding. If that is not possible,
* we need the original image. Finally, resolution metadata
* is stored in the PDF so we need that as well.
*/
const char* GetInputName();
// Takes ownership of the input pix.
void SetInputImage(Pix *pix);
Pix* GetInputImage();
int GetSourceYResolution();
const char* GetDatapath();
/** Set the name of the bonus output files. Needed only for debugging. */
void SetOutputName(const char* name);
/**
* Set the value of an internal "parameter."
* Supply the name of the parameter and the value as a string, just as
* you would in a config file.
* Returns false if the name lookup failed.
* Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z.
* Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode.
* SetVariable may be used before Init, but settings will revert to
* defaults on End().
*
* Note: Must be called after Init(). Only works for non-init variables
* (init variables should be passed to Init()).
*/
bool SetVariable(const char* name, const char* value);
bool SetDebugVariable(const char* name, const char* value);
/**
* Returns true if the parameter was found among Tesseract parameters.
* Fills in value with the value of the parameter.
*/
bool GetIntVariable(const char *name, int *value) const;
bool GetBoolVariable(const char *name, bool *value) const;
bool GetDoubleVariable(const char *name, double *value) const;
/**
* Returns the pointer to the string that represents the value of the
* parameter if it was found among Tesseract parameters.
*/
const char *GetStringVariable(const char *name) const;
/**
* Print Tesseract parameters to the given file.
*/
void PrintVariables(FILE *fp) const;
/**
* Get value of named variable as a string, if it exists.
*/
bool GetVariableAsString(const char *name, STRING *val);
/**
* Instances are now mostly thread-safe and totally independent,
* but some global parameters remain. Basically it is safe to use multiple
* TessBaseAPIs in different threads in parallel, UNLESS:
* you use SetVariable on some of the Params in classify and textord.
* If you do, then the effect will be to change it for all your instances.
*
* Start tesseract. Returns zero on success and -1 on failure.
* NOTE that the only members that may be called before Init are those
* listed above here in the class definition.
*
* The datapath must be the name of the parent directory of tessdata and
* must end in / . Any name after the last / will be stripped.
* The language is (usually) an ISO 639-3 string or NULL will default to eng.
* It is entirely safe (and eventually will be efficient too) to call
* Init multiple times on the same instance to change language, or just
* to reset the classifier.
* The language may be a string of the form [~]<lang>[+[~]<lang>]* indicating
* that multiple languages are to be loaded. Eg hin+eng will load Hindi and
* English. Languages may specify internally that they want to be loaded
* with one or more other languages, so the ~ sign is available to override
* that. Eg if hin were set to load eng by default, then hin+~eng would force
* loading only hin. The number of loaded languages is limited only by
* memory, with the caveat that loading additional languages will impact
* both speed and accuracy, as there is more work to do to decide on the
* applicable language, and there is more chance of hallucinating incorrect
* words.
* WARNING: On changing languages, all Tesseract parameters are reset
* back to their default values. (Which may vary between languages.)
* If you have a rare need to set a Variable that controls
* initialization for a second call to Init you should explicitly
* call End() and then use SetVariable before Init. This is only a very
* rare use case, since there are very few uses that require any parameters
* to be set before Init.
*
* If set_only_non_debug_params is true, only params that do not contain
* "debug" in the name will be set.
*/
int Init(const char* datapath, const char* language, OcrEngineMode mode,
char **configs, int configs_size,
const GenericVector<STRING> *vars_vec,
const GenericVector<STRING> *vars_values,
bool set_only_non_debug_params);
int Init(const char* datapath, const char* language, OcrEngineMode oem) {
return Init(datapath, language, oem, NULL, 0, NULL, NULL, false);
}
int Init(const char* datapath, const char* language) {
return Init(datapath, language, OEM_DEFAULT, NULL, 0, NULL, NULL, false);
}
/**
* Returns the languages string used in the last valid initialization.
* If the last initialization specified "deu+hin" then that will be
* returned. If hin loaded eng automatically as well, then that will
* not be included in this list. To find the languages actually
* loaded use GetLoadedLanguagesAsVector.
* The returned string should NOT be deleted.
*/
const char* GetInitLanguagesAsString() const;
/**
* Returns the loaded languages in the vector of STRINGs.
* Includes all languages loaded by the last Init, including those loaded
* as dependencies of other loaded languages.
*/
void GetLoadedLanguagesAsVector(GenericVector<STRING>* langs) const;
/**
* Returns the available languages in the vector of STRINGs.
*/
void GetAvailableLanguagesAsVector(GenericVector<STRING>* langs) const;
/**
* Init only the lang model component of Tesseract. The only functions
* that work after this init are SetVariable and IsValidWord.
* WARNING: temporary! This function will be removed from here and placed
* in a separate API at some future time.
*/
int InitLangMod(const char* datapath, const char* language);
/**
* Init only for page layout analysis. Use only for calls to SetImage and
* AnalysePage. Calls that attempt recognition will generate an error.
*/
void InitForAnalysePage();
/**
* Read a "config" file containing a set of param, value pairs.
* Searches the standard places: tessdata/configs, tessdata/tessconfigs
* and also accepts a relative or absolute path name.
* Note: only non-init params will be set (init params are set by Init()).
*/
void ReadConfigFile(const char* filename);
/** Same as above, but only set debug params from the given config file. */
void ReadDebugConfigFile(const char* filename);
/**
* Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK.
* The mode is stored as an IntParam so it can also be modified by
* ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
*/
void SetPageSegMode(PageSegMode mode);
/** Return the current page segmentation mode. */
PageSegMode GetPageSegMode() const;
/**
* Recognize a rectangle from an image and return the result as a string.
* May be called many times for a single Init.
* Currently has no error checking.
* Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
* Palette color images will not work properly and must be converted to
* 24 bit.
* Binary images of 1 bit per pixel may also be given but they must be
* byte packed with the MSB of the first byte being the first pixel, and a
* 1 represents WHITE. For binary images set bytes_per_pixel=0.
* The recognized text is returned as a char* which is coded
* as UTF8 and must be freed with the delete [] operator.
*
* Note that TesseractRect is the simplified convenience interface.
* For advanced uses, use SetImage, (optionally) SetRectangle, Recognize,
* and one or more of the Get*Text functions below.
*/
char* TesseractRect(const unsigned char* imagedata,
int bytes_per_pixel, int bytes_per_line,
int left, int top, int width, int height);
/**
* Call between pages or documents etc to free up memory and forget
* adaptive data.
*/
void ClearAdaptiveClassifier();
/**
* @defgroup AdvancedAPI Advanced API
* The following methods break TesseractRect into pieces, so you can
* get hold of the thresholded image, get the text in different formats,
* get bounding boxes, confidences etc.
*/
/* @{ */
/**
* Provide an image for Tesseract to recognize. Format is as
* TesseractRect above. Copies the image buffer and converts to Pix.
* SetImage clears all recognition results, and sets the rectangle to the
* full image, so it may be followed immediately by a GetUTF8Text, and it
* will automatically perform recognition.
*/
void SetImage(const unsigned char* imagedata, int width, int height,
int bytes_per_pixel, int bytes_per_line);
/**
* Provide an image for Tesseract to recognize. As with SetImage above,
* Tesseract takes its own copy of the image, so it need not persist until
* after Recognize.
* Pix vs raw, which to use?
* Use Pix where possible. Tesseract uses Pix as its internal representation
* and it is therefore more efficient to provide a Pix directly.
*/
void SetImage(Pix* pix);
/**
* Set the resolution of the source image in pixels per inch so font size
* information can be calculated in results. Call this after SetImage().
*/
void SetSourceResolution(int ppi);
/**
* Restrict recognition to a sub-rectangle of the image. Call after SetImage.
* Each SetRectangle clears the recogntion results so multiple rectangles
* can be recognized with the same image.
*/
void SetRectangle(int left, int top, int width, int height);
/**
* In extreme cases only, usually with a subclass of Thresholder, it
* is possible to provide a different Thresholder. The Thresholder may
* be preloaded with an image, settings etc, or they may be set after.
* Note that Tesseract takes ownership of the Thresholder and will
* delete it when it it is replaced or the API is destructed.
*/
void SetThresholder(ImageThresholder* thresholder) {
delete thresholder_;
thresholder_ = thresholder;
ClearResults();
}
/**
* Get a copy of the internal thresholded image from Tesseract.
* Caller takes ownership of the Pix and must pixDestroy it.
* May be called any time after SetImage, or after TesseractRect.
*/
Pix* GetThresholdedImage();
/**
* Get the result of page layout analysis as a leptonica-style
* Boxa, Pixa pair, in reading order.
* Can be called before or after Recognize.
*/
Boxa* GetRegions(Pixa** pixa);
/**
* Get the textlines as a leptonica-style
* Boxa, Pixa pair, in reading order.
* Can be called before or after Recognize.
* If raw_image is true, then extract from the original image instead of the
* thresholded image and pad by raw_padding pixels.
* If blockids is not NULL, the block-id of each line is also returned as an
* array of one element per line. delete [] after use.
* If paraids is not NULL, the paragraph-id of each line within its block is
* also returned as an array of one element per line. delete [] after use.
*/
Boxa* GetTextlines(const bool raw_image, const int raw_padding,
Pixa** pixa, int** blockids, int** paraids);
/*
Helper method to extract from the thresholded image. (most common usage)
*/
Boxa* GetTextlines(Pixa** pixa, int** blockids) {
return GetTextlines(false, 0, pixa, blockids, NULL);
}
/**
* Get textlines and strips of image regions as a leptonica-style Boxa, Pixa
* pair, in reading order. Enables downstream handling of non-rectangular
* regions.
* Can be called before or after Recognize.
* If blockids is not NULL, the block-id of each line is also returned as an
* array of one element per line. delete [] after use.
*/
Boxa* GetStrips(Pixa** pixa, int** blockids);
/**
* Get the words as a leptonica-style
* Boxa, Pixa pair, in reading order.
* Can be called before or after Recognize.
*/
Boxa* GetWords(Pixa** pixa);
/**
* Gets the individual connected (text) components (created
* after pages segmentation step, but before recognition)
* as a leptonica-style Boxa, Pixa pair, in reading order.
* Can be called before or after Recognize.
* Note: the caller is responsible for calling boxaDestroy()
* on the returned Boxa array and pixaDestroy() on cc array.
*/
Boxa* GetConnectedComponents(Pixa** cc);
/**
* Get the given level kind of components (block, textline, word etc.) as a
* leptonica-style Boxa, Pixa pair, in reading order.
* Can be called before or after Recognize.
* If blockids is not NULL, the block-id of each component is also returned
* as an array of one element per component. delete [] after use.
* If blockids is not NULL, the paragraph-id of each component with its block
* is also returned as an array of one element per component. delete [] after
* use.
* If raw_image is true, then portions of the original image are extracted
* instead of the thresholded image and padded with raw_padding.
* If text_only is true, then only text components are returned.
*/
Boxa* GetComponentImages(const PageIteratorLevel level,
const bool text_only, const bool raw_image,
const int raw_padding,
Pixa** pixa, int** blockids, int** paraids);
// Helper function to get binary images with no padding (most common usage).
Boxa* GetComponentImages(const PageIteratorLevel level,
const bool text_only,
Pixa** pixa, int** blockids) {
return GetComponentImages(level, text_only, false, 0, pixa, blockids, NULL);
}
/**
* Returns the scale factor of the thresholded image that would be returned by
* GetThresholdedImage() and the various GetX() methods that call
* GetComponentImages().
* Returns 0 if no thresholder has been set.
*/
int GetThresholdedImageScaleFactor() const;
/**
* Dump the internal binary image to a PGM file.
* @deprecated Use GetThresholdedImage and write the image using pixWrite
* instead if possible.
*/
void DumpPGM(const char* filename);
/**
* Runs page layout analysis in the mode set by SetPageSegMode.
* May optionally be called prior to Recognize to get access to just
* the page layout results. Returns an iterator to the results.
* If merge_similar_words is true, words are combined where suitable for use
* with a line recognizer. Use if you want to use AnalyseLayout to find the
* textlines, and then want to process textline fragments with an external
* line recognizer.
* Returns NULL on error or an empty page.
* The returned iterator must be deleted after use.
* WARNING! This class points to data held within the TessBaseAPI class, and
* therefore can only be used while the TessBaseAPI class still exists and
* has not been subjected to a call of Init, SetImage, Recognize, Clear, End
* DetectOS, or anything else that changes the internal PAGE_RES.
*/
PageIterator* AnalyseLayout();
PageIterator* AnalyseLayout(bool merge_similar_words);
int AnalyseLayout1();
/**
* Recognize the image from SetAndThresholdImage, generating Tesseract
* internal structures. Returns 0 on success.
* Optional. The Get*Text functions below will call Recognize if needed.
* After Recognize, the output is kept internally until the next SetImage.
*/
int Recognize(ETEXT_DESC* monitor);
/**
* Methods to retrieve information after SetAndThresholdImage(),
* Recognize() or TesseractRect(). (Recognize is called implicitly if needed.)
*/
/** Variant on Recognize used for testing chopper. */
int RecognizeForChopTest(ETEXT_DESC* monitor);
/**
* Turns images into symbolic text.
*
* filename can point to a single image, a multi-page TIFF,
* or a plain text list of image filenames.
*
* retry_config is useful for debugging. If not NULL, you can fall
* back to an alternate configuration if a page fails for some
* reason.
*
* timeout_millisec terminates processing if any single page
* takes too long. Set to 0 for unlimited time.
*
* renderer is responible for creating the output. For example,
* use the TessTextRenderer if you want plaintext output, or
* the TessPDFRender to produce searchable PDF.
*
* If tessedit_page_number is non-negative, will only process that
* single page. Works for multi-page tiff file, or filelist.
*
* Returns true if successful, false on error.
*/
bool ProcessPages(const char* filename, const char* retry_config,
int timeout_millisec, TessResultRenderer* renderer);
// Does the real work of ProcessPages.
bool ProcessPagesInternal(const char* filename, const char* retry_config,
int timeout_millisec, TessResultRenderer* renderer);
/**
* Turn a single image into symbolic text.
*
* The pix is the image processed. filename and page_index are
* metadata used by side-effect processes, such as reading a box
* file or formatting as hOCR.
*
* See ProcessPages for desciptions of other parameters.
*/
bool ProcessPage(Pix* pix, int page_index, const char* filename,
const char* retry_config, int timeout_millisec,
TessResultRenderer* renderer,
const char* jpgdata, int len);
/**
* Get a reading-order iterator to the results of LayoutAnalysis and/or
* Recognize. The returned iterator must be deleted after use.
* WARNING! This class points to data held within the TessBaseAPI class, and
* therefore can only be used while the TessBaseAPI class still exists and
* has not been subjected to a call of Init, SetImage, Recognize, Clear, End
* DetectOS, or anything else that changes the internal PAGE_RES.
*/
ResultIterator* GetIterator();
/**
* Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.
* The returned iterator must be deleted after use.
* WARNING! This class points to data held within the TessBaseAPI class, and
* therefore can only be used while the TessBaseAPI class still exists and
* has not been subjected to a call of Init, SetImage, Recognize, Clear, End
* DetectOS, or anything else that changes the internal PAGE_RES.
*/
MutableIterator* GetMutableIterator();
/**
* The recognized text is returned as a char* which is coded
* as UTF8 and must be freed with the delete [] operator.
*/
char* GetUTF8Text();
/**
* Make a HTML-formatted string with hOCR markup from the internal
* data structures.
* page_number is 0-based but will appear in the output as 1-based.
* monitor can be used to
* cancel the recognition
* receive progress callbacks
*/
char* GetHOCRText(ETEXT_DESC* monitor, int page_number);
/**
* Make a HTML-formatted string with hOCR markup from the internal
* data structures.
* page_number is 0-based but will appear in the output as 1-based.
*/
char* GetHOCRText(int page_number);
/**
* Make a TSV-formatted string from the internal data structures.
* page_number is 0-based but will appear in the output as 1-based.
*/
char* GetTSVText(int page_number);
/**
* The recognized text is returned as a char* which is coded in the same
* format as a box file used in training. Returned string must be freed with
* the delete [] operator.
* Constructs coordinates in the original image - not just the rectangle.
* page_number is a 0-based page index that will appear in the box file.
*/
char* GetBoxText(int page_number);
/**
* The recognized text is returned as a char* which is coded
* as UNLV format Latin-1 with specific reject and suspect codes
* and must be freed with the delete [] operator.
*/
char* GetUNLVText();
/**
* Detect the orientation of the input image and apparent script (alphabet).
* orient_deg is the detected clockwise rotation of the input image in degrees (0, 90, 180, 270)
* orient_conf is the confidence (15.0 is reasonably confident)
* script_name is an ASCII string, the name of the script, e.g. "Latin"
* script_conf is confidence level in the script
* Returns true on success and writes values to each parameter as an output
*/
bool DetectOrientationScript(int* orient_deg, float* orient_conf, const char** script_name, float* script_conf);
/**
* The recognized text is returned as a char* which is coded
* as UTF8 and must be freed with the delete [] operator.
* page_number is a 0-based page index that will appear in the osd file.
*/
char* GetOsdText(int page_number);
/** Returns the (average) confidence value between 0 and 100. */
int MeanTextConf();
/**
* Returns all word confidences (between 0 and 100) in an array, terminated
* by -1. The calling function must delete [] after use.
* The number of confidences should correspond to the number of space-
* delimited words in GetUTF8Text.
*/
int* AllWordConfidences();
/**
* Applies the given word to the adaptive classifier if possible.
* The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can
* tell the boundaries of the graphemes.
* Assumes that SetImage/SetRectangle have been used to set the image
* to the given word. The mode arg should be PSM_SINGLE_WORD or
* PSM_CIRCLE_WORD, as that will be used to control layout analysis.
* The currently set PageSegMode is preserved.
* Returns false if adaption was not possible for some reason.
*/
bool AdaptToWordStr(PageSegMode mode, const char* wordstr);
/**
* Free up recognition results and any stored image data, without actually
* freeing any recognition data that would be time-consuming to reload.
* Afterwards, you must call SetImage or TesseractRect before doing
* any Recognize or Get* operation.
*/
void Clear();
/**
* Close down tesseract and free up all memory. End() is equivalent to
* destructing and reconstructing your TessBaseAPI.
* Once End() has been used, none of the other API functions may be used
* other than Init and anything declared above it in the class definition.
*/
void End();
/**
* Clear any library-level memory caches.
* There are a variety of expensive-to-load constant data structures (mostly
* language dictionaries) that are cached globally -- surviving the Init()
* and End() of individual TessBaseAPI's. This function allows the clearing
* of these caches.
**/
static void ClearPersistentCache();
/**
* Check whether a word is valid according to Tesseract's language model
* @return 0 if the word is invalid, non-zero if valid.
* @warning temporary! This function will be removed from here and placed
* in a separate API at some future time.
*/
int IsValidWord(const char *word);
// Returns true if utf8_character is defined in the UniCharset.
bool IsValidCharacter(const char *utf8_character);
bool GetTextDirection(int* out_offset, float* out_slope);
/** Sets Dict::letter_is_okay_ function to point to the given function. */
void SetDictFunc(DictFunc f);
/** Sets Dict::probability_in_context_ function to point to the given
* function.
*/
void SetProbabilityInContextFunc(ProbabilityInContextFunc f);
/** Sets Wordrec::fill_lattice_ function to point to the given function. */
void SetFillLatticeFunc(FillLatticeFunc f);
/**
* Estimates the Orientation And Script of the image.
* @return true if the image was processed successfully.
*/
bool DetectOS(OSResults*);
/** This method returns the features associated with the input image. */
void GetFeaturesForBlob(TBLOB* blob, INT_FEATURE_STRUCT* int_features,
int* num_features, int* feature_outline_index);
/**
* This method returns the row to which a box of specified dimensions would
* belong. If no good match is found, it returns NULL.
*/
static ROW* FindRowForBox(BLOCK_LIST* blocks, int left, int top,
int right, int bottom);
/**
* Method to run adaptive classifier on a blob.
* It returns at max num_max_matches results.
*/
void RunAdaptiveClassifier(TBLOB* blob,
int num_max_matches,
int* unichar_ids,
float* ratings,
int* num_matches_returned);
/** This method returns the string form of the specified unichar. */
const char* GetUnichar(int unichar_id);
/** Return the pointer to the i-th dawg loaded into tesseract_ object. */
const Dawg *GetDawg(int i) const;
/** Return the number of dawgs loaded into tesseract_ object. */
int NumDawgs() const;
/** Returns a ROW object created from the input row specification. */
static ROW *MakeTessOCRRow(float baseline, float xheight,
float descender, float ascender);
/** Returns a TBLOB corresponding to the entire input image. */
static TBLOB *MakeTBLOB(Pix *pix);
/**
* This method baseline normalizes a TBLOB in-place. The input row is used
* for normalization. The denorm is an optional parameter in which the
* normalization-antidote is returned.
*/
static void NormalizeTBLOB(TBLOB *tblob, ROW *row, bool numeric_mode);
Tesseract* tesseract() const { return tesseract_; }
OcrEngineMode oem() const { return last_oem_requested_; }
void InitTruthCallback(TruthCallback *cb) { truth_cb_ = cb; }
#ifndef NO_CUBE_BUILD
/** Return a pointer to underlying CubeRecoContext object if present. */
CubeRecoContext *GetCubeRecoContext() const;
#endif // NO_CUBE_BUILD
void set_min_orientation_margin(double margin);
/**
* Return text orientation of each block as determined by an earlier run
* of layout analysis.
*/
void GetBlockTextOrientations(int** block_orientation,
bool** vertical_writing);
/** Find lines from the image making the BLOCK_LIST. */
BLOCK_LIST* FindLinesCreateBlockList();
/**
* Delete a block list.
* This is to keep BLOCK_LIST pointer opaque
* and let go of including the other headers.
*/
static void DeleteBlockList(BLOCK_LIST* block_list);
/* @} */
protected:
/** Common code for setting the image. Returns true if Init has been called. */
TESS_LOCAL bool InternalSetImage();
/**
* Run the thresholder to make the thresholded image. If pix is not NULL,
* the source is thresholded to pix instead of the internal IMAGE.
*/
TESS_LOCAL virtual void Threshold(Pix** pix);
/**
* Find lines from the image making the BLOCK_LIST.
* @return 0 on success.
*/
TESS_LOCAL int FindLines();
/** Delete the pageres and block list ready for a new page. */
void ClearResults();
/**
* Return an LTR Result Iterator -- used only for training, as we really want
* to ignore all BiDi smarts at that point.
* delete once you're done with it.
*/
TESS_LOCAL LTRResultIterator* GetLTRIterator();
/**
* Return the length of the output text string, as UTF8, assuming
* one newline per line and one per block, with a terminator,
* and assuming a single character reject marker for each rejected character.
* Also return the number of recognized blobs in blob_count.
*/
TESS_LOCAL int TextLength(int* blob_count);
/** @defgroup ocropusAddOns ocropus add-ons */
/* @{ */
/**
* Adapt to recognize the current image as the given character.
* The image must be preloaded and be just an image of a single character.
*/
TESS_LOCAL void AdaptToCharacter(const char *unichar_repr,
int length,
float baseline,
float xheight,
float descender,
float ascender);
/** Recognize text doing one pass only, using settings for a given pass. */
TESS_LOCAL PAGE_RES* RecognitionPass1(BLOCK_LIST* block_list);
TESS_LOCAL PAGE_RES* RecognitionPass2(BLOCK_LIST* block_list,
PAGE_RES* pass1_result);
//// paragraphs.cpp ////////////////////////////////////////////////////
TESS_LOCAL void DetectParagraphs(bool after_text_recognition);
/**
* Extract the OCR results, costs (penalty points for uncertainty),
* and the bounding boxes of the characters.
*/
TESS_LOCAL static int TesseractExtractResult(char** text,
int** lengths,
float** costs,
int** x0,
int** y0,
int** x1,
int** y1,
PAGE_RES* page_res);
TESS_LOCAL const PAGE_RES* GetPageRes() const { return page_res_; }
/* @} */
protected:
Tesseract* tesseract_; ///< The underlying data object.
Tesseract* osd_tesseract_; ///< For orientation & script detection.
EquationDetect* equ_detect_; ///<The equation detector.
ImageThresholder* thresholder_; ///< Image thresholding module.
GenericVector<ParagraphModel *>* paragraph_models_;
BLOCK_LIST* block_list_; ///< The page layout.
PAGE_RES* page_res_; ///< The page-level data.
STRING* input_file_; ///< Name used by training code.
STRING* output_file_; ///< Name used by debug code.
STRING* datapath_; ///< Current location of tessdata.
STRING* language_; ///< Last initialized language.
OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested.
bool recognition_done_; ///< page_res_ contains recognition data.
TruthCallback *truth_cb_; /// fxn for setting truth_* in WERD_RES
/**
* @defgroup ThresholderParams Thresholder Parameters
* Parameters saved from the Thresholder. Needed to rebuild coordinates.
*/
/* @{ */
int rect_left_;
int rect_top_;
int rect_width_;
int rect_height_;
int image_width_;
int image_height_;
/* @} */
private:
// A list of image filenames gets special consideration
bool ProcessPagesFileList(FILE *fp,
STRING *buf,
const char* retry_config, int timeout_millisec,
TessResultRenderer* renderer,
int tessedit_page_number);
// TIFF supports multipage so gets special consideration.
bool ProcessPagesMultipageTiff(const unsigned char *data,
size_t size,
const char* filename,
const char* retry_config,
int timeout_millisec,
TessResultRenderer* renderer,
int tessedit_page_number);
// There's currently no way to pass a document title from the
// Tesseract command line, and we have multiple places that choose
// to set the title to an empty string. Using a single named
// variable will hopefully reduce confusion if the situation changes
// in the future.
const char *unknown_title_;
}; // class TessBaseAPI.
/** Escape a char string - remove &<>"' with HTML codes. */
STRING HOcrEscape(const char* text);
} // namespace tesseract.
#endif // TESSERACT_API_BASEAPI_H__

View File

@ -0,0 +1,826 @@
///////////////////////////////////////////////////////////////////////
// File: capi.cpp
// Description: C-API TessBaseAPI
//
// (C) Copyright 2012, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESS_CAPI_INCLUDE_BASEAPI
# define TESS_CAPI_INCLUDE_BASEAPI
#endif
#include "capi.h"
#include "genericvector.h"
#include "strngs.h"
TESS_API const char* TESS_CALL TessVersion()
{
return TessBaseAPI::Version();
}
TESS_API void TESS_CALL TessDeleteText(char* text)
{
delete[] text;
}
TESS_API void TESS_CALL TessDeleteTextArray(char** arr)
{
for (char** pos = arr; *pos != NULL; ++pos)
delete[] * pos;
delete[] arr;
}
TESS_API void TESS_CALL TessDeleteIntArray(int* arr)
{
delete[] arr;
}
TESS_API void TESS_CALL TessDeleteBlockList(BLOCK_LIST* block_list)
{
TessBaseAPI::DeleteBlockList(block_list);
}
TESS_API TessResultRenderer* TESS_CALL TessTextRendererCreate(const char* outputbase)
{
return new TessTextRenderer(outputbase);
}
TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate(const char* outputbase)
{
return new TessHOcrRenderer(outputbase);
}
TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate2(const char* outputbase, BOOL font_info)
{
return new TessHOcrRenderer(outputbase, font_info);
}
TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir)
{
return new TessPDFRenderer(outputbase, datadir, false);
}
TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreateTextonly(const char* outputbase, const char* datadir,
BOOL textonly)
{
return new TessPDFRenderer(outputbase, datadir, textonly);
}
TESS_API TessResultRenderer* TESS_CALL TessUnlvRendererCreate(const char* outputbase)
{
return new TessUnlvRenderer(outputbase);
}
TESS_API TessResultRenderer* TESS_CALL TessBoxTextRendererCreate(const char* outputbase)
{
return new TessBoxTextRenderer(outputbase);
}
TESS_API void TESS_CALL TessDeleteResultRenderer(TessResultRenderer* renderer)
{
delete renderer;
}
TESS_API void TESS_CALL TessResultRendererInsert(TessResultRenderer* renderer, TessResultRenderer* next)
{
renderer->insert(next);
}
TESS_API TessResultRenderer* TESS_CALL TessResultRendererNext(TessResultRenderer* renderer)
{
return renderer->next();
}
TESS_API BOOL TESS_CALL TessResultRendererBeginDocument(TessResultRenderer* renderer, const char* title)
{
return renderer->BeginDocument(title);
}
TESS_API BOOL TESS_CALL TessResultRendererAddImage(TessResultRenderer* renderer, TessBaseAPI* api)
{
return renderer->AddImage(api, nullptr, 0);
}
TESS_API BOOL TESS_CALL TessResultRendererEndDocument(TessResultRenderer* renderer)
{
return renderer->EndDocument();
}
TESS_API const char* TESS_CALL TessResultRendererExtention(TessResultRenderer* renderer)
{
return renderer->file_extension();
}
TESS_API const char* TESS_CALL TessResultRendererTitle(TessResultRenderer* renderer)
{
return renderer->title();
}
TESS_API int TESS_CALL TessResultRendererImageNum(TessResultRenderer* renderer)
{
return renderer->imagenum();
}
TESS_API TessBaseAPI* TESS_CALL TessBaseAPICreate()
{
return new TessBaseAPI;
}
TESS_API void TESS_CALL TessBaseAPIDelete(TessBaseAPI* handle)
{
delete handle;
}
TESS_API size_t TESS_CALL TessBaseAPIGetOpenCLDevice(TessBaseAPI* handle, void **device)
{
return handle->getOpenCLDevice(device);
}
TESS_API void TESS_CALL TessBaseAPISetInputName(TessBaseAPI* handle, const char* name)
{
handle->SetInputName(name);
}
TESS_API const char* TESS_CALL TessBaseAPIGetInputName(TessBaseAPI* handle)
{
return handle->GetInputName();
}
TESS_API void TESS_CALL TessBaseAPISetInputImage(TessBaseAPI* handle, Pix* pix)
{
handle->SetInputImage(pix);
}
TESS_API Pix* TESS_CALL TessBaseAPIGetInputImage(TessBaseAPI* handle)
{
return handle->GetInputImage();
}
TESS_API int TESS_CALL TessBaseAPIGetSourceYResolution(TessBaseAPI* handle)
{
return handle->GetSourceYResolution();
}
TESS_API const char* TESS_CALL TessBaseAPIGetDatapath(TessBaseAPI* handle)
{
return handle->GetDatapath();
}
TESS_API void TESS_CALL TessBaseAPISetOutputName(TessBaseAPI* handle, const char* name)
{
handle->SetOutputName(name);
}
TESS_API BOOL TESS_CALL TessBaseAPISetVariable(TessBaseAPI* handle, const char* name, const char* value)
{
return handle->SetVariable(name, value) ? TRUE : FALSE;
}
TESS_API BOOL TESS_CALL TessBaseAPISetDebugVariable(TessBaseAPI* handle, const char* name, const char* value)
{
return handle->SetVariable(name, value) ? TRUE : FALSE;
}
TESS_API BOOL TESS_CALL TessBaseAPIGetIntVariable(const TessBaseAPI* handle, const char* name, int* value)
{
return handle->GetIntVariable(name, value) ? TRUE : FALSE;
}
TESS_API BOOL TESS_CALL TessBaseAPIGetBoolVariable(const TessBaseAPI* handle, const char* name, BOOL* value)
{
bool boolValue;
if (handle->GetBoolVariable(name, &boolValue))
{
*value = boolValue ? TRUE : FALSE;
return TRUE;
}
else
{
return FALSE;
}
}
TESS_API BOOL TESS_CALL TessBaseAPIGetDoubleVariable(const TessBaseAPI* handle, const char* name, double* value)
{
return handle->GetDoubleVariable(name, value) ? TRUE : FALSE;
}
TESS_API const char* TESS_CALL TessBaseAPIGetStringVariable(const TessBaseAPI* handle, const char* name)
{
return handle->GetStringVariable(name);
}
TESS_API void TESS_CALL TessBaseAPIPrintVariables(const TessBaseAPI* handle, FILE* fp)
{
handle->PrintVariables(fp);
}
TESS_API BOOL TESS_CALL TessBaseAPIPrintVariablesToFile(const TessBaseAPI* handle, const char* filename)
{
FILE* fp = fopen(filename, "w");
if (fp != NULL)
{
handle->PrintVariables(fp);
fclose(fp);
return TRUE;
}
return FALSE;
}
TESS_API BOOL TESS_CALL TessBaseAPIGetVariableAsString(TessBaseAPI* handle, const char* name, STRING* val)
{
return handle->GetVariableAsString(name, val) ? TRUE : FALSE;
}
TESS_API int TESS_CALL TessBaseAPIInit4(TessBaseAPI* handle, const char* datapath, const char* language,
TessOcrEngineMode mode, char** configs, int configs_size,
char** vars_vec, char** vars_values, size_t vars_vec_size,
BOOL set_only_non_debug_params)
{
GenericVector<STRING> varNames;
GenericVector<STRING> varValues;
if (vars_vec != NULL && vars_values != NULL) {
for (size_t i = 0; i < vars_vec_size; i++) {
varNames.push_back(STRING(vars_vec[i]));
varValues.push_back(STRING(vars_values[i]));
}
}
return handle->Init(datapath, language, mode, configs, configs_size, &varNames, &varValues, set_only_non_debug_params);
}
TESS_API int TESS_CALL TessBaseAPIInit1(TessBaseAPI* handle, const char* datapath, const char* language, TessOcrEngineMode oem,
char** configs, int configs_size)
{
return handle->Init(datapath, language, oem, configs, configs_size, NULL, NULL, false);
}
TESS_API int TESS_CALL TessBaseAPIInit2(TessBaseAPI* handle, const char* datapath, const char* language, TessOcrEngineMode oem)
{
return handle->Init(datapath, language, oem);
}
TESS_API int TESS_CALL TessBaseAPIInit3(TessBaseAPI* handle, const char* datapath, const char* language)
{
return handle->Init(datapath, language);
}
TESS_API const char* TESS_CALL TessBaseAPIGetInitLanguagesAsString(const TessBaseAPI* handle)
{
return handle->GetInitLanguagesAsString();
}
TESS_API char** TESS_CALL TessBaseAPIGetLoadedLanguagesAsVector(const TessBaseAPI* handle)
{
GenericVector<STRING> languages;
handle->GetLoadedLanguagesAsVector(&languages);
char** arr = new char*[languages.size() + 1];
for (int index = 0; index < languages.size(); ++index)
arr[index] = languages[index].strdup();
arr[languages.size()] = NULL;
return arr;
}
TESS_API char** TESS_CALL TessBaseAPIGetAvailableLanguagesAsVector(const TessBaseAPI* handle)
{
GenericVector<STRING> languages;
handle->GetAvailableLanguagesAsVector(&languages);
char** arr = new char*[languages.size() + 1];
for (int index = 0; index < languages.size(); ++index)
arr[index] = languages[index].strdup();
arr[languages.size()] = NULL;
return arr;
}
TESS_API int TESS_CALL TessBaseAPIInitLangMod(TessBaseAPI* handle, const char* datapath, const char* language)
{
return handle->InitLangMod(datapath, language);
}
TESS_API void TESS_CALL TessBaseAPIInitForAnalysePage(TessBaseAPI* handle)
{
handle->InitForAnalysePage();
}
TESS_API void TESS_CALL TessBaseAPIReadConfigFile(TessBaseAPI* handle, const char* filename)
{
handle->ReadConfigFile(filename);
}
TESS_API void TESS_CALL TessBaseAPIReadDebugConfigFile(TessBaseAPI* handle, const char* filename)
{
handle->ReadDebugConfigFile(filename);
}
TESS_API void TESS_CALL TessBaseAPISetPageSegMode(TessBaseAPI* handle, TessPageSegMode mode)
{
handle->SetPageSegMode(mode);
}
TESS_API TessPageSegMode TESS_CALL TessBaseAPIGetPageSegMode(const TessBaseAPI* handle)
{
return handle->GetPageSegMode();
}
TESS_API char* TESS_CALL TessBaseAPIRect(TessBaseAPI* handle, const unsigned char* imagedata,
int bytes_per_pixel, int bytes_per_line,
int left, int top, int width, int height)
{
return handle->TesseractRect(imagedata, bytes_per_pixel, bytes_per_line, left, top, width, height);
}
TESS_API void TESS_CALL TessBaseAPIClearAdaptiveClassifier(TessBaseAPI* handle)
{
handle->ClearAdaptiveClassifier();
}
TESS_API void TESS_CALL TessBaseAPISetImage(TessBaseAPI* handle, const unsigned char* imagedata, int width, int height,
int bytes_per_pixel, int bytes_per_line)
{
handle->SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line);
}
TESS_API void TESS_CALL TessBaseAPISetImage2(TessBaseAPI* handle, struct Pix* pix)
{
return handle->SetImage(pix);
}
TESS_API void TESS_CALL TessBaseAPISetSourceResolution(TessBaseAPI* handle, int ppi)
{
handle->SetSourceResolution(ppi);
}
TESS_API void TESS_CALL TessBaseAPISetRectangle(TessBaseAPI* handle, int left, int top, int width, int height)
{
handle->SetRectangle(left, top, width, height);
}
TESS_API void TESS_CALL TessBaseAPISetThresholder(TessBaseAPI* handle, TessImageThresholder* thresholder)
{
handle->SetThresholder(thresholder);
}
TESS_API struct Pix* TESS_CALL TessBaseAPIGetThresholdedImage(TessBaseAPI* handle)
{
return handle->GetThresholdedImage();
}
TESS_API struct Boxa* TESS_CALL TessBaseAPIGetRegions(TessBaseAPI* handle, struct Pixa** pixa)
{
return handle->GetRegions(pixa);
}
TESS_API struct Boxa* TESS_CALL TessBaseAPIGetTextlines(TessBaseAPI* handle, struct Pixa** pixa, int** blockids)
{
return handle->GetTextlines(pixa, blockids);
}
TESS_API struct Boxa* TESS_CALL TessBaseAPIGetTextlines1(TessBaseAPI* handle, const BOOL raw_image, const int raw_padding,
struct Pixa** pixa, int** blockids, int** paraids)
{
return handle->GetTextlines(raw_image, raw_padding, pixa, blockids, paraids);
}
TESS_API struct Boxa* TESS_CALL TessBaseAPIGetStrips(TessBaseAPI* handle, struct Pixa** pixa, int** blockids)
{
return handle->GetStrips(pixa, blockids);
}
TESS_API struct Boxa* TESS_CALL TessBaseAPIGetWords(TessBaseAPI* handle, struct Pixa** pixa)
{
return handle->GetWords(pixa);
}
TESS_API struct Boxa* TESS_CALL TessBaseAPIGetConnectedComponents(TessBaseAPI* handle, struct Pixa** cc)
{
return handle->GetConnectedComponents(cc);
}
TESS_API struct Boxa* TESS_CALL TessBaseAPIGetComponentImages(TessBaseAPI* handle, TessPageIteratorLevel level, BOOL text_only, struct Pixa** pixa, int** blockids)
{
return handle->GetComponentImages(level, text_only != FALSE, pixa, blockids);
}
TESS_API struct Boxa*
TESS_CALL TessBaseAPIGetComponentImages1(TessBaseAPI* handle, const TessPageIteratorLevel level, const BOOL text_only,
const BOOL raw_image, const int raw_padding,
struct Pixa** pixa, int** blockids, int** paraids)
{
return handle->GetComponentImages(level, text_only != FALSE, raw_image, raw_padding, pixa, blockids, paraids);
}
TESS_API int TESS_CALL TessBaseAPIGetThresholdedImageScaleFactor(const TessBaseAPI* handle)
{
return handle->GetThresholdedImageScaleFactor();
}
TESS_API void TESS_CALL TessBaseAPIDumpPGM(TessBaseAPI* handle, const char* filename)
{
handle->DumpPGM(filename);
}
TESS_API TessPageIterator* TESS_CALL TessBaseAPIAnalyseLayout(TessBaseAPI* handle)
{
return handle->AnalyseLayout();
}
TESS_API int TESS_CALL TessBaseAPIRecognize(TessBaseAPI* handle, ETEXT_DESC* monitor)
{
return handle->Recognize(monitor);
}
TESS_API int TESS_CALL TessBaseAPIRecognizeForChopTest(TessBaseAPI* handle, ETEXT_DESC* monitor)
{
return handle->RecognizeForChopTest(monitor);
}
TESS_API BOOL TESS_CALL TessBaseAPIProcessPages(TessBaseAPI* handle, const char* filename, const char* retry_config,
int timeout_millisec, TessResultRenderer* renderer)
{
if (handle->ProcessPages(filename, retry_config, timeout_millisec, renderer))
return TRUE;
else
return FALSE;
}
TESS_API BOOL TESS_CALL TessBaseAPIProcessPage(TessBaseAPI* handle, struct Pix* pix, int page_index, const char* filename,
const char* retry_config, int timeout_millisec, TessResultRenderer* renderer)
{
if (handle->ProcessPage(pix, page_index, filename, retry_config, timeout_millisec, renderer, nullptr, 0))
return TRUE;
else
return FALSE;
}
TESS_API TessResultIterator* TESS_CALL TessBaseAPIGetIterator(TessBaseAPI* handle)
{
return handle->GetIterator();
}
TESS_API TessMutableIterator* TESS_CALL TessBaseAPIGetMutableIterator(TessBaseAPI* handle)
{
return handle->GetMutableIterator();
}
TESS_API char* TESS_CALL TessBaseAPIGetUTF8Text(TessBaseAPI* handle)
{
return handle->GetUTF8Text();
}
TESS_API char* TESS_CALL TessBaseAPIGetHOCRText(TessBaseAPI* handle, int page_number)
{
return handle->GetHOCRText(NULL, page_number);
}
TESS_API char* TESS_CALL TessBaseAPIGetBoxText(TessBaseAPI* handle, int page_number)
{
return handle->GetBoxText(page_number);
}
TESS_API char* TESS_CALL TessBaseAPIGetUNLVText(TessBaseAPI* handle)
{
return handle->GetUNLVText();
}
TESS_API int TESS_CALL TessBaseAPIMeanTextConf(TessBaseAPI* handle)
{
return handle->MeanTextConf();
}
TESS_API int* TESS_CALL TessBaseAPIAllWordConfidences(TessBaseAPI* handle)
{
return handle->AllWordConfidences();
}
TESS_API BOOL TESS_CALL TessBaseAPIAdaptToWordStr(TessBaseAPI* handle, TessPageSegMode mode, const char* wordstr)
{
return handle->AdaptToWordStr(mode, wordstr) ? TRUE : FALSE;
}
TESS_API void TESS_CALL TessBaseAPIClear(TessBaseAPI* handle)
{
handle->Clear();
}
TESS_API void TESS_CALL TessBaseAPIEnd(TessBaseAPI* handle)
{
handle->End();
}
TESS_API int TESS_CALL TessBaseAPIIsValidWord(TessBaseAPI* handle, const char* word)
{
return handle->IsValidWord(word);
}
TESS_API BOOL TESS_CALL TessBaseAPIGetTextDirection(TessBaseAPI* handle, int* out_offset, float* out_slope)
{
return handle->GetTextDirection(out_offset, out_slope) ? TRUE : FALSE;
}
TESS_API void TESS_CALL TessBaseAPISetDictFunc(TessBaseAPI* handle, TessDictFunc f)
{
handle->SetDictFunc(f);
}
TESS_API void TESS_CALL TessBaseAPIClearPersistentCache(TessBaseAPI* handle)
{
handle->ClearPersistentCache();
}
TESS_API void TESS_CALL TessBaseAPISetProbabilityInContextFunc(TessBaseAPI* handle, TessProbabilityInContextFunc f)
{
handle->SetProbabilityInContextFunc(f);
}
TESS_API BOOL TESS_CALL TessBaseAPIDetectOS(TessBaseAPI* handle, OSResults* results)
{
return FALSE; // Unsafe ABI, return FALSE always
}
TESS_API BOOL TESS_CALL TessBaseAPIDetectOrientationScript(TessBaseAPI* handle,
int* orient_deg, float* orient_conf, const char** script_name, float* script_conf)
{
bool success;
success = handle->DetectOrientationScript(orient_deg, orient_conf, script_name, script_conf);
return (BOOL)success;
}
TESS_API void TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, INT_FEATURE_STRUCT* int_features,
int* num_features, int* FeatureOutlineIndex)
{
handle->GetFeaturesForBlob(blob, int_features, num_features, FeatureOutlineIndex);
}
TESS_API ROW* TESS_CALL TessFindRowForBox(BLOCK_LIST* blocks, int left, int top, int right, int bottom)
{
return TessBaseAPI::FindRowForBox(blocks, left, top, right, bottom);
}
TESS_API void TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, int num_max_matches,
int* unichar_ids, float* ratings, int* num_matches_returned)
{
handle->RunAdaptiveClassifier(blob, num_max_matches, unichar_ids, ratings, num_matches_returned);
}
TESS_API const char* TESS_CALL TessBaseAPIGetUnichar(TessBaseAPI* handle, int unichar_id)
{
return handle->GetUnichar(unichar_id);
}
TESS_API const TessDawg* TESS_CALL TessBaseAPIGetDawg(const TessBaseAPI* handle, int i)
{
return handle->GetDawg(i);
}
TESS_API int TESS_CALL TessBaseAPINumDawgs(const TessBaseAPI* handle)
{
return handle->NumDawgs();
}
TESS_API ROW* TESS_CALL TessMakeTessOCRRow(float baseline, float xheight, float descender, float ascender)
{
return TessBaseAPI::MakeTessOCRRow(baseline, xheight, descender, ascender);
}
TESS_API TBLOB* TESS_CALL TessMakeTBLOB(struct Pix* pix)
{
return TessBaseAPI::MakeTBLOB(pix);
}
TESS_API void TESS_CALL TessNormalizeTBLOB(TBLOB* tblob, ROW* row, BOOL numeric_mode)
{
TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode != FALSE);
}
TESS_API TessOcrEngineMode TESS_CALL TessBaseAPIOem(const TessBaseAPI* handle)
{
return handle->oem();
}
TESS_API void TESS_CALL TessBaseAPIInitTruthCallback(TessBaseAPI* handle, TessTruthCallback* cb)
{
handle->InitTruthCallback(cb);
}
#ifndef NO_CUBE_BUILD
TESS_API TessCubeRecoContext* TESS_CALL TessBaseAPIGetCubeRecoContext(const TessBaseAPI* handle)
{
return handle->GetCubeRecoContext();
}
#endif // NO_CUBE_BUILD
TESS_API void TESS_CALL TessBaseAPISetMinOrientationMargin(TessBaseAPI* handle, double margin)
{
handle->set_min_orientation_margin(margin);
}
TESS_API void TESS_CALL TessBaseGetBlockTextOrientations(TessBaseAPI* handle, int** block_orientation, bool** vertical_writing)
{
handle->GetBlockTextOrientations(block_orientation, vertical_writing);
}
TESS_API BLOCK_LIST* TESS_CALL TessBaseAPIFindLinesCreateBlockList(TessBaseAPI* handle)
{
return handle->FindLinesCreateBlockList();
}
TESS_API void TESS_CALL TessPageIteratorDelete(TessPageIterator* handle)
{
delete handle;
}
TESS_API TessPageIterator* TESS_CALL TessPageIteratorCopy(const TessPageIterator* handle)
{
return new TessPageIterator(*handle);
}
TESS_API void TESS_CALL TessPageIteratorBegin(TessPageIterator* handle)
{
handle->Begin();
}
TESS_API BOOL TESS_CALL TessPageIteratorNext(TessPageIterator* handle, TessPageIteratorLevel level)
{
return handle->Next(level) ? TRUE : FALSE;
}
TESS_API BOOL TESS_CALL TessPageIteratorIsAtBeginningOf(const TessPageIterator* handle, TessPageIteratorLevel level)
{
return handle->IsAtBeginningOf(level) ? TRUE : FALSE;
}
TESS_API BOOL TESS_CALL TessPageIteratorIsAtFinalElement(const TessPageIterator* handle, TessPageIteratorLevel level,
TessPageIteratorLevel element)
{
return handle->IsAtFinalElement(level, element) ? TRUE : FALSE;
}
TESS_API BOOL TESS_CALL TessPageIteratorBoundingBox(const TessPageIterator* handle, TessPageIteratorLevel level,
int* left, int* top, int* right, int* bottom)
{
return handle->BoundingBox(level, left, top, right, bottom) ? TRUE : FALSE;
}
TESS_API TessPolyBlockType TESS_CALL TessPageIteratorBlockType(const TessPageIterator* handle)
{
return handle->BlockType();
}
TESS_API struct Pix* TESS_CALL TessPageIteratorGetBinaryImage(const TessPageIterator* handle, TessPageIteratorLevel level)
{
return handle->GetBinaryImage(level);
}
TESS_API struct Pix* TESS_CALL TessPageIteratorGetImage(const TessPageIterator* handle, TessPageIteratorLevel level, int padding,
struct Pix* original_image, int* left, int* top)
{
return handle->GetImage(level, padding, original_image, left, top);
}
TESS_API BOOL TESS_CALL TessPageIteratorBaseline(const TessPageIterator* handle, TessPageIteratorLevel level,
int* x1, int* y1, int* x2, int* y2)
{
return handle->Baseline(level, x1, y1, x2, y2) ? TRUE : FALSE;
}
TESS_API void TESS_CALL TessPageIteratorOrientation(TessPageIterator* handle, TessOrientation* orientation,
TessWritingDirection* writing_direction, TessTextlineOrder* textline_order,
float* deskew_angle)
{
handle->Orientation(orientation, writing_direction, textline_order, deskew_angle);
}
TESS_API void TESS_CALL TessPageIteratorParagraphInfo(TessPageIterator* handle, TessParagraphJustification* justification,
BOOL *is_list_item, BOOL *is_crown, int *first_line_indent)
{
bool bool_is_list_item, bool_is_crown;
handle->ParagraphInfo(justification, &bool_is_list_item, &bool_is_crown, first_line_indent);
if (is_list_item)
*is_list_item = bool_is_list_item ? TRUE : FALSE;
if (is_crown)
*is_crown = bool_is_crown ? TRUE : FALSE;
}
TESS_API void TESS_CALL TessResultIteratorDelete(TessResultIterator* handle)
{
delete handle;
}
TESS_API TessResultIterator* TESS_CALL TessResultIteratorCopy(const TessResultIterator* handle)
{
return new TessResultIterator(*handle);
}
TESS_API TessPageIterator* TESS_CALL TessResultIteratorGetPageIterator(TessResultIterator* handle)
{
return handle;
}
TESS_API const TessPageIterator* TESS_CALL TessResultIteratorGetPageIteratorConst(const TessResultIterator* handle)
{
return handle;
}
TESS_API TessChoiceIterator* TESS_CALL TessResultIteratorGetChoiceIterator(const TessResultIterator* handle)
{
return new TessChoiceIterator(*handle);
}
TESS_API BOOL TESS_CALL TessResultIteratorNext(TessResultIterator* handle, TessPageIteratorLevel level)
{
return handle->Next(level);
}
TESS_API char* TESS_CALL TessResultIteratorGetUTF8Text(const TessResultIterator* handle, TessPageIteratorLevel level)
{
return handle->GetUTF8Text(level);
}
TESS_API float TESS_CALL TessResultIteratorConfidence(const TessResultIterator* handle, TessPageIteratorLevel level)
{
return handle->Confidence(level);
}
TESS_API const char* TESS_CALL TessResultIteratorWordRecognitionLanguage(const TessResultIterator* handle)
{
return handle->WordRecognitionLanguage();
}
TESS_API const char* TESS_CALL TessResultIteratorWordFontAttributes(const TessResultIterator* handle, BOOL* is_bold, BOOL* is_italic,
BOOL* is_underlined, BOOL* is_monospace, BOOL* is_serif,
BOOL* is_smallcaps, int* pointsize, int* font_id)
{
bool bool_is_bold, bool_is_italic, bool_is_underlined, bool_is_monospace, bool_is_serif, bool_is_smallcaps;
const char* ret = handle->WordFontAttributes(&bool_is_bold, &bool_is_italic, &bool_is_underlined, &bool_is_monospace, &bool_is_serif,
&bool_is_smallcaps, pointsize, font_id);
if (is_bold)
*is_bold = bool_is_bold ? TRUE : FALSE;
if (is_italic)
*is_italic = bool_is_italic ? TRUE : FALSE;
if (is_underlined)
*is_underlined = bool_is_underlined ? TRUE : FALSE;
if (is_monospace)
*is_monospace = bool_is_monospace ? TRUE : FALSE;
if (is_serif)
*is_serif = bool_is_serif ? TRUE : FALSE;
if (is_smallcaps)
*is_smallcaps = bool_is_smallcaps ? TRUE : FALSE;
return ret;
}
TESS_API BOOL TESS_CALL TessResultIteratorWordIsFromDictionary(const TessResultIterator* handle)
{
return handle->WordIsFromDictionary() ? TRUE : FALSE;
}
TESS_API BOOL TESS_CALL TessResultIteratorWordIsNumeric(const TessResultIterator* handle)
{
return handle->WordIsNumeric() ? TRUE : FALSE;
}
TESS_API BOOL TESS_CALL TessResultIteratorSymbolIsSuperscript(const TessResultIterator* handle)
{
return handle->SymbolIsSuperscript() ? TRUE : FALSE;
}
TESS_API BOOL TESS_CALL TessResultIteratorSymbolIsSubscript(const TessResultIterator* handle)
{
return handle->SymbolIsSubscript() ? TRUE : FALSE;
}
TESS_API BOOL TESS_CALL TessResultIteratorSymbolIsDropcap(const TessResultIterator* handle)
{
return handle->SymbolIsDropcap() ? TRUE : FALSE;
}
TESS_API void TESS_CALL TessChoiceIteratorDelete(TessChoiceIterator* handle)
{
delete handle;
}
TESS_API BOOL TESS_CALL TessChoiceIteratorNext(TessChoiceIterator* handle)
{
return handle->Next();
}
TESS_API const char* TESS_CALL TessChoiceIteratorGetUTF8Text(const TessChoiceIterator* handle)
{
return handle->GetUTF8Text();
}
TESS_API float TESS_CALL TessChoiceIteratorConfidence(const TessChoiceIterator* handle)
{
return handle->Confidence();
}

View File

@ -0,0 +1,409 @@
///////////////////////////////////////////////////////////////////////
// File: capi.h
// Description: C-API TessBaseAPI
//
// (C) Copyright 2012, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef API_CAPI_H_
#define API_CAPI_H_
#ifdef TESS_CAPI_INCLUDE_BASEAPI
# include "baseapi.h"
# include "pageiterator.h"
# include "resultiterator.h"
# include "renderer.h"
#else
# include "platform.h"
# include <stdio.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
#ifndef TESS_CALL
# if defined(WIN32)
# define TESS_CALL __cdecl
# else
# define TESS_CALL
# endif
#endif
#ifndef BOOL
# define BOOL int
# define TRUE 1
# define FALSE 0
#endif
#ifdef TESS_CAPI_INCLUDE_BASEAPI
typedef tesseract::TessResultRenderer TessResultRenderer;
typedef tesseract::TessTextRenderer TessTextRenderer;
typedef tesseract::TessHOcrRenderer TessHOcrRenderer;
typedef tesseract::TessPDFRenderer TessPDFRenderer;
typedef tesseract::TessUnlvRenderer TessUnlvRenderer;
typedef tesseract::TessBoxTextRenderer TessBoxTextRenderer;
typedef tesseract::TessBaseAPI TessBaseAPI;
typedef tesseract::PageIterator TessPageIterator;
typedef tesseract::ResultIterator TessResultIterator;
typedef tesseract::MutableIterator TessMutableIterator;
typedef tesseract::ChoiceIterator TessChoiceIterator;
typedef tesseract::OcrEngineMode TessOcrEngineMode;
typedef tesseract::PageSegMode TessPageSegMode;
typedef tesseract::ImageThresholder TessImageThresholder;
typedef tesseract::PageIteratorLevel TessPageIteratorLevel;
typedef tesseract::DictFunc TessDictFunc;
typedef tesseract::ProbabilityInContextFunc TessProbabilityInContextFunc;
// typedef tesseract::ParamsModelClassifyFunc TessParamsModelClassifyFunc;
typedef tesseract::FillLatticeFunc TessFillLatticeFunc;
typedef tesseract::Dawg TessDawg;
typedef tesseract::TruthCallback TessTruthCallback;
#ifndef NO_CUBE_BUILD
typedef tesseract::CubeRecoContext TessCubeRecoContext;
#endif // NO_CUBE_BUILD
typedef tesseract::Orientation TessOrientation;
typedef tesseract::ParagraphJustification TessParagraphJustification;
typedef tesseract::WritingDirection TessWritingDirection;
typedef tesseract::TextlineOrder TessTextlineOrder;
typedef PolyBlockType TessPolyBlockType;
#else
typedef struct TessResultRenderer TessResultRenderer;
typedef struct TessTextRenderer TessTextRenderer;
typedef struct TessHOcrRenderer TessHOcrRenderer;
typedef struct TessPDFRenderer TessPDFRenderer;
typedef struct TessUnlvRenderer TessUnlvRenderer;
typedef struct TessBoxTextRenderer TessBoxTextRenderer;
typedef struct TessBaseAPI TessBaseAPI;
typedef struct TessPageIterator TessPageIterator;
typedef struct TessResultIterator TessResultIterator;
typedef struct TessMutableIterator TessMutableIterator;
typedef struct TessChoiceIterator TessChoiceIterator;
typedef enum TessOcrEngineMode { OEM_TESSERACT_ONLY, OEM_CUBE_ONLY, OEM_TESSERACT_CUBE_COMBINED, OEM_DEFAULT } TessOcrEngineMode;
typedef enum TessPageSegMode { PSM_OSD_ONLY, PSM_AUTO_OSD, PSM_AUTO_ONLY, PSM_AUTO, PSM_SINGLE_COLUMN, PSM_SINGLE_BLOCK_VERT_TEXT,
PSM_SINGLE_BLOCK, PSM_SINGLE_LINE, PSM_SINGLE_WORD, PSM_CIRCLE_WORD, PSM_SINGLE_CHAR, PSM_SPARSE_TEXT,
PSM_SPARSE_TEXT_OSD, PSM_COUNT } TessPageSegMode;
typedef enum TessPageIteratorLevel { RIL_BLOCK, RIL_PARA, RIL_TEXTLINE, RIL_WORD, RIL_SYMBOL} TessPageIteratorLevel;
typedef enum TessPolyBlockType { PT_UNKNOWN, PT_FLOWING_TEXT, PT_HEADING_TEXT, PT_PULLOUT_TEXT, PT_EQUATION, PT_INLINE_EQUATION,
PT_TABLE, PT_VERTICAL_TEXT, PT_CAPTION_TEXT, PT_FLOWING_IMAGE, PT_HEADING_IMAGE,
PT_PULLOUT_IMAGE, PT_HORZ_LINE, PT_VERT_LINE, PT_NOISE, PT_COUNT } TessPolyBlockType;
typedef enum TessOrientation { ORIENTATION_PAGE_UP, ORIENTATION_PAGE_RIGHT, ORIENTATION_PAGE_DOWN, ORIENTATION_PAGE_LEFT } TessOrientation;
typedef enum TessParagraphJustification { JUSTIFICATION_UNKNOWN, JUSTIFICATION_LEFT, JUSTIFICATION_CENTER, JUSTIFICATION_RIGHT } TessParagraphJustification;
typedef enum TessWritingDirection { WRITING_DIRECTION_LEFT_TO_RIGHT, WRITING_DIRECTION_RIGHT_TO_LEFT, WRITING_DIRECTION_TOP_TO_BOTTOM } TessWritingDirection;
typedef enum TessTextlineOrder { TEXTLINE_ORDER_LEFT_TO_RIGHT, TEXTLINE_ORDER_RIGHT_TO_LEFT, TEXTLINE_ORDER_TOP_TO_BOTTOM } TessTextlineOrder;
typedef struct ETEXT_DESC ETEXT_DESC;
#endif
struct Pix;
struct Boxa;
struct Pixa;
/* General free functions */
TESS_API const char*
TESS_CALL TessVersion();
TESS_API void TESS_CALL TessDeleteText(char* text);
TESS_API void TESS_CALL TessDeleteTextArray(char** arr);
TESS_API void TESS_CALL TessDeleteIntArray(int* arr);
#ifdef TESS_CAPI_INCLUDE_BASEAPI
TESS_API void TESS_CALL TessDeleteBlockList(BLOCK_LIST* block_list);
#endif
/* Renderer API */
TESS_API TessResultRenderer* TESS_CALL TessTextRendererCreate(const char* outputbase);
TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate(const char* outputbase);
TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate2(const char* outputbase, BOOL font_info);
TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir);
TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreateTextonly(const char* outputbase, const char* datadir,
BOOL textonly);
TESS_API TessResultRenderer* TESS_CALL TessUnlvRendererCreate(const char* outputbase);
TESS_API TessResultRenderer* TESS_CALL TessBoxTextRendererCreate(const char* outputbase);
TESS_API void TESS_CALL TessDeleteResultRenderer(TessResultRenderer* renderer);
TESS_API void TESS_CALL TessResultRendererInsert(TessResultRenderer* renderer, TessResultRenderer* next);
TESS_API TessResultRenderer*
TESS_CALL TessResultRendererNext(TessResultRenderer* renderer);
TESS_API BOOL TESS_CALL TessResultRendererBeginDocument(TessResultRenderer* renderer, const char* title);
TESS_API BOOL TESS_CALL TessResultRendererAddImage(TessResultRenderer* renderer, TessBaseAPI* api);
TESS_API BOOL TESS_CALL TessResultRendererEndDocument(TessResultRenderer* renderer);
TESS_API const char* TESS_CALL TessResultRendererExtention(TessResultRenderer* renderer);
TESS_API const char* TESS_CALL TessResultRendererTitle(TessResultRenderer* renderer);
TESS_API int TESS_CALL TessResultRendererImageNum(TessResultRenderer* renderer);
/* Base API */
TESS_API TessBaseAPI*
TESS_CALL TessBaseAPICreate();
TESS_API void TESS_CALL TessBaseAPIDelete(TessBaseAPI* handle);
TESS_API size_t TESS_CALL TessBaseAPIGetOpenCLDevice(TessBaseAPI* handle, void **device);
TESS_API void TESS_CALL TessBaseAPISetInputName( TessBaseAPI* handle, const char* name);
TESS_API const char* TESS_CALL TessBaseAPIGetInputName(TessBaseAPI* handle);
TESS_API void TESS_CALL TessBaseAPISetInputImage(TessBaseAPI* handle, struct Pix* pix);
TESS_API struct Pix* TESS_CALL TessBaseAPIGetInputImage(TessBaseAPI* handle);
TESS_API int TESS_CALL TessBaseAPIGetSourceYResolution(TessBaseAPI* handle);
TESS_API const char* TESS_CALL TessBaseAPIGetDatapath(TessBaseAPI* handle);
TESS_API void TESS_CALL TessBaseAPISetOutputName(TessBaseAPI* handle, const char* name);
TESS_API BOOL TESS_CALL TessBaseAPISetVariable(TessBaseAPI* handle, const char* name, const char* value);
TESS_API BOOL TESS_CALL TessBaseAPISetDebugVariable(TessBaseAPI* handle, const char* name, const char* value);
TESS_API BOOL TESS_CALL TessBaseAPIGetIntVariable( const TessBaseAPI* handle, const char* name, int* value);
TESS_API BOOL TESS_CALL TessBaseAPIGetBoolVariable( const TessBaseAPI* handle, const char* name, BOOL* value);
TESS_API BOOL TESS_CALL TessBaseAPIGetDoubleVariable(const TessBaseAPI* handle, const char* name, double* value);
TESS_API const char*
TESS_CALL TessBaseAPIGetStringVariable(const TessBaseAPI* handle, const char* name);
TESS_API void TESS_CALL TessBaseAPIPrintVariables( const TessBaseAPI* handle, FILE* fp);
TESS_API BOOL TESS_CALL TessBaseAPIPrintVariablesToFile(const TessBaseAPI* handle, const char* filename);
#ifdef TESS_CAPI_INCLUDE_BASEAPI
TESS_API BOOL TESS_CALL TessBaseAPIGetVariableAsString(TessBaseAPI* handle, const char* name, STRING* val);
#endif
#ifdef TESS_CAPI_INCLUDE_BASEAPI
TESS_API int TESS_CALL TessBaseAPIInit(TessBaseAPI* handle, const char* datapath, const char* language,
TessOcrEngineMode mode, char** configs, int configs_size,
const STRING* vars_vec, size_t vars_vec_size,
const STRING* vars_values, size_t vars_values_size, BOOL set_only_init_params);
#endif
TESS_API int TESS_CALL TessBaseAPIInit1(TessBaseAPI* handle, const char* datapath, const char* language, TessOcrEngineMode oem,
char** configs, int configs_size);
TESS_API int TESS_CALL TessBaseAPIInit2(TessBaseAPI* handle, const char* datapath, const char* language, TessOcrEngineMode oem);
TESS_API int TESS_CALL TessBaseAPIInit3(TessBaseAPI* handle, const char* datapath, const char* language);
TESS_API int TESS_CALL TessBaseAPIInit4(TessBaseAPI* handle, const char* datapath, const char* language, TessOcrEngineMode mode,
char** configs, int configs_size,
char** vars_vec, char** vars_values, size_t vars_vec_size,
BOOL set_only_non_debug_params);
TESS_API const char*
TESS_CALL TessBaseAPIGetInitLanguagesAsString(const TessBaseAPI* handle);
TESS_API char**
TESS_CALL TessBaseAPIGetLoadedLanguagesAsVector(const TessBaseAPI* handle);
TESS_API char**
TESS_CALL TessBaseAPIGetAvailableLanguagesAsVector(const TessBaseAPI* handle);
TESS_API int TESS_CALL TessBaseAPIInitLangMod(TessBaseAPI* handle, const char* datapath, const char* language);
TESS_API void TESS_CALL TessBaseAPIInitForAnalysePage(TessBaseAPI* handle);
TESS_API void TESS_CALL TessBaseAPIReadConfigFile(TessBaseAPI* handle, const char* filename);
TESS_API void TESS_CALL TessBaseAPIReadDebugConfigFile(TessBaseAPI* handle, const char* filename);
TESS_API void TESS_CALL TessBaseAPISetPageSegMode(TessBaseAPI* handle, TessPageSegMode mode);
TESS_API TessPageSegMode
TESS_CALL TessBaseAPIGetPageSegMode(const TessBaseAPI* handle);
TESS_API char* TESS_CALL TessBaseAPIRect(TessBaseAPI* handle, const unsigned char* imagedata,
int bytes_per_pixel, int bytes_per_line,
int left, int top, int width, int height);
TESS_API void TESS_CALL TessBaseAPIClearAdaptiveClassifier(TessBaseAPI* handle);
TESS_API void TESS_CALL TessBaseAPISetImage(TessBaseAPI* handle, const unsigned char* imagedata, int width, int height,
int bytes_per_pixel, int bytes_per_line);
TESS_API void TESS_CALL TessBaseAPISetImage2(TessBaseAPI* handle, struct Pix* pix);
TESS_API void TESS_CALL TessBaseAPISetSourceResolution(TessBaseAPI* handle, int ppi);
TESS_API void TESS_CALL TessBaseAPISetRectangle(TessBaseAPI* handle, int left, int top, int width, int height);
#ifdef TESS_CAPI_INCLUDE_BASEAPI
TESS_API void TESS_CALL TessBaseAPISetThresholder(TessBaseAPI* handle, TessImageThresholder* thresholder);
#endif
TESS_API struct Pix*
TESS_CALL TessBaseAPIGetThresholdedImage( TessBaseAPI* handle);
TESS_API struct Boxa*
TESS_CALL TessBaseAPIGetRegions( TessBaseAPI* handle, struct Pixa** pixa);
TESS_API struct Boxa*
TESS_CALL TessBaseAPIGetTextlines( TessBaseAPI* handle, struct Pixa** pixa, int** blockids);
TESS_API struct Boxa*
TESS_CALL TessBaseAPIGetTextlines1( TessBaseAPI* handle, const BOOL raw_image, const int raw_padding,
struct Pixa** pixa, int** blockids, int** paraids);
TESS_API struct Boxa*
TESS_CALL TessBaseAPIGetStrips( TessBaseAPI* handle, struct Pixa** pixa, int** blockids);
TESS_API struct Boxa*
TESS_CALL TessBaseAPIGetWords( TessBaseAPI* handle, struct Pixa** pixa);
TESS_API struct Boxa*
TESS_CALL TessBaseAPIGetConnectedComponents(TessBaseAPI* handle, struct Pixa** cc);
TESS_API struct Boxa*
TESS_CALL TessBaseAPIGetComponentImages( TessBaseAPI* handle, const TessPageIteratorLevel level, const BOOL text_only,
struct Pixa** pixa, int** blockids);
TESS_API struct Boxa*
TESS_CALL TessBaseAPIGetComponentImages1( TessBaseAPI* handle, const TessPageIteratorLevel level, const BOOL text_only,
const BOOL raw_image, const int raw_padding,
struct Pixa** pixa, int** blockids, int** paraids);
TESS_API int TESS_CALL TessBaseAPIGetThresholdedImageScaleFactor(const TessBaseAPI* handle);
TESS_API void TESS_CALL TessBaseAPIDumpPGM(TessBaseAPI* handle, const char* filename);
TESS_API TessPageIterator*
TESS_CALL TessBaseAPIAnalyseLayout(TessBaseAPI* handle);
TESS_API int TESS_CALL TessBaseAPIRecognize(TessBaseAPI* handle, ETEXT_DESC* monitor);
TESS_API int TESS_CALL TessBaseAPIRecognizeForChopTest(TessBaseAPI* handle, ETEXT_DESC* monitor);
TESS_API BOOL TESS_CALL TessBaseAPIProcessPages(TessBaseAPI* handle, const char* filename, const char* retry_config,
int timeout_millisec, TessResultRenderer* renderer);
TESS_API BOOL TESS_CALL TessBaseAPIProcessPage(TessBaseAPI* handle, struct Pix* pix, int page_index, const char* filename,
const char* retry_config, int timeout_millisec, TessResultRenderer* renderer);
TESS_API TessResultIterator*
TESS_CALL TessBaseAPIGetIterator(TessBaseAPI* handle);
TESS_API TessMutableIterator*
TESS_CALL TessBaseAPIGetMutableIterator(TessBaseAPI* handle);
TESS_API char* TESS_CALL TessBaseAPIGetUTF8Text(TessBaseAPI* handle);
TESS_API char* TESS_CALL TessBaseAPIGetHOCRText(TessBaseAPI* handle, int page_number);
TESS_API char* TESS_CALL TessBaseAPIGetBoxText(TessBaseAPI* handle, int page_number);
TESS_API char* TESS_CALL TessBaseAPIGetUNLVText(TessBaseAPI* handle);
TESS_API int TESS_CALL TessBaseAPIMeanTextConf(TessBaseAPI* handle);
TESS_API int* TESS_CALL TessBaseAPIAllWordConfidences(TessBaseAPI* handle);
TESS_API BOOL TESS_CALL TessBaseAPIAdaptToWordStr(TessBaseAPI* handle, TessPageSegMode mode, const char* wordstr);
TESS_API void TESS_CALL TessBaseAPIClear(TessBaseAPI* handle);
TESS_API void TESS_CALL TessBaseAPIEnd(TessBaseAPI* handle);
TESS_API int TESS_CALL TessBaseAPIIsValidWord(TessBaseAPI* handle, const char* word);
TESS_API BOOL TESS_CALL TessBaseAPIGetTextDirection(TessBaseAPI* handle, int* out_offset, float* out_slope);
#ifdef TESS_CAPI_INCLUDE_BASEAPI
TESS_API void TESS_CALL TessBaseAPISetDictFunc(TessBaseAPI* handle, TessDictFunc f);
TESS_API void TESS_CALL TessBaseAPIClearPersistentCache(TessBaseAPI* handle);
TESS_API void TESS_CALL TessBaseAPISetProbabilityInContextFunc(TessBaseAPI* handle, TessProbabilityInContextFunc f);
TESS_API void TESS_CALL TessBaseAPISetFillLatticeFunc(TessBaseAPI* handle, TessFillLatticeFunc f);
// Deprecated, no longer working
TESS_API BOOL TESS_CALL TessBaseAPIDetectOS(TessBaseAPI* handle, OSResults* results);
// Call TessDeleteText(*best_script_name) to free memory allocated by this function
TESS_API BOOL TESS_CALL TessBaseAPIDetectOrientationScript(TessBaseAPI* handle,
int* orient_deg, float* orient_conf, const char **script_name, float* script_conf);
TESS_API void TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, INT_FEATURE_STRUCT* int_features,
int* num_features, int* FeatureOutlineIndex);
TESS_API ROW* TESS_CALL TessFindRowForBox(BLOCK_LIST* blocks, int left, int top, int right, int bottom);
TESS_API void TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, int num_max_matches,
int* unichar_ids, float* ratings, int* num_matches_returned);
#endif
TESS_API const char*
TESS_CALL TessBaseAPIGetUnichar(TessBaseAPI* handle, int unichar_id);
#ifdef TESS_CAPI_INCLUDE_BASEAPI
TESS_API const TessDawg*
TESS_CALL TessBaseAPIGetDawg(const TessBaseAPI* handle, int i);
TESS_API int TESS_CALL TessBaseAPINumDawgs(const TessBaseAPI* handle);
#endif
#ifdef TESS_CAPI_INCLUDE_BASEAPI
TESS_API ROW* TESS_CALL TessMakeTessOCRRow(float baseline, float xheight, float descender, float ascender);
TESS_API TBLOB*
TESS_CALL TessMakeTBLOB(Pix* pix);
TESS_API void TESS_CALL TessNormalizeTBLOB(TBLOB* tblob, ROW* row, BOOL numeric_mode);
TESS_API TessOcrEngineMode
TESS_CALL TessBaseAPIOem(const TessBaseAPI* handle);
TESS_API void TESS_CALL TessBaseAPIInitTruthCallback(TessBaseAPI* handle, TessTruthCallback* cb);
#ifndef NO_CUBE_BUILD
TESS_API TessCubeRecoContext*
TESS_CALL TessBaseAPIGetCubeRecoContext(const TessBaseAPI* handle);
#endif // NO_CUBE_BUILD
#endif
TESS_API void TESS_CALL TessBaseAPISetMinOrientationMargin(TessBaseAPI* handle, double margin);
#ifdef TESS_CAPI_INCLUDE_BASEAPI
TESS_API void TESS_CALL TessBaseGetBlockTextOrientations(TessBaseAPI* handle, int** block_orientation, BOOL** vertical_writing);
TESS_API BLOCK_LIST*
TESS_CALL TessBaseAPIFindLinesCreateBlockList(TessBaseAPI* handle);
#endif
/* Page iterator */
TESS_API void TESS_CALL TessPageIteratorDelete(TessPageIterator* handle);
TESS_API TessPageIterator*
TESS_CALL TessPageIteratorCopy(const TessPageIterator* handle);
TESS_API void TESS_CALL TessPageIteratorBegin(TessPageIterator* handle);
TESS_API BOOL TESS_CALL TessPageIteratorNext(TessPageIterator* handle, TessPageIteratorLevel level);
TESS_API BOOL TESS_CALL TessPageIteratorIsAtBeginningOf(const TessPageIterator* handle, TessPageIteratorLevel level);
TESS_API BOOL TESS_CALL TessPageIteratorIsAtFinalElement(const TessPageIterator* handle, TessPageIteratorLevel level,
TessPageIteratorLevel element);
TESS_API BOOL TESS_CALL TessPageIteratorBoundingBox(const TessPageIterator* handle, TessPageIteratorLevel level,
int* left, int* top, int* right, int* bottom);
TESS_API TessPolyBlockType
TESS_CALL TessPageIteratorBlockType(const TessPageIterator* handle);
TESS_API struct Pix*
TESS_CALL TessPageIteratorGetBinaryImage(const TessPageIterator* handle, TessPageIteratorLevel level);
TESS_API struct Pix*
TESS_CALL TessPageIteratorGetImage(const TessPageIterator* handle, TessPageIteratorLevel level, int padding,
struct Pix* original_image, int* left, int* top);
TESS_API BOOL TESS_CALL TessPageIteratorBaseline(const TessPageIterator* handle, TessPageIteratorLevel level,
int* x1, int* y1, int* x2, int* y2);
TESS_API void TESS_CALL TessPageIteratorOrientation(TessPageIterator* handle, TessOrientation* orientation,
TessWritingDirection* writing_direction, TessTextlineOrder* textline_order,
float* deskew_angle);
TESS_API void TESS_CALL TessPageIteratorParagraphInfo(TessPageIterator* handle, TessParagraphJustification* justification,
BOOL *is_list_item, BOOL *is_crown, int *first_line_indent);
/* Result iterator */
TESS_API void TESS_CALL TessResultIteratorDelete(TessResultIterator* handle);
TESS_API TessResultIterator*
TESS_CALL TessResultIteratorCopy(const TessResultIterator* handle);
TESS_API TessPageIterator*
TESS_CALL TessResultIteratorGetPageIterator(TessResultIterator* handle);
TESS_API const TessPageIterator*
TESS_CALL TessResultIteratorGetPageIteratorConst(const TessResultIterator* handle);
TESS_API TessChoiceIterator*
TESS_CALL TessResultIteratorGetChoiceIterator(const TessResultIterator* handle);
TESS_API BOOL TESS_CALL TessResultIteratorNext(TessResultIterator* handle, TessPageIteratorLevel level);
TESS_API char* TESS_CALL TessResultIteratorGetUTF8Text(const TessResultIterator* handle, TessPageIteratorLevel level);
TESS_API float TESS_CALL TessResultIteratorConfidence(const TessResultIterator* handle, TessPageIteratorLevel level);
TESS_API const char*
TESS_CALL TessResultIteratorWordRecognitionLanguage(const TessResultIterator* handle);
TESS_API const char*
TESS_CALL TessResultIteratorWordFontAttributes(const TessResultIterator* handle, BOOL* is_bold, BOOL* is_italic,
BOOL* is_underlined, BOOL* is_monospace, BOOL* is_serif,
BOOL* is_smallcaps, int* pointsize, int* font_id);
TESS_API BOOL TESS_CALL TessResultIteratorWordIsFromDictionary(const TessResultIterator* handle);
TESS_API BOOL TESS_CALL TessResultIteratorWordIsNumeric(const TessResultIterator* handle);
TESS_API BOOL TESS_CALL TessResultIteratorSymbolIsSuperscript(const TessResultIterator* handle);
TESS_API BOOL TESS_CALL TessResultIteratorSymbolIsSubscript(const TessResultIterator* handle);
TESS_API BOOL TESS_CALL TessResultIteratorSymbolIsDropcap(const TessResultIterator* handle);
TESS_API void TESS_CALL TessChoiceIteratorDelete(TessChoiceIterator* handle);
TESS_API BOOL TESS_CALL TessChoiceIteratorNext(TessChoiceIterator* handle);
TESS_API const char* TESS_CALL TessChoiceIteratorGetUTF8Text(const TessChoiceIterator* handle);
TESS_API float TESS_CALL TessChoiceIteratorConfidence(const TessChoiceIterator* handle);
#ifdef __cplusplus
}
#endif
#endif // API_CAPI_H_

View File

@ -0,0 +1,9 @@
#define HAVE_LIBJPEG 1
#define HAVE_LIBTIFF 1
#define HAVE_LIBPNG 1
#define HAVE_LIBZ 1
#define HAVE_LIBGIF 1
#define HAVE_LIBUNGIF 0
#define HAVE_LIBWEBP 1
#define HAVE_LIBJP2K 1
#define LIBJP2K_HEADER <openjpeg.h>

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,283 @@
///////////////////////////////////////////////////////////////////////
// File: renderer.cpp
// Description: Rendering interface to inject into TessBaseAPI
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
#include <string.h>
#include "baseapi.h"
#include "genericvector.h"
#include "renderer.h"
namespace tesseract {
/**********************************************************************
* Base Renderer interface implementation
**********************************************************************/
TessResultRenderer::TessResultRenderer(const char *outputbase,
const char* extension)
: file_extension_(extension),
title_(""), imagenum_(-1),
fout_(stdout),
next_(NULL),
happy_(true) {
if (strcmp(outputbase, "-") && strcmp(outputbase, "stdout")) {
STRING outfile = STRING(outputbase) + STRING(".") + STRING(file_extension_);
fout_ = fopen(outfile.string(), "wb");
if (fout_ == NULL) {
happy_ = false;
}
}
}
TessResultRenderer::~TessResultRenderer() {
if (fout_ != NULL) {
if (fout_ != stdout)
fclose(fout_);
else
clearerr(fout_);
}
delete next_;
}
void TessResultRenderer::insert(TessResultRenderer* next) {
if (next == NULL) return;
TessResultRenderer* remainder = next_;
next_ = next;
if (remainder) {
while (next->next_ != NULL) {
next = next->next_;
}
next->next_ = remainder;
}
}
bool TessResultRenderer::BeginDocument(const char* title) {
if (!happy_) return false;
title_ = title;
imagenum_ = -1;
bool ok = BeginDocumentHandler();
if (next_) {
ok = next_->BeginDocument(title) && ok;
}
return ok;
}
bool TessResultRenderer::AddImage(TessBaseAPI* api, const char* jpgdata, int len) {
if (!happy_) return false;
++imagenum_;
bool ok = AddImageHandler(api, jpgdata, len);
if (next_) {
ok = next_->AddImage(api, jpgdata, len) && ok;
}
return ok;
}
bool TessResultRenderer::EndDocument() {
if (!happy_) return false;
bool ok = EndDocumentHandler();
if (next_) {
ok = next_->EndDocument() && ok;
}
return ok;
}
void TessResultRenderer::AppendString(const char* s) {
AppendData(s, strlen(s));
}
void TessResultRenderer::AppendData(const char* s, int len) {
int n = fwrite(s, 1, len, fout_);
if (n != len) happy_ = false;
}
bool TessResultRenderer::BeginDocumentHandler() {
return happy_;
}
bool TessResultRenderer::EndDocumentHandler() {
return happy_;
}
/**********************************************************************
* UTF8 Text Renderer interface implementation
**********************************************************************/
TessTextRenderer::TessTextRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "txt") {
}
bool TessTextRenderer::AddImageHandler(TessBaseAPI* api) {
char* utf8 = api->GetUTF8Text();
if (utf8 == NULL) {
return false;
}
AppendString(utf8);
delete[] utf8;
bool pageBreak = false;
api->GetBoolVariable("include_page_breaks", &pageBreak);
const char* pageSeparator = api->GetStringVariable("page_separator");
if (pageBreak) {
AppendString(pageSeparator);
}
return true;
}
/**********************************************************************
* HOcr Text Renderer interface implementation
**********************************************************************/
TessHOcrRenderer::TessHOcrRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "hocr") {
font_info_ = false;
}
TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info)
: TessResultRenderer(outputbase, "hocr") {
font_info_ = font_info;
}
bool TessHOcrRenderer::BeginDocumentHandler() {
AppendString(
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
"lang=\"en\">\n <head>\n <title>");
AppendString(title());
AppendString(
"</title>\n"
"<meta http-equiv=\"Content-Type\" content=\"text/html;"
"charset=utf-8\" />\n"
" <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR
"' />\n"
" <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
" ocr_line ocrx_word");
if (font_info_)
AppendString(
" ocrp_lang ocrp_dir ocrp_font ocrp_fsize ocrp_wconf");
AppendString(
"'/>\n"
"</head>\n<body>\n");
return true;
}
bool TessHOcrRenderer::EndDocumentHandler() {
AppendString(" </body>\n</html>\n");
return true;
}
bool TessHOcrRenderer::AddImageHandler(TessBaseAPI* api) {
char* hocr = api->GetHOCRText(imagenum());
if (hocr == NULL) return false;
AppendString(hocr);
delete[] hocr;
return true;
}
/**********************************************************************
* TSV Text Renderer interface implementation
**********************************************************************/
TessTsvRenderer::TessTsvRenderer(const char* outputbase)
: TessResultRenderer(outputbase, "tsv") {
font_info_ = false;
}
TessTsvRenderer::TessTsvRenderer(const char* outputbase, bool font_info)
: TessResultRenderer(outputbase, "tsv") {
font_info_ = font_info;
}
bool TessTsvRenderer::BeginDocumentHandler() {
// Output TSV column headings
AppendString(
"level\tpage_num\tblock_num\tpar_num\tline_num\tword_"
"num\tleft\ttop\twidth\theight\tconf\ttext\n");
return true;
}
bool TessTsvRenderer::EndDocumentHandler() { return true; }
bool TessTsvRenderer::AddImageHandler(TessBaseAPI* api) {
char* tsv = api->GetTSVText(imagenum());
if (tsv == NULL) return false;
AppendString(tsv);
delete[] tsv;
return true;
}
/**********************************************************************
* UNLV Text Renderer interface implementation
**********************************************************************/
TessUnlvRenderer::TessUnlvRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "unlv") {
}
bool TessUnlvRenderer::AddImageHandler(TessBaseAPI* api) {
char* unlv = api->GetUNLVText();
if (unlv == NULL) return false;
AppendString(unlv);
delete[] unlv;
return true;
}
/**********************************************************************
* BoxText Renderer interface implementation
**********************************************************************/
TessBoxTextRenderer::TessBoxTextRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "box") {
}
bool TessBoxTextRenderer::AddImageHandler(TessBaseAPI* api) {
char* text = api->GetBoxText(imagenum());
if (text == NULL) return false;
AppendString(text);
delete[] text;
return true;
}
/**********************************************************************
* Osd Text Renderer interface implementation
**********************************************************************/
TessOsdRenderer::TessOsdRenderer(const char* outputbase)
: TessResultRenderer(outputbase, "osd") {}
bool TessOsdRenderer::AddImageHandler(TessBaseAPI* api) {
char* osd = api->GetOsdText(imagenum());
if (osd == NULL) return false;
AppendString(osd);
delete[] osd;
return true;
}
} // namespace tesseract

View File

@ -0,0 +1,271 @@
///////////////////////////////////////////////////////////////////////
// File: renderer.h
// Description: Rendering interface to inject into TessBaseAPI
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_API_RENDERER_H_
#define TESSERACT_API_RENDERER_H_
// To avoid collision with other typenames include the ABSOLUTE MINIMUM
// complexity of includes here. Use forward declarations wherever possible
// and hide includes of complex types in baseapi.cpp.
#include "genericvector.h"
#include "platform.h"
#include "publictypes.h"
namespace tesseract {
class TessBaseAPI;
/**
* Interface for rendering tesseract results into a document, such as text,
* HOCR or pdf. This class is abstract. Specific classes handle individual
* formats. This interface is then used to inject the renderer class into
* tesseract when processing images.
*
* For simplicity implementing this with tesesract version 3.01,
* the renderer contains document state that is cleared from document
* to document just as the TessBaseAPI is. This way the base API can just
* delegate its rendering functionality to injected renderers, and the
* renderers can manage the associated state needed for the specific formats
* in addition to the heuristics for producing it.
*/
class TESS_API TessResultRenderer {
public:
virtual ~TessResultRenderer();
// Takes ownership of pointer so must be new'd instance.
// Renderers aren't ordered, but appends the sequences of next parameter
// and existing next(). The renderers should be unique across both lists.
void insert(TessResultRenderer* next);
// Returns the next renderer or NULL.
TessResultRenderer* next() { return next_; }
/**
* Starts a new document with the given title.
* This clears the contents of the output data.
* Title should use UTF-8 encoding.
*/
bool BeginDocument(const char* title);
/**
* Adds the recognized text from the source image to the current document.
* Invalid if BeginDocument not yet called.
*
* Note that this API is a bit weird but is designed to fit into the
* current TessBaseAPI implementation where the api has lots of state
* information that we might want to add in.
*/
bool AddImage(TessBaseAPI * api, const char * jpgdata, int len);
/**
* Finishes the document and finalizes the output data
* Invalid if BeginDocument not yet called.
*/
bool EndDocument();
const char* file_extension() const { return file_extension_; }
const char* title() const { return title_.c_str(); }
/**
* Returns the index of the last image given to AddImage
* (i.e. images are incremented whether the image succeeded or not)
*
* This is always defined. It means either the number of the
* current image, the last image ended, or in the completed document
* depending on when in the document lifecycle you are looking at it.
* Will return -1 if a document was never started.
*/
int imagenum() const { return imagenum_; }
protected:
/**
* Called by concrete classes.
*
* outputbase is the name of the output file excluding
* extension. For example, "/path/to/chocolate-chip-cookie-recipe"
*
* extension indicates the file extension to be used for output
* files. For example "pdf" will produce a .pdf file, and "hocr"
* will produce .hocr files.
*/
TessResultRenderer(const char *outputbase,
const char* extension);
// Hook for specialized handling in BeginDocument()
virtual bool BeginDocumentHandler();
// This must be overriden to render the OCR'd results
virtual bool AddImageHandler(TessBaseAPI* api) = 0;
virtual bool AddImageHandler(TessBaseAPI* api, const char* jpgdata, int len) = 0;
// Hook for specialized handling in EndDocument()
virtual bool EndDocumentHandler();
// Renderers can call this to append '\0' terminated strings into
// the output string returned by GetOutput.
// This method will grow the output buffer if needed.
void AppendString(const char* s);
// Renderers can call this to append binary byte sequences into
// the output string returned by GetOutput. Note that s is not necessarily
// '\0' terminated (and can contain '\0' within it).
// This method will grow the output buffer if needed.
void AppendData(const char* s, int len);
private:
const char* file_extension_; // standard extension for generated output
STRING title_; // title of document being renderered
int imagenum_; // index of last image added
FILE* fout_; // output file pointer
TessResultRenderer* next_; // Can link multiple renderers together
bool happy_; // I get grumpy when the disk fills up, etc.
};
/**
* Renders tesseract output into a plain UTF-8 text string
*/
class TESS_API TessTextRenderer : public TessResultRenderer {
public:
explicit TessTextRenderer(const char *outputbase);
protected:
virtual bool AddImageHandler(TessBaseAPI* api);
virtual bool AddImageHandler(TessBaseAPI* api, const char* jpgdata, int len) { return false; }
};
/**
* Renders tesseract output into an hocr text string
*/
class TESS_API TessHOcrRenderer : public TessResultRenderer {
public:
explicit TessHOcrRenderer(const char *outputbase, bool font_info);
explicit TessHOcrRenderer(const char *outputbase);
protected:
virtual bool BeginDocumentHandler();
virtual bool AddImageHandler(TessBaseAPI* api);
virtual bool AddImageHandler(TessBaseAPI* api, const char* jpgdata, int len) { return false; }
virtual bool EndDocumentHandler();
private:
bool font_info_; // whether to print font information
};
/**
* Renders Tesseract output into a TSV string
*/
class TESS_API TessTsvRenderer : public TessResultRenderer {
public:
explicit TessTsvRenderer(const char* outputbase, bool font_info);
explicit TessTsvRenderer(const char* outputbase);
protected:
virtual bool BeginDocumentHandler();
virtual bool AddImageHandler(TessBaseAPI* api);
virtual bool AddImageHandler(TessBaseAPI* api, const char* jpgdata, int len) { return false; }
virtual bool EndDocumentHandler();
private:
bool font_info_; // whether to print font information
};
/**
* Renders tesseract output into searchable PDF
*/
class TESS_API TessPDFRenderer : public TessResultRenderer {
public:
// datadir is the location of the TESSDATA. We need it because
// we load a custom PDF font from this location.
TessPDFRenderer(const char* outputbase, const char* datadir);
TessPDFRenderer(const char* outputbase, const char* datadir, bool textonly);
protected:
virtual bool AddImageHandler(TessBaseAPI* api, const char* jpgdata, int len);
virtual bool BeginDocumentHandler();
virtual bool AddImageHandler(TessBaseAPI* api);
virtual bool EndDocumentHandler();
private:
// We don't want to have every image in memory at once,
// so we store some metadata as we go along producing
// PDFs one page at a time. At the end, that metadata is
// used to make everything that isn't easily handled in a
// streaming fashion.
long int obj_; // counter for PDF objects
GenericVector<long int> offsets_; // offset of every PDF object in bytes
GenericVector<long int> pages_; // object number for every /Page object
const char *datadir_; // where to find the custom font
bool textonly_; // skip images if set
// Bookkeeping only. DIY = Do It Yourself.
void AppendPDFObjectDIY(size_t objectsize);
// Bookkeeping + emit data.
void AppendPDFObject(const char *data);
// Create the /Contents object for an entire page.
char* GetPDFTextObjects(TessBaseAPI* api, double width, double height);
// Turn an image into a PDF object. Only transcode if we have to.
static bool imageToPDFObj(Pix *pix, char *filename, long int objnum,
char **pdf_object, long int *pdf_object_size);
static bool imageToPDFObj(const char* jpgdata, int len, long int objnum,
char **pdf_object, long int *pdf_object_size);
};
/**
* Renders tesseract output into a plain UTF-8 text string
*/
class TESS_API TessUnlvRenderer : public TessResultRenderer {
public:
explicit TessUnlvRenderer(const char *outputbase);
protected:
virtual bool AddImageHandler(TessBaseAPI* api);
virtual bool AddImageHandler(TessBaseAPI* api, const char* jpgdata, int len) { return false; }
};
/**
* Renders tesseract output into a plain UTF-8 text string
*/
class TESS_API TessBoxTextRenderer : public TessResultRenderer {
public:
explicit TessBoxTextRenderer(const char *outputbase);
protected:
virtual bool AddImageHandler(TessBaseAPI* api);
virtual bool AddImageHandler(TessBaseAPI* api, const char* jpgdata, int len) { return false; }
};
/**
* Renders tesseract output into an osd text string
*/
class TESS_API TessOsdRenderer : public TessResultRenderer {
public:
explicit TessOsdRenderer(const char* outputbase);
protected:
virtual bool AddImageHandler(TessBaseAPI* api);
virtual bool AddImageHandler(TessBaseAPI* api, const char* jpgdata, int len) { return false; }
};
} // namespace tesseract.
#endif // TESSERACT_API_RENDERER_H_

View File

@ -0,0 +1,546 @@
/**********************************************************************
* File: tessedit.cpp (Formerly tessedit.c)
* Description: Main program for merge of tess and editor.
* Author: Ray Smith
* Created: Tue Jan 07 15:21:46 GMT 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
// Include automatically generated configuration file if running autoconf
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
#include <iostream>
#include "allheaders.h"
#include "baseapi.h"
#include "basedir.h"
#include "dict.h"
#include "openclwrapper.h"
#include "osdetect.h"
#include "renderer.h"
#include "strngs.h"
#include "tprintf.h"
#include "StopWatch.h"
#if defined(HAVE_TIFFIO_H) && defined(_WIN32)
#include <tiffio.h>
static void Win32WarningHandler(const char* module, const char* fmt,
va_list ap) {
if (module != NULL) {
fprintf(stderr, "%s: ", module);
}
fprintf(stderr, "Warning, ");
vfprintf(stderr, fmt, ap);
fprintf(stderr, ".\n");
}
#endif /* HAVE_TIFFIO_H && _WIN32 */
void PrintVersionInfo() {
char* versionStrP;
printf("tesseract %s\n", tesseract::TessBaseAPI::Version());
versionStrP = getLeptonicaVersion();
printf(" %s\n", versionStrP);
lept_free(versionStrP);
versionStrP = getImagelibVersions();
printf(" %s\n", versionStrP);
lept_free(versionStrP);
#ifdef USE_OPENCL
cl_platform_id platform[4];
cl_uint num_platforms;
printf(" OpenCL info:\n");
if (clGetPlatformIDs(4, platform, &num_platforms) == CL_SUCCESS) {
printf(" Found %u platform(s).\n", num_platforms);
for (unsigned n = 0; n < num_platforms; n++) {
char info[256];
if (clGetPlatformInfo(platform[n], CL_PLATFORM_NAME, 256, info, 0) ==
CL_SUCCESS) {
printf(" Platform %u name: %s.\n", n + 1, info);
}
if (clGetPlatformInfo(platform[n], CL_PLATFORM_VERSION, 256, info, 0) ==
CL_SUCCESS) {
printf(" Version: %s.\n", info);
}
cl_device_id devices[2];
cl_uint num_devices;
if (clGetDeviceIDs(platform[n], CL_DEVICE_TYPE_ALL, 2, devices,
&num_devices) == CL_SUCCESS) {
printf(" Found %u device(s).\n", num_devices);
for (unsigned i = 0; i < num_devices; ++i) {
if (clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 256, info, 0) ==
CL_SUCCESS) {
printf(" Device %u name: %s.\n", i + 1, info);
}
}
}
}
}
#endif
}
void PrintUsage(const char* program) {
printf(
"Usage:\n"
" %s --help | --help-psm | --help-oem | --version\n"
" %s --list-langs [--tessdata-dir PATH]\n"
" %s --print-parameters [options...] [configfile...]\n"
" %s imagename|stdin outputbase|stdout [options...] [configfile...]\n",
program, program, program, program);
}
void PrintHelpForPSM() {
const char* msg =
"Page segmentation modes:\n"
" 0 Orientation and script detection (OSD) only.\n"
" 1 Automatic page segmentation with OSD.\n"
" 2 Automatic page segmentation, but no OSD, or OCR.\n"
" 3 Fully automatic page segmentation, but no OSD. (Default)\n"
" 4 Assume a single column of text of variable sizes.\n"
" 5 Assume a single uniform block of vertically aligned text.\n"
" 6 Assume a single uniform block of text.\n"
" 7 Treat the image as a single text line.\n"
" 8 Treat the image as a single word.\n"
" 9 Treat the image as a single word in a circle.\n"
" 10 Treat the image as a single character.\n"
" 11 Sparse text. Find as much text as possible in no"
" particular order.\n"
" 12 Sparse text with OSD.\n"
" 13 Raw line. Treat the image as a single text line,\n"
"\t\t\tbypassing hacks that are Tesseract-specific.\n";
printf("%s", msg);
}
void PrintHelpForOEM() {
const char* msg =
"OCR Engine modes:\n"
" 0 Original Tesseract only.\n"
" 1 Cube only.\n"
" 2 Tesseract + cube.\n"
" 3 Default, based on what is available.\n";
printf("%s", msg);
}
void PrintHelpMessage(const char* program) {
PrintUsage(program);
const char* ocr_options =
"OCR options:\n"
" --tessdata-dir PATH Specify the location of tessdata path.\n"
" --user-words PATH Specify the location of user words file.\n"
" --user-patterns PATH Specify the location of user patterns file.\n"
" -l LANG[+LANG] Specify language(s) used for OCR.\n"
" -c VAR=VALUE Set value for config variables.\n"
" Multiple -c arguments are allowed.\n"
" --psm NUM Specify page segmentation mode.\n"
" --oem NUM Specify OCR Engine mode.\n"
"NOTE: These options must occur before any configfile.\n";
printf("\n%s\n", ocr_options);
PrintHelpForPSM();
PrintHelpForOEM();
const char* single_options =
"Single options:\n"
" -h, --help Show this help message.\n"
" --help-psm Show page segmentation modes.\n"
" --help-oem Show OCR Engine modes.\n"
" -v, --version Show version information.\n"
" --list-langs List available languages for tesseract engine.\n"
" --print-parameters Print tesseract parameters to stdout.\n";
printf("\n%s", single_options);
}
void SetVariablesFromCLArgs(tesseract::TessBaseAPI* api, int argc,
char** argv) {
char opt1[256], opt2[255];
for (int i = 0; i < argc; i++) {
if (strcmp(argv[i], "-c") == 0 && i + 1 < argc) {
strncpy(opt1, argv[i + 1], 255);
opt1[255] = '\0';
char* p = strchr(opt1, '=');
if (!p) {
fprintf(stderr, "Missing = in configvar assignment\n");
exit(1);
}
*p = 0;
strncpy(opt2, strchr(argv[i + 1], '=') + 1, 255);
opt2[254] = 0;
++i;
if (!api->SetVariable(opt1, opt2)) {
fprintf(stderr, "Could not set option: %s=%s\n", opt1, opt2);
}
}
}
}
void PrintLangsList(tesseract::TessBaseAPI* api) {
GenericVector<STRING> languages;
api->GetAvailableLanguagesAsVector(&languages);
printf("List of available languages (%d):\n", languages.size());
for (int index = 0; index < languages.size(); ++index) {
STRING& string = languages[index];
printf("%s\n", string.string());
}
api->End();
}
void PrintBanner() {
tprintf("Tesseract Open Source OCR Engine v%s with Leptonica\n",
tesseract::TessBaseAPI::Version());
}
/**
* We have 2 possible sources of pagesegmode: a config file and
* the command line. For backwards compatibility reasons, the
* default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
* default for this program is tesseract::PSM_AUTO. We will let
* the config file take priority, so the command-line default
* can take priority over the tesseract default, so we use the
* value from the command line only if the retrieved mode
* is still tesseract::PSM_SINGLE_BLOCK, indicating no change
* in any config file. Therefore the only way to force
* tesseract::PSM_SINGLE_BLOCK is from the command line.
* It would be simpler if we could set the value before Init,
* but that doesn't work.
*/
void FixPageSegMode(tesseract::TessBaseAPI* api,
tesseract::PageSegMode pagesegmode) {
if (api->GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK)
api->SetPageSegMode(pagesegmode);
}
// NOTE: arg_i is used here to avoid ugly *i so many times in this function
void ParseArgs(const int argc, char** argv, const char** lang,
const char** image, const char** outputbase,
const char** datapath, bool* list_langs, bool* print_parameters,
GenericVector<STRING>* vars_vec,
GenericVector<STRING>* vars_values, int* arg_i,
tesseract::PageSegMode* pagesegmode,
tesseract::OcrEngineMode* enginemode) {
if (argc == 1) {
PrintHelpMessage(argv[0]);
exit(0);
}
if (argc == 2) {
if ((strcmp(argv[1], "-h") == 0) || (strcmp(argv[1], "--help") == 0)) {
PrintHelpMessage(argv[0]);
exit(0);
}
if ((strcmp(argv[1], "--help-psm") == 0)) {
PrintHelpForPSM();
exit(0);
}
if ((strcmp(argv[1], "--help-oem") == 0)) {
PrintHelpForOEM();
exit(0);
}
if ((strcmp(argv[1], "-v") == 0) || (strcmp(argv[1], "--version") == 0)) {
PrintVersionInfo();
exit(0);
}
}
bool noocr = false;
int i = 1;
while (i < argc && (*outputbase == NULL || argv[i][0] == '-')) {
if (strcmp(argv[i], "-l") == 0 && i + 1 < argc) {
*lang = argv[i + 1];
++i;
}
else if (strcmp(argv[i], "--tessdata-dir") == 0 && i + 1 < argc) {
*datapath = argv[i + 1];
++i;
}
else if (strcmp(argv[i], "--user-words") == 0 && i + 1 < argc) {
vars_vec->push_back("user_words_file");
vars_values->push_back(argv[i + 1]);
++i;
}
else if (strcmp(argv[i], "--user-patterns") == 0 && i + 1 < argc) {
vars_vec->push_back("user_patterns_file");
vars_values->push_back(argv[i + 1]);
++i;
}
else if (strcmp(argv[i], "--list-langs") == 0) {
noocr = true;
*list_langs = true;
}
else if (strcmp(argv[i], "-psm") == 0 && i + 1 < argc) {
// The parameter -psm is deprecated and was replaced by --psm.
// It is still supported for compatibility reasons.
*pagesegmode = static_cast<tesseract::PageSegMode>(atoi(argv[i + 1]));
++i;
}
else if (strcmp(argv[i], "--psm") == 0 && i + 1 < argc) {
*pagesegmode = static_cast<tesseract::PageSegMode>(atoi(argv[i + 1]));
++i;
}
else if (strcmp(argv[i], "--oem") == 0 && i + 1 < argc) {
*enginemode = static_cast<tesseract::OcrEngineMode>(atoi(argv[i + 1]));
++i;
}
else if (strcmp(argv[i], "--print-parameters") == 0) {
noocr = true;
*print_parameters = true;
}
else if (strcmp(argv[i], "-c") == 0 && i + 1 < argc) {
// handled properly after api init
++i;
}
else if (*image == NULL) {
*image = argv[i];
}
else if (*outputbase == NULL) {
*outputbase = argv[i];
}
++i;
}
*arg_i = i;
if (argc == 2 && strcmp(argv[1], "--list-langs") == 0) {
*list_langs = true;
noocr = true;
}
if (*outputbase == NULL && noocr == false) {
PrintHelpMessage(argv[0]);
exit(1);
}
}
void PreloadRenderers(
tesseract::TessBaseAPI* api,
tesseract::PointerVector<tesseract::TessResultRenderer>* renderers,
tesseract::PageSegMode pagesegmode, const char* outputbase) {
if (pagesegmode == tesseract::PSM_OSD_ONLY) {
renderers->push_back(new tesseract::TessOsdRenderer(outputbase));
}
else {
bool b;
api->GetBoolVariable("tessedit_create_hocr", &b);
if (b) {
bool font_info;
api->GetBoolVariable("hocr_font_info", &font_info);
renderers->push_back(
new tesseract::TessHOcrRenderer(outputbase, font_info));
}
api->GetBoolVariable("tessedit_create_tsv", &b);
if (b) {
bool font_info;
api->GetBoolVariable("hocr_font_info", &font_info);
renderers->push_back(
new tesseract::TessTsvRenderer(outputbase, font_info));
}
api->GetBoolVariable("tessedit_create_pdf", &b);
if (b) {
bool textonly;
api->GetBoolVariable("textonly_pdf", &textonly);
renderers->push_back(new tesseract::TessPDFRenderer(
outputbase, api->GetDatapath(), textonly));
}
api->GetBoolVariable("tessedit_write_unlv", &b);
if (b) {
renderers->push_back(new tesseract::TessUnlvRenderer(outputbase));
}
api->GetBoolVariable("tessedit_create_boxfile", &b);
if (b) {
renderers->push_back(new tesseract::TessBoxTextRenderer(outputbase));
}
api->GetBoolVariable("tessedit_create_txt", &b);
if (b || renderers->empty()) {
renderers->push_back(new tesseract::TessTextRenderer(outputbase));
}
}
if (!renderers->empty()) {
// Since the PointerVector auto-deletes, null-out the renderers that are
// added to the root, and leave the root in the vector.
for (int r = 1; r < renderers->size(); ++r) {
(*renderers)[0]->insert((*renderers)[r]);
(*renderers)[r] = NULL;
}
}
}
/**********************************************************************
* main()
*
**********************************************************************/
int main(int argc, char** argv) {
const char* lang = "osd";
const char* image = NULL;
const char* outputbase = NULL;
const char* datapath = NULL;
bool list_langs = false;
bool print_parameters = false;
int arg_i = 1;
tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO_OSD;
tesseract::OcrEngineMode enginemode = tesseract::OEM_DEFAULT;
/* main() calls functions like ParseArgs which call exit().
* This results in memory leaks if vars_vec and vars_values are
* declared as auto variables (destructor is not called then). */
static GenericVector<STRING> vars_vec;
static GenericVector<STRING> vars_values;
#ifdef NDEBUG
// Disable debugging and informational messages from Leptonica.
setMsgSeverity(L_SEVERITY_ERROR);
#endif
#if defined(HAVE_TIFFIO_H) && defined(_WIN32)
/* Show libtiff warnings on console (not in GUI). */
TIFFSetWarningHandler(Win32WarningHandler);
#endif /* HAVE_TIFFIO_H && _WIN32 */
ParseArgs(argc, argv, &lang, &image, &outputbase, &datapath, &list_langs,
&print_parameters, &vars_vec, &vars_values, &arg_i, &pagesegmode,
&enginemode);
bool banner = false;
if (outputbase != NULL && strcmp(outputbase, "-") &&
strcmp(outputbase, "stdout")) {
banner = true;
}
PERF_COUNT_START("Tesseract:main")
// Call GlobalDawgCache here to create the global DawgCache object before
// the TessBaseAPI object. This fixes the order of destructor calls:
// first TessBaseAPI must be destructed, DawgCache must be the last object.
tesseract::Dict::GlobalDawgCache();
// Avoid memory leak caused by auto variable when exit() is called.
static tesseract::TessBaseAPI api;
api.SetOutputName(outputbase);
int init_failed = api.Init(datapath, lang, enginemode, &(argv[arg_i]),
argc - arg_i, &vars_vec, &vars_values, false);
if (init_failed) {
fprintf(stderr, "Could not initialize tesseract.\n");
getchar();
return EXIT_FAILURE;
}
SetVariablesFromCLArgs(&api, argc, argv);
if (list_langs) {
PrintLangsList(&api);
getchar();
return EXIT_SUCCESS;
}
if (print_parameters) {
FILE* fout = stdout;
fprintf(stdout, "Tesseract parameters:\n");
api.PrintVariables(fout);
api.End();
getchar();
return EXIT_SUCCESS;
}
FixPageSegMode(&api, pagesegmode);
if (pagesegmode == tesseract::PSM_AUTO_OSD) {
int ret_val = EXIT_SUCCESS;
Pix* pixs = pixRead(image);
if (!pixs) {
fprintf(stderr, "Cannot open input file: %s\n", image);
getchar();
return 2;
}
api.SetImage(pixs);
tesseract::Orientation orientation;
tesseract::WritingDirection direction;
tesseract::TextlineOrder order;
float deskew_angle;
tesseract::PageIterator* it = api.AnalyseLayout();
if (it) {
StopWatch timer;
timer.reset();
it->Orientation(&orientation, &direction, &order, &deskew_angle);
tprintf(
"Orientation: %d\nWritingDirection: %d\nTextlineOrder: %d\n"
"Deskew angle: %.4f\n time: %.4f\n img: %s",
orientation, direction, order, deskew_angle, timer.elapsed_s(), image);
getchar();
}
else {
ret_val = EXIT_FAILURE;
}
delete it;
pixDestroy(&pixs);
return ret_val;
}
// set in_training_mode to true when using one of these configs:
// ambigs.train, box.train, box.train.stderr, linebox, rebox
bool b = false;
bool in_training_mode =
(api.GetBoolVariable("tessedit_ambigs_training", &b) && b) ||
(api.GetBoolVariable("tessedit_resegment_from_boxes", &b) && b) ||
(api.GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b);
// Avoid memory leak caused by auto variable when exit() is called.
static tesseract::PointerVector<tesseract::TessResultRenderer> renderers;
if (in_training_mode) {
renderers.push_back(NULL);
}
else {
PreloadRenderers(&api, &renderers, pagesegmode, outputbase);
}
if (!renderers.empty()) {
if (banner) PrintBanner();
bool succeed = api.ProcessPages(image, NULL, 0, renderers[0]);
if (!succeed) {
fprintf(stderr, "Error during processing.\n");
return EXIT_FAILURE;
}
}
PERF_COUNT_END
return EXIT_SUCCESS;
}

View File

@ -0,0 +1,126 @@
/**********************************************************************
* File: adaptions.cpp (Formerly adaptions.c)
* Description: Functions used to adapt to blobs already confidently
* identified
* Author: Chris Newton
* Created: Thu Oct 7 10:17:28 BST 1993
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifdef _MSC_VER
#pragma warning(disable:4244) // Conversion warnings
#pragma warning(disable:4305) // int/float warnings
#endif
#ifdef __UNIX__
#include <assert.h>
#endif
#include <ctype.h>
#include <string.h>
#include "tessbox.h"
#include "tessvars.h"
#include "memry.h"
#include "reject.h"
#include "control.h"
#include "stopper.h"
#include "tesseractclass.h"
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
namespace tesseract {
BOOL8 Tesseract::word_adaptable( //should we adapt?
WERD_RES *word,
uinT16 mode) {
if (tessedit_adaption_debug) {
tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
word->best_choice == NULL ? "" :
word->best_choice->unichar_string().string(),
word->best_choice->rating(), word->best_choice->certainty());
}
BOOL8 status = FALSE;
BITS16 flags(mode);
enum MODES
{
ADAPTABLE_WERD,
ACCEPTABLE_WERD,
CHECK_DAWGS,
CHECK_SPACES,
CHECK_ONE_ELL_CONFLICT,
CHECK_AMBIG_WERD
};
/*
0: NO adaption
*/
if (mode == 0) {
if (tessedit_adaption_debug) tprintf("adaption disabled\n");
return FALSE;
}
if (flags.bit(ADAPTABLE_WERD)) {
status |= word->tess_would_adapt; // result of Classify::AdaptableWord()
if (tessedit_adaption_debug && !status) {
tprintf("tess_would_adapt bit is false\n");
}
}
if (flags.bit(ACCEPTABLE_WERD)) {
status |= word->tess_accepted;
if (tessedit_adaption_debug && !status) {
tprintf("tess_accepted bit is false\n");
}
}
if (!status) { // If not set then
return FALSE; // ignore other checks
}
if (flags.bit(CHECK_DAWGS) &&
(word->best_choice->permuter() != SYSTEM_DAWG_PERM) &&
(word->best_choice->permuter() != FREQ_DAWG_PERM) &&
(word->best_choice->permuter() != USER_DAWG_PERM) &&
(word->best_choice->permuter() != NUMBER_PERM)) {
if (tessedit_adaption_debug) tprintf("word not in dawgs\n");
return FALSE;
}
if (flags.bit(CHECK_ONE_ELL_CONFLICT) && one_ell_conflict(word, FALSE)) {
if (tessedit_adaption_debug) tprintf("word has ell conflict\n");
return FALSE;
}
if (flags.bit(CHECK_SPACES) &&
(strchr(word->best_choice->unichar_string().string(), ' ') != NULL)) {
if (tessedit_adaption_debug) tprintf("word contains spaces\n");
return FALSE;
}
if (flags.bit(CHECK_AMBIG_WERD) &&
word->best_choice->dangerous_ambig_found()) {
if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
return FALSE;
}
if (tessedit_adaption_debug) {
tprintf("returning status %d\n", status);
}
return status;
}
} // namespace tesseract

View File

@ -0,0 +1,814 @@
/**********************************************************************
* File: applybox.cpp (Formerly applybox.c)
* Description: Re segment rows according to box file data
* Author: Phil Cheatle
* Created: Wed Nov 24 09:11:23 GMT 1993
*
* (C) Copyright 1993, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifdef _MSC_VER
#pragma warning(disable:4244) // Conversion warnings
#endif
#include <ctype.h>
#include <string.h>
#ifdef __UNIX__
#include <assert.h>
#include <errno.h>
#endif
#include "allheaders.h"
#include "boxread.h"
#include "chopper.h"
#include "pageres.h"
#include "unichar.h"
#include "unicharset.h"
#include "tesseractclass.h"
#include "genericvector.h"
/** Max number of blobs to classify together in FindSegmentation. */
const int kMaxGroupSize = 4;
/// Max fraction of median allowed as deviation in xheight before switching
/// to median.
const double kMaxXHeightDeviationFraction = 0.125;
/**
* The box file is assumed to contain box definitions, one per line, of the
* following format for blob-level boxes:
* @verbatim
* <UTF8 str> <left> <bottom> <right> <top> <page id>
* @endverbatim
* and for word/line-level boxes:
* @verbatim
* WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
* @endverbatim
* NOTES:
* The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
*
* <page id> is 0-based, and the page number is used for multipage input (tiff).
*
* In the blob-level form, each line represents a recognizable unit, which may
* be several UTF-8 bytes, but there is a bounding box around each recognizable
* unit, and no classifier is needed to train in this mode (bootstrapping.)
*
* In the word/line-level form, the line begins with the literal "WordStr", and
* the bounding box bounds either a whole line or a whole word. The recognizable
* units in the word/line are listed after the # at the end of the line and
* are space delimited, ignoring any original spaces on the line.
* Eg.
* @verbatim
* word -> #w o r d
* multi word line -> #m u l t i w o r d l i n e
* @endverbatim
* The recognizable units must be space-delimited in order to allow multiple
* unicodes to be used for a single recognizable unit, eg Hindi.
*
* In this mode, the classifier must have been pre-trained with the desired
* character set, or it will not be able to find the character segmentations.
*/
namespace tesseract {
static void clear_any_old_text(BLOCK_LIST *block_list) {
BLOCK_IT block_it(block_list);
for (block_it.mark_cycle_pt();
!block_it.cycled_list(); block_it.forward()) {
ROW_IT row_it(block_it.data()->row_list());
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
WERD_IT word_it(row_it.data()->word_list());
for (word_it.mark_cycle_pt();
!word_it.cycled_list(); word_it.forward()) {
word_it.data()->set_text("");
}
}
}
}
// Applies the box file based on the image name fname, and resegments
// the words in the block_list (page), with:
// blob-mode: one blob per line in the box file, words as input.
// word/line-mode: one blob per space-delimited unit after the #, and one word
// per line in the box file. (See comment above for box file format.)
// If find_segmentation is true, (word/line mode) then the classifier is used
// to re-segment words/lines to match the space-delimited truth string for
// each box. In this case, the input box may be for a word or even a whole
// text line, and the output words will contain multiple blobs corresponding
// to the space-delimited input string.
// With find_segmentation false, no classifier is needed, but the chopper
// can still be used to correctly segment touching characters with the help
// of the input boxes.
// In the returned PAGE_RES, the WERD_RES are setup as they would be returned
// from normal classification, ie. with a word, chopped_word, rebuild_word,
// seam_array, denorm, box_word, and best_state, but NO best_choice or
// raw_choice, as they would require a UNICHARSET, which we aim to avoid.
// Instead, the correct_text member of WERD_RES is set, and this may be later
// converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
// is not required before calling ApplyBoxTraining.
PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname,
bool find_segmentation,
BLOCK_LIST *block_list) {
GenericVector<TBOX> boxes;
GenericVector<STRING> texts, full_texts;
if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
NULL)) {
return NULL; // Can't do it.
}
int box_count = boxes.size();
int box_failures = 0;
// Add an empty everything to the end.
boxes.push_back(TBOX());
texts.push_back(STRING());
full_texts.push_back(STRING());
// In word mode, we use the boxes to make a word for each box, but
// in blob mode we use the existing words and maximally chop them first.
PAGE_RES* page_res = find_segmentation ?
NULL : SetupApplyBoxes(boxes, block_list);
clear_any_old_text(block_list);
for (int i = 0; i < boxes.size() - 1; i++) {
bool foundit = false;
if (page_res != NULL) {
if (i == 0) {
foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1],
full_texts[i].string());
}
else {
foundit = ResegmentCharBox(page_res, &boxes[i - 1], boxes[i],
boxes[i + 1], full_texts[i].string());
}
}
else {
foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1],
texts[i].string());
}
if (!foundit) {
box_failures++;
ReportFailedBox(i, boxes[i], texts[i].string(),
"FAILURE! Couldn't find a matching blob");
}
}
if (page_res == NULL) {
// In word/line mode, we now maximally chop all the words and resegment
// them with the classifier.
page_res = SetupApplyBoxes(boxes, block_list);
ReSegmentByClassification(page_res);
}
if (applybox_debug > 0) {
tprintf("APPLY_BOXES:\n");
tprintf(" Boxes read from boxfile: %6d\n", box_count);
if (box_failures > 0)
tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
}
TidyUp(page_res);
return page_res;
}
// Helper computes median xheight in the image.
static double MedianXHeight(BLOCK_LIST *block_list) {
BLOCK_IT block_it(block_list);
STATS xheights(0, block_it.data()->bounding_box().height());
for (block_it.mark_cycle_pt();
!block_it.cycled_list(); block_it.forward()) {
ROW_IT row_it(block_it.data()->row_list());
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
}
}
return xheights.median();
}
/// Any row xheight that is significantly different from the median is set
/// to the median.
void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
double median_xheight = MedianXHeight(block_list);
double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
// Strip all fuzzy space markers to simplify the PAGE_RES.
BLOCK_IT b_it(block_list);
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
BLOCK* block = b_it.data();
ROW_IT r_it(block->row_list());
for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
ROW* row = r_it.data();
float diff = fabs(row->x_height() - median_xheight);
if (diff > max_deviation) {
if (applybox_debug) {
tprintf("row xheight=%g, but median xheight = %g\n",
row->x_height(), median_xheight);
}
row->set_x_height(static_cast<float>(median_xheight));
}
}
}
}
/// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
/// All fuzzy spaces are removed, and all the words are maximally chopped.
PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
BLOCK_LIST *block_list) {
PreenXHeights(block_list);
// Strip all fuzzy space markers to simplify the PAGE_RES.
BLOCK_IT b_it(block_list);
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
BLOCK* block = b_it.data();
ROW_IT r_it(block->row_list());
for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
ROW* row = r_it.data();
WERD_IT w_it(row->word_list());
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
WERD* word = w_it.data();
if (word->cblob_list()->empty()) {
delete w_it.extract();
}
else {
word->set_flag(W_FUZZY_SP, false);
word->set_flag(W_FUZZY_NON, false);
}
}
}
}
PAGE_RES* page_res = new PAGE_RES(false, block_list, NULL);
PAGE_RES_IT pr_it(page_res);
WERD_RES* word_res;
while ((word_res = pr_it.word()) != NULL) {
MaximallyChopWord(boxes, pr_it.block()->block,
pr_it.row()->row, word_res);
pr_it.forward();
}
return page_res;
}
/// Tests the chopper by exhaustively running chop_one_blob.
/// The word_res will contain filled chopped_word, seam_array, denorm,
/// box_word and best_state for the maximally chopped word.
void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
BLOCK* block, ROW* row,
WERD_RES* word_res) {
if (!word_res->SetupForRecognition(unicharset, this, BestPix(),
tessedit_ocr_engine_mode, NULL,
classify_bln_numeric_mode,
textord_use_cjk_fp_model,
poly_allow_detailed_fx,
row, block)) {
word_res->CloneChoppedToRebuild();
return;
}
if (chop_debug) {
tprintf("Maximally chopping word at:");
word_res->word->bounding_box().print();
}
GenericVector<BLOB_CHOICE*> blob_choices;
ASSERT_HOST(!word_res->chopped_word->blobs.empty());
float rating = static_cast<float>(MAX_INT8);
for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
// The rating and certainty are not quite arbitrary. Since
// select_blob_to_chop uses the worst certainty to choose, they all have
// to be different, so starting with MAX_INT8, subtract 1/8 for each blob
// in here, and then divide by e each time they are chopped, which
// should guarantee a set of unequal values for the whole tree of blobs
// produced, however much chopping is required. The chops are thus only
// limited by the ability of the chopper to find suitable chop points,
// and not by the value of the certainties.
BLOB_CHOICE* choice =
new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
blob_choices.push_back(choice);
rating -= 0.125f;
}
const double e = exp(1.0); // The base of natural logs.
int blob_number;
int right_chop_index = 0;
if (!assume_fixed_pitch_char_segment) {
// We only chop if the language is not fixed pitch like CJK.
SEAM* seam = NULL;
while ((seam = chop_one_blob(boxes, blob_choices, word_res,
&blob_number)) != NULL) {
word_res->InsertSeam(blob_number, seam);
BLOB_CHOICE* left_choice = blob_choices[blob_number];
rating = left_choice->rating() / e;
left_choice->set_rating(rating);
left_choice->set_certainty(-rating);
// combine confidence w/ serial #
BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index,
rating - 0.125f, -rating, -1,
0.0f, 0.0f, 0.0f, BCC_FAKE);
blob_choices.insert(right_choice, blob_number + 1);
}
}
word_res->CloneChoppedToRebuild();
word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
}
/// Helper to compute the dispute resolution metric.
/// Disputed blob resolution. The aim is to give the blob to the most
/// appropriate boxfile box. Most of the time it is obvious, but if
/// two boxfile boxes overlap significantly it is not. If a small boxfile
/// box takes most of the blob, and a large boxfile box does too, then
/// we want the small boxfile box to get it, but if the small box
/// is much smaller than the blob, we don't want it to get it.
/// Details of the disputed blob resolution:
/// Given a box with area A, and a blob with area B, with overlap area C,
/// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
/// miss metric gets the blob.
static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
int overlap_area = box1.intersection(box2).area();
double miss_metric = box1.area() - overlap_area;
miss_metric /= box1.area();
miss_metric *= box2.area() - overlap_area;
miss_metric /= box2.area();
return miss_metric;
}
/// Gather consecutive blobs that match the given box into the best_state
/// and corresponding correct_text.
///
/// Fights over which box owns which blobs are settled by pre-chopping and
/// applying the blobs to box or next_box with the least non-overlap.
/// @return false if the box was in error, which can only be caused by
/// failing to find an appropriate blob for a box.
///
/// This means that occasionally, blobs may be incorrectly segmented if the
/// chopper fails to find a suitable chop point.
bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
const TBOX& box, const TBOX& next_box,
const char* correct_text) {
if (applybox_debug > 1) {
tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
}
PAGE_RES_IT page_res_it(page_res);
WERD_RES* word_res;
for (word_res = page_res_it.word(); word_res != NULL;
word_res = page_res_it.forward()) {
if (!word_res->box_word->bounding_box().major_overlap(box))
continue;
if (applybox_debug > 1) {
tprintf("Checking word box:");
word_res->box_word->bounding_box().print();
}
int word_len = word_res->box_word->length();
for (int i = 0; i < word_len; ++i) {
TBOX char_box = TBOX();
int blob_count = 0;
for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
if (!blob_box.major_overlap(box))
break;
if (word_res->correct_text[i + blob_count].length() > 0)
break; // Blob is claimed already.
double current_box_miss_metric = BoxMissMetric(blob_box, box);
double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
if (applybox_debug > 2) {
tprintf("Checking blob:");
blob_box.print();
tprintf("Current miss metric = %g, next = %g\n",
current_box_miss_metric, next_box_miss_metric);
}
if (current_box_miss_metric > next_box_miss_metric)
break; // Blob is a better match for next box.
char_box += blob_box;
}
if (blob_count > 0) {
if (applybox_debug > 1) {
tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
}
if (!char_box.almost_equal(box, 3) &&
(box.x_gap(next_box) < -3 ||
(prev_box != NULL && prev_box->x_gap(box) < -3))) {
return false;
}
// We refine just the box_word, best_state and correct_text here.
// The rebuild_word is made in TidyUp.
// blob_count blobs are put together to match the box. Merge the
// box_word boxes, save the blob_count in the state and the text.
word_res->box_word->MergeBoxes(i, i + blob_count);
word_res->best_state[i] = blob_count;
word_res->correct_text[i] = correct_text;
if (applybox_debug > 2) {
tprintf("%d Blobs match: blob box:", blob_count);
word_res->box_word->BlobBox(i).print();
tprintf("Matches box:");
box.print();
tprintf("With next box:");
next_box.print();
}
// Eliminated best_state and correct_text entries for the consumed
// blobs.
for (int j = 1; j < blob_count; ++j) {
word_res->best_state.remove(i + 1);
word_res->correct_text.remove(i + 1);
}
// Assume that no box spans multiple source words, so we are done with
// this box.
if (applybox_debug > 1) {
tprintf("Best state = ");
for (int j = 0; j < word_res->best_state.size(); ++j) {
tprintf("%d ", word_res->best_state[j]);
}
tprintf("\n");
tprintf("Correct text = [[ ");
for (int j = 0; j < word_res->correct_text.size(); ++j) {
tprintf("%s ", word_res->correct_text[j].string());
}
tprintf("]]\n");
}
return true;
}
}
}
if (applybox_debug > 0) {
tprintf("FAIL!\n");
}
return false; // Failure.
}
/// Consume all source blobs that strongly overlap the given box,
/// putting them into a new word, with the correct_text label.
/// Fights over which box owns which blobs are settled by
/// applying the blobs to box or next_box with the least non-overlap.
/// @return false if the box was in error, which can only be caused by
/// failing to find an overlapping blob for a box.
bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
const TBOX& box, const TBOX& next_box,
const char* correct_text) {
if (applybox_debug > 1) {
tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
}
WERD* new_word = NULL;
BLOCK_IT b_it(block_list);
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
BLOCK* block = b_it.data();
if (!box.major_overlap(block->bounding_box()))
continue;
ROW_IT r_it(block->row_list());
for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
ROW* row = r_it.data();
if (!box.major_overlap(row->bounding_box()))
continue;
WERD_IT w_it(row->word_list());
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
WERD* word = w_it.data();
if (applybox_debug > 2) {
tprintf("Checking word:");
word->bounding_box().print();
}
if (word->text() != NULL && word->text()[0] != '\0')
continue; // Ignore words that are already done.
if (!box.major_overlap(word->bounding_box()))
continue;
C_BLOB_IT blob_it(word->cblob_list());
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
blob_it.forward()) {
C_BLOB* blob = blob_it.data();
TBOX blob_box = blob->bounding_box();
if (!blob_box.major_overlap(box))
continue;
double current_box_miss_metric = BoxMissMetric(blob_box, box);
double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
if (applybox_debug > 2) {
tprintf("Checking blob:");
blob_box.print();
tprintf("Current miss metric = %g, next = %g\n",
current_box_miss_metric, next_box_miss_metric);
}
if (current_box_miss_metric > next_box_miss_metric)
continue; // Blob is a better match for next box.
if (applybox_debug > 2) {
tprintf("Blob match: blob:");
blob_box.print();
tprintf("Matches box:");
box.print();
tprintf("With next box:");
next_box.print();
}
if (new_word == NULL) {
// Make a new word with a single blob.
new_word = word->shallow_copy();
new_word->set_text(correct_text);
w_it.add_to_end(new_word);
}
C_BLOB_IT new_blob_it(new_word->cblob_list());
new_blob_it.add_to_end(blob_it.extract());
}
}
}
}
if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
return new_word != NULL;
}
/// Resegments the words by running the classifier in an attempt to find the
/// correct segmentation that produces the required string.
void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
PAGE_RES_IT pr_it(page_res);
WERD_RES* word_res;
for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
WERD* word = word_res->word;
if (word->text() == NULL || word->text()[0] == '\0')
continue; // Ignore words that have no text.
// Convert the correct text to a vector of UNICHAR_ID
GenericVector<UNICHAR_ID> target_text;
if (!ConvertStringToUnichars(word->text(), &target_text)) {
tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
word->text());
pr_it.DeleteCurrentWord();
continue;
}
if (!FindSegmentation(target_text, word_res)) {
tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
word->text());
pr_it.DeleteCurrentWord();
continue;
}
}
}
/// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
/// @return false if an invalid UNICHAR_ID is encountered.
bool Tesseract::ConvertStringToUnichars(const char* utf8,
GenericVector<UNICHAR_ID>* class_ids) {
for (int step = 0; *utf8 != '\0'; utf8 += step) {
const char* next_space = strchr(utf8, ' ');
if (next_space == NULL)
next_space = utf8 + strlen(utf8);
step = next_space - utf8;
UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
if (class_id == INVALID_UNICHAR_ID) {
return false;
}
while (utf8[step] == ' ')
++step;
class_ids->push_back(class_id);
}
return true;
}
/// Resegments the word to achieve the target_text from the classifier.
/// Returns false if the re-segmentation fails.
/// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and
/// applies a full search on the classifier results to find the best classified
/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
/// substitutions ARE used.
bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
WERD_RES* word_res) {
// Classify all required combinations of blobs and save results in choices.
int word_length = word_res->box_word->length();
GenericVector<BLOB_CHOICE_LIST*>* choices =
new GenericVector<BLOB_CHOICE_LIST*>[word_length];
for (int i = 0; i < word_length; ++i) {
for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
BLOB_CHOICE_LIST* match_result = classify_piece(
word_res->seam_array, i, i + j - 1, "Applybox",
word_res->chopped_word, word_res->blamer_bundle);
if (applybox_debug > 2) {
tprintf("%d+%d:", i, j);
print_ratings_list("Segment:", match_result, unicharset);
}
choices[i].push_back(match_result);
}
}
// Search the segmentation graph for the target text. Must be an exact
// match. Using wildcards makes it difficult to find the correct
// segmentation even when it is there.
word_res->best_state.clear();
GenericVector<int> search_segmentation;
float best_rating = 0.0f;
SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
&search_segmentation, &best_rating, &word_res->best_state);
for (int i = 0; i < word_length; ++i)
choices[i].delete_data_pointers();
delete[] choices;
if (word_res->best_state.empty()) {
// Build the original segmentation and if it is the same length as the
// truth, assume it will do.
int blob_count = 1;
for (int s = 0; s < word_res->seam_array.size(); ++s) {
SEAM* seam = word_res->seam_array[s];
if (!seam->HasAnySplits()) {
word_res->best_state.push_back(blob_count);
blob_count = 1;
}
else {
++blob_count;
}
}
word_res->best_state.push_back(blob_count);
if (word_res->best_state.size() != target_text.size()) {
word_res->best_state.clear(); // No good. Original segmentation bad size.
return false;
}
}
word_res->correct_text.clear();
for (int i = 0; i < target_text.size(); ++i) {
word_res->correct_text.push_back(
STRING(unicharset.id_to_unichar(target_text[i])));
}
return true;
}
/// Recursive helper to find a match to the target_text (from text_index
/// position) in the choices (from choices_pos position).
/// @param choices is an array of GenericVectors, of length choices_length,
/// with each element representing a starting position in the word, and the
/// #GenericVector holding classification results for a sequence of consecutive
/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
/// @param choices_pos
/// @param choices_length
/// @param target_text
/// @param text_index
/// @param rating
/// @param segmentation
/// @param best_rating
/// @param best_segmentation
void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
int choices_pos, int choices_length,
const GenericVector<UNICHAR_ID>& target_text,
int text_index,
float rating, GenericVector<int>* segmentation,
float* best_rating,
GenericVector<int>* best_segmentation) {
const UnicharAmbigsVector& table = getDict().getUnicharAmbigs().dang_ambigs();
for (int length = 1; length <= choices[choices_pos].size(); ++length) {
// Rating of matching choice or worst choice if no match.
float choice_rating = 0.0f;
// Find the corresponding best BLOB_CHOICE.
BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
choice_it.forward()) {
BLOB_CHOICE* choice = choice_it.data();
choice_rating = choice->rating();
UNICHAR_ID class_id = choice->unichar_id();
if (class_id == target_text[text_index]) {
break;
}
// Search ambigs table.
if (class_id < table.size() && table[class_id] != NULL) {
AmbigSpec_IT spec_it(table[class_id]);
for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
spec_it.forward()) {
const AmbigSpec *ambig_spec = spec_it.data();
// We'll only do 1-1.
if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
ambig_spec->correct_ngram_id == target_text[text_index])
break;
}
if (!spec_it.cycled_list())
break; // Found an ambig.
}
}
if (choice_it.cycled_list())
continue; // No match.
segmentation->push_back(length);
if (choices_pos + length == choices_length &&
text_index + 1 == target_text.size()) {
// This is a complete match. If the rating is good record a new best.
if (applybox_debug > 2) {
tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
rating + choice_rating, *best_rating, segmentation->size(),
best_segmentation->size());
}
if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
*best_segmentation = *segmentation;
*best_rating = rating + choice_rating;
}
}
else if (choices_pos + length < choices_length &&
text_index + 1 < target_text.size()) {
if (applybox_debug > 3) {
tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
target_text[text_index],
unicharset.id_to_unichar(target_text[text_index]),
choice_it.data()->unichar_id() == target_text[text_index]
? "Match" : "Ambig",
choices_pos, length);
}
SearchForText(choices, choices_pos + length, choices_length, target_text,
text_index + 1, rating + choice_rating, segmentation,
best_rating, best_segmentation);
if (applybox_debug > 3) {
tprintf("End recursion for %d=%s\n", target_text[text_index],
unicharset.id_to_unichar(target_text[text_index]));
}
}
segmentation->truncate(segmentation->size() - 1);
}
}
/// - Counts up the labelled words and the blobs within.
/// - Deletes all unused or emptied words, counting the unused ones.
/// - Resets W_BOL and W_EOL flags correctly.
/// - Builds the rebuild_word and rebuilds the box_word and the best_choice.
void Tesseract::TidyUp(PAGE_RES* page_res) {
int ok_blob_count = 0;
int bad_blob_count = 0;
int ok_word_count = 0;
int unlabelled_words = 0;
PAGE_RES_IT pr_it(page_res);
WERD_RES* word_res;
for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
int ok_in_word = 0;
int blob_count = word_res->correct_text.size();
WERD_CHOICE* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
word_choice->set_permuter(TOP_CHOICE_PERM);
for (int c = 0; c < blob_count; ++c) {
if (word_res->correct_text[c].length() > 0) {
++ok_in_word;
}
// Since we only need a fake word_res->best_choice, the actual
// unichar_ids do not matter. Which is fortunate, since TidyUp()
// can be called while training Tesseract, at the stage where
// unicharset is not meaningful yet.
word_choice->append_unichar_id_space_allocated(
INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
}
if (ok_in_word > 0) {
ok_blob_count += ok_in_word;
bad_blob_count += word_res->correct_text.size() - ok_in_word;
word_res->LogNewRawChoice(word_choice);
word_res->LogNewCookedChoice(1, false, word_choice);
}
else {
++unlabelled_words;
if (applybox_debug > 0) {
tprintf("APPLY_BOXES: Unlabelled word at :");
word_res->word->bounding_box().print();
}
pr_it.DeleteCurrentWord();
delete word_choice;
}
}
pr_it.restart_page();
for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
// Denormalize back to a BoxWord.
word_res->RebuildBestState();
word_res->SetupBoxWord();
word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
}
if (applybox_debug > 0) {
tprintf(" Found %d good blobs.\n", ok_blob_count);
if (bad_blob_count > 0) {
tprintf(" Leaving %d unlabelled blobs in %d words.\n",
bad_blob_count, ok_word_count);
}
if (unlabelled_words > 0)
tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
}
}
/** Logs a bad box by line in the box file and box coords.*/
void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
const char *box_ch, const char *err_msg) {
tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
boxfile_lineno + 1, box_ch,
box.left(), box.bottom(), box.right(), box.top(), err_msg);
}
/** Creates a fake best_choice entry in each WERD_RES with the correct text.*/
void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
PAGE_RES_IT pr_it(page_res);
for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
word_res = pr_it.forward()) {
WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set,
word_res->correct_text.size());
for (int i = 0; i < word_res->correct_text.size(); ++i) {
// The part before the first space is the real ground truth, and the
// rest is the bounding box location and page number.
GenericVector<STRING> tokens;
word_res->correct_text[i].split(' ', &tokens);
UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
choice->append_unichar_id_space_allocated(char_id,
word_res->best_state[i],
0.0f, 0.0f);
}
word_res->ClearWordChoices();
word_res->LogNewRawChoice(choice);
word_res->LogNewCookedChoice(1, false, choice);
}
}
/// Calls #LearnWord to extract features for labelled blobs within each word.
/// Features are stored in an internal buffer.
void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
PAGE_RES_IT pr_it(page_res);
int word_count = 0;
for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
word_res = pr_it.forward()) {
LearnWord(fontname.string(), word_res);
++word_count;
}
tprintf("Generated training data for %d words\n", word_count);
}
} // namespace tesseract

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,44 @@
/**********************************************************************
* File: control.h (Formerly control.h)
* Description: Module-independent matcher controller.
* Author: Ray Smith
* Created: Thu Apr 23 11:09:58 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
/**
* @file control.h
* Module-independent matcher controller.
*/
#ifndef CONTROL_H
#define CONTROL_H
#include "params.h"
#include "ocrblock.h"
#include "ratngs.h"
#include "statistc.h"
#include "pageres.h"
enum ACCEPTABLE_WERD_TYPE
{
AC_UNACCEPTABLE, ///< Unacceptable word
AC_LOWER_CASE, ///< ALL lower case
AC_UPPER_CASE, ///< ALL upper case
AC_INITIAL_CAP, ///< ALL but initial lc
AC_LC_ABBREV, ///< a.b.c.
AC_UC_ABBREV ///< A.B.C.
};
#endif

View File

@ -0,0 +1,440 @@
/******************************************************************
* File: cube_control.cpp
* Description: Tesseract class methods for invoking cube convolutional
* neural network word recognizer.
* Author: Raquel Romano
* Created: September 2009
*
* (C) Copyright 2009, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
**********************************************************************/
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
#include "allheaders.h"
#include "cube_object.h"
#include "cube_reco_context.h"
#include "tesseractclass.h"
#include "tesseract_cube_combiner.h"
namespace tesseract {
/**
* @name convert_prob_to_tess_certainty
*
* Normalize a probability in the range [0.0, 1.0] to a tesseract
* certainty in the range [-20.0, 0.0]
*/
static float convert_prob_to_tess_certainty(float prob) {
return (prob - 1.0) * 20.0;
}
/**
* @name char_box_to_tbox
*
* Create a TBOX from a character bounding box. If nonzero, the
* x_offset accounts for any additional padding of the word box that
* should be taken into account.
*
*/
TBOX char_box_to_tbox(Box* char_box, TBOX word_box, int x_offset) {
l_int32 left;
l_int32 top;
l_int32 width;
l_int32 height;
l_int32 right;
l_int32 bottom;
boxGetGeometry(char_box, &left, &top, &width, &height);
left += word_box.left() - x_offset;
right = left + width;
top = word_box.bottom() + word_box.height() - top;
bottom = top - height;
return TBOX(left, bottom, right, top);
}
/**
* @name extract_cube_state
*
* Extract CharSamp objects and character bounding boxes from the
* CubeObject's state. The caller should free both structres.
*
*/
bool Tesseract::extract_cube_state(CubeObject* cube_obj,
int* num_chars,
Boxa** char_boxes,
CharSamp*** char_samples) {
if (!cube_obj) {
if (cube_debug_level > 0) {
tprintf("Cube WARNING (extract_cube_state): Invalid cube object "
"passed to extract_cube_state\n");
}
return false;
}
// Note that the CubeObject accessors return either the deslanted or
// regular objects search object or beam search object, whichever
// was used in the last call to Recognize()
CubeSearchObject* cube_search_obj = cube_obj->SrchObj();
if (!cube_search_obj) {
if (cube_debug_level > 0) {
tprintf("Cube WARNING (Extract_cube_state): Could not retrieve "
"cube's search object in extract_cube_state.\n");
}
return false;
}
BeamSearch *beam_search_obj = cube_obj->BeamObj();
if (!beam_search_obj) {
if (cube_debug_level > 0) {
tprintf("Cube WARNING (Extract_cube_state): Could not retrieve "
"cube's beam search object in extract_cube_state.\n");
}
return false;
}
// Get the character samples and bounding boxes by backtracking
// through the beam search path
int best_node_index = beam_search_obj->BestPresortedNodeIndex();
*char_samples = beam_search_obj->BackTrack(
cube_search_obj, best_node_index, num_chars, NULL, char_boxes);
if (!*char_samples)
return false;
return true;
}
/**
* @name create_cube_box_word
*
* Fill the given BoxWord with boxes from character bounding
* boxes. The char_boxes have local coordinates w.r.t. the
* word bounding box, i.e., the left-most character bbox of each word
* has (0,0) left-top coord, but the BoxWord must be defined in page
* coordinates.
*/
bool Tesseract::create_cube_box_word(Boxa *char_boxes,
int num_chars,
TBOX word_box,
BoxWord* box_word) {
if (!box_word) {
if (cube_debug_level > 0) {
tprintf("Cube WARNING (create_cube_box_word): Invalid box_word.\n");
}
return false;
}
// Find the x-coordinate of left-most char_box, which could be
// nonzero if the word image was padded before recognition took place.
int x_offset = -1;
for (int i = 0; i < num_chars; ++i) {
Box* char_box = boxaGetBox(char_boxes, i, L_CLONE);
if (x_offset < 0 || char_box->x < x_offset) {
x_offset = char_box->x;
}
boxDestroy(&char_box);
}
for (int i = 0; i < num_chars; ++i) {
Box* char_box = boxaGetBox(char_boxes, i, L_CLONE);
TBOX tbox = char_box_to_tbox(char_box, word_box, x_offset);
boxDestroy(&char_box);
box_word->InsertBox(i, tbox);
}
return true;
}
/**
* @name init_cube_objects
*
* Instantiates Tesseract object's CubeRecoContext and TesseractCubeCombiner.
* Returns false if cube context could not be created or if load_combiner is
* true, but the combiner could not be loaded.
*/
bool Tesseract::init_cube_objects(bool load_combiner,
TessdataManager *tessdata_manager) {
ASSERT_HOST(cube_cntxt_ == NULL);
ASSERT_HOST(tess_cube_combiner_ == NULL);
// Create the cube context object
cube_cntxt_ = CubeRecoContext::Create(this, tessdata_manager, &unicharset);
if (cube_cntxt_ == NULL) {
if (cube_debug_level > 0) {
tprintf("Cube WARNING (Tesseract::init_cube_objects()): Failed to "
"instantiate CubeRecoContext\n");
}
return false;
}
// Create the combiner object and load the combiner net for target languages.
if (load_combiner) {
tess_cube_combiner_ = new tesseract::TesseractCubeCombiner(cube_cntxt_);
if (!tess_cube_combiner_->LoadCombinerNet()) {
delete cube_cntxt_;
cube_cntxt_ = NULL;
delete tess_cube_combiner_;
tess_cube_combiner_ = NULL;
if (cube_debug_level > 0)
tprintf("Cube ERROR (Failed to instantiate TesseractCubeCombiner\n");
return false;
}
}
return true;
}
/**
* @name run_cube_combiner
*
* Iterates through tesseract's results and calls cube on each word,
* combining the results with the existing tesseract result.
*/
void Tesseract::run_cube_combiner(PAGE_RES *page_res) {
if (page_res == NULL || tess_cube_combiner_ == NULL)
return;
PAGE_RES_IT page_res_it(page_res);
// Iterate through the word results and call cube on each word.
for (page_res_it.restart_page(); page_res_it.word() != NULL;
page_res_it.forward()) {
BLOCK* block = page_res_it.block()->block;
if (block->poly_block() != NULL && !block->poly_block()->IsText())
continue; // Don't deal with non-text blocks.
WERD_RES* word = page_res_it.word();
// Skip cube entirely if tesseract's certainty is greater than threshold.
int combiner_run_thresh = convert_prob_to_tess_certainty(
cube_cntxt_->Params()->CombinerRunThresh());
if (word->best_choice->certainty() >= combiner_run_thresh) {
continue;
}
// Use the same language as Tesseract used for the word.
Tesseract* lang_tess = word->tesseract;
// Setup a trial WERD_RES in which to classify with cube.
WERD_RES cube_word;
cube_word.InitForRetryRecognition(*word);
cube_word.SetupForRecognition(lang_tess->unicharset, this, BestPix(),
OEM_CUBE_ONLY,
NULL, false, false, false,
page_res_it.row()->row,
page_res_it.block()->block);
CubeObject *cube_obj = lang_tess->cube_recognize_word(
page_res_it.block()->block, &cube_word);
if (cube_obj != NULL)
lang_tess->cube_combine_word(cube_obj, &cube_word, word);
delete cube_obj;
}
}
/**
* @name cube_word_pass1
*
* Recognizes a single word using (only) cube. Compatible with
* Tesseract's classify_word_pass1/classify_word_pass2.
*/
void Tesseract::cube_word_pass1(BLOCK* block, ROW *row, WERD_RES *word) {
CubeObject *cube_obj = cube_recognize_word(block, word);
delete cube_obj;
}
/**
* @name cube_recognize_word
*
* Cube recognizer to recognize a single word as with classify_word_pass1
* but also returns the cube object in case the combiner is needed.
*/
CubeObject* Tesseract::cube_recognize_word(BLOCK* block, WERD_RES* word) {
if (!cube_binary_ || !cube_cntxt_) {
if (cube_debug_level > 0 && !cube_binary_)
tprintf("Tesseract::run_cube(): NULL binary image.\n");
word->SetupFake(unicharset);
return NULL;
}
TBOX word_box = word->word->bounding_box();
if (block != NULL && (block->re_rotation().x() != 1.0f ||
block->re_rotation().y() != 0.0f)) {
// TODO(rays) We have to rotate the bounding box to get the true coords.
// This will be achieved in the future via DENORM.
// In the mean time, cube can't process this word.
if (cube_debug_level > 0) {
tprintf("Cube can't process rotated word at:");
word_box.print();
}
word->SetupFake(unicharset);
return NULL;
}
CubeObject* cube_obj = new tesseract::CubeObject(
cube_cntxt_, cube_binary_, word_box.left(),
pixGetHeight(cube_binary_) - word_box.top(),
word_box.width(), word_box.height());
if (!cube_recognize(cube_obj, block, word)) {
delete cube_obj;
return NULL;
}
return cube_obj;
}
/**
* @name cube_combine_word
*
* Combines the cube and tesseract results for a single word, leaving the
* result in tess_word.
*/
void Tesseract::cube_combine_word(CubeObject* cube_obj, WERD_RES* cube_word,
WERD_RES* tess_word) {
float combiner_prob = tess_cube_combiner_->CombineResults(tess_word,
cube_obj);
// If combiner probability is greater than tess/cube combiner
// classifier threshold, i.e. tesseract wins, then just return the
// tesseract result unchanged, as the combiner knows nothing about how
// correct the answer is. If cube and tesseract agree, then improve the
// scores before returning.
WERD_CHOICE* tess_best = tess_word->best_choice;
WERD_CHOICE* cube_best = cube_word->best_choice;
if (cube_debug_level || classify_debug_level) {
tprintf("Combiner prob = %g vs threshold %g\n",
combiner_prob, cube_cntxt_->Params()->CombinerClassifierThresh());
}
if (combiner_prob >=
cube_cntxt_->Params()->CombinerClassifierThresh()) {
if (tess_best->unichar_string() == cube_best->unichar_string()) {
// Cube and tess agree, so improve the scores.
tess_best->set_rating(tess_best->rating() / 2);
tess_best->set_certainty(tess_best->certainty() / 2);
}
return;
}
// Cube wins.
// It is better for the language combiner to have all tesseract scores,
// so put them in the cube result.
cube_best->set_rating(tess_best->rating());
cube_best->set_certainty(tess_best->certainty());
if (cube_debug_level || classify_debug_level) {
tprintf("Cube INFO: tesseract result replaced by cube: %s -> %s\n",
tess_best->unichar_string().string(),
cube_best->unichar_string().string());
}
tess_word->ConsumeWordResults(cube_word);
}
/**
* @name cube_recognize
*
* Call cube on the current word, and write the result to word.
* Sets up a fake result and returns false if something goes wrong.
*/
bool Tesseract::cube_recognize(CubeObject *cube_obj, BLOCK* block,
WERD_RES *word) {
// Run cube
WordAltList *cube_alt_list = cube_obj->RecognizeWord();
if (!cube_alt_list || cube_alt_list->AltCount() <= 0) {
if (cube_debug_level > 0) {
tprintf("Cube returned nothing for word at:");
word->word->bounding_box().print();
}
word->SetupFake(unicharset);
return false;
}
// Get cube's best result and its probability, mapped to tesseract's
// certainty range
char_32 *cube_best_32 = cube_alt_list->Alt(0);
double cube_prob = CubeUtils::Cost2Prob(cube_alt_list->AltCost(0));
float cube_certainty = convert_prob_to_tess_certainty(cube_prob);
string cube_best_str;
CubeUtils::UTF32ToUTF8(cube_best_32, &cube_best_str);
// Retrieve Cube's character bounding boxes and CharSamples,
// corresponding to the most recent call to RecognizeWord().
Boxa *char_boxes = NULL;
CharSamp **char_samples = NULL;;
int num_chars;
if (!extract_cube_state(cube_obj, &num_chars, &char_boxes, &char_samples)
&& cube_debug_level > 0) {
tprintf("Cube WARNING (Tesseract::cube_recognize): Cannot extract "
"cube state.\n");
word->SetupFake(unicharset);
return false;
}
// Convert cube's character bounding boxes to a BoxWord.
BoxWord cube_box_word;
TBOX tess_word_box = word->word->bounding_box();
if (word->denorm.block() != NULL)
tess_word_box.rotate(word->denorm.block()->re_rotation());
bool box_word_success = create_cube_box_word(char_boxes, num_chars,
tess_word_box,
&cube_box_word);
boxaDestroy(&char_boxes);
if (!box_word_success) {
if (cube_debug_level > 0) {
tprintf("Cube WARNING (Tesseract::cube_recognize): Could not "
"create cube BoxWord\n");
}
word->SetupFake(unicharset);
return false;
}
// Fill tesseract result's fields with cube results
fill_werd_res(cube_box_word, cube_best_str.c_str(), word);
// Create cube's best choice.
BLOB_CHOICE** choices = new BLOB_CHOICE*[num_chars];
for (int i = 0; i < num_chars; ++i) {
UNICHAR_ID uch_id =
cube_cntxt_->CharacterSet()->UnicharID(char_samples[i]->StrLabel());
choices[i] = new BLOB_CHOICE(uch_id, -cube_certainty, cube_certainty,
-1, 0.0f, 0.0f, 0.0f, BCC_STATIC_CLASSIFIER);
}
word->FakeClassifyWord(num_chars, choices);
// within a word, cube recognizes the word in reading order.
word->best_choice->set_unichars_in_script_order(true);
delete[] choices;
delete[] char_samples;
// Some sanity checks
ASSERT_HOST(word->best_choice->length() == word->reject_map.length());
if (cube_debug_level || classify_debug_level) {
tprintf("Cube result: %s r=%g, c=%g\n",
word->best_choice->unichar_string().string(),
word->best_choice->rating(),
word->best_choice->certainty());
}
return true;
}
/**
* @name fill_werd_res
*
* Fill Tesseract's word result fields with cube's.
*
*/
void Tesseract::fill_werd_res(const BoxWord& cube_box_word,
const char* cube_best_str,
WERD_RES* tess_werd_res) {
delete tess_werd_res->box_word;
tess_werd_res->box_word = new BoxWord(cube_box_word);
tess_werd_res->box_word->ClipToOriginalWord(tess_werd_res->denorm.block(),
tess_werd_res->word);
// Fill text and remaining fields
tess_werd_res->word->set_text(cube_best_str);
tess_werd_res->tess_failed = FALSE;
tess_werd_res->tess_accepted = tess_acceptable_word(tess_werd_res);
// There is no output word, so we can' call AdaptableWord, but then I don't
// think we need to. Fudge the result with accepted.
tess_werd_res->tess_would_adapt = tess_werd_res->tess_accepted;
// Set word to done, i.e., ignore all of tesseract's tests for rejection
tess_werd_res->done = tess_werd_res->tess_accepted;
}
} // namespace tesseract

View File

@ -0,0 +1,184 @@
/**********************************************************************
* File: cube_reco_context.cpp
* Description: Implementation of the Cube Recognition Context Class
* Author: Ahmad Abdulkader
* Created: 2007
*
* (C) Copyright 2008, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include <string>
#include <limits.h>
#include "cube_reco_context.h"
#include "classifier_factory.h"
#include "cube_tuning_params.h"
#include "dict.h"
#include "feature_bmp.h"
#include "tessdatamanager.h"
#include "tesseractclass.h"
#include "tess_lang_model.h"
namespace tesseract {
/**
* Instantiate a CubeRecoContext object using a Tesseract object.
* CubeRecoContext will not take ownership of tess_obj, but will
* record the pointer to it and will make use of various Tesseract
* components (language model, flags, etc). Thus the caller should
* keep tess_obj alive so long as the instantiated CubeRecoContext is used.
*/
CubeRecoContext::CubeRecoContext(Tesseract *tess_obj) {
tess_obj_ = tess_obj;
lang_ = "";
loaded_ = false;
lang_mod_ = NULL;
params_ = NULL;
char_classifier_ = NULL;
char_set_ = NULL;
word_size_model_ = NULL;
char_bigrams_ = NULL;
word_unigrams_ = NULL;
noisy_input_ = false;
size_normalization_ = false;
}
CubeRecoContext::~CubeRecoContext() {
delete char_classifier_;
char_classifier_ = NULL;
delete word_size_model_;
word_size_model_ = NULL;
delete char_set_;
char_set_ = NULL;
delete char_bigrams_;
char_bigrams_ = NULL;
delete word_unigrams_;
word_unigrams_ = NULL;
delete lang_mod_;
lang_mod_ = NULL;
delete params_;
params_ = NULL;
}
/**
* Returns the path of the data files by looking up the TESSDATA_PREFIX
* environment variable and appending a "tessdata" directory to it
*/
bool CubeRecoContext::GetDataFilePath(string *path) const {
*path = tess_obj_->datadir.string();
return true;
}
/**
* The object initialization function that loads all the necessary
* components of a RecoContext. TessdataManager is used to load the
* data from [lang].traineddata file. If TESSDATA_CUBE_UNICHARSET
* component is present, Cube will be instantiated with the unicharset
* specified in this component and the corresponding dictionary
* (TESSDATA_CUBE_SYSTEM_DAWG), and will map Cube's unicharset to
* Tesseract's. Otherwise, TessdataManager will assume that Cube will
* be using Tesseract's unicharset and dawgs, and will load the
* unicharset from the TESSDATA_UNICHARSET component and will load the
* dawgs from TESSDATA_*_DAWG components.
*/
bool CubeRecoContext::Load(TessdataManager *tessdata_manager,
UNICHARSET *tess_unicharset) {
ASSERT_HOST(tess_obj_ != NULL);
tess_unicharset_ = tess_unicharset;
string data_file_path;
// Get the data file path.
if (GetDataFilePath(&data_file_path) == false) {
fprintf(stderr, "Unable to get data file path\n");
return false;
}
// Get the language from the Tesseract object.
lang_ = tess_obj_->lang.string();
// Create the char set.
if ((char_set_ =
CharSet::Create(tessdata_manager, tess_unicharset)) == NULL) {
fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to load "
"CharSet\n");
return false;
}
// Create the language model.
string lm_file_name = data_file_path + lang_ + ".cube.lm";
string lm_params;
if (!CubeUtils::ReadFileToString(lm_file_name, &lm_params)) {
fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to read cube "
"language model params from %s\n", lm_file_name.c_str());
return false;
}
lang_mod_ = new TessLangModel(lm_params, data_file_path,
tess_obj_->getDict().load_system_dawg,
tessdata_manager, this);
// Create the optional char bigrams object.
char_bigrams_ = CharBigrams::Create(data_file_path, lang_);
// Create the optional word unigrams object.
word_unigrams_ = WordUnigrams::Create(data_file_path, lang_);
// Create the optional size model.
word_size_model_ = WordSizeModel::Create(data_file_path, lang_,
char_set_, Contextual());
// Load tuning params.
params_ = CubeTuningParams::Create(data_file_path, lang_);
if (params_ == NULL) {
fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to read "
"CubeTuningParams from %s\n", data_file_path.c_str());
return false;
}
// Create the char classifier.
char_classifier_ = CharClassifierFactory::Create(data_file_path, lang_,
lang_mod_, char_set_,
params_);
if (char_classifier_ == NULL) {
fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to load "
"CharClassifierFactory object from %s\n", data_file_path.c_str());
return false;
}
loaded_ = true;
return true;
}
/** Creates a CubeRecoContext object using a tesseract object */
CubeRecoContext * CubeRecoContext::Create(Tesseract *tess_obj,
TessdataManager *tessdata_manager,
UNICHARSET *tess_unicharset) {
// create the object
CubeRecoContext *cntxt = new CubeRecoContext(tess_obj);
// load the necessary components
if (cntxt->Load(tessdata_manager, tess_unicharset) == false) {
fprintf(stderr, "Cube ERROR (CubeRecoContext::Create): unable to init "
"CubeRecoContext object\n");
delete cntxt;
return NULL;
}
// success
return cntxt;
}
} // tesseract}

View File

@ -0,0 +1,157 @@
/**********************************************************************
* File: cube_reco_context.h
* Description: Declaration of the Cube Recognition Context Class
* Author: Ahmad Abdulkader
* Created: 2007
*
* (C) Copyright 2008, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
// The CubeRecoContext class abstracts the Cube OCR Engine. Typically a process
// (or a thread) would create one CubeRecoContext object per language.
// The CubeRecoContext object also provides methods to get and set the
// different attribues of the Cube OCR Engine.
#ifndef CUBE_RECO_CONTEXT_H
#define CUBE_RECO_CONTEXT_H
#include <string>
#include "neural_net.h"
#include "lang_model.h"
#include "classifier_base.h"
#include "feature_base.h"
#include "char_set.h"
#include "word_size_model.h"
#include "char_bigrams.h"
#include "word_unigrams.h"
namespace tesseract {
class Tesseract;
class TessdataManager;
class CubeRecoContext {
public:
// Reading order enum type
enum ReadOrder {
L2R,
R2L
};
// Instantiate using a Tesseract object
CubeRecoContext(Tesseract *tess_obj);
~CubeRecoContext();
// accessor functions
inline const string & Lang() const { return lang_; }
inline CharSet *CharacterSet() const { return char_set_; }
const UNICHARSET *TessUnicharset() const { return tess_unicharset_; }
inline CharClassifier *Classifier() const { return char_classifier_; }
inline WordSizeModel *SizeModel() const { return word_size_model_; }
inline CharBigrams *Bigrams() const { return char_bigrams_; }
inline WordUnigrams *WordUnigramsObj() const { return word_unigrams_; }
inline TuningParams *Params() const { return params_; }
inline LangModel *LangMod() const { return lang_mod_; }
// the reading order of the language
inline ReadOrder ReadingOrder() const {
return ((lang_ == "ara") ? R2L : L2R);
}
// does the language support case
inline bool HasCase() const {
return (lang_ != "ara" && lang_ != "hin");
}
inline bool Cursive() const {
return (lang_ == "ara");
}
inline bool HasItalics() const {
return (lang_ != "ara" && lang_ != "hin");
}
inline bool Contextual() const {
return (lang_ == "ara");
}
// RecoContext runtime flags accessor functions
inline bool SizeNormalization() const { return size_normalization_; }
inline bool NoisyInput() const { return noisy_input_; }
inline bool OOD() const { return lang_mod_->OOD(); }
inline bool Numeric() const { return lang_mod_->Numeric(); }
inline bool WordList() const { return lang_mod_->WordList(); }
inline bool Punc() const { return lang_mod_->Punc(); }
inline bool CaseSensitive() const {
return char_classifier_->CaseSensitive();
}
inline void SetSizeNormalization(bool size_normalization) {
size_normalization_ = size_normalization;
}
inline void SetNoisyInput(bool noisy_input) {
noisy_input_ = noisy_input;
}
inline void SetOOD(bool ood_enabled) {
lang_mod_->SetOOD(ood_enabled);
}
inline void SetNumeric(bool numeric_enabled) {
lang_mod_->SetNumeric(numeric_enabled);
}
inline void SetWordList(bool word_list_enabled) {
lang_mod_->SetWordList(word_list_enabled);
}
inline void SetPunc(bool punc_enabled) {
lang_mod_->SetPunc(punc_enabled);
}
inline void SetCaseSensitive(bool case_sensitive) {
char_classifier_->SetCaseSensitive(case_sensitive);
}
inline tesseract::Tesseract *TesseractObject() const {
return tess_obj_;
}
// Returns the path of the data files
bool GetDataFilePath(string *path) const;
// Creates a CubeRecoContext object using a tesseract object. Data
// files are loaded via the tessdata_manager, and the tesseract
// unicharset is provided in order to map Cube's unicharset to
// Tesseract's in the case where the two unicharsets differ.
static CubeRecoContext *Create(Tesseract *tess_obj,
TessdataManager *tessdata_manager,
UNICHARSET *tess_unicharset);
private:
bool loaded_;
string lang_;
CharSet *char_set_;
UNICHARSET *tess_unicharset_;
WordSizeModel *word_size_model_;
CharClassifier *char_classifier_;
CharBigrams *char_bigrams_;
WordUnigrams *word_unigrams_;
TuningParams *params_;
LangModel *lang_mod_;
Tesseract *tess_obj_; // CubeRecoContext does not own this pointer
bool size_normalization_;
bool noisy_input_;
// Loads and initialized all the necessary components of a
// CubeRecoContext. See .cpp for more details.
bool Load(TessdataManager *tessdata_manager,
UNICHARSET *tess_unicharset);
};
}
#endif // CUBE_RECO_CONTEXT_H

View File

@ -0,0 +1,134 @@
// Copyright 2011 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File: cubeclassifier.cpp
// Description: Cube implementation of a ShapeClassifier.
// Author: Ray Smith
// Created: Wed Nov 23 10:39:45 PST 2011
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "cubeclassifier.h"
#include "char_altlist.h"
#include "char_set.h"
#include "cube_object.h"
#include "cube_reco_context.h"
#include "tessclassifier.h"
#include "tesseractclass.h"
#include "trainingsample.h"
#include "unicharset.h"
namespace tesseract {
CubeClassifier::CubeClassifier(tesseract::Tesseract* tesseract)
: cube_cntxt_(tesseract->GetCubeRecoContext()),
shape_table_(*tesseract->shape_table()) {
}
CubeClassifier::~CubeClassifier() {
}
/// Classifies the given [training] sample, writing to results.
/// See ShapeClassifier for a full description.
int CubeClassifier::UnicharClassifySample(
const TrainingSample& sample, Pix* page_pix, int debug,
UNICHAR_ID keep_this, GenericVector<UnicharRating>* results) {
results->clear();
if (page_pix == NULL) return 0;
ASSERT_HOST(cube_cntxt_ != NULL);
const TBOX& char_box = sample.bounding_box();
CubeObject* cube_obj = new tesseract::CubeObject(
cube_cntxt_, page_pix, char_box.left(),
pixGetHeight(page_pix) - char_box.top(),
char_box.width(), char_box.height());
CharAltList* alt_list = cube_obj->RecognizeChar();
if (alt_list != NULL) {
alt_list->Sort();
CharSet* char_set = cube_cntxt_->CharacterSet();
for (int i = 0; i < alt_list->AltCount(); ++i) {
// Convert cube representation to a shape_id.
int alt_id = alt_list->Alt(i);
int unichar_id = char_set->UnicharID(char_set->ClassString(alt_id));
if (unichar_id >= 0)
results->push_back(UnicharRating(unichar_id, alt_list->AltProb(i)));
}
delete alt_list;
}
delete cube_obj;
return results->size();
}
/** Provides access to the ShapeTable that this classifier works with. */
const ShapeTable* CubeClassifier::GetShapeTable() const {
return &shape_table_;
}
CubeTessClassifier::CubeTessClassifier(tesseract::Tesseract* tesseract)
: cube_cntxt_(tesseract->GetCubeRecoContext()),
shape_table_(*tesseract->shape_table()),
pruner_(new TessClassifier(true, tesseract)) {
}
CubeTessClassifier::~CubeTessClassifier() {
delete pruner_;
}
/// Classifies the given [training] sample, writing to results.
/// See ShapeClassifier for a full description.
int CubeTessClassifier::UnicharClassifySample(
const TrainingSample& sample, Pix* page_pix, int debug,
UNICHAR_ID keep_this, GenericVector<UnicharRating>* results) {
int num_results = pruner_->UnicharClassifySample(sample, page_pix, debug,
keep_this, results);
if (page_pix == NULL) return num_results;
ASSERT_HOST(cube_cntxt_ != NULL);
const TBOX& char_box = sample.bounding_box();
CubeObject* cube_obj = new tesseract::CubeObject(
cube_cntxt_, page_pix, char_box.left(),
pixGetHeight(page_pix) - char_box.top(),
char_box.width(), char_box.height());
CharAltList* alt_list = cube_obj->RecognizeChar();
CharSet* char_set = cube_cntxt_->CharacterSet();
if (alt_list != NULL) {
for (int r = 0; r < num_results; ++r) {
// Get the best cube probability of the unichar in the result.
double best_prob = 0.0;
for (int i = 0; i < alt_list->AltCount(); ++i) {
int alt_id = alt_list->Alt(i);
int unichar_id = char_set->UnicharID(char_set->ClassString(alt_id));
if (unichar_id == (*results)[r].unichar_id &&
alt_list->AltProb(i) > best_prob) {
best_prob = alt_list->AltProb(i);
}
}
(*results)[r].rating = best_prob;
}
delete alt_list;
// Re-sort by rating.
results->sort(&UnicharRating::SortDescendingRating);
}
delete cube_obj;
return results->size();
}
/** Provides access to the ShapeTable that this classifier works with. */
const ShapeTable* CubeTessClassifier::GetShapeTable() const {
return &shape_table_;
}
} // namespace tesseract

View File

@ -0,0 +1,81 @@
// Copyright 2011 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File: cubeclassifier.h
// Description: Cube implementation of a ShapeClassifier.
// Author: Ray Smith
// Created: Wed Nov 23 10:36:32 PST 2011
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef THIRD_PARTY_TESSERACT_CCMAIN_CUBECLASSIFIER_H_
#define THIRD_PARTY_TESSERACT_CCMAIN_CUBECLASSIFIER_H_
#include "shapeclassifier.h"
#include "platform.h"
namespace tesseract {
class Classify;
class CubeRecoContext;
class ShapeTable;
class TessClassifier;
class Tesseract;
class TrainingSample;
struct UnicharRating;
// Cube implementation of a ShapeClassifier.
class TESS_API CubeClassifier : public ShapeClassifier {
public:
explicit CubeClassifier(Tesseract* tesseract);
virtual ~CubeClassifier();
// Classifies the given [training] sample, writing to results.
// See ShapeClassifier for a full description.
virtual int UnicharClassifySample(const TrainingSample& sample, Pix* page_pix,
int debug, UNICHAR_ID keep_this,
GenericVector<UnicharRating>* results);
// Provides access to the ShapeTable that this classifier works with.
virtual const ShapeTable* GetShapeTable() const;
private:
// Cube objects.
CubeRecoContext* cube_cntxt_;
const ShapeTable& shape_table_;
};
// Combination of Tesseract class pruner with scoring by cube.
class TESS_API CubeTessClassifier : public ShapeClassifier {
public:
explicit CubeTessClassifier(Tesseract* tesseract);
virtual ~CubeTessClassifier();
// Classifies the given [training] sample, writing to results.
// See ShapeClassifier for a full description.
virtual int UnicharClassifySample(const TrainingSample& sample, Pix* page_pix,
int debug, UNICHAR_ID keep_this,
GenericVector<UnicharRating>* results);
// Provides access to the ShapeTable that this classifier works with.
virtual const ShapeTable* GetShapeTable() const;
private:
// Cube objects.
CubeRecoContext* cube_cntxt_;
const ShapeTable& shape_table_;
TessClassifier* pruner_;
};
} // namespace tesseract
#endif /* THIRD_PARTY_TESSERACT_CCMAIN_CUBECLASSIFIER_H_ */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,35 @@
/******************************************************************
* File: docqual.h (Formerly docqual.h)
* Description: Document Quality Metrics
* Author: Phil Cheatle
* Created: Mon May 9 11:27:28 BST 1994
*
* (C) Copyright 1994, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef DOCQUAL_H
#define DOCQUAL_H
#include "control.h"
enum GARBAGE_LEVEL
{
G_NEVER_CRUNCH,
G_OK,
G_DODGY,
G_TERRIBLE
};
inT16 word_blob_quality(WERD_RES *word, ROW *row);
void reject_whole_page(PAGE_RES_IT &page_res_it);
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,278 @@
///////////////////////////////////////////////////////////////////////
// File: equationdetect.h
// Description: The equation detection class that inherits equationdetectbase.
// Author: Zongyi (Joe) Liu (joeliu@google.com)
// Created: Fri Aug 31 11:13:01 PST 2011
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCMAIN_EQUATIONDETECT_H__
#define TESSERACT_CCMAIN_EQUATIONDETECT_H__
#include "blobbox.h"
#include "equationdetectbase.h"
#include "genericvector.h"
#include "tesseractclass.h"
#include "unichar.h"
class BLOBNBOX;
class BLOB_CHOICE;
class BLOB_CHOICE_LIST;
class TO_BLOCK_LIST;
class TBOX;
class UNICHARSET;
namespace tesseract {
class Tesseract;
class ColPartition;
class ColPartitionGrid;
class ColPartitionSet;
class EquationDetect : public EquationDetectBase {
public:
EquationDetect(const char* equ_datapath,
const char* equ_language);
~EquationDetect();
enum IndentType {
NO_INDENT,
LEFT_INDENT,
RIGHT_INDENT,
BOTH_INDENT,
INDENT_TYPE_COUNT
};
// Reset the lang_tesseract_ pointer. This function should be called before we
// do any detector work.
void SetLangTesseract(Tesseract* lang_tesseract);
// Iterate over the blobs inside to_block, and set the blobs that we want to
// process to BSTT_NONE. (By default, they should be BSTT_SKIP). The function
// returns 0 upon success.
int LabelSpecialText(TO_BLOCK* to_block);
// Find possible equation partitions from part_grid. Should be called
// after the special_text_type of blobs are set.
// It returns 0 upon success.
int FindEquationParts(ColPartitionGrid* part_grid,
ColPartitionSet** best_columns);
// Reset the resolution of the processing image. TEST only function.
void SetResolution(const int resolution);
protected:
// Identify the special text type for one blob, and update its field. When
// height_th is set (> 0), we will label the blob as BSTT_NONE if its height
// is less than height_th.
void IdentifySpecialText(BLOBNBOX *blob, const int height_th);
// Estimate the type for one unichar.
BlobSpecialTextType EstimateTypeForUnichar(
const UNICHARSET& unicharset, const UNICHAR_ID id) const;
// Compute special text type for each blobs in part_grid_.
void IdentifySpecialText();
// Identify blobs that we want to skip during special blob type
// classification.
void IdentifyBlobsToSkip(ColPartition* part);
// The ColPartitions in part_grid_ maybe over-segmented, particularly in the
// block equation regions. So we like to identify these partitions and merge
// them before we do the searching.
void MergePartsByLocation();
// Staring from the seed center, we do radius search. And for partitions that
// have large overlaps with seed, we remove them from part_grid_ and add into
// parts_overlap. Note: this function may update the part_grid_, so if the
// caller is also running ColPartitionGridSearch, use the RepositionIterator
// to continue.
void SearchByOverlap(ColPartition* seed,
GenericVector<ColPartition*>* parts_overlap);
// Insert part back into part_grid_, after it absorbs some other parts.
void InsertPartAfterAbsorb(ColPartition* part);
// Identify the colparitions in part_grid_, label them as PT_EQUATION, and
// save them into cp_seeds_.
void IdentifySeedParts();
// Check the blobs count for a seed region candidate.
bool CheckSeedBlobsCount(ColPartition* part);
// Compute the foreground pixel density for a tbox area.
float ComputeForegroundDensity(const TBOX& tbox);
// Check if part from seed2 label: with low math density and left indented. We
// are using two checks:
// 1. If its left is aligned with any coordinates in indented_texts_left,
// which we assume have been sorted.
// 2. If its foreground density is over foreground_density_th.
bool CheckForSeed2(
const GenericVector<int>& indented_texts_left,
const float foreground_density_th,
ColPartition* part);
// Count the number of values in sorted_vec that is close to val, used to
// check if a partition is aligned with text partitions.
int CountAlignment(
const GenericVector<int>& sorted_vec, const int val) const;
// Check for a seed candidate using the foreground pixel density. And we
// return true if the density is below a certain threshold, because characters
// in equation regions usually are apart with more white spaces.
bool CheckSeedFgDensity(const float density_th, ColPartition* part);
// A light version of SplitCPHor: instead of really doing the part split, we
// simply compute the union bounding box of each splitted part.
void SplitCPHorLite(ColPartition* part, GenericVector<TBOX>* splitted_boxes);
// Split the part (horizontally), and save the splitted result into
// parts_splitted. Note that it is caller's responsibility to release the
// memory owns by parts_splitted. On the other hand, the part is unchanged
// during this process and still owns the blobs, so do NOT call DeleteBoxes
// when freeing the colpartitions in parts_splitted.
void SplitCPHor(ColPartition* part,
GenericVector<ColPartition*>* parts_splitted);
// Check the density for a seed candidate (part) using its math density and
// italic density, returns true if the check passed.
bool CheckSeedDensity(const float math_density_high,
const float math_density_low,
const ColPartition* part) const;
// Check if part is indented.
IndentType IsIndented(ColPartition* part);
// Identify inline partitions from cp_seeds_, and re-label them.
void IdentifyInlineParts();
// Comute the super bounding box for all colpartitions inside part_grid_.
void ComputeCPsSuperBBox();
// Identify inline partitions from cp_seeds_ using the horizontal search.
void IdentifyInlinePartsHorizontal();
// Estimate the line spacing between two text partitions. Returns -1 if not
// enough data.
int EstimateTextPartLineSpacing();
// Identify inline partitions from cp_seeds_ using vertical search.
void IdentifyInlinePartsVertical(const bool top_to_bottom,
const int textPartsLineSpacing);
// Check if part is an inline equation zone. This should be called after we
// identified the seed regions.
bool IsInline(const bool search_bottom,
const int textPartsLineSpacing,
ColPartition* part);
// For a given seed partition, we search the part_grid_ and see if there is
// any partition can be merged with it. It returns true if the seed has been
// expanded.
bool ExpandSeed(ColPartition* seed);
// Starting from the seed position, we search the part_grid_
// horizontally/vertically, find all parititions that can be
// merged with seed, remove them from part_grid_, and put them into
// parts_to_merge.
void ExpandSeedHorizontal(const bool search_left,
ColPartition* seed,
GenericVector<ColPartition*>* parts_to_merge);
void ExpandSeedVertical(const bool search_bottom,
ColPartition* seed,
GenericVector<ColPartition*>* parts_to_merge);
// Check if a part_box is the small neighbor of seed_box.
bool IsNearSmallNeighbor(const TBOX& seed_box,
const TBOX& part_box) const;
// Perform the density check for part, which we assume is nearing a seed
// partition. It returns true if the check passed.
bool CheckSeedNeighborDensity(const ColPartition* part) const;
// After identify the math blocks, we do one more scanning on all text
// partitions, and check if any of them is the satellite of:
// math blocks: here a p is the satellite of q if:
// 1. q is the nearest vertical neighbor of p, and
// 2. y_gap(p, q) is less than a threshold, and
// 3. x_overlap(p, q) is over a threshold.
// Note that p can be the satellites of two blocks: its top neighbor and
// bottom neighbor.
void ProcessMathBlockSatelliteParts();
// Check if part is the satellite of one/two math blocks. If it is, we return
// true, and save the blocks into math_blocks.
bool IsMathBlockSatellite(
ColPartition* part, GenericVector<ColPartition*>* math_blocks);
// Search the nearest neighbor of part in one vertical direction as defined in
// search_bottom. It returns the neighbor found that major x overlap with it,
// or NULL when not found.
ColPartition* SearchNNVertical(const bool search_bottom,
const ColPartition* part);
// Check if the neighbor with vertical distance of y_gap is a near and math
// block partition.
bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const;
// Generate the tiff file name for output/debug file.
void GetOutputTiffName(const char* name, STRING* image_name) const;
// Debugger function that renders ColPartitions on the input image, where:
// parts labeled as PT_EQUATION will be painted in red, PT_INLINE_EQUATION
// will be painted in green, and other parts will be painted in blue.
void PaintColParts(const STRING& outfile) const;
// Debugger function that renders the blobs in part_grid_ over the input
// image.
void PaintSpecialTexts(const STRING& outfile) const;
// Debugger function that print the math blobs density values for a
// ColPartition object.
void PrintSpecialBlobsDensity(const ColPartition* part) const;
// The tesseract engine intialized from equation training data.
Tesseract equ_tesseract_;
// The tesseract engine used for OCR. This pointer is passed in by the caller,
// so do NOT destroy it in this class.
Tesseract* lang_tesseract_;
// The ColPartitionGrid that we are processing. This pointer is passed in from
// the caller, so do NOT destroy it in the class.
ColPartitionGrid* part_grid_;
// A simple array of pointers to the best assigned column division at
// each grid y coordinate. This pointer is passed in from the caller, so do
// NOT destroy it in the class.
ColPartitionSet** best_columns_;
// The super bounding box of all cps in the part_grid_.
TBOX* cps_super_bbox_;
// The seed ColPartition for equation region.
GenericVector<ColPartition*> cp_seeds_;
// The resolution (dpi) of the processing image.
int resolution_;
// The number of pages we have processed.
int page_count_;
};
} // namespace tesseract
#endif // TESSERACT_CCMAIN_EQUATIONDETECT_H_

View File

@ -0,0 +1,876 @@
/******************************************************************
* File: fixspace.cpp (Formerly fixspace.c)
* Description: Implements a pass over the page res, exploring the alternative
* spacing possibilities, trying to use context to improve the
* word spacing
* Author: Phil Cheatle
* Created: Thu Oct 21 11:38:43 BST 1993
*
* (C) Copyright 1993, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include <ctype.h>
#include "reject.h"
#include "statistc.h"
#include "control.h"
#include "fixspace.h"
#include "genblob.h"
#include "tessvars.h"
#include "tessbox.h"
#include "globals.h"
#include "tesseractclass.h"
#define PERFECT_WERDS 999
#define MAXSPACING 128 /*max expected spacing in pix */
namespace tesseract {
/**
* @name fix_fuzzy_spaces()
* Walk over the page finding sequences of words joined by fuzzy spaces. Extract
* them as a sublist, process the sublist to find the optimal arrangement of
* spaces then replace the sublist in the ROW_RES.
*
* @param monitor progress monitor
* @param word_count count of words in doc
* @param[out] page_res
*/
void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor,
inT32 word_count,
PAGE_RES *page_res) {
BLOCK_RES_IT block_res_it;
ROW_RES_IT row_res_it;
WERD_RES_IT word_res_it_from;
WERD_RES_IT word_res_it_to;
WERD_RES *word_res;
WERD_RES_LIST fuzzy_space_words;
inT16 new_length;
BOOL8 prevent_null_wd_fixsp; // DON'T process blobless wds
inT32 word_index; // current word
block_res_it.set_to_list(&page_res->block_res_list);
word_index = 0;
for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
block_res_it.forward()) {
row_res_it.set_to_list(&block_res_it.data()->row_res_list);
for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
row_res_it.forward()) {
word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
while (!word_res_it_from.at_last()) {
word_res = word_res_it_from.data();
while (!word_res_it_from.at_last() &&
!(word_res->combination ||
word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
block_res_it.data()->block);
word_res = word_res_it_from.forward();
word_index++;
if (monitor != NULL) {
monitor->ocr_alive = TRUE;
monitor->progress = 90 + 5 * word_index / word_count;
if (monitor->deadline_exceeded() ||
(monitor->cancel != NULL &&
(*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
return;
}
}
if (!word_res_it_from.at_last()) {
word_res_it_to = word_res_it_from;
prevent_null_wd_fixsp =
word_res->word->cblob_list()->empty();
if (check_debug_pt(word_res, 60))
debug_fix_space_level.set_value(10);
word_res_it_to.forward();
word_index++;
if (monitor != NULL) {
monitor->ocr_alive = TRUE;
monitor->progress = 90 + 5 * word_index / word_count;
if (monitor->deadline_exceeded() ||
(monitor->cancel != NULL &&
(*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
return;
}
while (!word_res_it_to.at_last() &&
(word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
if (check_debug_pt(word_res, 60))
debug_fix_space_level.set_value(10);
if (word_res->word->cblob_list()->empty())
prevent_null_wd_fixsp = TRUE;
word_res = word_res_it_to.forward();
}
if (check_debug_pt(word_res, 60))
debug_fix_space_level.set_value(10);
if (word_res->word->cblob_list()->empty())
prevent_null_wd_fixsp = TRUE;
if (prevent_null_wd_fixsp) {
word_res_it_from = word_res_it_to;
}
else {
fuzzy_space_words.assign_to_sublist(&word_res_it_from,
&word_res_it_to);
fix_fuzzy_space_list(fuzzy_space_words,
row_res_it.data()->row,
block_res_it.data()->block);
new_length = fuzzy_space_words.length();
word_res_it_from.add_list_before(&fuzzy_space_words);
for (;
!word_res_it_from.at_last() && new_length > 0;
new_length--) {
word_res_it_from.forward();
}
}
if (test_pt)
debug_fix_space_level.set_value(0);
}
fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
block_res_it.data()->block);
// Last word in row
}
}
}
}
void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm,
ROW *row,
BLOCK* block) {
inT16 best_score;
WERD_RES_LIST current_perm;
inT16 current_score;
BOOL8 improved = FALSE;
best_score = eval_word_spacing(best_perm); // default score
dump_words(best_perm, best_score, 1, improved);
if (best_score != PERFECT_WERDS)
initialise_search(best_perm, current_perm);
while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
match_current_words(current_perm, row, block);
current_score = eval_word_spacing(current_perm);
dump_words(current_perm, current_score, 2, improved);
if (current_score > best_score) {
best_perm.clear();
best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
best_score = current_score;
improved = TRUE;
}
if (current_score < PERFECT_WERDS)
transform_to_next_perm(current_perm);
}
dump_words(best_perm, best_score, 3, improved);
}
} // namespace tesseract
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
WERD_RES_IT src_it(&src_list);
WERD_RES_IT new_it(&new_list);
WERD_RES *src_wd;
WERD_RES *new_wd;
for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
src_wd = src_it.data();
if (!src_wd->combination) {
new_wd = WERD_RES::deep_copy(src_wd);
new_wd->combination = FALSE;
new_wd->part_of_combo = FALSE;
new_it.add_after_then_move(new_wd);
}
}
}
namespace tesseract {
void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
BLOCK* block) {
WERD_RES_IT word_it(&words);
WERD_RES *word;
// Since we are not using PAGE_RES to iterate over words, we need to update
// prev_word_best_choice_ before calling classify_word_pass2().
prev_word_best_choice_ = NULL;
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
word = word_it.data();
if ((!word->part_of_combo) && (word->box_word == NULL)) {
WordData word_data(block, row, word);
SetupWordPassN(2, &word_data);
classify_word_and_language(2, NULL, &word_data);
}
prev_word_best_choice_ = word->best_choice;
}
}
/**
* @name eval_word_spacing()
* The basic measure is the number of characters in contextually confirmed
* words. (I.e the word is done)
* If all words are contextually confirmed the evaluation is deemed perfect.
*
* Some fiddles are done to handle "1"s as these are VERY frequent causes of
* fuzzy spaces. The problem with the basic measure is that "561 63" would score
* the same as "56163", though given our knowledge that the space is fuzzy, and
* that there is a "1" next to the fuzzy space, we need to ensure that "56163"
* is preferred.
*
* The solution is to NOT COUNT the score of any word which has a digit at one
* end and a "1Il" as the character the other side of the space.
*
* Conversly, any character next to a "1" within a word is counted as a positive
* score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of
* the "1" joined). "56163" would score 7 - all chars in a numeric word + 2
* sides of a "1" joined.
*
* The joined 1 rule is applied to any word REGARDLESS of contextual
* confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally
* confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.
*
*/
inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
WERD_RES_IT word_res_it(&word_res_list);
inT16 total_score = 0;
inT16 word_count = 0;
inT16 done_word_count = 0;
inT16 word_len;
inT16 i;
inT16 offset;
WERD_RES *word; // current word
inT16 prev_word_score = 0;
BOOL8 prev_word_done = FALSE;
BOOL8 prev_char_1 = FALSE; // prev ch a "1/I/l"?
BOOL8 prev_char_digit = FALSE; // prev ch 2..9 or 0
BOOL8 current_char_1 = FALSE;
BOOL8 current_word_ok_so_far;
STRING punct_chars = "!\"`',.:;";
BOOL8 prev_char_punct = FALSE;
BOOL8 current_char_punct = FALSE;
BOOL8 word_done = FALSE;
do {
word = word_res_it.data();
word_done = fixspace_thinks_word_done(word);
word_count++;
if (word->tess_failed) {
total_score += prev_word_score;
if (prev_word_done)
done_word_count++;
prev_word_score = 0;
prev_char_1 = FALSE;
prev_char_digit = FALSE;
prev_word_done = FALSE;
}
else {
/*
Can we add the prev word score and potentially count this word?
Yes IF it didn't end in a 1 when the first char of this word is a digit
AND it didn't end in a digit when the first char of this word is a 1
*/
word_len = word->reject_map.length();
current_word_ok_so_far = FALSE;
if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
(prev_char_digit && (
(word_done &&
word->best_choice->unichar_lengths().string()[0] == 1 &&
word->best_choice->unichar_string()[0] == '1') ||
(!word_done && STRING(conflict_set_I_l_1).contains(
word->best_choice->unichar_string()[0])))))) {
total_score += prev_word_score;
if (prev_word_done)
done_word_count++;
current_word_ok_so_far = word_done;
}
if (current_word_ok_so_far) {
prev_word_done = TRUE;
prev_word_score = word_len;
}
else {
prev_word_done = FALSE;
prev_word_score = 0;
}
/* Add 1 to total score for every joined 1 regardless of context and
rejtn */
for (i = 0, prev_char_1 = FALSE; i < word_len; i++) {
current_char_1 = word->best_choice->unichar_string()[i] == '1';
if (prev_char_1 || (current_char_1 && (i > 0)))
total_score++;
prev_char_1 = current_char_1;
}
/* Add 1 to total score for every joined punctuation regardless of context
and rejtn */
if (tessedit_prefer_joined_punct) {
for (i = 0, offset = 0, prev_char_punct = FALSE; i < word_len;
offset += word->best_choice->unichar_lengths()[i++]) {
current_char_punct =
punct_chars.contains(word->best_choice->unichar_string()[offset]);
if (prev_char_punct || (current_char_punct && i > 0))
total_score++;
prev_char_punct = current_char_punct;
}
}
prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
for (i = 0, offset = 0; i < word_len - 1;
offset += word->best_choice->unichar_lengths()[i++]);
prev_char_1 =
((word_done && (word->best_choice->unichar_string()[offset] == '1'))
|| (!word_done && STRING(conflict_set_I_l_1).contains(
word->best_choice->unichar_string()[offset])));
}
/* Find next word */
do {
word_res_it.forward();
} while (word_res_it.data()->part_of_combo);
} while (!word_res_it.at_first());
total_score += prev_word_score;
if (prev_word_done)
done_word_count++;
if (done_word_count == word_count)
return PERFECT_WERDS;
else
return total_score;
}
BOOL8 Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
int i;
int offset;
for (i = 0, offset = 0; i < char_position;
offset += word->best_choice->unichar_lengths()[i++]);
return (
word->uch_set->get_isdigit(
word->best_choice->unichar_string().string() + offset,
word->best_choice->unichar_lengths()[i]) ||
(word->best_choice->permuter() == NUMBER_PERM &&
STRING(numeric_punctuation).contains(
word->best_choice->unichar_string().string()[offset])));
}
} // namespace tesseract
/**
* @name transform_to_next_perm()
* Examines the current word list to find the smallest word gap size. Then walks
* the word list closing any gaps of this size by either inserted new
* combination words, or extending existing ones.
*
* The routine COULD be limited to stop it building words longer than N blobs.
*
* If there are no more gaps then it DELETES the entire list and returns the
* empty list to cause termination.
*/
void transform_to_next_perm(WERD_RES_LIST &words) {
WERD_RES_IT word_it(&words);
WERD_RES_IT prev_word_it(&words);
WERD_RES *word;
WERD_RES *prev_word;
WERD_RES *combo;
WERD *copy_word;
inT16 prev_right = -MAX_INT16;
TBOX box;
inT16 gap;
inT16 min_gap = MAX_INT16;
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
word = word_it.data();
if (!word->part_of_combo) {
box = word->word->bounding_box();
if (prev_right > -MAX_INT16) {
gap = box.left() - prev_right;
if (gap < min_gap)
min_gap = gap;
}
prev_right = box.right();
}
}
if (min_gap < MAX_INT16) {
prev_right = -MAX_INT16; // back to start
word_it.set_to_list(&words);
// Note: we can't use cycle_pt due to inserted combos at start of list.
for (; (prev_right == -MAX_INT16) || !word_it.at_first();
word_it.forward()) {
word = word_it.data();
if (!word->part_of_combo) {
box = word->word->bounding_box();
if (prev_right > -MAX_INT16) {
gap = box.left() - prev_right;
if (gap <= min_gap) {
prev_word = prev_word_it.data();
if (prev_word->combination) {
combo = prev_word;
}
else {
/* Make a new combination and insert before
* the first word being joined. */
copy_word = new WERD;
*copy_word = *(prev_word->word);
// deep copy
combo = new WERD_RES(copy_word);
combo->combination = TRUE;
combo->x_height = prev_word->x_height;
prev_word->part_of_combo = TRUE;
prev_word_it.add_before_then_move(combo);
}
combo->word->set_flag(W_EOL, word->word->flag(W_EOL));
if (word->combination) {
combo->word->join_on(word->word);
// Move blobs to combo
// old combo no longer needed
delete word_it.extract();
}
else {
// Copy current wd to combo
combo->copy_on(word);
word->part_of_combo = TRUE;
}
combo->done = FALSE;
combo->ClearResults();
}
else {
prev_word_it = word_it; // catch up
}
}
prev_right = box.right();
}
}
}
else {
words.clear(); // signal termination
}
}
namespace tesseract {
void Tesseract::dump_words(WERD_RES_LIST &perm, inT16 score,
inT16 mode, BOOL8 improved) {
WERD_RES_IT word_res_it(&perm);
if (debug_fix_space_level > 0) {
if (mode == 1) {
stats_.dump_words_str = "";
for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
word_res_it.forward()) {
if (!word_res_it.data()->part_of_combo) {
stats_.dump_words_str +=
word_res_it.data()->best_choice->unichar_string();
stats_.dump_words_str += ' ';
}
}
}
if (debug_fix_space_level > 1) {
switch (mode) {
case 1:
tprintf("EXTRACTED (%d): \"", score);
break;
case 2:
tprintf("TESTED (%d): \"", score);
break;
case 3:
tprintf("RETURNED (%d): \"", score);
break;
}
for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
word_res_it.forward()) {
if (!word_res_it.data()->part_of_combo) {
tprintf("%s/%1d ",
word_res_it.data()->best_choice->unichar_string().string(),
(int)word_res_it.data()->best_choice->permuter());
}
}
tprintf("\"\n");
}
else if (improved) {
tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string());
for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
word_res_it.forward()) {
if (!word_res_it.data()->part_of_combo) {
tprintf("%s/%1d ",
word_res_it.data()->best_choice->unichar_string().string(),
(int)word_res_it.data()->best_choice->permuter());
}
}
tprintf("\"\n");
}
}
}
BOOL8 Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
if (word->done)
return TRUE;
/*
Use all the standard pass 2 conditions for mode 5 in set_done() in
reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
CARE WHETHER WE HAVE of/at on/an etc.
*/
if (fixsp_done_mode > 0 &&
(word->tess_accepted ||
(fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
fixsp_done_mode == 3) &&
(strchr(word->best_choice->unichar_string().string(), ' ') == NULL) &&
((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
(word->best_choice->permuter() == FREQ_DAWG_PERM) ||
(word->best_choice->permuter() == USER_DAWG_PERM) ||
(word->best_choice->permuter() == NUMBER_PERM))) {
return TRUE;
}
else {
return FALSE;
}
}
/**
* @name fix_sp_fp_word()
* Test the current word to see if it can be split by deleting noise blobs. If
* so, do the business.
* Return with the iterator pointing to the same place if the word is unchanged,
* or the last of the replacement words.
*/
void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,
BLOCK* block) {
WERD_RES *word_res;
WERD_RES_LIST sub_word_list;
WERD_RES_IT sub_word_list_it(&sub_word_list);
inT16 blob_index;
inT16 new_length;
float junk;
word_res = word_res_it.data();
if (word_res->word->flag(W_REP_CHAR) ||
word_res->combination ||
word_res->part_of_combo ||
!word_res->word->flag(W_DONT_CHOP))
return;
blob_index = worst_noise_blob(word_res, &junk);
if (blob_index < 0)
return;
if (debug_fix_space_level > 1) {
tprintf("FP fixspace working on \"%s\"\n",
word_res->best_choice->unichar_string().string());
}
word_res->word->rej_cblob_list()->sort(c_blob_comparator);
sub_word_list_it.add_after_stay_put(word_res_it.extract());
fix_noisy_space_list(sub_word_list, row, block);
new_length = sub_word_list.length();
word_res_it.add_list_before(&sub_word_list);
for (; !word_res_it.at_last() && new_length > 1; new_length--) {
word_res_it.forward();
}
}
void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
BLOCK* block) {
inT16 best_score;
WERD_RES_IT best_perm_it(&best_perm);
WERD_RES_LIST current_perm;
WERD_RES_IT current_perm_it(&current_perm);
WERD_RES *old_word_res;
inT16 current_score;
BOOL8 improved = FALSE;
best_score = fp_eval_word_spacing(best_perm); // default score
dump_words(best_perm, best_score, 1, improved);
old_word_res = best_perm_it.data();
// Even deep_copy doesn't copy the underlying WERD unless its combination
// flag is true!.
old_word_res->combination = TRUE; // Kludge to force deep copy
current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
old_word_res->combination = FALSE; // Undo kludge
break_noisiest_blob_word(current_perm);
while (best_score != PERFECT_WERDS && !current_perm.empty()) {
match_current_words(current_perm, row, block);
current_score = fp_eval_word_spacing(current_perm);
dump_words(current_perm, current_score, 2, improved);
if (current_score > best_score) {
best_perm.clear();
best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
best_score = current_score;
improved = TRUE;
}
if (current_score < PERFECT_WERDS) {
break_noisiest_blob_word(current_perm);
}
}
dump_words(best_perm, best_score, 3, improved);
}
/**
* break_noisiest_blob_word()
* Find the word with the blob which looks like the worst noise.
* Break the word into two, deleting the noise blob.
*/
void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
WERD_RES_IT word_it(&words);
WERD_RES_IT worst_word_it;
float worst_noise_score = 9999;
int worst_blob_index = -1; // Noisiest blob of noisiest wd
int blob_index; // of wds noisiest blob
float noise_score; // of wds noisiest blob
WERD_RES *word_res;
C_BLOB_IT blob_it;
C_BLOB_IT rej_cblob_it;
C_BLOB_LIST new_blob_list;
C_BLOB_IT new_blob_it;
C_BLOB_IT new_rej_cblob_it;
WERD *new_word;
inT16 start_of_noise_blob;
inT16 i;
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
blob_index = worst_noise_blob(word_it.data(), &noise_score);
if (blob_index > -1 && worst_noise_score > noise_score) {
worst_noise_score = noise_score;
worst_blob_index = blob_index;
worst_word_it = word_it;
}
}
if (worst_blob_index < 0) {
words.clear(); // signal termination
return;
}
/* Now split the worst_word_it */
word_res = worst_word_it.data();
/* Move blobs before noise blob to a new bloblist */
new_blob_it.set_to_list(&new_blob_list);
blob_it.set_to_list(word_res->word->cblob_list());
for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
new_blob_it.add_after_then_move(blob_it.extract());
}
start_of_noise_blob = blob_it.data()->bounding_box().left();
delete blob_it.extract(); // throw out noise blob
new_word = new WERD(&new_blob_list, word_res->word);
new_word->set_flag(W_EOL, FALSE);
word_res->word->set_flag(W_BOL, FALSE);
word_res->word->set_blanks(1); // After break
new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
for (;
(!rej_cblob_it.empty() &&
(rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
rej_cblob_it.forward()) {
new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
}
WERD_RES* new_word_res = new WERD_RES(new_word);
new_word_res->combination = TRUE;
worst_word_it.add_before_then_move(new_word_res);
word_res->ClearResults();
}
inT16 Tesseract::worst_noise_blob(WERD_RES *word_res,
float *worst_noise_score) {
float noise_score[512];
int i;
int min_noise_blob; // 1st contender
int max_noise_blob; // last contender
int non_noise_count;
int worst_noise_blob; // Worst blob
float small_limit = kBlnXHeight * fixsp_small_outlines_size;
float non_noise_limit = kBlnXHeight * 0.8;
if (word_res->rebuild_word == NULL)
return -1; // Can't handle cube words.
// Normalised.
int blob_count = word_res->box_word->length();
ASSERT_HOST(blob_count <= 512);
if (blob_count < 5)
return -1; // too short to split
/* Get the noise scores for all blobs */
#ifndef SECURE_NAMES
if (debug_fix_space_level > 5)
tprintf("FP fixspace Noise metrics for \"%s\": ",
word_res->best_choice->unichar_string().string());
#endif
for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
TBLOB* blob = word_res->rebuild_word->blobs[i];
if (word_res->reject_map[i].accepted())
noise_score[i] = non_noise_limit;
else
noise_score[i] = blob_noise_score(blob);
if (debug_fix_space_level > 5)
tprintf("%1.1f ", noise_score[i]);
}
if (debug_fix_space_level > 5)
tprintf("\n");
/* Now find the worst one which is far enough away from the end of the word */
non_noise_count = 0;
for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
if (noise_score[i] >= non_noise_limit) {
non_noise_count++;
}
}
if (non_noise_count < fixsp_non_noise_limit)
return -1;
min_noise_blob = i;
non_noise_count = 0;
for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
i--) {
if (noise_score[i] >= non_noise_limit) {
non_noise_count++;
}
}
if (non_noise_count < fixsp_non_noise_limit)
return -1;
max_noise_blob = i;
if (min_noise_blob > max_noise_blob)
return -1;
*worst_noise_score = small_limit;
worst_noise_blob = -1;
for (i = min_noise_blob; i <= max_noise_blob; i++) {
if (noise_score[i] < *worst_noise_score) {
worst_noise_blob = i;
*worst_noise_score = noise_score[i];
}
}
return worst_noise_blob;
}
float Tesseract::blob_noise_score(TBLOB *blob) {
TBOX box; // BB of outline
inT16 outline_count = 0;
inT16 max_dimension;
inT16 largest_outline_dimension = 0;
for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) {
outline_count++;
box = ol->bounding_box();
if (box.height() > box.width()) {
max_dimension = box.height();
}
else {
max_dimension = box.width();
}
if (largest_outline_dimension < max_dimension)
largest_outline_dimension = max_dimension;
}
if (outline_count > 5) {
// penalise LOTS of blobs
largest_outline_dimension *= 2;
}
box = blob->bounding_box();
if (box.bottom() > kBlnBaselineOffset * 4 ||
box.top() < kBlnBaselineOffset / 2) {
// Lax blob is if high or low
largest_outline_dimension /= 2;
}
return largest_outline_dimension;
}
} // namespace tesseract
void fixspace_dbg(WERD_RES *word) {
TBOX box = word->word->bounding_box();
BOOL8 show_map_detail = FALSE;
inT16 i;
box.print();
tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
tprintf("Blob count: %d (word); %d/%d (rebuild word)\n",
word->word->cblob_list()->length(),
word->rebuild_word->NumBlobs(),
word->box_word->length());
word->reject_map.print(debug_fp);
tprintf("\n");
if (show_map_detail) {
tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
word->reject_map[i].full_print(debug_fp);
}
}
tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
}
/**
* fp_eval_word_spacing()
* Evaluation function for fixed pitch word lists.
*
* Basically, count the number of "nice" characters - those which are in tess
* acceptable words or in dict words and are not rejected.
* Penalise any potential noise chars
*/
namespace tesseract {
inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
WERD_RES_IT word_it(&word_res_list);
WERD_RES *word;
inT16 score = 0;
inT16 i;
float small_limit = kBlnXHeight * fixsp_small_outlines_size;
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
word = word_it.data();
if (word->rebuild_word == NULL)
continue; // Can't handle cube words.
if (word->done ||
word->tess_accepted ||
word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
word->best_choice->permuter() == FREQ_DAWG_PERM ||
word->best_choice->permuter() == USER_DAWG_PERM ||
safe_dict_word(word) > 0) {
int num_blobs = word->rebuild_word->NumBlobs();
UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
TBLOB* blob = word->rebuild_word->blobs[i];
if (word->best_choice->unichar_id(i) == space ||
blob_noise_score(blob) < small_limit) {
score -= 1; // penalise possibly erroneous non-space
}
else if (word->reject_map[i].accepted()) {
score++;
}
}
}
}
if (score < 0)
score = 0;
return score;
}
} // namespace tesseract

View File

@ -0,0 +1,31 @@
/******************************************************************
* File: fixspace.h (Formerly fixspace.h)
* Description: Implements a pass over the page res, exploring the alternative
* spacing possibilities, trying to use context to improve the
word spacing
* Author: Phil Cheatle
* Created: Thu Oct 21 11:38:43 BST 1993
*
* (C) Copyright 1993, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef FIXSPACE_H
#define FIXSPACE_H
#include "pageres.h"
#include "params.h"
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list);
void transform_to_next_perm(WERD_RES_LIST &words);
void fixspace_dbg(WERD_RES *word);
#endif

View File

@ -0,0 +1,216 @@
/**********************************************************************
* File: fixxht.cpp (Formerly fixxht.c)
* Description: Improve x_ht and look out for case inconsistencies
* Author: Phil Cheatle
* Created: Thu Aug 5 14:11:08 BST 1993
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include <string.h>
#include <ctype.h>
#include "params.h"
#include "float2int.h"
#include "tesseractclass.h"
namespace tesseract {
// Fixxht overview.
// Premise: Initial estimate of x-height is adequate most of the time, but
// occasionally it is incorrect. Most notable causes of failure are:
// 1. Small caps, where the top of the caps is the same as the body text
// xheight. For small caps words the xheight needs to be reduced to correctly
// recognize the caps in the small caps word.
// 2. All xheight lines, such as summer. Here the initial estimate will have
// guessed that the blob tops are caps and will have placed the xheight too low.
// 3. Noise/logos beside words, or changes in font size on a line. Such
// things can blow the statistics and cause an incorrect estimate.
// 4. Incorrect baseline. Can happen when 2 columns are incorrectly merged.
// In this case the x-height is often still correct.
//
// Algorithm.
// Compare the vertical position (top only) of alphnumerics in a word with
// the range of positions in training data (in the unicharset).
// See CountMisfitTops. If any characters disagree sufficiently with the
// initial xheight estimate, then recalculate the xheight, re-run OCR on
// the word, and if the number of vertical misfits goes down, along with
// either the word rating or certainty, then keep the new xheight.
// The new xheight is calculated as follows:ComputeCompatibleXHeight
// For each alphanumeric character that has a vertically misplaced top
// (a misfit), yet its bottom is within the acceptable range (ie it is not
// likely a sub-or super-script) calculate the range of acceptable xheight
// positions from its range of tops, and give each value in the range a
// number of votes equal to the distance of its top from its acceptance range.
// The x-height position with the median of the votes becomes the new
// x-height. This assumes that most characters will be correctly recognized
// even if the x-height is incorrect. This is not a terrible assumption, but
// it is not great. An improvement would be to use a classifier that does
// not care about vertical position or scaling at all.
// Separately collect stats on shifted baselines and apply the same logic to
// computing a best-fit shift to fix the error. If the baseline needs to be
// shifted, but the x-height is OK, returns the original x-height along with
// the baseline shift to indicate that recognition needs to re-run.
// If the max-min top of a unicharset char is bigger than kMaxCharTopRange
// then the char top cannot be used to judge misfits or suggest a new top.
const int kMaxCharTopRange = 48;
// Returns the number of misfit blob tops in this word.
int Tesseract::CountMisfitTops(WERD_RES *word_res) {
int bad_blobs = 0;
int num_blobs = word_res->rebuild_word->NumBlobs();
for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
int top = blob->bounding_box().top();
if (top >= INT_FEAT_RANGE)
top = INT_FEAT_RANGE - 1;
int min_bottom, max_bottom, min_top, max_top;
unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
&min_top, &max_top);
if (max_top - min_top > kMaxCharTopRange)
continue;
bool bad = top < min_top - x_ht_acceptance_tolerance ||
top > max_top + x_ht_acceptance_tolerance;
if (bad)
++bad_blobs;
if (debug_x_ht_level >= 1) {
tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
unicharset.id_to_unichar(class_id),
bad ? "Misfit" : "OK", top, min_top, max_top,
static_cast<int>(x_ht_acceptance_tolerance));
}
}
}
return bad_blobs;
}
// Returns a new x-height maximally compatible with the result in word_res.
// See comment above for overall algorithm.
float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res,
float* baseline_shift) {
STATS top_stats(0, MAX_UINT8);
STATS shift_stats(-MAX_UINT8, MAX_UINT8);
int bottom_shift = 0;
int num_blobs = word_res->rebuild_word->NumBlobs();
do {
top_stats.clear();
shift_stats.clear();
for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
if (unicharset.get_isalpha(class_id) ||
unicharset.get_isdigit(class_id)) {
int top = blob->bounding_box().top() + bottom_shift;
// Clip the top to the limit of normalized feature space.
if (top >= INT_FEAT_RANGE)
top = INT_FEAT_RANGE - 1;
int bottom = blob->bounding_box().bottom() + bottom_shift;
int min_bottom, max_bottom, min_top, max_top;
unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
&min_top, &max_top);
// Chars with a wild top range would mess up the result so ignore them.
if (max_top - min_top > kMaxCharTopRange)
continue;
int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top,
top - (max_top + x_ht_acceptance_tolerance));
int height = top - kBlnBaselineOffset;
if (debug_x_ht_level >= 2) {
tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
unicharset.id_to_unichar(class_id),
height, min_bottom, max_bottom, min_top, max_top,
bottom, top);
}
// Use only chars that fit in the expected bottom range, and where
// the range of tops is sensibly near the xheight.
if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
bottom - x_ht_acceptance_tolerance <= max_bottom &&
min_top > kBlnBaselineOffset &&
max_top - kBlnBaselineOffset >= kBlnXHeight &&
misfit_dist > 0) {
// Compute the x-height position using proportionality between the
// actual height and expected height.
int min_xht = DivRounded(height * kBlnXHeight,
max_top - kBlnBaselineOffset);
int max_xht = DivRounded(height * kBlnXHeight,
min_top - kBlnBaselineOffset);
if (debug_x_ht_level >= 2) {
tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
}
// The range of expected heights gets a vote equal to the distance
// of the actual top from the expected top.
for (int y = min_xht; y <= max_xht; ++y)
top_stats.add(y, misfit_dist);
}
else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
bottom - x_ht_acceptance_tolerance > max_bottom) &&
bottom_shift == 0) {
// Get the range of required bottom shift.
int min_shift = min_bottom - bottom;
int max_shift = max_bottom - bottom;
if (debug_x_ht_level >= 2) {
tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
}
// The range of expected shifts gets a vote equal to the min distance
// of the actual bottom from the expected bottom, spread over the
// range of its acceptance.
int misfit_weight = abs(min_shift);
if (max_shift > min_shift)
misfit_weight /= max_shift - min_shift;
for (int y = min_shift; y <= max_shift; ++y)
shift_stats.add(y, misfit_weight);
}
else {
if (bottom_shift == 0) {
// Things with bottoms that are already ok need to say so, on the
// 1st iteration only.
shift_stats.add(0, kBlnBaselineOffset);
}
if (debug_x_ht_level >= 2) {
tprintf(" already OK\n");
}
}
}
}
if (shift_stats.get_total() > top_stats.get_total()) {
bottom_shift = IntCastRounded(shift_stats.median());
if (debug_x_ht_level >= 2) {
tprintf("Applying bottom shift=%d\n", bottom_shift);
}
}
} while (bottom_shift != 0 &&
top_stats.get_total() < shift_stats.get_total());
// Baseline shift is opposite sign to the bottom shift.
*baseline_shift = -bottom_shift / word_res->denorm.y_scale();
if (debug_x_ht_level >= 2) {
tprintf("baseline shift=%g\n", *baseline_shift);
}
if (top_stats.get_total() == 0)
return bottom_shift != 0 ? word_res->x_height : 0.0f;
// The new xheight is just the median vote, which is then scaled out
// of BLN space back to pixel space to get the x-height in pixel space.
float new_xht = top_stats.median();
if (debug_x_ht_level >= 2) {
tprintf("Median xht=%f\n", new_xht);
tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
new_xht, new_xht / word_res->denorm.y_scale());
}
// The xheight must change by at least x_ht_min_change to be used.
if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
return new_xht / word_res->denorm.y_scale();
else
return bottom_shift != 0 ? word_res->x_height : 0.0f;
}
} // namespace tesseract

View File

@ -0,0 +1,390 @@
///////////////////////////////////////////////////////////////////////
// File: ltrresultiterator.cpp
// Description: Iterator for tesseract results in strict left-to-right
// order that avoids using tesseract internal data structures.
// Author: Ray Smith
// Created: Fri Feb 26 14:32:09 PST 2010
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "ltrresultiterator.h"
#include "allheaders.h"
#include "pageres.h"
#include "strngs.h"
#include "tesseractclass.h"
namespace tesseract {
LTRResultIterator::LTRResultIterator(PAGE_RES* page_res, Tesseract* tesseract,
int scale, int scaled_yres,
int rect_left, int rect_top,
int rect_width, int rect_height)
: PageIterator(page_res, tesseract, scale, scaled_yres,
rect_left, rect_top, rect_width, rect_height),
line_separator_("\n"),
paragraph_separator_("\n") {
}
LTRResultIterator::~LTRResultIterator() {
}
// Returns the null terminated UTF-8 encoded text string for the current
// object at the given level. Use delete [] to free after use.
char* LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const {
if (it_->word() == NULL) return NULL; // Already at the end!
STRING text;
PAGE_RES_IT res_it(*it_);
WERD_CHOICE* best_choice = res_it.word()->best_choice;
ASSERT_HOST(best_choice != NULL);
if (level == RIL_SYMBOL) {
text = res_it.word()->BestUTF8(blob_index_, false);
}
else if (level == RIL_WORD) {
text = best_choice->unichar_string();
}
else {
bool eol = false; // end of line?
bool eop = false; // end of paragraph?
do { // for each paragraph in a block
do { // for each text line in a paragraph
do { // for each word in a text line
best_choice = res_it.word()->best_choice;
ASSERT_HOST(best_choice != NULL);
text += best_choice->unichar_string();
text += " ";
res_it.forward();
eol = res_it.row() != res_it.prev_row();
} while (!eol);
text.truncate_at(text.length() - 1);
text += line_separator_;
eop = res_it.block() != res_it.prev_block() ||
res_it.row()->row->para() != res_it.prev_row()->row->para();
} while (level != RIL_TEXTLINE && !eop);
if (eop) text += paragraph_separator_;
} while (level == RIL_BLOCK && res_it.block() == res_it.prev_block());
}
int length = text.length() + 1;
char* result = new char[length];
strncpy(result, text.string(), length);
return result;
}
// Set the string inserted at the end of each text line. "\n" by default.
void LTRResultIterator::SetLineSeparator(const char *new_line) {
line_separator_ = new_line;
}
// Set the string inserted at the end of each paragraph. "\n" by default.
void LTRResultIterator::SetParagraphSeparator(const char *new_para) {
paragraph_separator_ = new_para;
}
// Returns the mean confidence of the current object at the given level.
// The number should be interpreted as a percent probability. (0.0f-100.0f)
float LTRResultIterator::Confidence(PageIteratorLevel level) const {
if (it_->word() == NULL) return 0.0f; // Already at the end!
float mean_certainty = 0.0f;
int certainty_count = 0;
PAGE_RES_IT res_it(*it_);
WERD_CHOICE* best_choice = res_it.word()->best_choice;
ASSERT_HOST(best_choice != NULL);
switch (level) {
case RIL_BLOCK:
do {
best_choice = res_it.word()->best_choice;
ASSERT_HOST(best_choice != NULL);
mean_certainty += best_choice->certainty();
++certainty_count;
res_it.forward();
} while (res_it.block() == res_it.prev_block());
break;
case RIL_PARA:
do {
best_choice = res_it.word()->best_choice;
ASSERT_HOST(best_choice != NULL);
mean_certainty += best_choice->certainty();
++certainty_count;
res_it.forward();
} while (res_it.block() == res_it.prev_block() &&
res_it.row()->row->para() == res_it.prev_row()->row->para());
break;
case RIL_TEXTLINE:
do {
best_choice = res_it.word()->best_choice;
ASSERT_HOST(best_choice != NULL);
mean_certainty += best_choice->certainty();
++certainty_count;
res_it.forward();
} while (res_it.row() == res_it.prev_row());
break;
case RIL_WORD:
mean_certainty += best_choice->certainty();
++certainty_count;
break;
case RIL_SYMBOL:
mean_certainty += best_choice->certainty(blob_index_);
++certainty_count;
}
if (certainty_count > 0) {
mean_certainty /= certainty_count;
float confidence = 100 + 5 * mean_certainty;
if (confidence < 0.0f) confidence = 0.0f;
if (confidence > 100.0f) confidence = 100.0f;
return confidence;
}
return 0.0f;
}
void LTRResultIterator::RowAttributes(float* row_height, float* descenders,
float* ascenders) const {
*row_height = it_->row()->row->x_height() + it_->row()->row->ascenders() -
it_->row()->row->descenders();
*descenders = it_->row()->row->descenders();
*ascenders = it_->row()->row->ascenders();
}
// Returns the font attributes of the current word. If iterating at a higher
// level object than words, eg textlines, then this will return the
// attributes of the first word in that textline.
// The actual return value is a string representing a font name. It points
// to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
// the iterator itself, ie rendered invalid by various members of
// TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
// Pointsize is returned in printers points (1/72 inch.)
const char* LTRResultIterator::WordFontAttributes(bool* is_bold,
bool* is_italic,
bool* is_underlined,
bool* is_monospace,
bool* is_serif,
bool* is_smallcaps,
int* pointsize,
int* font_id) const {
if (it_->word() == NULL) return NULL; // Already at the end!
if (it_->word()->fontinfo == NULL) {
*font_id = -1;
return NULL; // No font information.
}
const FontInfo& font_info = *it_->word()->fontinfo;
*font_id = font_info.universal_id;
*is_bold = font_info.is_bold();
*is_italic = font_info.is_italic();
*is_underlined = false; // TODO(rays) fix this!
*is_monospace = font_info.is_fixed_pitch();
*is_serif = font_info.is_serif();
*is_smallcaps = it_->word()->small_caps;
float row_height = it_->row()->row->x_height() +
it_->row()->row->ascenders() - it_->row()->row->descenders();
// Convert from pixels to printers points.
*pointsize = scaled_yres_ > 0
? static_cast<int>(row_height * kPointsPerInch / scaled_yres_ + 0.5)
: 0;
return font_info.name;
}
// Returns the name of the language used to recognize this word.
const char* LTRResultIterator::WordRecognitionLanguage() const {
if (it_->word() == NULL || it_->word()->tesseract == NULL) return NULL;
return it_->word()->tesseract->lang.string();
}
// Return the overall directionality of this word.
StrongScriptDirection LTRResultIterator::WordDirection() const {
if (it_->word() == NULL) return DIR_NEUTRAL;
bool has_rtl = it_->word()->AnyRtlCharsInWord();
bool has_ltr = it_->word()->AnyLtrCharsInWord();
if (has_rtl && !has_ltr)
return DIR_RIGHT_TO_LEFT;
if (has_ltr && !has_rtl)
return DIR_LEFT_TO_RIGHT;
if (!has_ltr && !has_rtl)
return DIR_NEUTRAL;
return DIR_MIX;
}
// Returns true if the current word was found in a dictionary.
bool LTRResultIterator::WordIsFromDictionary() const {
if (it_->word() == NULL) return false; // Already at the end!
int permuter = it_->word()->best_choice->permuter();
return permuter == SYSTEM_DAWG_PERM || permuter == FREQ_DAWG_PERM ||
permuter == USER_DAWG_PERM;
}
// Returns true if the current word is numeric.
bool LTRResultIterator::WordIsNumeric() const {
if (it_->word() == NULL) return false; // Already at the end!
int permuter = it_->word()->best_choice->permuter();
return permuter == NUMBER_PERM;
}
// Returns true if the word contains blamer information.
bool LTRResultIterator::HasBlamerInfo() const {
return it_->word() != NULL && it_->word()->blamer_bundle != NULL &&
it_->word()->blamer_bundle->HasDebugInfo();
}
// Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
// of the current word.
const void *LTRResultIterator::GetParamsTrainingBundle() const {
return (it_->word() != NULL && it_->word()->blamer_bundle != NULL) ?
&(it_->word()->blamer_bundle->params_training_bundle()) : NULL;
}
// Returns the pointer to the string with blamer information for this word.
// Assumes that the word's blamer_bundle is not NULL.
const char *LTRResultIterator::GetBlamerDebug() const {
return it_->word()->blamer_bundle->debug().string();
}
// Returns the pointer to the string with misadaption information for this word.
// Assumes that the word's blamer_bundle is not NULL.
const char *LTRResultIterator::GetBlamerMisadaptionDebug() const {
return it_->word()->blamer_bundle->misadaption_debug().string();
}
// Returns true if a truth string was recorded for the current word.
bool LTRResultIterator::HasTruthString() const {
if (it_->word() == NULL) return false; // Already at the end!
if (it_->word()->blamer_bundle == NULL ||
it_->word()->blamer_bundle->NoTruth()) {
return false; // no truth information for this word
}
return true;
}
// Returns true if the given string is equivalent to the truth string for
// the current word.
bool LTRResultIterator::EquivalentToTruth(const char *str) const {
if (!HasTruthString()) return false;
ASSERT_HOST(it_->word()->uch_set != NULL);
WERD_CHOICE str_wd(str, *(it_->word()->uch_set));
return it_->word()->blamer_bundle->ChoiceIsCorrect(&str_wd);
}
// Returns the null terminated UTF-8 encoded truth string for the current word.
// Use delete [] to free after use.
char* LTRResultIterator::WordTruthUTF8Text() const {
if (!HasTruthString()) return NULL;
STRING truth_text = it_->word()->blamer_bundle->TruthString();
int length = truth_text.length() + 1;
char* result = new char[length];
strncpy(result, truth_text.string(), length);
return result;
}
// Returns the null terminated UTF-8 encoded normalized OCR string for the
// current word. Use delete [] to free after use.
char* LTRResultIterator::WordNormedUTF8Text() const {
if (it_->word() == NULL) return NULL; // Already at the end!
STRING ocr_text;
WERD_CHOICE* best_choice = it_->word()->best_choice;
const UNICHARSET *unicharset = it_->word()->uch_set;
ASSERT_HOST(best_choice != NULL);
for (int i = 0; i < best_choice->length(); ++i) {
ocr_text += unicharset->get_normed_unichar(best_choice->unichar_id(i));
}
int length = ocr_text.length() + 1;
char* result = new char[length];
strncpy(result, ocr_text.string(), length);
return result;
}
// Returns a pointer to serialized choice lattice.
// Fills lattice_size with the number of bytes in lattice data.
const char *LTRResultIterator::WordLattice(int *lattice_size) const {
if (it_->word() == NULL) return NULL; // Already at the end!
if (it_->word()->blamer_bundle == NULL) return NULL;
*lattice_size = it_->word()->blamer_bundle->lattice_size();
return it_->word()->blamer_bundle->lattice_data();
}
// Returns true if the current symbol is a superscript.
// If iterating at a higher level object than symbols, eg words, then
// this will return the attributes of the first symbol in that word.
bool LTRResultIterator::SymbolIsSuperscript() const {
if (cblob_it_ == NULL && it_->word() != NULL)
return it_->word()->best_choice->BlobPosition(blob_index_) ==
SP_SUPERSCRIPT;
return false;
}
// Returns true if the current symbol is a subscript.
// If iterating at a higher level object than symbols, eg words, then
// this will return the attributes of the first symbol in that word.
bool LTRResultIterator::SymbolIsSubscript() const {
if (cblob_it_ == NULL && it_->word() != NULL)
return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUBSCRIPT;
return false;
}
// Returns true if the current symbol is a dropcap.
// If iterating at a higher level object than symbols, eg words, then
// this will return the attributes of the first symbol in that word.
bool LTRResultIterator::SymbolIsDropcap() const {
if (cblob_it_ == NULL && it_->word() != NULL)
return it_->word()->best_choice->BlobPosition(blob_index_) == SP_DROPCAP;
return false;
}
ChoiceIterator::ChoiceIterator(const LTRResultIterator& result_it) {
ASSERT_HOST(result_it.it_->word() != NULL);
word_res_ = result_it.it_->word();
BLOB_CHOICE_LIST* choices = NULL;
if (word_res_->ratings != NULL)
choices = word_res_->GetBlobChoices(result_it.blob_index_);
if (choices != NULL && !choices->empty()) {
choice_it_ = new BLOB_CHOICE_IT(choices);
choice_it_->mark_cycle_pt();
}
else {
choice_it_ = NULL;
}
}
ChoiceIterator::~ChoiceIterator() {
delete choice_it_;
}
// Moves to the next choice for the symbol and returns false if there
// are none left.
bool ChoiceIterator::Next() {
if (choice_it_ == NULL)
return false;
choice_it_->forward();
return !choice_it_->cycled_list();
}
// Returns the null terminated UTF-8 encoded text string for the current
// choice. Do NOT use delete [] to free after use.
const char* ChoiceIterator::GetUTF8Text() const {
if (choice_it_ == NULL)
return NULL;
UNICHAR_ID id = choice_it_->data()->unichar_id();
return word_res_->uch_set->id_to_unichar_ext(id);
}
// Returns the confidence of the current choice.
// The number should be interpreted as a percent probability. (0.0f-100.0f)
float ChoiceIterator::Confidence() const {
if (choice_it_ == NULL)
return 0.0f;
float confidence = 100 + 5 * choice_it_->data()->certainty();
if (confidence < 0.0f) confidence = 0.0f;
if (confidence > 100.0f) confidence = 100.0f;
return confidence;
}
} // namespace tesseract.

View File

@ -0,0 +1,218 @@
///////////////////////////////////////////////////////////////////////
// File: ltrresultiterator.h
// Description: Iterator for tesseract results in strict left-to-right
// order that avoids using tesseract internal data structures.
// Author: Ray Smith
// Created: Fri Feb 26 11:01:06 PST 2010
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H__
#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H__
#include "platform.h"
#include "pageiterator.h"
#include "unichar.h"
class BLOB_CHOICE_IT;
class WERD_RES;
namespace tesseract {
class Tesseract;
// Class to iterate over tesseract results, providing access to all levels
// of the page hierarchy, without including any tesseract headers or having
// to handle any tesseract structures.
// WARNING! This class points to data held within the TessBaseAPI class, and
// therefore can only be used while the TessBaseAPI class still exists and
// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
// DetectOS, or anything else that changes the internal PAGE_RES.
// See apitypes.h for the definition of PageIteratorLevel.
// See also base class PageIterator, which contains the bulk of the interface.
// LTRResultIterator adds text-specific methods for access to OCR output.
class TESS_API LTRResultIterator : public PageIterator {
friend class ChoiceIterator;
public:
// page_res and tesseract come directly from the BaseAPI.
// The rectangle parameters are copied indirectly from the Thresholder,
// via the BaseAPI. They represent the coordinates of some rectangle in an
// original image (in top-left-origin coordinates) and therefore the top-left
// needs to be added to any output boxes in order to specify coordinates
// in the original image. See TessBaseAPI::SetRectangle.
// The scale and scaled_yres are in case the Thresholder scaled the image
// rectangle prior to thresholding. Any coordinates in tesseract's image
// must be divided by scale before adding (rect_left, rect_top).
// The scaled_yres indicates the effective resolution of the binary image
// that tesseract has been given by the Thresholder.
// After the constructor, Begin has already been called.
LTRResultIterator(PAGE_RES* page_res, Tesseract* tesseract,
int scale, int scaled_yres,
int rect_left, int rect_top,
int rect_width, int rect_height);
virtual ~LTRResultIterator();
// LTRResultIterators may be copied! This makes it possible to iterate over
// all the objects at a lower level, while maintaining an iterator to
// objects at a higher level. These constructors DO NOT CALL Begin, so
// iterations will continue from the location of src.
// TODO: For now the copy constructor and operator= only need the base class
// versions, but if new data members are added, don't forget to add them!
// ============= Moving around within the page ============.
// See PageIterator.
// ============= Accessing data ==============.
// Returns the null terminated UTF-8 encoded text string for the current
// object at the given level. Use delete [] to free after use.
char* GetUTF8Text(PageIteratorLevel level) const;
// Set the string inserted at the end of each text line. "\n" by default.
void SetLineSeparator(const char *new_line);
// Set the string inserted at the end of each paragraph. "\n" by default.
void SetParagraphSeparator(const char *new_para);
// Returns the mean confidence of the current object at the given level.
// The number should be interpreted as a percent probability. (0.0f-100.0f)
float Confidence(PageIteratorLevel level) const;
// Returns the attributes of the current row.
void RowAttributes(float* row_height, float* descenders,
float* ascenders) const;
// ============= Functions that refer to words only ============.
// Returns the font attributes of the current word. If iterating at a higher
// level object than words, eg textlines, then this will return the
// attributes of the first word in that textline.
// The actual return value is a string representing a font name. It points
// to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
// the iterator itself, ie rendered invalid by various members of
// TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
// Pointsize is returned in printers points (1/72 inch.)
const char* WordFontAttributes(bool* is_bold,
bool* is_italic,
bool* is_underlined,
bool* is_monospace,
bool* is_serif,
bool* is_smallcaps,
int* pointsize,
int* font_id) const;
// Return the name of the language used to recognize this word.
// On error, NULL. Do not delete this pointer.
const char* WordRecognitionLanguage() const;
// Return the overall directionality of this word.
StrongScriptDirection WordDirection() const;
// Returns true if the current word was found in a dictionary.
bool WordIsFromDictionary() const;
// Returns true if the current word is numeric.
bool WordIsNumeric() const;
// Returns true if the word contains blamer information.
bool HasBlamerInfo() const;
// Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
// of the current word.
const void *GetParamsTrainingBundle() const;
// Returns a pointer to the string with blamer information for this word.
// Assumes that the word's blamer_bundle is not NULL.
const char *GetBlamerDebug() const;
// Returns a pointer to the string with misadaption information for this word.
// Assumes that the word's blamer_bundle is not NULL.
const char *GetBlamerMisadaptionDebug() const;
// Returns true if a truth string was recorded for the current word.
bool HasTruthString() const;
// Returns true if the given string is equivalent to the truth string for
// the current word.
bool EquivalentToTruth(const char *str) const;
// Returns a null terminated UTF-8 encoded truth string for the current word.
// Use delete [] to free after use.
char* WordTruthUTF8Text() const;
// Returns a null terminated UTF-8 encoded normalized OCR string for the
// current word. Use delete [] to free after use.
char* WordNormedUTF8Text() const;
// Returns a pointer to serialized choice lattice.
// Fills lattice_size with the number of bytes in lattice data.
const char *WordLattice(int *lattice_size) const;
// ============= Functions that refer to symbols only ============.
// Returns true if the current symbol is a superscript.
// If iterating at a higher level object than symbols, eg words, then
// this will return the attributes of the first symbol in that word.
bool SymbolIsSuperscript() const;
// Returns true if the current symbol is a subscript.
// If iterating at a higher level object than symbols, eg words, then
// this will return the attributes of the first symbol in that word.
bool SymbolIsSubscript() const;
// Returns true if the current symbol is a dropcap.
// If iterating at a higher level object than symbols, eg words, then
// this will return the attributes of the first symbol in that word.
bool SymbolIsDropcap() const;
protected:
const char *line_separator_;
const char *paragraph_separator_;
};
// Class to iterate over the classifier choices for a single RIL_SYMBOL.
class ChoiceIterator {
public:
// Construction is from a LTRResultIterator that points to the symbol of
// interest. The ChoiceIterator allows a one-shot iteration over the
// choices for this symbol and after that is is useless.
explicit ChoiceIterator(const LTRResultIterator& result_it);
~ChoiceIterator();
// Moves to the next choice for the symbol and returns false if there
// are none left.
bool Next();
// ============= Accessing data ==============.
// Returns the null terminated UTF-8 encoded text string for the current
// choice.
// NOTE: Unlike LTRResultIterator::GetUTF8Text, the return points to an
// internal structure and should NOT be delete[]ed to free after use.
const char* GetUTF8Text() const;
// Returns the confidence of the current choice.
// The number should be interpreted as a percent probability. (0.0f-100.0f)
float Confidence() const;
private:
// Pointer to the WERD_RES object owned by the API.
WERD_RES* word_res_;
// Iterator over the blob choices.
BLOB_CHOICE_IT* choice_it_;
};
} // namespace tesseract.
#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H__

View File

@ -0,0 +1,38 @@
///////////////////////////////////////////////////////////////////////
// File: mathfix.h
// Description: Implement missing math functions
// Author: zdenop
// Created: Fri Feb 03 06:45:06 CET 2012
//
// (C) Copyright 2012, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef VS2008_INCLUDE_MATHFIX_H_
#define VS2008_INCLUDE_MATHFIXT_H_
#ifndef _MSC_VER
#error "Use this header only with Microsoft Visual C++ compilers!"
#endif
#include <math.h>
#include <float.h> // for _isnan(), _finite() on VC++
#if _MSC_VER < 1800
#define isnan(x) _isnan(x)
#define isinf(x) (!_finite(x))
#define fmax max //VC++ does not implement all the provisions of C99 Standard
#define round(x) roundf(x)
inline float roundf(float num) { return num > 0 ? floorf(num + 0.5f) : ceilf(num - 0.5f); }
#endif
#endif // VS2008_INCLUDE_MATHFIXT_H_

View File

@ -0,0 +1,64 @@
///////////////////////////////////////////////////////////////////////
// File: mutableiterator.h
// Description: Iterator for tesseract results providing access to
// both high-level API and Tesseract internal data structures.
// Author: David Eger
// Created: Thu Feb 24 19:01:06 PST 2011
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCMAIN_MUTABLEITERATOR_H__
#define TESSERACT_CCMAIN_MUTABLEITERATOR_H__
#include "resultiterator.h"
class BLOB_CHOICE_IT;
namespace tesseract {
class Tesseract;
// Class to iterate over tesseract results, providing access to all levels
// of the page hierarchy, without including any tesseract headers or having
// to handle any tesseract structures.
// WARNING! This class points to data held within the TessBaseAPI class, and
// therefore can only be used while the TessBaseAPI class still exists and
// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
// DetectOS, or anything else that changes the internal PAGE_RES.
// See apitypes.h for the definition of PageIteratorLevel.
// See also base class PageIterator, which contains the bulk of the interface.
// ResultIterator adds text-specific methods for access to OCR output.
// MutableIterator adds access to internal data structures.
class MutableIterator : public ResultIterator {
public:
// See argument descriptions in ResultIterator()
MutableIterator(PAGE_RES* page_res, Tesseract* tesseract,
int scale, int scaled_yres,
int rect_left, int rect_top,
int rect_width, int rect_height)
: ResultIterator(
LTRResultIterator(page_res, tesseract, scale, scaled_yres, rect_left,
rect_top, rect_width, rect_height)) {}
virtual ~MutableIterator() {}
// See PageIterator and ResultIterator for most calls.
// Return access to Tesseract internals.
const PAGE_RES_IT *PageResIt() const { return it_; }
};
} // namespace tesseract.
#endif // TESSERACT_CCMAIN_MUTABLEITERATOR_H__

View File

@ -0,0 +1,585 @@
///////////////////////////////////////////////////////////////////////
// File: osdetect.cpp
// Description: Orientation and script detection.
// Author: Samuel Charron
// Ranjith Unnikrishnan
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "osdetect.h"
#include "blobbox.h"
#include "blread.h"
#include "colfind.h"
#include "fontinfo.h"
#include "imagefind.h"
#include "linefind.h"
#include "oldlist.h"
#include "qrsequence.h"
#include "ratngs.h"
#include "strngs.h"
#include "tabvector.h"
#include "tesseractclass.h"
#include "textord.h"
const int kMinCharactersToTry = 20;
const int kMaxCharactersToTry = 5 * kMinCharactersToTry;
const float kSizeRatioToReject = 2.0;
const int kMinAcceptableBlobHeight = 10;
const float kScriptAcceptRatio = 1.3;
const float kHanRatioInKorean = 0.7;
const float kHanRatioInJapanese = 0.3;
const float kNonAmbiguousMargin = 1.0;
// General scripts
static const char* han_script = "Han";
static const char* latin_script = "Latin";
static const char* katakana_script = "Katakana";
static const char* hiragana_script = "Hiragana";
static const char* hangul_script = "Hangul";
// Pseudo-scripts Name
const char* ScriptDetector::korean_script_ = "Korean";
const char* ScriptDetector::japanese_script_ = "Japanese";
const char* ScriptDetector::fraktur_script_ = "Fraktur";
// Minimum believable resolution.
const int kMinCredibleResolution = 70;
void OSResults::update_best_orientation() {
float first = orientations[0];
float second = orientations[1];
best_result.orientation_id = 0;
if (orientations[0] < orientations[1]) {
first = orientations[1];
second = orientations[0];
best_result.orientation_id = 1;
}
for (int i = 2; i < 4; ++i) {
if (orientations[i] > first) {
second = first;
first = orientations[i];
best_result.orientation_id = i;
}
else if (orientations[i] > second) {
second = orientations[i];
}
}
// Store difference of top two orientation scores.
best_result.oconfidence = first - second;
}
void OSResults::set_best_orientation(int orientation_id) {
best_result.orientation_id = orientation_id;
best_result.oconfidence = 0;
}
void OSResults::update_best_script(int orientation) {
// We skip index 0 to ignore the "Common" script.
float first = scripts_na[orientation][1];
float second = scripts_na[orientation][2];
best_result.script_id = 1;
if (scripts_na[orientation][1] < scripts_na[orientation][2]) {
first = scripts_na[orientation][2];
second = scripts_na[orientation][1];
best_result.script_id = 2;
}
for (int i = 3; i < kMaxNumberOfScripts; ++i) {
if (scripts_na[orientation][i] > first) {
best_result.script_id = i;
second = first;
first = scripts_na[orientation][i];
}
else if (scripts_na[orientation][i] > second) {
second = scripts_na[orientation][i];
}
}
best_result.sconfidence =
(first / second - 1.0) / (kScriptAcceptRatio - 1.0);
}
int OSResults::get_best_script(int orientation_id) const {
int max_id = -1;
for (int j = 0; j < kMaxNumberOfScripts; ++j) {
const char *script = unicharset->get_script_from_script_id(j);
if (strcmp(script, "Common") && strcmp(script, "NULL")) {
if (max_id == -1 ||
scripts_na[orientation_id][j] > scripts_na[orientation_id][max_id])
max_id = j;
}
}
return max_id;
}
// Print the script scores for all possible orientations.
void OSResults::print_scores(void) const {
for (int i = 0; i < 4; ++i) {
tprintf("Orientation id #%d", i);
print_scores(i);
}
}
// Print the script scores for the given candidate orientation.
void OSResults::print_scores(int orientation_id) const {
for (int j = 0; j < kMaxNumberOfScripts; ++j) {
if (scripts_na[orientation_id][j]) {
tprintf("%12s\t: %f\n", unicharset->get_script_from_script_id(j),
scripts_na[orientation_id][j]);
}
}
}
// Accumulate scores with given OSResults instance and update the best script.
void OSResults::accumulate(const OSResults& osr) {
for (int i = 0; i < 4; ++i) {
orientations[i] += osr.orientations[i];
for (int j = 0; j < kMaxNumberOfScripts; ++j)
scripts_na[i][j] += osr.scripts_na[i][j];
}
unicharset = osr.unicharset;
update_best_orientation();
update_best_script(best_result.orientation_id);
}
// Detect and erase horizontal/vertical lines and picture regions from the
// image, so that non-text blobs are removed from consideration.
void remove_nontext_regions(tesseract::Tesseract *tess, BLOCK_LIST *blocks,
TO_BLOCK_LIST *to_blocks) {
Pix *pix = tess->pix_binary();
ASSERT_HOST(pix != NULL);
int vertical_x = 0;
int vertical_y = 1;
tesseract::TabVector_LIST v_lines;
tesseract::TabVector_LIST h_lines;
int resolution;
if (kMinCredibleResolution > pixGetXRes(pix)) {
resolution = kMinCredibleResolution;
tprintf("Warning. Invalid resolution %d dpi. Using %d instead.\n",
pixGetXRes(pix), resolution);
}
else {
resolution = pixGetXRes(pix);
}
tesseract::LineFinder::FindAndRemoveLines(resolution, false, pix,
&vertical_x, &vertical_y,
NULL, &v_lines, &h_lines);
Pix* im_pix = tesseract::ImageFind::FindImages(pix);
if (im_pix != NULL) {
pixSubtract(pix, pix, im_pix);
pixDestroy(&im_pix);
}
tess->mutable_textord()->find_components(tess->pix_binary(),
blocks, to_blocks);
}
// Find connected components in the page and process a subset until finished or
// a stopping criterion is met.
// Returns the number of blobs used in making the estimate. 0 implies failure.
int orientation_and_script_detection(STRING& filename,
OSResults* osr,
tesseract::Tesseract* tess) {
STRING name = filename; //truncated name
const char *lastdot; //of name
TBOX page_box;
lastdot = strrchr(name.string(), '.');
if (lastdot != NULL)
name[lastdot - name.string()] = '\0';
ASSERT_HOST(tess->pix_binary() != NULL)
int width = pixGetWidth(tess->pix_binary());
int height = pixGetHeight(tess->pix_binary());
BLOCK_LIST blocks;
if (!read_unlv_file(name, width, height, &blocks))
FullPageBlock(width, height, &blocks);
// Try to remove non-text regions from consideration.
TO_BLOCK_LIST land_blocks, port_blocks;
remove_nontext_regions(tess, &blocks, &port_blocks);
if (port_blocks.empty()) {
// page segmentation did not succeed, so we need to find_components first.
tess->mutable_textord()->find_components(tess->pix_binary(),
&blocks, &port_blocks);
}
else {
page_box.set_left(0);
page_box.set_bottom(0);
page_box.set_right(width);
page_box.set_top(height);
// Filter_blobs sets up the TO_BLOCKs the same as find_components does.
tess->mutable_textord()->filter_blobs(page_box.topright(),
&port_blocks, true);
}
return os_detect(&port_blocks, osr, tess);
}
// Filter and sample the blobs.
// Returns a non-zero number of blobs if the page was successfully processed, or
// zero if the page had too few characters to be reliable
int os_detect(TO_BLOCK_LIST* port_blocks, OSResults* osr,
tesseract::Tesseract* tess) {
int blobs_total = 0;
TO_BLOCK_IT block_it;
block_it.set_to_list(port_blocks);
BLOBNBOX_CLIST filtered_list;
BLOBNBOX_C_IT filtered_it(&filtered_list);
for (block_it.mark_cycle_pt(); !block_it.cycled_list();
block_it.forward()) {
TO_BLOCK* to_block = block_it.data();
if (to_block->block->poly_block() &&
!to_block->block->poly_block()->IsText()) continue;
BLOBNBOX_IT bbox_it;
bbox_it.set_to_list(&to_block->blobs);
for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list();
bbox_it.forward()) {
BLOBNBOX* bbox = bbox_it.data();
C_BLOB* blob = bbox->cblob();
TBOX box = blob->bounding_box();
++blobs_total;
float y_x = fabs((box.height() * 1.0) / box.width());
float x_y = 1.0f / y_x;
// Select a >= 1.0 ratio
float ratio = x_y > y_x ? x_y : y_x;
// Blob is ambiguous
if (ratio > kSizeRatioToReject) continue;
if (box.height() < kMinAcceptableBlobHeight) continue;
filtered_it.add_to_end(bbox);
}
}
return os_detect_blobs(NULL, &filtered_list, osr, tess);
}
// Detect orientation and script from a list of blobs.
// Returns a non-zero number of blobs if the list was successfully processed, or
// zero if the list had too few characters to be reliable.
// If allowed_scripts is non-null and non-empty, it is a list of scripts that
// constrains both orientation and script detection to consider only scripts
// from the list.
int os_detect_blobs(const GenericVector<int>* allowed_scripts,
BLOBNBOX_CLIST* blob_list, OSResults* osr,
tesseract::Tesseract* tess) {
OSResults osr_;
if (osr == NULL)
osr = &osr_;
osr->unicharset = &tess->unicharset;
OrientationDetector o(allowed_scripts, osr);
ScriptDetector s(allowed_scripts, osr, tess);
BLOBNBOX_C_IT filtered_it(blob_list);
int real_max = MIN(filtered_it.length(), kMaxCharactersToTry);
// tprintf("Total blobs found = %d\n", blobs_total);
// tprintf("Number of blobs post-filtering = %d\n", filtered_it.length());
// tprintf("Number of blobs to try = %d\n", real_max);
// If there are too few characters, skip this page entirely.
if (real_max < kMinCharactersToTry / 2) {
tprintf("Too few characters. Skipping this page\n");
return 0;
}
BLOBNBOX** blobs = new BLOBNBOX*[filtered_it.length()];
int number_of_blobs = 0;
for (filtered_it.mark_cycle_pt(); !filtered_it.cycled_list();
filtered_it.forward()) {
blobs[number_of_blobs++] = (BLOBNBOX*)filtered_it.data();
}
QRSequenceGenerator sequence(number_of_blobs);
int num_blobs_evaluated = 0;
for (int i = 0; i < real_max; ++i) {
if (os_detect_blob(blobs[sequence.GetVal()], &o, &s, osr, tess)
&& i > kMinCharactersToTry) {
break;
}
++num_blobs_evaluated;
}
delete[] blobs;
// Make sure the best_result is up-to-date
int orientation = o.get_orientation();
osr->update_best_script(orientation);
return num_blobs_evaluated;
}
// Processes a single blob to estimate script and orientation.
// Return true if estimate of orientation and script satisfies stopping
// criteria.
bool os_detect_blob(BLOBNBOX* bbox, OrientationDetector* o,
ScriptDetector* s, OSResults* osr,
tesseract::Tesseract* tess) {
tess->tess_cn_matching.set_value(true); // turn it on
tess->tess_bn_matching.set_value(false);
C_BLOB* blob = bbox->cblob();
TBLOB* tblob = TBLOB::PolygonalCopy(tess->poly_allow_detailed_fx, blob);
TBOX box = tblob->bounding_box();
FCOORD current_rotation(1.0f, 0.0f);
FCOORD rotation90(0.0f, 1.0f);
BLOB_CHOICE_LIST ratings[4];
// Test the 4 orientations
for (int i = 0; i < 4; ++i) {
// Normalize the blob. Set the origin to the place we want to be the
// bottom-middle after rotation.
// Scaling is to make the rotated height the x-height.
float scaling = static_cast<float>(kBlnXHeight) / box.height();
float x_origin = (box.left() + box.right()) / 2.0f;
float y_origin = (box.bottom() + box.top()) / 2.0f;
if (i == 0 || i == 2) {
// Rotation is 0 or 180.
y_origin = i == 0 ? box.bottom() : box.top();
}
else {
// Rotation is 90 or 270.
scaling = static_cast<float>(kBlnXHeight) / box.width();
x_origin = i == 1 ? box.left() : box.right();
}
TBLOB* rotated_blob = new TBLOB(*tblob);
rotated_blob->Normalize(NULL, &current_rotation, NULL,
x_origin, y_origin, scaling, scaling,
0.0f, static_cast<float>(kBlnBaselineOffset),
false, NULL);
tess->AdaptiveClassifier(rotated_blob, ratings + i);
delete rotated_blob;
current_rotation.rotate(rotation90);
}
delete tblob;
bool stop = o->detect_blob(ratings);
s->detect_blob(ratings);
int orientation = o->get_orientation();
stop = s->must_stop(orientation) && stop;
return stop;
}
OrientationDetector::OrientationDetector(
const GenericVector<int>* allowed_scripts, OSResults* osr) {
osr_ = osr;
allowed_scripts_ = allowed_scripts;
}
// Score the given blob and return true if it is now sure of the orientation
// after adding this block.
bool OrientationDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
float blob_o_score[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
float total_blob_o_score = 0.0f;
for (int i = 0; i < 4; ++i) {
BLOB_CHOICE_IT choice_it(scores + i);
if (!choice_it.empty()) {
BLOB_CHOICE* choice = NULL;
if (allowed_scripts_ != NULL && !allowed_scripts_->empty()) {
// Find the top choice in an allowed script.
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list() &&
choice == NULL; choice_it.forward()) {
int choice_script = choice_it.data()->script_id();
int s = 0;
for (s = 0; s < allowed_scripts_->size(); ++s) {
if ((*allowed_scripts_)[s] == choice_script) {
choice = choice_it.data();
break;
}
}
}
}
else {
choice = choice_it.data();
}
if (choice != NULL) {
// The certainty score ranges between [-20,0]. This is converted here to
// [0,1], with 1 indicating best match.
blob_o_score[i] = 1 + 0.05 * choice->certainty();
total_blob_o_score += blob_o_score[i];
}
}
}
if (total_blob_o_score == 0.0) return false;
// Fill in any blanks with the worst score of the others. This is better than
// picking an arbitrary probability for it and way better than -inf.
float worst_score = 0.0f;
int num_good_scores = 0;
for (int i = 0; i < 4; ++i) {
if (blob_o_score[i] > 0.0f) {
++num_good_scores;
if (worst_score == 0.0f || blob_o_score[i] < worst_score)
worst_score = blob_o_score[i];
}
}
if (num_good_scores == 1) {
// Lower worst if there is only one.
worst_score /= 2.0f;
}
for (int i = 0; i < 4; ++i) {
if (blob_o_score[i] == 0.0f) {
blob_o_score[i] = worst_score;
total_blob_o_score += worst_score;
}
}
// Normalize the orientation scores for the blob and use them to
// update the aggregated orientation score.
for (int i = 0; total_blob_o_score != 0 && i < 4; ++i) {
osr_->orientations[i] += log(blob_o_score[i] / total_blob_o_score);
}
// TODO(ranjith) Add an early exit test, based on min_orientation_margin,
// as used in pagesegmain.cpp.
return false;
}
int OrientationDetector::get_orientation() {
osr_->update_best_orientation();
return osr_->best_result.orientation_id;
}
ScriptDetector::ScriptDetector(const GenericVector<int>* allowed_scripts,
OSResults* osr, tesseract::Tesseract* tess) {
osr_ = osr;
tess_ = tess;
allowed_scripts_ = allowed_scripts;
katakana_id_ = tess_->unicharset.add_script(katakana_script);
hiragana_id_ = tess_->unicharset.add_script(hiragana_script);
han_id_ = tess_->unicharset.add_script(han_script);
hangul_id_ = tess_->unicharset.add_script(hangul_script);
japanese_id_ = tess_->unicharset.add_script(japanese_script_);
korean_id_ = tess_->unicharset.add_script(korean_script_);
latin_id_ = tess_->unicharset.add_script(latin_script);
fraktur_id_ = tess_->unicharset.add_script(fraktur_script_);
}
// Score the given blob and return true if it is now sure of the script after
// adding this blob.
void ScriptDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
bool done[kMaxNumberOfScripts];
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < kMaxNumberOfScripts; ++j)
done[j] = false;
BLOB_CHOICE_IT choice_it;
choice_it.set_to_list(scores + i);
float prev_score = -1;
int script_count = 0;
int prev_id = -1;
int prev_fontinfo_id = -1;
const char* prev_unichar = "";
const char* unichar = "";
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
choice_it.forward()) {
BLOB_CHOICE* choice = choice_it.data();
int id = choice->script_id();
if (allowed_scripts_ != NULL && !allowed_scripts_->empty()) {
// Check that the choice is in an allowed script.
int s = 0;
for (s = 0; s < allowed_scripts_->size(); ++s) {
if ((*allowed_scripts_)[s] == id) break;
}
if (s == allowed_scripts_->size()) continue; // Not found in list.
}
// Script already processed before.
if (done[id]) continue;
done[id] = true;
unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
// Save data from the first match
if (prev_score < 0) {
prev_score = -choice->certainty();
script_count = 1;
prev_id = id;
prev_unichar = unichar;
prev_fontinfo_id = choice->fontinfo_id();
}
else if (-choice->certainty() < prev_score + kNonAmbiguousMargin) {
++script_count;
}
if (strlen(prev_unichar) == 1)
if (unichar[0] >= '0' && unichar[0] <= '9')
break;
// if script_count is >= 2, character is ambiguous, skip other matches
// since they are useless.
if (script_count >= 2)
break;
}
// Character is non ambiguous
if (script_count == 1) {
// Update the score of the winning script
osr_->scripts_na[i][prev_id] += 1.0;
// Workaround for Fraktur
if (prev_id == latin_id_) {
if (prev_fontinfo_id >= 0) {
const tesseract::FontInfo &fi =
tess_->get_fontinfo_table().get(prev_fontinfo_id);
//printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name,
// fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(),
// fi.is_serif(), fi.is_fraktur(),
// prev_unichar);
if (fi.is_fraktur()) {
osr_->scripts_na[i][prev_id] -= 1.0;
osr_->scripts_na[i][fraktur_id_] += 1.0;
}
}
}
// Update Japanese / Korean pseudo-scripts
if (prev_id == katakana_id_)
osr_->scripts_na[i][japanese_id_] += 1.0;
if (prev_id == hiragana_id_)
osr_->scripts_na[i][japanese_id_] += 1.0;
if (prev_id == hangul_id_)
osr_->scripts_na[i][korean_id_] += 1.0;
if (prev_id == han_id_) {
osr_->scripts_na[i][korean_id_] += kHanRatioInKorean;
osr_->scripts_na[i][japanese_id_] += kHanRatioInJapanese;
}
}
} // iterate over each orientation
}
bool ScriptDetector::must_stop(int orientation) {
osr_->update_best_script(orientation);
return osr_->best_result.sconfidence > 1;
}
// Helper method to convert an orientation index to its value in degrees.
// The value represents the amount of clockwise rotation in degrees that must be
// applied for the text to be upright (readable).
int OrientationIdToValue(const int& id) {
switch (id) {
case 0:
return 0;
case 1:
return 270;
case 2:
return 180;
case 3:
return 90;
default:
return -1;
}
}

View File

@ -0,0 +1,138 @@
///////////////////////////////////////////////////////////////////////
// File: osdetect.h
// Description: Orientation and script detection.
// Author: Samuel Charron
// Ranjith Unnikrishnan
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCMAIN_OSDETECT_H__
#define TESSERACT_CCMAIN_OSDETECT_H__
#include "strngs.h"
#include "unicharset.h"
class TO_BLOCK_LIST;
class BLOBNBOX;
class BLOB_CHOICE_LIST;
class BLOBNBOX_CLIST;
namespace tesseract {
class Tesseract;
}
// Max number of scripts in ICU + "NULL" + Japanese and Korean + Fraktur
const int kMaxNumberOfScripts = 116 + 1 + 2 + 1;
struct OSBestResult {
OSBestResult() : orientation_id(0), script_id(0), sconfidence(0.0),
oconfidence(0.0) {}
int orientation_id;
int script_id;
float sconfidence;
float oconfidence;
};
struct OSResults {
OSResults() : unicharset(NULL) {
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < kMaxNumberOfScripts; ++j)
scripts_na[i][j] = 0;
orientations[i] = 0;
}
}
void update_best_orientation();
// Set the estimate of the orientation to the given id.
void set_best_orientation(int orientation_id);
// Update/Compute the best estimate of the script assuming the given
// orientation id.
void update_best_script(int orientation_id);
// Return the index of the script with the highest score for this orientation.
TESS_API int get_best_script(int orientation_id) const;
// Accumulate scores with given OSResults instance and update the best script.
void accumulate(const OSResults& osr);
// Print statistics.
void print_scores(void) const;
void print_scores(int orientation_id) const;
// Array holding scores for each orientation id [0,3].
// Orientation ids [0..3] map to [0, 270, 180, 90] degree orientations of the
// page respectively, where the values refer to the amount of clockwise
// rotation to be applied to the page for the text to be upright and readable.
float orientations[4];
// Script confidence scores for each of 4 possible orientations.
float scripts_na[4][kMaxNumberOfScripts];
UNICHARSET* unicharset;
OSBestResult best_result;
};
class OrientationDetector {
public:
OrientationDetector(const GenericVector<int>* allowed_scripts,
OSResults* results);
bool detect_blob(BLOB_CHOICE_LIST* scores);
int get_orientation();
private:
OSResults* osr_;
const GenericVector<int>* allowed_scripts_;
};
class ScriptDetector {
public:
ScriptDetector(const GenericVector<int>* allowed_scripts,
OSResults* osr, tesseract::Tesseract* tess);
void detect_blob(BLOB_CHOICE_LIST* scores);
bool must_stop(int orientation);
private:
OSResults* osr_;
static const char* korean_script_;
static const char* japanese_script_;
static const char* fraktur_script_;
int korean_id_;
int japanese_id_;
int katakana_id_;
int hiragana_id_;
int han_id_;
int hangul_id_;
int latin_id_;
int fraktur_id_;
tesseract::Tesseract* tess_;
const GenericVector<int>* allowed_scripts_;
};
int orientation_and_script_detection(STRING& filename,
OSResults*,
tesseract::Tesseract*);
int os_detect(TO_BLOCK_LIST* port_blocks,
OSResults* osr,
tesseract::Tesseract* tess);
int os_detect_blobs(const GenericVector<int>* allowed_scripts,
BLOBNBOX_CLIST* blob_list,
OSResults* osr,
tesseract::Tesseract* tess);
bool os_detect_blob(BLOBNBOX* bbox, OrientationDetector* o,
ScriptDetector* s, OSResults*,
tesseract::Tesseract* tess);
// Helper method to convert an orientation index to its value in degrees.
// The value represents the amount of clockwise rotation in degrees that must be
// applied for the text to be upright (readable).
TESS_API int OrientationIdToValue(const int& id);
#endif // TESSERACT_CCMAIN_OSDETECT_H__

View File

@ -0,0 +1,450 @@
/******************************************************************
* File: output.cpp (Formerly output.c)
* Description: Output pass
* Author: Phil Cheatle
* Created: Thu Aug 4 10:56:08 BST 1994
*
* (C) Copyright 1994, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifdef _MSC_VER
#pragma warning(disable:4244) // Conversion warnings
#endif
#include <string.h>
#include <ctype.h>
#ifdef __UNIX__
#include <assert.h>
#include <unistd.h>
#include <errno.h>
#endif
#include "helpers.h"
#include "tessvars.h"
#include "control.h"
#include "reject.h"
#include "docqual.h"
#include "output.h"
#include "globals.h"
#include "tesseractclass.h"
#define EPAPER_EXT ".ep"
#define PAGE_YSIZE 3508
#define CTRL_INSET '\024' //dc4=text inset
#define CTRL_FONT '\016' //so=font change
#define CTRL_DEFAULT '\017' //si=default font
#define CTRL_SHIFT '\022' //dc2=x shift
#define CTRL_TAB '\011' //tab
#define CTRL_NEWLINE '\012' //newline
#define CTRL_HARDLINE '\015' //cr
/**********************************************************************
* pixels_to_pts
*
* Convert an integer number of pixels to the nearest integer
* number of points.
**********************************************************************/
inT32 pixels_to_pts( //convert coords
inT32 pixels,
inT32 pix_res //resolution
) {
float pts; //converted value
pts = pixels * 72.0 / pix_res;
return (inT32)(pts + 0.5); //round it
}
namespace tesseract {
void Tesseract::output_pass( //Tess output pass //send to api
PAGE_RES_IT &page_res_it,
const TBOX *target_word_box) {
BLOCK_RES *block_of_last_word;
BOOL8 force_eol; //During output
BLOCK *nextblock; //block of next word
WERD *nextword; //next word
page_res_it.restart_page();
block_of_last_word = NULL;
while (page_res_it.word() != NULL) {
check_debug_pt(page_res_it.word(), 120);
if (target_word_box) {
TBOX current_word_box = page_res_it.word()->word->bounding_box();
FCOORD center_pt(
(current_word_box.right() + current_word_box.left()) / 2,
(current_word_box.bottom() + current_word_box.top()) / 2);
if (!target_word_box->contains(center_pt)) {
page_res_it.forward();
continue;
}
}
if (tessedit_write_block_separators &&
block_of_last_word != page_res_it.block()) {
block_of_last_word = page_res_it.block();
}
force_eol = (tessedit_write_block_separators &&
(page_res_it.block() != page_res_it.next_block())) ||
(page_res_it.next_word() == NULL);
if (page_res_it.next_word() != NULL)
nextword = page_res_it.next_word()->word;
else
nextword = NULL;
if (page_res_it.next_block() != NULL)
nextblock = page_res_it.next_block()->block;
else
nextblock = NULL;
//regardless of tilde crunching
write_results(page_res_it,
determine_newline_type(page_res_it.word()->word,
page_res_it.block()->block,
nextword, nextblock), force_eol);
page_res_it.forward();
}
}
/*************************************************************************
* write_results()
*
* All recognition and rejection has now been done. Generate the following:
* .txt file - giving the final best choices with NO highlighting
* .raw file - giving the tesseract top choice output for each word
* .map file - showing how the .txt file has been rejected in the .ep file
* epchoice list - a list of one element per word, containing the text for the
* epaper. Reject strings are inserted.
* inset list - a list of bounding boxes of reject insets - indexed by the
* reject strings in the epchoice text.
*************************************************************************/
void Tesseract::write_results(PAGE_RES_IT &page_res_it,
char newline_type, // type of newline
BOOL8 force_eol) { // override tilde crunch?
WERD_RES *word = page_res_it.word();
const UNICHARSET &uchset = *word->uch_set;
int i;
BOOL8 need_reject = FALSE;
UNICHAR_ID space = uchset.unichar_to_id(" ");
if ((word->unlv_crunch_mode != CR_NONE ||
word->best_choice->length() == 0) &&
!tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
if ((word->unlv_crunch_mode != CR_DELETE) &&
(!stats_.tilde_crunch_written ||
((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
(word->word->space() > 0) &&
!word->word->flag(W_FUZZY_NON) &&
!word->word->flag(W_FUZZY_SP)))) {
if (!word->word->flag(W_BOL) &&
(word->word->space() > 0) &&
!word->word->flag(W_FUZZY_NON) &&
!word->word->flag(W_FUZZY_SP)) {
stats_.last_char_was_tilde = false;
}
need_reject = TRUE;
}
if ((need_reject && !stats_.last_char_was_tilde) ||
(force_eol && stats_.write_results_empty_block)) {
/* Write a reject char - mark as rejected unless zero_rejection mode */
stats_.last_char_was_tilde = TRUE;
stats_.tilde_crunch_written = true;
stats_.last_char_was_newline = false;
stats_.write_results_empty_block = false;
}
if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) {
stats_.tilde_crunch_written = false;
stats_.last_char_was_newline = true;
stats_.last_char_was_tilde = false;
}
if (force_eol)
stats_.write_results_empty_block = true;
return;
}
/* NORMAL PROCESSING of non tilde crunched words */
stats_.tilde_crunch_written = false;
if (newline_type)
stats_.last_char_was_newline = true;
else
stats_.last_char_was_newline = false;
stats_.write_results_empty_block = force_eol; // about to write a real word
if (unlv_tilde_crunching &&
stats_.last_char_was_tilde &&
(word->word->space() == 0) &&
!(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
(word->best_choice->unichar_id(0) == space)) {
/* Prevent adjacent tilde across words - we know that adjacent tildes within
words have been removed */
word->MergeAdjacentBlobs(0);
}
if (newline_type ||
(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes))
stats_.last_char_was_tilde = false;
else {
if (word->reject_map.length() > 0) {
if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
stats_.last_char_was_tilde = true;
else
stats_.last_char_was_tilde = false;
}
else if (word->word->space() > 0)
stats_.last_char_was_tilde = false;
/* else it is unchanged as there are no output chars */
}
ASSERT_HOST(word->best_choice->length() == word->reject_map.length());
set_unlv_suspects(word);
check_debug_pt(word, 120);
if (tessedit_rejection_debug) {
tprintf("Dict word: \"%s\": %d\n",
word->best_choice->debug_string().string(),
dict_word(*(word->best_choice)));
}
if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
if (tessedit_zero_rejection) {
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
for (i = 0; i < word->best_choice->length(); ++i) {
if (word->reject_map[i].rejected())
word->reject_map[i].setrej_minimal_rej_accept();
}
}
if (tessedit_minimal_rejection) {
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
for (i = 0; i < word->best_choice->length(); ++i) {
if ((word->best_choice->unichar_id(i) != space) &&
word->reject_map[i].rejected())
word->reject_map[i].setrej_minimal_rej_accept();
}
}
}
}
} // namespace tesseract
/**********************************************************************
* determine_newline_type
*
* Find whether we have a wrapping or hard newline.
* Return FALSE if not at end of line.
**********************************************************************/
char determine_newline_type( //test line ends
WERD *word, //word to do
BLOCK *block, //current block
WERD *next_word, //next word
BLOCK *next_block //block of next word
) {
inT16 end_gap; //to right edge
inT16 width; //of next word
TBOX word_box; //bounding
TBOX next_box; //next word
TBOX block_box; //block bounding
if (!word->flag(W_EOL))
return FALSE; //not end of line
if (next_word == NULL || next_block == NULL || block != next_block)
return CTRL_NEWLINE;
if (next_word->space() > 0)
return CTRL_HARDLINE; //it is tabbed
word_box = word->bounding_box();
next_box = next_word->bounding_box();
block_box = block->bounding_box();
//gap to eol
end_gap = block_box.right() - word_box.right();
end_gap -= (inT32)block->space();
width = next_box.right() - next_box.left();
// tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
// block_box.right(),word_box.right(),end_gap,
// next_box.right(),next_box.left(),width,
// end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
}
/*************************************************************************
* get_rep_char()
* Return the first accepted character from the repetition string. This is the
* character which is repeated - as determined earlier by fix_rep_char()
*************************************************************************/
namespace tesseract {
UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
int i;
for (i = 0; ((i < word->reject_map.length()) &&
(word->reject_map[i].rejected())); ++i);
if (i < word->reject_map.length()) {
return word->best_choice->unichar_id(i);
}
else {
return word->uch_set->unichar_to_id(unrecognised_char.string());
}
}
/*************************************************************************
* SUSPECT LEVELS
*
* 0 - don't reject ANYTHING
* 1,2 - partial rejection
* 3 - BEST
*
* NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
* tessedit_minimal_rejection.
*************************************************************************/
void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
int len = word_res->reject_map.length();
const WERD_CHOICE &word = *(word_res->best_choice);
const UNICHARSET &uchset = *word.unicharset();
int i;
float rating_per_ch;
if (suspect_level == 0) {
for (i = 0; i < len; i++) {
if (word_res->reject_map[i].rejected())
word_res->reject_map[i].setrej_minimal_rej_accept();
}
return;
}
if (suspect_level >= 3)
return; //Use defaults
/* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
if (safe_dict_word(word_res) &&
(count_alphas(word) > suspect_short_words)) {
/* Unreject alphas in dictionary words */
for (i = 0; i < len; ++i) {
if (word_res->reject_map[i].rejected() &&
uchset.get_isalpha(word.unichar_id(i)))
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
rating_per_ch = word.rating() / word_res->reject_map.length();
if (rating_per_ch >= suspect_rating_per_ch)
return; // Don't touch bad ratings
if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
/* Unreject any Tess Acceptable word - but NOT tess reject chs*/
for (i = 0; i < len; ++i) {
if (word_res->reject_map[i].rejected() &&
(!uchset.eq(word.unichar_id(i), " ")))
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
for (i = 0; i < len; i++) {
if (word_res->reject_map[i].rejected()) {
if (word_res->reject_map[i].flag(R_DOC_REJ))
word_res->reject_map[i].setrej_minimal_rej_accept();
if (word_res->reject_map[i].flag(R_BLOCK_REJ))
word_res->reject_map[i].setrej_minimal_rej_accept();
if (word_res->reject_map[i].flag(R_ROW_REJ))
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
if (suspect_level == 2)
return;
if (!suspect_constrain_1Il ||
(word_res->reject_map.length() <= suspect_short_words)) {
for (i = 0; i < len; i++) {
if (word_res->reject_map[i].rejected()) {
if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
word_res->reject_map[i].flag(R_POSTNN_1IL)))
word_res->reject_map[i].setrej_minimal_rej_accept();
if (!suspect_constrain_1Il &&
word_res->reject_map[i].flag(R_MM_REJECT))
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
}
if (acceptable_word_string(*word_res->uch_set,
word.unichar_string().string(),
word.unichar_lengths().string()) !=
AC_UNACCEPTABLE ||
acceptable_number_string(word.unichar_string().string(),
word.unichar_lengths().string())) {
if (word_res->reject_map.length() > suspect_short_words) {
for (i = 0; i < len; i++) {
if (word_res->reject_map[i].rejected() &&
(!word_res->reject_map[i].perm_rejected() ||
word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
word_res->reject_map[i].flag(R_POSTNN_1IL) ||
word_res->reject_map[i].flag(R_MM_REJECT))) {
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
}
}
}
inT16 Tesseract::count_alphas(const WERD_CHOICE &word) {
int count = 0;
for (int i = 0; i < word.length(); ++i) {
if (word.unicharset()->get_isalpha(word.unichar_id(i)))
count++;
}
return count;
}
inT16 Tesseract::count_alphanums(const WERD_CHOICE &word) {
int count = 0;
for (int i = 0; i < word.length(); ++i) {
if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
word.unicharset()->get_isdigit(word.unichar_id(i)))
count++;
}
return count;
}
BOOL8 Tesseract::acceptable_number_string(const char *s,
const char *lengths) {
BOOL8 prev_digit = FALSE;
if (*lengths == 1 && *s == '(')
s++;
if (*lengths == 1 &&
((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
s++;
for (; *s != '\0'; s += *(lengths++)) {
if (unicharset.get_isdigit(s, *lengths))
prev_digit = TRUE;
else if (prev_digit &&
(*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
prev_digit = FALSE;
else if (prev_digit && *lengths == 1 &&
(*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
return TRUE;
else if (prev_digit &&
*lengths == 1 && (*s == '%') &&
(*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
(*(s + *lengths + *(lengths + 1)) == '\0'))
return TRUE;
else
return FALSE;
}
return TRUE;
}
} // namespace tesseract

View File

@ -0,0 +1,33 @@
/******************************************************************
* File: output.h (Formerly output.h)
* Description: Output pass
* Author: Phil Cheatle
* Created: Thu Aug 4 10:56:08 BST 1994
*
* (C) Copyright 1994, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef OUTPUT_H
#define OUTPUT_H
#include "params.h"
//#include "epapconv.h"
#include "pageres.h"
/** test line ends */
char determine_newline_type(WERD *word, ///< word to do
BLOCK *block, ///< current block
WERD *next_word, ///< next word
BLOCK *next_block ///< block of next word
);
#endif

View File

@ -0,0 +1,631 @@
///////////////////////////////////////////////////////////////////////
// File: pageiterator.cpp
// Description: Iterator for tesseract page structure that avoids using
// tesseract internal data structures.
// Author: Ray Smith
// Created: Fri Feb 26 14:32:09 PST 2010
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "pageiterator.h"
#include "allheaders.h"
#include "helpers.h"
#include "pageres.h"
#include "tesseractclass.h"
namespace tesseract {
PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract, int scale,
int scaled_yres, int rect_left, int rect_top,
int rect_width, int rect_height)
: page_res_(page_res),
tesseract_(tesseract),
word_(NULL),
word_length_(0),
blob_index_(0),
cblob_it_(NULL),
include_upper_dots_(false),
include_lower_dots_(false),
scale_(scale),
scaled_yres_(scaled_yres),
rect_left_(rect_left),
rect_top_(rect_top),
rect_width_(rect_width),
rect_height_(rect_height) {
it_ = new PAGE_RES_IT(page_res);
PageIterator::Begin();
}
PageIterator::~PageIterator() {
delete it_;
delete cblob_it_;
}
/**
* PageIterators may be copied! This makes it possible to iterate over
* all the objects at a lower level, while maintaining an iterator to
* objects at a higher level.
*/
PageIterator::PageIterator(const PageIterator& src)
: page_res_(src.page_res_),
tesseract_(src.tesseract_),
word_(NULL),
word_length_(src.word_length_),
blob_index_(src.blob_index_),
cblob_it_(NULL),
include_upper_dots_(src.include_upper_dots_),
include_lower_dots_(src.include_lower_dots_),
scale_(src.scale_),
scaled_yres_(src.scaled_yres_),
rect_left_(src.rect_left_),
rect_top_(src.rect_top_),
rect_width_(src.rect_width_),
rect_height_(src.rect_height_) {
it_ = new PAGE_RES_IT(*src.it_);
BeginWord(src.blob_index_);
}
const PageIterator& PageIterator::operator=(const PageIterator& src) {
page_res_ = src.page_res_;
tesseract_ = src.tesseract_;
include_upper_dots_ = src.include_upper_dots_;
include_lower_dots_ = src.include_lower_dots_;
scale_ = src.scale_;
scaled_yres_ = src.scaled_yres_;
rect_left_ = src.rect_left_;
rect_top_ = src.rect_top_;
rect_width_ = src.rect_width_;
rect_height_ = src.rect_height_;
delete it_;
it_ = new PAGE_RES_IT(*src.it_);
BeginWord(src.blob_index_);
return *this;
}
bool PageIterator::PositionedAtSameWord(const PAGE_RES_IT* other) const {
return (it_ == NULL && it_ == other) ||
((other != NULL) && (it_ != NULL) && (*it_ == *other));
}
// ============= Moving around within the page ============.
/** Resets the iterator to point to the start of the page. */
void PageIterator::Begin() {
it_->restart_page_with_empties();
BeginWord(0);
}
void PageIterator::RestartParagraph() {
if (it_->block() == NULL) return; // At end of the document.
PAGE_RES_IT para(page_res_);
PAGE_RES_IT next_para(para);
next_para.forward_paragraph();
while (next_para.cmp(*it_) <= 0) {
para = next_para;
next_para.forward_paragraph();
}
*it_ = para;
BeginWord(0);
}
bool PageIterator::IsWithinFirstTextlineOfParagraph() const {
PageIterator p_start(*this);
p_start.RestartParagraph();
return p_start.it_->row() == it_->row();
}
void PageIterator::RestartRow() {
it_->restart_row();
BeginWord(0);
}
/**
* Moves to the start of the next object at the given level in the
* page hierarchy, and returns false if the end of the page was reached.
* NOTE (CHANGED!) that ALL PageIteratorLevel level values will visit each
* non-text block at least once.
* Think of non text blocks as containing a single para, with at least one
* line, with a single imaginary word, containing a single symbol.
* The bounding boxes mark out any polygonal nature of the block, and
* PTIsTextType(BLockType()) is false for non-text blocks.
* Calls to Next with different levels may be freely intermixed.
* This function iterates words in right-to-left scripts correctly, if
* the appropriate language has been loaded into Tesseract.
*/
bool PageIterator::Next(PageIteratorLevel level) {
if (it_->block() == NULL) return false; // Already at the end!
if (it_->word() == NULL)
level = RIL_BLOCK;
switch (level) {
case RIL_BLOCK:
it_->forward_block();
break;
case RIL_PARA:
it_->forward_paragraph();
break;
case RIL_TEXTLINE:
for (it_->forward_with_empties(); it_->row() == it_->prev_row();
it_->forward_with_empties());
break;
case RIL_WORD:
it_->forward_with_empties();
break;
case RIL_SYMBOL:
if (cblob_it_ != NULL)
cblob_it_->forward();
++blob_index_;
if (blob_index_ >= word_length_)
it_->forward_with_empties();
else
return true;
break;
}
BeginWord(0);
return it_->block() != NULL;
}
/**
* Returns true if the iterator is at the start of an object at the given
* level. Possible uses include determining if a call to Next(RIL_WORD)
* moved to the start of a RIL_PARA.
*/
bool PageIterator::IsAtBeginningOf(PageIteratorLevel level) const {
if (it_->block() == NULL) return false; // Already at the end!
if (it_->word() == NULL) return true; // In an image block.
switch (level) {
case RIL_BLOCK:
return blob_index_ == 0 && it_->block() != it_->prev_block();
case RIL_PARA:
return blob_index_ == 0 &&
(it_->block() != it_->prev_block() ||
it_->row()->row->para() != it_->prev_row()->row->para());
case RIL_TEXTLINE:
return blob_index_ == 0 && it_->row() != it_->prev_row();
case RIL_WORD:
return blob_index_ == 0;
case RIL_SYMBOL:
return true;
}
return false;
}
/**
* Returns whether the iterator is positioned at the last element in a
* given level. (e.g. the last word in a line, the last line in a block)
*/
bool PageIterator::IsAtFinalElement(PageIteratorLevel level,
PageIteratorLevel element) const {
if (Empty(element)) return true; // Already at the end!
// The result is true if we step forward by element and find we are
// at the the end of the page or at beginning of *all* levels in:
// [level, element).
// When there is more than one level difference between element and level,
// we could for instance move forward one symbol and still be at the first
// word on a line, so we also have to be at the first symbol in a word.
PageIterator next(*this);
next.Next(element);
if (next.Empty(element)) return true; // Reached the end of the page.
while (element > level) {
element = static_cast<PageIteratorLevel>(element - 1);
if (!next.IsAtBeginningOf(element))
return false;
}
return true;
}
/**
* Returns whether this iterator is positioned
* before other: -1
* equal to other: 0
* after other: 1
*/
int PageIterator::Cmp(const PageIterator &other) const {
int word_cmp = it_->cmp(*other.it_);
if (word_cmp != 0)
return word_cmp;
if (blob_index_ < other.blob_index_)
return -1;
if (blob_index_ == other.blob_index_)
return 0;
return 1;
}
// ============= Accessing data ==============.
// Coordinate system:
// Integer coordinates are at the cracks between the pixels.
// The top-left corner of the top-left pixel in the image is at (0,0).
// The bottom-right corner of the bottom-right pixel in the image is at
// (width, height).
// Every bounding box goes from the top-left of the top-left contained
// pixel to the bottom-right of the bottom-right contained pixel, so
// the bounding box of the single top-left pixel in the image is:
// (0,0)->(1,1).
// If an image rectangle has been set in the API, then returned coordinates
// relate to the original (full) image, rather than the rectangle.
/**
* Returns the bounding rectangle of the current object at the given level in
* the coordinates of the working image that is pix_binary().
* See comment on coordinate system above.
* Returns false if there is no such object at the current position.
*/
bool PageIterator::BoundingBoxInternal(PageIteratorLevel level,
int* left, int* top,
int* right, int* bottom) const {
if (Empty(level))
return false;
TBOX box;
PARA *para = NULL;
switch (level) {
case RIL_BLOCK:
box = it_->block()->block->restricted_bounding_box(include_upper_dots_,
include_lower_dots_);
break;
case RIL_PARA:
para = it_->row()->row->para();
// explicit fall-through.
case RIL_TEXTLINE:
box = it_->row()->row->restricted_bounding_box(include_upper_dots_,
include_lower_dots_);
break;
case RIL_WORD:
box = it_->word()->word->restricted_bounding_box(include_upper_dots_,
include_lower_dots_);
break;
case RIL_SYMBOL:
if (cblob_it_ == NULL)
box = it_->word()->box_word->BlobBox(blob_index_);
else
box = cblob_it_->data()->bounding_box();
}
if (level == RIL_PARA) {
PageIterator other = *this;
other.Begin();
do {
if (other.it_->block() &&
other.it_->block()->block == it_->block()->block &&
other.it_->row() && other.it_->row()->row &&
other.it_->row()->row->para() == para) {
box = box.bounding_union(other.it_->row()->row->bounding_box());
}
} while (other.Next(RIL_TEXTLINE));
}
if (level != RIL_SYMBOL || cblob_it_ != NULL)
box.rotate(it_->block()->block->re_rotation());
// Now we have a box in tesseract coordinates relative to the image rectangle,
// we have to convert the coords to a top-down system.
const int pix_height = pixGetHeight(tesseract_->pix_binary());
const int pix_width = pixGetWidth(tesseract_->pix_binary());
*left = ClipToRange(static_cast<int>(box.left()), 0, pix_width);
*top = ClipToRange(pix_height - box.top(), 0, pix_height);
*right = ClipToRange(static_cast<int>(box.right()), *left, pix_width);
*bottom = ClipToRange(pix_height - box.bottom(), *top, pix_height);
return true;
}
/**
* Returns the bounding rectangle of the current object at the given level in
* coordinates of the original image.
* See comment on coordinate system above.
* Returns false if there is no such object at the current position.
*/
bool PageIterator::BoundingBox(PageIteratorLevel level,
int* left, int* top,
int* right, int* bottom) const {
return BoundingBox(level, 0, left, top, right, bottom);
}
bool PageIterator::BoundingBox(PageIteratorLevel level, const int padding,
int* left, int* top,
int* right, int* bottom) const {
if (!BoundingBoxInternal(level, left, top, right, bottom))
return false;
// Convert to the coordinate system of the original image.
*left = ClipToRange(*left / scale_ + rect_left_ - padding,
rect_left_, rect_left_ + rect_width_);
*top = ClipToRange(*top / scale_ + rect_top_ - padding,
rect_top_, rect_top_ + rect_height_);
*right = ClipToRange((*right + scale_ - 1) / scale_ + rect_left_ + padding,
*left, rect_left_ + rect_width_);
*bottom = ClipToRange((*bottom + scale_ - 1) / scale_ + rect_top_ + padding,
*top, rect_top_ + rect_height_);
return true;
}
/** Return that there is no such object at a given level. */
bool PageIterator::Empty(PageIteratorLevel level) const {
if (it_->block() == NULL) return true; // Already at the end!
if (it_->word() == NULL && level != RIL_BLOCK) return true; // image block
if (level == RIL_SYMBOL && blob_index_ >= word_length_)
return true; // Zero length word, or already at the end of it.
return false;
}
/** Returns the type of the current block. See apitypes.h for PolyBlockType. */
PolyBlockType PageIterator::BlockType() const {
if (it_->block() == NULL || it_->block()->block == NULL)
return PT_UNKNOWN; // Already at the end!
if (it_->block()->block->poly_block() == NULL)
return PT_FLOWING_TEXT; // No layout analysis used - assume text.
return it_->block()->block->poly_block()->isA();
}
/** Returns the polygon outline of the current block. The returned Pta must
* be ptaDestroy-ed after use. */
Pta* PageIterator::BlockPolygon() const {
if (it_->block() == NULL || it_->block()->block == NULL)
return NULL; // Already at the end!
if (it_->block()->block->poly_block() == NULL)
return NULL; // No layout analysis used - no polygon.
ICOORDELT_IT it(it_->block()->block->poly_block()->points());
Pta* pta = ptaCreate(it.length());
int num_pts = 0;
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++num_pts) {
ICOORD* pt = it.data();
// Convert to top-down coords within the input image.
float x = static_cast<float>(pt->x()) / scale_ + rect_left_;
float y = rect_top_ + rect_height_ - static_cast<float>(pt->y()) / scale_;
ptaAddPt(pta, x, y);
}
return pta;
}
/**
* Returns a binary image of the current object at the given level.
* The position and size match the return from BoundingBoxInternal, and so this
* could be upscaled with respect to the original input image.
* Use pixDestroy to delete the image after use.
* The following methods are used to generate the images:
* RIL_BLOCK: mask the page image with the block polygon.
* RIL_TEXTLINE: Clip the rectangle of the line box from the page image.
* TODO(rays) fix this to generate and use a line polygon.
* RIL_WORD: Clip the rectangle of the word box from the page image.
* RIL_SYMBOL: Render the symbol outline to an image for cblobs (prior
* to recognition) or the bounding box otherwise.
* A reconstruction of the original image (using xor to check for double
* representation) should be reasonably accurate,
* apart from removed noise, at the block level. Below the block level, the
* reconstruction will be missing images and line separators.
* At the symbol level, kerned characters will be invade the bounding box
* if rendered after recognition, making an xor reconstruction inaccurate, but
* an or construction better. Before recognition, symbol-level reconstruction
* should be good, even with xor, since the images come from the connected
* components.
*/
Pix* PageIterator::GetBinaryImage(PageIteratorLevel level) const {
int left, top, right, bottom;
if (!BoundingBoxInternal(level, &left, &top, &right, &bottom))
return NULL;
if (level == RIL_SYMBOL && cblob_it_ != NULL &&
cblob_it_->data()->area() != 0)
return cblob_it_->data()->render();
Box* box = boxCreate(left, top, right - left, bottom - top);
Pix* pix = pixClipRectangle(tesseract_->pix_binary(), box, NULL);
boxDestroy(&box);
if (level == RIL_BLOCK || level == RIL_PARA) {
// Clip to the block polygon as well.
TBOX mask_box;
Pix* mask = it_->block()->block->render_mask(&mask_box);
int mask_x = left - mask_box.left();
int mask_y = top - (tesseract_->ImageHeight() - mask_box.top());
// AND the mask and pix, putting the result in pix.
pixRasterop(pix, MAX(0, -mask_x), MAX(0, -mask_y), pixGetWidth(pix),
pixGetHeight(pix), PIX_SRC & PIX_DST, mask, MAX(0, mask_x),
MAX(0, mask_y));
pixDestroy(&mask);
}
return pix;
}
/**
* Returns an image of the current object at the given level in greyscale
* if available in the input. To guarantee a binary image use BinaryImage.
* NOTE that in order to give the best possible image, the bounds are
* expanded slightly over the binary connected component, by the supplied
* padding, so the top-left position of the returned image is returned
* in (left,top). These will most likely not match the coordinates
* returned by BoundingBox.
* If you do not supply an original image, you will get a binary one.
* Use pixDestroy to delete the image after use.
*/
Pix* PageIterator::GetImage(PageIteratorLevel level, int padding,
Pix* original_img,
int* left, int* top) const {
int right, bottom;
if (!BoundingBox(level, left, top, &right, &bottom))
return NULL;
if (original_img == NULL)
return GetBinaryImage(level);
// Expand the box.
*left = MAX(*left - padding, 0);
*top = MAX(*top - padding, 0);
right = MIN(right + padding, rect_width_);
bottom = MIN(bottom + padding, rect_height_);
Box* box = boxCreate(*left, *top, right - *left, bottom - *top);
Pix* grey_pix = pixClipRectangle(original_img, box, NULL);
boxDestroy(&box);
if (level == RIL_BLOCK || level == RIL_PARA) {
// Clip to the block polygon as well.
TBOX mask_box;
Pix* mask = it_->block()->block->render_mask(&mask_box);
// Copy the mask registered correctly into an image the size of grey_pix.
int mask_x = *left - mask_box.left();
int mask_y = *top - (pixGetHeight(original_img) - mask_box.top());
int width = pixGetWidth(grey_pix);
int height = pixGetHeight(grey_pix);
Pix* resized_mask = pixCreate(width, height, 1);
pixRasterop(resized_mask, MAX(0, -mask_x), MAX(0, -mask_y), width, height,
PIX_SRC, mask, MAX(0, mask_x), MAX(0, mask_y));
pixDestroy(&mask);
pixDilateBrick(resized_mask, resized_mask, 2 * padding + 1,
2 * padding + 1);
pixInvert(resized_mask, resized_mask);
pixSetMasked(grey_pix, resized_mask, MAX_UINT32);
pixDestroy(&resized_mask);
}
return grey_pix;
}
/**
* Returns the baseline of the current object at the given level.
* The baseline is the line that passes through (x1, y1) and (x2, y2).
* WARNING: with vertical text, baselines may be vertical!
*/
bool PageIterator::Baseline(PageIteratorLevel level,
int* x1, int* y1, int* x2, int* y2) const {
if (it_->word() == NULL) return false; // Already at the end!
ROW* row = it_->row()->row;
WERD* word = it_->word()->word;
TBOX box = (level == RIL_WORD || level == RIL_SYMBOL)
? word->bounding_box()
: row->bounding_box();
int left = box.left();
ICOORD startpt(left, static_cast<inT16>(row->base_line(left) + 0.5));
int right = box.right();
ICOORD endpt(right, static_cast<inT16>(row->base_line(right) + 0.5));
// Rotate to image coordinates and convert to global image coords.
startpt.rotate(it_->block()->block->re_rotation());
endpt.rotate(it_->block()->block->re_rotation());
*x1 = startpt.x() / scale_ + rect_left_;
*y1 = (rect_height_ - startpt.y()) / scale_ + rect_top_;
*x2 = endpt.x() / scale_ + rect_left_;
*y2 = (rect_height_ - endpt.y()) / scale_ + rect_top_;
return true;
}
void PageIterator::Orientation(tesseract::Orientation *orientation,
tesseract::WritingDirection *writing_direction,
tesseract::TextlineOrder *textline_order,
float *deskew_angle) const {
BLOCK* block = it_->block()->block;
// Orientation
FCOORD up_in_image(0.0, 1.0);
up_in_image.unrotate(block->classify_rotation());
up_in_image.rotate(block->re_rotation());
if (up_in_image.x() == 0.0F) {
if (up_in_image.y() > 0.0F) {
*orientation = ORIENTATION_PAGE_UP;
}
else {
*orientation = ORIENTATION_PAGE_DOWN;
}
}
else if (up_in_image.x() > 0.0F) {
*orientation = ORIENTATION_PAGE_RIGHT;
}
else {
*orientation = ORIENTATION_PAGE_LEFT;
}
return;
// Writing direction
bool is_vertical_text = (block->classify_rotation().x() == 0.0);
bool right_to_left = block->right_to_left();
*writing_direction =
is_vertical_text
? WRITING_DIRECTION_TOP_TO_BOTTOM
: (right_to_left
? WRITING_DIRECTION_RIGHT_TO_LEFT
: WRITING_DIRECTION_LEFT_TO_RIGHT);
// Textline Order
bool is_mongolian = false; // TODO(eger): fix me
*textline_order = is_vertical_text
? (is_mongolian
? TEXTLINE_ORDER_LEFT_TO_RIGHT
: TEXTLINE_ORDER_RIGHT_TO_LEFT)
: TEXTLINE_ORDER_TOP_TO_BOTTOM;
// Deskew angle
FCOORD skew = block->skew(); // true horizontal for textlines
*deskew_angle = -skew.angle();
}
void PageIterator::ParagraphInfo(tesseract::ParagraphJustification *just,
bool *is_list_item,
bool *is_crown,
int *first_line_indent) const {
*just = tesseract::JUSTIFICATION_UNKNOWN;
if (!it_->row() || !it_->row()->row || !it_->row()->row->para() ||
!it_->row()->row->para()->model)
return;
PARA *para = it_->row()->row->para();
*is_list_item = para->is_list_item;
*is_crown = para->is_very_first_or_continuation;
*first_line_indent = para->model->first_indent() -
para->model->body_indent();
*just = para->model->justification();
}
/**
* Sets up the internal data for iterating the blobs of a new word, then
* moves the iterator to the given offset.
*/
void PageIterator::BeginWord(int offset) {
WERD_RES* word_res = it_->word();
if (word_res == NULL) {
// This is a non-text block, so there is no word.
word_length_ = 0;
blob_index_ = 0;
word_ = NULL;
return;
}
if (word_res->best_choice != NULL) {
// Recognition has been done, so we are using the box_word, which
// is already baseline denormalized.
word_length_ = word_res->best_choice->length();
if (word_res->box_word != NULL) {
if (word_res->box_word->length() != word_length_) {
tprintf("Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: ",
word_length_, word_res->best_choice->unichar_string().string(),
word_res->box_word->length());
word_res->box_word->bounding_box().print();
}
ASSERT_HOST(word_res->box_word->length() == word_length_);
}
word_ = NULL;
// We will be iterating the box_word.
delete cblob_it_;
cblob_it_ = NULL;
}
else {
// No recognition yet, so a "symbol" is a cblob.
word_ = word_res->word;
ASSERT_HOST(word_->cblob_list() != NULL);
word_length_ = word_->cblob_list()->length();
if (cblob_it_ == NULL) cblob_it_ = new C_BLOB_IT;
cblob_it_->set_to_list(word_->cblob_list());
}
for (blob_index_ = 0; blob_index_ < offset; ++blob_index_) {
if (cblob_it_ != NULL)
cblob_it_->forward();
}
}
bool PageIterator::SetWordBlamerBundle(BlamerBundle *blamer_bundle) {
if (it_->word() != NULL) {
it_->word()->blamer_bundle = blamer_bundle;
return true;
}
else {
return false;
}
}
} // namespace tesseract.

View File

@ -0,0 +1,364 @@
///////////////////////////////////////////////////////////////////////
// File: pageiterator.h
// Description: Iterator for tesseract page structure that avoids using
// tesseract internal data structures.
// Author: Ray Smith
// Created: Fri Feb 26 11:01:06 PST 2010
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H__
#define TESSERACT_CCMAIN_PAGEITERATOR_H__
#include "publictypes.h"
#include "platform.h"
struct BlamerBundle;
class C_BLOB_IT;
class PAGE_RES;
class PAGE_RES_IT;
class WERD;
struct Pix;
struct Pta;
namespace tesseract {
class Tesseract;
/**
* Class to iterate over tesseract page structure, providing access to all
* levels of the page hierarchy, without including any tesseract headers or
* having to handle any tesseract structures.
* WARNING! This class points to data held within the TessBaseAPI class, and
* therefore can only be used while the TessBaseAPI class still exists and
* has not been subjected to a call of Init, SetImage, Recognize, Clear, End
* DetectOS, or anything else that changes the internal PAGE_RES.
* See apitypes.h for the definition of PageIteratorLevel.
* See also ResultIterator, derived from PageIterator, which adds in the
* ability to access OCR output with text-specific methods.
*/
class TESS_API PageIterator {
public:
/**
* page_res and tesseract come directly from the BaseAPI.
* The rectangle parameters are copied indirectly from the Thresholder,
* via the BaseAPI. They represent the coordinates of some rectangle in an
* original image (in top-left-origin coordinates) and therefore the top-left
* needs to be added to any output boxes in order to specify coordinates
* in the original image. See TessBaseAPI::SetRectangle.
* The scale and scaled_yres are in case the Thresholder scaled the image
* rectangle prior to thresholding. Any coordinates in tesseract's image
* must be divided by scale before adding (rect_left, rect_top).
* The scaled_yres indicates the effective resolution of the binary image
* that tesseract has been given by the Thresholder.
* After the constructor, Begin has already been called.
*/
PageIterator(PAGE_RES* page_res, Tesseract* tesseract,
int scale, int scaled_yres,
int rect_left, int rect_top,
int rect_width, int rect_height);
virtual ~PageIterator();
/**
* Page/ResultIterators may be copied! This makes it possible to iterate over
* all the objects at a lower level, while maintaining an iterator to
* objects at a higher level. These constructors DO NOT CALL Begin, so
* iterations will continue from the location of src.
*/
PageIterator(const PageIterator& src);
const PageIterator& operator=(const PageIterator& src);
/** Are we positioned at the same location as other? */
bool PositionedAtSameWord(const PAGE_RES_IT* other) const;
// ============= Moving around within the page ============.
/**
* Moves the iterator to point to the start of the page to begin an
* iteration.
*/
virtual void Begin();
/**
* Moves the iterator to the beginning of the paragraph.
* This class implements this functionality by moving it to the zero indexed
* blob of the first (leftmost) word on the first row of the paragraph.
*/
virtual void RestartParagraph();
/**
* Return whether this iterator points anywhere in the first textline of a
* paragraph.
*/
bool IsWithinFirstTextlineOfParagraph() const;
/**
* Moves the iterator to the beginning of the text line.
* This class implements this functionality by moving it to the zero indexed
* blob of the first (leftmost) word of the row.
*/
virtual void RestartRow();
/**
* Moves to the start of the next object at the given level in the
* page hierarchy, and returns false if the end of the page was reached.
* NOTE that RIL_SYMBOL will skip non-text blocks, but all other
* PageIteratorLevel level values will visit each non-text block once.
* Think of non text blocks as containing a single para, with a single line,
* with a single imaginary word.
* Calls to Next with different levels may be freely intermixed.
* This function iterates words in right-to-left scripts correctly, if
* the appropriate language has been loaded into Tesseract.
*/
virtual bool Next(PageIteratorLevel level);
/**
* Returns true if the iterator is at the start of an object at the given
* level.
*
* For instance, suppose an iterator it is pointed to the first symbol of the
* first word of the third line of the second paragraph of the first block in
* a page, then:
* it.IsAtBeginningOf(RIL_BLOCK) = false
* it.IsAtBeginningOf(RIL_PARA) = false
* it.IsAtBeginningOf(RIL_TEXTLINE) = true
* it.IsAtBeginningOf(RIL_WORD) = true
* it.IsAtBeginningOf(RIL_SYMBOL) = true
*/
virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
/**
* Returns whether the iterator is positioned at the last element in a
* given level. (e.g. the last word in a line, the last line in a block)
*
* Here's some two-paragraph example
* text. It starts off innocuously
* enough but quickly turns bizarre.
* The author inserts a cornucopia
* of words to guard against confused
* references.
*
* Now take an iterator it pointed to the start of "bizarre."
* it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false
* it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true
* it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false
*/
virtual bool IsAtFinalElement(PageIteratorLevel level,
PageIteratorLevel element) const;
/**
* Returns whether this iterator is positioned
* before other: -1
* equal to other: 0
* after other: 1
*/
int Cmp(const PageIterator &other) const;
// ============= Accessing data ==============.
// Coordinate system:
// Integer coordinates are at the cracks between the pixels.
// The top-left corner of the top-left pixel in the image is at (0,0).
// The bottom-right corner of the bottom-right pixel in the image is at
// (width, height).
// Every bounding box goes from the top-left of the top-left contained
// pixel to the bottom-right of the bottom-right contained pixel, so
// the bounding box of the single top-left pixel in the image is:
// (0,0)->(1,1).
// If an image rectangle has been set in the API, then returned coordinates
// relate to the original (full) image, rather than the rectangle.
/**
* Controls what to include in a bounding box. Bounding boxes of all levels
* between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics.
* Between layout analysis and recognition, it isn't known where all
* diacritics belong, so this control is used to include or exclude some
* diacritics that are above or below the main body of the word. In most cases
* where the placement is obvious, and after recognition, it doesn't make as
* much difference, as the diacritics will already be included in the word.
*/
void SetBoundingBoxComponents(bool include_upper_dots,
bool include_lower_dots) {
include_upper_dots_ = include_upper_dots;
include_lower_dots_ = include_lower_dots;
}
/**
* Returns the bounding rectangle of the current object at the given level.
* See comment on coordinate system above.
* Returns false if there is no such object at the current position.
* The returned bounding box is guaranteed to match the size and position
* of the image returned by GetBinaryImage, but may clip foreground pixels
* from a grey image. The padding argument to GetImage can be used to expand
* the image to include more foreground pixels. See GetImage below.
*/
bool BoundingBox(PageIteratorLevel level,
int* left, int* top, int* right, int* bottom) const;
bool BoundingBox(PageIteratorLevel level, const int padding,
int* left, int* top, int* right, int* bottom) const;
/**
* Returns the bounding rectangle of the object in a coordinate system of the
* working image rectangle having its origin at (rect_left_, rect_top_) with
* respect to the original image and is scaled by a factor scale_.
*/
bool BoundingBoxInternal(PageIteratorLevel level,
int* left, int* top, int* right, int* bottom) const;
/** Returns whether there is no object of a given level. */
bool Empty(PageIteratorLevel level) const;
/**
* Returns the type of the current block. See apitypes.h for
* PolyBlockType.
*/
PolyBlockType BlockType() const;
/**
* Returns the polygon outline of the current block. The returned Pta must
* be ptaDestroy-ed after use. Note that the returned Pta lists the vertices
* of the polygon, and the last edge is the line segment between the last
* point and the first point. NULL will be returned if the iterator is
* at the end of the document or layout analysis was not used.
*/
Pta* BlockPolygon() const;
/**
* Returns a binary image of the current object at the given level.
* The position and size match the return from BoundingBoxInternal, and so
* this could be upscaled with respect to the original input image.
* Use pixDestroy to delete the image after use.
*/
Pix* GetBinaryImage(PageIteratorLevel level) const;
/**
* Returns an image of the current object at the given level in greyscale
* if available in the input. To guarantee a binary image use BinaryImage.
* NOTE that in order to give the best possible image, the bounds are
* expanded slightly over the binary connected component, by the supplied
* padding, so the top-left position of the returned image is returned
* in (left,top). These will most likely not match the coordinates
* returned by BoundingBox.
* If you do not supply an original image, you will get a binary one.
* Use pixDestroy to delete the image after use.
*/
Pix* GetImage(PageIteratorLevel level, int padding, Pix* original_img,
int* left, int* top) const;
/**
* Returns the baseline of the current object at the given level.
* The baseline is the line that passes through (x1, y1) and (x2, y2).
* WARNING: with vertical text, baselines may be vertical!
* Returns false if there is no baseline at the current position.
*/
bool Baseline(PageIteratorLevel level,
int* x1, int* y1, int* x2, int* y2) const;
/**
* Returns orientation for the block the iterator points to.
* orientation, writing_direction, textline_order: see publictypes.h
* deskew_angle: after rotating the block so the text orientation is
* upright, how many radians does one have to rotate the
* block anti-clockwise for it to be level?
* -Pi/4 <= deskew_angle <= Pi/4
*/
void Orientation(tesseract::Orientation *orientation,
tesseract::WritingDirection *writing_direction,
tesseract::TextlineOrder *textline_order,
float *deskew_angle) const;
/**
* Returns information about the current paragraph, if available.
*
* justification -
* LEFT if ragged right, or fully justified and script is left-to-right.
* RIGHT if ragged left, or fully justified and script is right-to-left.
* unknown if it looks like source code or we have very few lines.
* is_list_item -
* true if we believe this is a member of an ordered or unordered list.
* is_crown -
* true if the first line of the paragraph is aligned with the other
* lines of the paragraph even though subsequent paragraphs have first
* line indents. This typically indicates that this is the continuation
* of a previous paragraph or that it is the very first paragraph in
* the chapter.
* first_line_indent -
* For LEFT aligned paragraphs, the first text line of paragraphs of
* this kind are indented this many pixels from the left edge of the
* rest of the paragraph.
* for RIGHT aligned paragraphs, the first text line of paragraphs of
* this kind are indented this many pixels from the right edge of the
* rest of the paragraph.
* NOTE 1: This value may be negative.
* NOTE 2: if *is_crown == true, the first line of this paragraph is
* actually flush, and first_line_indent is set to the "common"
* first_line_indent for subsequent paragraphs in this block
* of text.
*/
void ParagraphInfo(tesseract::ParagraphJustification *justification,
bool *is_list_item,
bool *is_crown,
int *first_line_indent) const;
// If the current WERD_RES (it_->word()) is not NULL, sets the BlamerBundle
// of the current word to the given pointer (takes ownership of the pointer)
// and returns true.
// Can only be used when iterating on the word level.
bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);
protected:
/**
* Sets up the internal data for iterating the blobs of a new word, then
* moves the iterator to the given offset.
*/
TESS_LOCAL void BeginWord(int offset);
/** Pointer to the page_res owned by the API. */
PAGE_RES* page_res_;
/** Pointer to the Tesseract object owned by the API. */
Tesseract* tesseract_;
/**
* The iterator to the page_res_. Owned by this ResultIterator.
* A pointer just to avoid dragging in Tesseract includes.
*/
PAGE_RES_IT* it_;
/**
* The current input WERD being iterated. If there is an output from OCR,
* then word_ is NULL. Owned by the API
*/
WERD* word_;
/** The length of the current word_. */
int word_length_;
/** The current blob index within the word. */
int blob_index_;
/**
* Iterator to the blobs within the word. If NULL, then we are iterating
* OCR results in the box_word.
* Owned by this ResultIterator.
*/
C_BLOB_IT* cblob_it_;
/** Control over what to include in bounding boxes. */
bool include_upper_dots_;
bool include_lower_dots_;
/** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/
int scale_;
int scaled_yres_;
int rect_left_;
int rect_top_;
int rect_width_;
int rect_height_;
};
} // namespace tesseract.
#endif // TESSERACT_CCMAIN_PAGEITERATOR_H__

View File

@ -0,0 +1,434 @@
/**********************************************************************
* File: pagesegmain.cpp
* Description: Top-level page segmenter for Tesseract.
* Author: Ray Smith
* Created: Thu Sep 25 17:12:01 PDT 2008
*
* (C) Copyright 2008, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifdef _WIN32
#ifndef unlink
#include <io.h>
#endif
#else
#include <unistd.h>
#endif // _WIN32
#ifdef _MSC_VER
#pragma warning(disable:4244) // Conversion warnings
#endif
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
#include "allheaders.h"
#include "blobbox.h"
#include "blread.h"
#include "colfind.h"
#include "equationdetect.h"
#include "imagefind.h"
#include "linefind.h"
#include "makerow.h"
#include "osdetect.h"
#include "tabvector.h"
#include "tesseractclass.h"
#include "tessvars.h"
#include "textord.h"
#include "tordmain.h"
#include "wordseg.h"
namespace tesseract {
// Max erosions to perform in removing an enclosing circle.
const int kMaxCircleErosions = 8;
// Helper to remove an enclosing circle from an image.
// If there isn't one, then the image will most likely get badly mangled.
// The returned pix must be pixDestroyed after use. NULL may be returned
// if the image doesn't meet the trivial conditions that it uses to determine
// success.
static Pix* RemoveEnclosingCircle(Pix* pixs) {
Pix* pixsi = pixInvert(NULL, pixs);
Pix* pixc = pixCreateTemplate(pixs);
pixSetOrClearBorder(pixc, 1, 1, 1, 1, PIX_SET);
pixSeedfillBinary(pixc, pixc, pixsi, 4);
pixInvert(pixc, pixc);
pixDestroy(&pixsi);
Pix* pixt = pixAnd(NULL, pixs, pixc);
l_int32 max_count;
pixCountConnComp(pixt, 8, &max_count);
// The count has to go up before we start looking for the minimum.
l_int32 min_count = MAX_INT32;
Pix* pixout = NULL;
for (int i = 1; i < kMaxCircleErosions; i++) {
pixDestroy(&pixt);
pixErodeBrick(pixc, pixc, 3, 3);
pixt = pixAnd(NULL, pixs, pixc);
l_int32 count;
pixCountConnComp(pixt, 8, &count);
if (i == 1 || count > max_count) {
max_count = count;
min_count = count;
}
else if (i > 1 && count < min_count) {
min_count = count;
pixDestroy(&pixout);
pixout = pixCopy(NULL, pixt); // Save the best.
}
else if (count >= min_count) {
break; // We have passed by the best.
}
}
pixDestroy(&pixt);
pixDestroy(&pixc);
return pixout;
}
/**
* Segment the page according to the current value of tessedit_pageseg_mode.
* pix_binary_ is used as the source image and should not be NULL.
* On return the blocks list owns all the constructed page layout.
*/
int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
Tesseract* osd_tess, OSResults* osr) {
ASSERT_HOST(pix_binary_ != NULL);
int width = pixGetWidth(pix_binary_);
int height = pixGetHeight(pix_binary_);
// Get page segmentation mode.
PageSegMode pageseg_mode = static_cast<PageSegMode>(
static_cast<int>(tessedit_pageseg_mode));
// If a UNLV zone file can be found, use that instead of segmentation.
if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
input_file != NULL && input_file->length() > 0) {
STRING name = *input_file;
const char* lastdot = strrchr(name.string(), '.');
if (lastdot != NULL)
name[lastdot - name.string()] = '\0';
read_unlv_file(name, width, height, blocks);
}
if (blocks->empty()) {
// No UNLV file present. Work according to the PageSegMode.
// First make a single block covering the whole image.
BLOCK_IT block_it(blocks);
BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height);
block->set_right_to_left(right_to_left());
block_it.add_to_end(block);
}
else {
// UNLV file present. Use PSM_SINGLE_BLOCK.
pageseg_mode = PSM_SINGLE_BLOCK;
}
// The diacritic_blobs holds noise blobs that may be diacritics. They
// are separated out on areas of the image that seem noisy and short-circuit
// the layout process, going straight from the initial partition creation
// right through to after word segmentation, where they are added to the
// rej_cblobs list of the most appropriate word. From there classification
// will determine whether they are used.
BLOBNBOX_LIST diacritic_blobs;
int auto_page_seg_ret_val = 0;
TO_BLOCK_LIST to_blocks;
if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
PSM_SPARSE(pageseg_mode)) {
auto_page_seg_ret_val = AutoPageSeg(
pageseg_mode, blocks, &to_blocks,
enable_noise_removal ? &diacritic_blobs : NULL, osd_tess, osr);
if (pageseg_mode == PSM_OSD_ONLY)
return auto_page_seg_ret_val;
// To create blobs from the image region bounds uncomment this line:
// to_blocks.clear(); // Uncomment to go back to the old mode.
}
else {
deskew_ = FCOORD(1.0f, 0.0f);
reskew_ = FCOORD(1.0f, 0.0f);
if (pageseg_mode == PSM_CIRCLE_WORD) {
Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
if (pixcleaned != NULL) {
pixDestroy(&pix_binary_);
pix_binary_ = pixcleaned;
}
}
}
if (auto_page_seg_ret_val < 0) {
return -1;
}
if (blocks->empty()) {
if (textord_debug_tabfind)
tprintf("Empty page\n");
return 0; // AutoPageSeg found an empty page.
}
bool splitting =
pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT;
bool cjk_mode = textord_use_cjk_fp_model;
textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
pix_thresholds_, pix_grey_, splitting || cjk_mode,
&diacritic_blobs, blocks, &to_blocks);
return auto_page_seg_ret_val;
}
// Helper writes a grey image to a file for use by scrollviewer.
// Normally for speed we don't display the image in the layout debug windows.
// If textord_debug_images is true, we draw the image as a background to some
// of the debug windows. printable determines whether these
// images are optimized for printing instead of screen display.
static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) {
Pix* grey_pix = pixCreate(pixGetWidth(pix_binary),
pixGetHeight(pix_binary), 8);
// Printable images are light grey on white, but for screen display
// they are black on dark grey so the other colors show up well.
if (printable) {
pixSetAll(grey_pix);
pixSetMasked(grey_pix, pix_binary, 192);
}
else {
pixSetAllArbitrary(grey_pix, 64);
pixSetMasked(grey_pix, pix_binary, 0);
}
AlignedBlob::IncrementDebugPix();
pixWrite(AlignedBlob::textord_debug_pix().string(), grey_pix, IFF_PNG);
pixDestroy(&grey_pix);
}
/**
* Auto page segmentation. Divide the page image into blocks of uniform
* text linespacing and images.
*
* Resolution (in ppi) is derived from the input image.
*
* The output goes in the blocks list with corresponding TO_BLOCKs in the
* to_blocks list.
*
* If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide
* the image into columns, but multiple blocks are still made if the text is
* of non-uniform linespacing.
*
* If diacritic_blobs is non-null, then diacritics/noise blobs, that would
* confuse layout anaylsis by causing textline overlap, are placed there,
* with the expectation that they will be reassigned to words later and
* noise/diacriticness determined via classification.
*
* If osd (orientation and script detection) is true then that is performed
* as well. If only_osd is true, then only orientation and script detection is
* performed. If osd is desired, (osd or only_osd) then osr_tess must be
* another Tesseract that was initialized especially for osd, and the results
* will be output into osr (orientation and script result).
*/
int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
TO_BLOCK_LIST* to_blocks,
BLOBNBOX_LIST* diacritic_blobs, Tesseract* osd_tess,
OSResults* osr) {
if (textord_debug_images) {
WriteDebugBackgroundImage(textord_debug_printable, pix_binary_);
}
Pix* photomask_pix = NULL;
Pix* musicmask_pix = NULL;
// The blocks made by the ColumnFinder. Moved to blocks before return.
BLOCK_LIST found_blocks;
TO_BLOCK_LIST temp_blocks;
ColumnFinder* finder = SetupPageSegAndDetectOrientation(
pageseg_mode, blocks, osd_tess, osr, &temp_blocks, &photomask_pix,
&musicmask_pix);
int result = 0;
if (finder != NULL) {
TO_BLOCK_IT to_block_it(&temp_blocks);
TO_BLOCK* to_block = to_block_it.data();
if (musicmask_pix != NULL) {
// TODO(rays) pass the musicmask_pix into FindBlocks and mark music
// blocks separately. For now combine with photomask_pix.
pixOr(photomask_pix, photomask_pix, musicmask_pix);
}
if (equ_detect_) {
finder->SetEquationDetect(equ_detect_);
}
/* ÆÁ±ÎÍáб½Ç¶Èʶ±ð
result = finder->FindBlocks(
pageseg_mode, scaled_color_, scaled_factor_, to_block, photomask_pix,
pix_thresholds_, pix_grey_, &found_blocks, diacritic_blobs, to_blocks);
if (result >= 0)
finder->GetDeskewVectors(&deskew_, &reskew_);
*/
delete finder;
}
pixDestroy(&photomask_pix);
pixDestroy(&musicmask_pix);
if (result < 0) return result;
blocks->clear();
BLOCK_IT block_it(blocks);
// Move the found blocks to the input/output blocks.
block_it.add_list_after(&found_blocks);
if (textord_debug_images) {
// The debug image is no longer needed so delete it.
unlink(AlignedBlob::textord_debug_pix().string());
}
return result;
}
// Helper adds all the scripts from sid_set converted to ids from osd_set to
// allowed_ids.
static void AddAllScriptsConverted(const UNICHARSET& sid_set,
const UNICHARSET& osd_set,
GenericVector<int>* allowed_ids) {
for (int i = 0; i < sid_set.get_script_table_size(); ++i) {
if (i != sid_set.null_sid()) {
const char* script = sid_set.get_script_from_script_id(i);
allowed_ids->push_back(osd_set.get_script_id_from_name(script));
}
}
}
/**
* Sets up auto page segmentation, determines the orientation, and corrects it.
* Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to
* facilitate testing.
* photo_mask_pix is a pointer to a NULL pointer that will be filled on return
* with the leptonica photo mask, which must be pixDestroyed by the caller.
* to_blocks is an empty list that will be filled with (usually a single)
* block that is used during layout analysis. This ugly API is required
* because of the possibility of a unlv zone file.
* TODO(rays) clean this up.
* See AutoPageSeg for other arguments.
* The returned ColumnFinder must be deleted after use.
*/
ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
PageSegMode pageseg_mode, BLOCK_LIST* blocks, Tesseract* osd_tess,
OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
Pix** music_mask_pix) {
int vertical_x = 0;
int vertical_y = 1;
TabVector_LIST v_lines;
TabVector_LIST h_lines;
ICOORD bleft(0, 0);
ASSERT_HOST(pix_binary_ != NULL);
if (tessedit_dump_pageseg_images) {
pixWrite("tessinput.png", pix_binary_, IFF_PNG);
}
// Leptonica is used to find the rule/separator lines in the input.
LineFinder::FindAndRemoveLines(source_resolution_,
textord_tabfind_show_vlines, pix_binary_,
&vertical_x, &vertical_y, music_mask_pix,
&v_lines, &h_lines);
if (tessedit_dump_pageseg_images)
pixWrite("tessnolines.png", pix_binary_, IFF_PNG);
// Leptonica is used to find a mask of the photo regions in the input.
*photo_mask_pix = ImageFind::FindImages(pix_binary_);
if (tessedit_dump_pageseg_images)
pixWrite("tessnoimages.png", pix_binary_, IFF_PNG);
if (!PSM_COL_FIND_ENABLED(pageseg_mode)) v_lines.clear();
// The rest of the algorithm uses the usual connected components.
textord_.find_components(pix_binary_, blocks, to_blocks);
TO_BLOCK_IT to_block_it(to_blocks);
// There must be exactly one input block.
// TODO(rays) handle new textline finding with a UNLV zone file.
ASSERT_HOST(to_blocks->singleton());
TO_BLOCK* to_block = to_block_it.data();
TBOX blkbox = to_block->block->bounding_box();
ColumnFinder* finder = NULL;
if (to_block->line_size >= 2) {
finder = new ColumnFinder(static_cast<int>(to_block->line_size),
blkbox.botleft(), blkbox.topright(),
source_resolution_, textord_use_cjk_fp_model,
textord_tabfind_aligned_gap_fraction,
&v_lines, &h_lines, vertical_x, vertical_y);
finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);
if (equ_detect_) {
equ_detect_->LabelSpecialText(to_block);
}
BLOBNBOX_CLIST osd_blobs;
// osd_orientation is the number of 90 degree rotations to make the
// characters upright. (See osdetect.h for precise definition.)
// We want the text lines horizontal, (vertical text indicates vertical
// textlines) which may conflict (eg vertically written CJK).
int osd_orientation = 0;
bool vertical_text = textord_tabfind_force_vertical_text ||
pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
if (!vertical_text && textord_tabfind_vertical_text &&
PSM_ORIENTATION_ENABLED(pageseg_mode)) {
vertical_text =
finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio,
to_block, &osd_blobs);
}
if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != NULL && osr != NULL) {
GenericVector<int> osd_scripts;
if (osd_tess != this) {
// We are running osd as part of layout analysis, so constrain the
// scripts to those allowed by *this.
AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
for (int s = 0; s < sub_langs_.size(); ++s) {
AddAllScriptsConverted(sub_langs_[s]->unicharset,
osd_tess->unicharset, &osd_scripts);
}
}
os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
if (pageseg_mode == PSM_OSD_ONLY) {
delete finder;
return NULL;
}
osd_orientation = osr->best_result.orientation_id;
double osd_score = osr->orientations[osd_orientation];
double osd_margin = min_orientation_margin * 2;
for (int i = 0; i < 4; ++i) {
if (i != osd_orientation &&
osd_score - osr->orientations[i] < osd_margin) {
osd_margin = osd_score - osr->orientations[i];
}
}
int best_script_id = osr->best_result.script_id;
const char* best_script_str =
osd_tess->unicharset.get_script_from_script_id(best_script_id);
bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
best_script_id == osd_tess->unicharset.hiragana_sid() ||
best_script_id == osd_tess->unicharset.katakana_sid() ||
strcmp("Japanese", best_script_str) == 0 ||
strcmp("Korean", best_script_str) == 0 ||
strcmp("Hangul", best_script_str) == 0;
if (cjk) {
finder->set_cjk_script(true);
}
if (osd_margin < min_orientation_margin) {
// The margin is weak.
if (!cjk && !vertical_text && osd_orientation == 2) {
// upside down latin text is improbable with such a weak margin.
tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
"Don't rotate.\n", osd_margin);
osd_orientation = 0;
}
else {
tprintf(
"OSD: Weak margin (%.2f) for %d blob text block, "
"but using orientation anyway: %d\n",
osd_margin, osd_blobs.length(), osd_orientation);
}
}
}
osd_blobs.shallow_clear();
finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
}
return finder;
}
} // namespace tesseract.

View File

@ -0,0 +1,43 @@
/**********************************************************************
* File: pagewalk.cpp (Formerly walkers.c)
* Description: Block list processors
* Author: Phil Cheatle
* Created: Thu Oct 10 16:25:24 BST 1991
*
* (C) Copyright 1991, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "pageres.h"
#include "tesseractclass.h"
namespace tesseract {
/**
* @name process_selected_words()
*
* Walk the current block list applying the specified word processor function
* to each word that overlaps the selection_box.
*/
void Tesseract::process_selected_words(
PAGE_RES* page_res, // blocks to check
TBOX & selection_box,
BOOL8(tesseract::Tesseract::*word_processor)(PAGE_RES_IT* pr_it)) {
for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != NULL;
page_res_it.forward()) {
WERD* word = page_res_it.word()->word;
if (word->bounding_box().overlap(selection_box)) {
if (!(this->*word_processor)(&page_res_it))
return;
}
}
}
} // namespace tesseract

View File

@ -0,0 +1,69 @@
///////////////////////////////////////////////////////////////////////
// File: par_control.cpp
// Description: Control code for parallel implementation.
// Author: Ray Smith
// Created: Mon Nov 04 13:23:15 PST 2013
//
// (C) Copyright 2013, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "tesseractclass.h"
namespace tesseract {
struct BlobData {
BlobData() : blob(NULL), choices(NULL) {}
BlobData(int index, Tesseract* tess, const WERD_RES& word)
: blob(word.chopped_word->blobs[index]),
tesseract(tess),
choices(&(*word.ratings)(index, index)) {}
TBLOB* blob;
Tesseract* tesseract;
BLOB_CHOICE_LIST** choices;
};
void Tesseract::PrerecAllWordsPar(const GenericVector<WordData>& words) {
// Prepare all the blobs.
GenericVector<BlobData> blobs;
for (int w = 0; w < words.size(); ++w) {
if (words[w].word->ratings != NULL &&
words[w].word->ratings->get(0, 0) == NULL) {
for (int s = 0; s < words[w].lang_words.size(); ++s) {
Tesseract* sub = s < sub_langs_.size() ? sub_langs_[s] : this;
const WERD_RES& word = *words[w].lang_words[s];
for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) {
blobs.push_back(BlobData(b, sub, word));
}
}
}
}
// Pre-classify all the blobs.
if (tessedit_parallelize > 1) {
for (int b = 0; b < blobs.size(); ++b) {
*blobs[b].choices =
blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, NULL);
}
}
else {
// TODO(AMD) parallelize this.
for (int b = 0; b < blobs.size(); ++b) {
*blobs[b].choices =
blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, NULL);
}
}
}
} // namespace tesseract.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,108 @@
/**********************************************************************
* File: paragraphs.h
* Description: Paragraph Detection data structures.
* Author: David Eger
* Created: 25 February 2011
*
* (C) Copyright 2011, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_
#define TESSERACT_CCMAIN_PARAGRAPHS_H_
#include "rect.h"
#include "ocrpara.h"
#include "genericvector.h"
#include "strngs.h"
class WERD;
class UNICHARSET;
namespace tesseract {
class MutableIterator;
// This structure captures all information needed about a text line for the
// purposes of paragraph detection. It is meant to be exceedingly light-weight
// so that we can easily test paragraph detection independent of the rest of
// Tesseract.
class RowInfo {
public:
// Constant data derived from Tesseract output.
STRING text; // the full UTF-8 text of the line.
bool ltr; // whether the majority of the text is left-to-right
// TODO(eger) make this more fine-grained.
bool has_leaders; // does the line contain leader dots (.....)?
bool has_drop_cap; // does the line have a drop cap?
int pix_ldistance; // distance to the left pblock boundary in pixels
int pix_rdistance; // distance to the right pblock boundary in pixels
float pix_xheight; // guessed xheight for the line
int average_interword_space; // average space between words in pixels.
int num_words;
TBOX lword_box; // in normalized (horiz text rows) space
TBOX rword_box; // in normalized (horiz text rows) space
STRING lword_text; // the UTF-8 text of the leftmost werd
STRING rword_text; // the UTF-8 text of the rightmost werd
// The text of a paragraph typically starts with the start of an idea and
// ends with the end of an idea. Here we define paragraph as something that
// may have a first line indent and a body indent which may be different.
// Typical words that start an idea are:
// 1. Words in western scripts that start with
// a capital letter, for example "The"
// 2. Bulleted or numbered list items, for
// example "2."
// Typical words which end an idea are words ending in punctuation marks. In
// this vocabulary, each list item is represented as a paragraph.
bool lword_indicates_list_item;
bool lword_likely_starts_idea;
bool lword_likely_ends_idea;
bool rword_indicates_list_item;
bool rword_likely_starts_idea;
bool rword_likely_ends_idea;
};
// Main entry point for Paragraph Detection Algorithm.
//
// Given a set of equally spaced textlines (described by row_infos),
// Split them into paragraphs. See http://goto/paragraphstalk
//
// Output:
// row_owners - one pointer for each row, to the paragraph it belongs to.
// paragraphs - this is the actual list of PARA objects.
// models - the list of paragraph models referenced by the PARA objects.
// caller is responsible for deleting the models.
void DetectParagraphs(int debug_level,
GenericVector<RowInfo> *row_infos,
GenericVector<PARA *> *row_owners,
PARA_LIST *paragraphs,
GenericVector<ParagraphModel *> *models);
// Given a MutableIterator to the start of a block, run DetectParagraphs on
// that block and commit the results to the underlying ROW and BLOCK structs,
// saving the ParagraphModels in models. Caller owns the models.
// We use unicharset during the function to answer questions such as "is the
// first letter of this word upper case?"
void DetectParagraphs(int debug_level,
bool after_text_recognition,
const MutableIterator *block_start,
GenericVector<ParagraphModel *> *models);
} // namespace
#endif // TESSERACT_CCMAIN_PARAGRAPHS_H_

View File

@ -0,0 +1,312 @@
/**********************************************************************
* File: paragraphs.h
* Description: Paragraph Detection internal data structures.
* Author: David Eger
* Created: 11 March 2011
*
* (C) Copyright 2011, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
#define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
#include "paragraphs.h"
#ifdef _MSC_VER
#include <string>
#else
#include "strings.h"
#endif
// NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
// DATA STRUCTURES OR FUNCTIONS IN THIS FILE.
class WERD_CHOICE;
namespace tesseract {
// Return whether the given word is likely to be a list item start word.
bool AsciiLikelyListItem(const STRING &word);
// Return the first Unicode Codepoint from werd[pos].
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
// Set right word attributes given either a unicharset and werd or a utf8
// string.
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
const STRING &utf8,
bool *is_list, bool *starts_idea, bool *ends_idea);
// Set left word attributes given either a unicharset and werd or a utf8 string.
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
const STRING &utf8,
bool *is_list, bool *starts_idea, bool *ends_idea);
enum LineType {
LT_START = 'S', // First line of a paragraph.
LT_BODY = 'C', // Continuation line of a paragraph.
LT_UNKNOWN = 'U', // No clues.
LT_MULTIPLE = 'M', // Matches for both LT_START and LT_BODY.
};
// The first paragraph in a page of body text is often un-indented.
// This is a typographic convention which is common to indicate either that:
// (1) The paragraph is the continuation of a previous paragraph, or
// (2) The paragraph is the first paragraph in a chapter.
//
// I refer to such paragraphs as "crown"s, and the output of the paragraph
// detection algorithm attempts to give them the same paragraph model as
// the rest of the body text.
//
// Nonetheless, while building hypotheses, it is useful to mark the lines
// of crown paragraphs temporarily as crowns, either aligned left or right.
extern const ParagraphModel *kCrownLeft;
extern const ParagraphModel *kCrownRight;
inline bool StrongModel(const ParagraphModel *model) {
return model != NULL && model != kCrownLeft && model != kCrownRight;
}
struct LineHypothesis {
LineHypothesis() : ty(LT_UNKNOWN), model(NULL) {}
LineHypothesis(LineType line_type, const ParagraphModel *m)
: ty(line_type), model(m) {}
LineHypothesis(const LineHypothesis &other)
: ty(other.ty), model(other.model) {}
bool operator==(const LineHypothesis &other) const {
return ty == other.ty && model == other.model;
}
LineType ty;
const ParagraphModel *model;
};
class ParagraphTheory; // Forward Declaration
typedef GenericVectorEqEq<const ParagraphModel *> SetOfModels;
// Row Scratch Registers are data generated by the paragraph detection
// algorithm based on a RowInfo input.
class RowScratchRegisters {
public:
// We presume row will outlive us.
void Init(const RowInfo &row);
LineType GetLineType() const;
LineType GetLineType(const ParagraphModel *model) const;
// Mark this as a start line type, sans model. This is useful for the
// initial marking of probable body lines or paragraph start lines.
void SetStartLine();
// Mark this as a body line type, sans model. This is useful for the
// initial marking of probably body lines or paragraph start lines.
void SetBodyLine();
// Record that this row fits as a paragraph start line in the given model,
void AddStartLine(const ParagraphModel *model);
// Record that this row fits as a paragraph body line in the given model,
void AddBodyLine(const ParagraphModel *model);
// Clear all hypotheses about this line.
void SetUnknown() { hypotheses_.truncate(0); }
// Append all hypotheses of strong models that match this row as a start.
void StartHypotheses(SetOfModels *models) const;
// Append all hypotheses of strong models matching this row.
void StrongHypotheses(SetOfModels *models) const;
// Append all hypotheses for this row.
void NonNullHypotheses(SetOfModels *models) const;
// Discard any hypotheses whose model is not in the given list.
void DiscardNonMatchingHypotheses(const SetOfModels &models);
// If we have only one hypothesis and that is that this line is a paragraph
// start line of a certain model, return that model. Else return NULL.
const ParagraphModel *UniqueStartHypothesis() const;
// If we have only one hypothesis and that is that this line is a paragraph
// body line of a certain model, return that model. Else return NULL.
const ParagraphModel *UniqueBodyHypothesis() const;
// Return the indentation for the side opposite of the aligned side.
int OffsideIndent(tesseract::ParagraphJustification just) const {
switch (just) {
case tesseract::JUSTIFICATION_RIGHT: return lindent_;
case tesseract::JUSTIFICATION_LEFT: return rindent_;
default: return lindent_ > rindent_ ? lindent_ : rindent_;
}
}
// Return the indentation for the side the text is aligned to.
int AlignsideIndent(tesseract::ParagraphJustification just) const {
switch (just) {
case tesseract::JUSTIFICATION_RIGHT: return rindent_;
case tesseract::JUSTIFICATION_LEFT: return lindent_;
default: return lindent_ > rindent_ ? lindent_ : rindent_;
}
}
// Append header fields to a vector of row headings.
static void AppendDebugHeaderFields(GenericVector<STRING> *header);
// Append data for this row to a vector of debug strings.
void AppendDebugInfo(const ParagraphTheory &theory,
GenericVector<STRING> *dbg) const;
const RowInfo *ri_;
// These four constants form a horizontal box model for the white space
// on the edges of each line. At each point in the algorithm, the following
// shall hold:
// ri_->pix_ldistance = lmargin_ + lindent_
// ri_->pix_rdistance = rindent_ + rmargin_
int lmargin_;
int lindent_;
int rindent_;
int rmargin_;
private:
// Hypotheses of either LT_START or LT_BODY
GenericVectorEqEq<LineHypothesis> hypotheses_;
};
// A collection of convenience functions for wrapping the set of
// Paragraph Models we believe correctly model the paragraphs in the image.
class ParagraphTheory {
public:
// We presume models will outlive us, and that models will take ownership
// of any ParagraphModel *'s we add.
explicit ParagraphTheory(GenericVector<ParagraphModel *> *models)
: models_(models) {}
GenericVector<ParagraphModel *> &models() { return *models_; }
const GenericVector<ParagraphModel *> &models() const { return *models_; }
// Return an existing model if one that is Comparable() can be found.
// Else, allocate a new copy of model to save and return a pointer to it.
const ParagraphModel *AddModel(const ParagraphModel &model);
// Discard any models we've made that are not in the list of used models.
void DiscardUnusedModels(const SetOfModels &used_models);
// Return the set of all non-centered models.
void NonCenteredModels(SetOfModels *models);
// If any of the non-centered paragraph models we know about fit
// rows[start, end), return it. Else NULL.
const ParagraphModel *Fits(const GenericVector<RowScratchRegisters> *rows,
int start, int end) const;
int IndexOf(const ParagraphModel *model) const;
private:
GenericVector<ParagraphModel *> *models_;
GenericVectorEqEq<ParagraphModel *> models_we_added_;
};
bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows,
int row, const ParagraphModel *model);
bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows,
int row, const ParagraphModel *model);
bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows,
int a, int b, const ParagraphModel *model);
// A class for smearing Paragraph Model hypotheses to surrounding rows.
// The idea here is that StrongEvidenceClassify first marks only exceedingly
// obvious start and body rows and constructs models of them. Thereafter,
// we may have left over unmarked lines (mostly end-of-paragraph lines) which
// were too short to have much confidence about, but which fit the models we've
// constructed perfectly and which we ought to mark. This class is used to
// "smear" our models over the text.
class ParagraphModelSmearer {
public:
ParagraphModelSmearer(GenericVector<RowScratchRegisters> *rows,
int row_start, int row_end,
ParagraphTheory *theory);
// Smear forward paragraph models from existing row markings to subsequent
// text lines if they fit, and mark any thereafter still unmodeled rows
// with any model in the theory that fits them.
void Smear();
private:
// Record in open_models_ for rows [start_row, end_row) the list of models
// currently open at each row.
// A model is still open in a row if some previous row has said model as a
// start hypothesis, and all rows since (including this row) would fit as
// either a body or start line in that model.
void CalculateOpenModels(int row_start, int row_end);
SetOfModels &OpenModels(int row) {
return open_models_[row - row_start_ + 1];
}
ParagraphTheory *theory_;
GenericVector<RowScratchRegisters> *rows_;
int row_start_;
int row_end_;
// open_models_ corresponds to rows[start_row_ - 1, end_row_]
//
// open_models_: Contains models which there was an active (open) paragraph
// as of the previous line and for which the left and right
// indents admit the possibility that this text line continues
// to fit the same model.
// TODO(eger): Think about whether we can get rid of "Open" models and just
// use the current hypotheses on RowScratchRegisters.
GenericVector<SetOfModels> open_models_;
};
// Clear all hypotheses about lines [start, end) and reset the margins to the
// percentile (0..100) value of the left and right row edges for this run of
// rows.
void RecomputeMarginsAndClearHypotheses(
GenericVector<RowScratchRegisters> *rows, int start, int end,
int percentile);
// Return the median inter-word space in rows[row_start, row_end).
int InterwordSpace(const GenericVector<RowScratchRegisters> &rows,
int row_start, int row_end);
// Return whether the first word on the after line can fit in the space at
// the end of the before line (knowing which way the text is aligned and read).
bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
const RowScratchRegisters &after,
tesseract::ParagraphJustification justification);
// Return whether the first word on the after line can fit in the space at
// the end of the before line (not knowing the text alignment).
bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
const RowScratchRegisters &after);
// Do rows[start, end) form a single instance of the given paragraph model?
bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows,
int start, int end, const ParagraphModel *model);
// Do the text and geometry of two rows support a paragraph break between them?
bool LikelyParagraphStart(const RowScratchRegisters &before,
const RowScratchRegisters &after,
tesseract::ParagraphJustification j);
// Given a set of row_owners pointing to PARAs or NULL (no paragraph known),
// normalize each row_owner to point to an actual PARA, and output the
// paragraphs in order onto paragraphs.
void CanonicalizeDetectionResults(
GenericVector<PARA *> *row_owners,
PARA_LIST *paragraphs);
} // namespace
#endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_

View File

@ -0,0 +1,370 @@
///////////////////////////////////////////////////////////////////////
// File: paramsd.cpp
// Description: Tesseract parameter Editor
// Author: Joern Wanke
// Created: Wed Jul 18 10:05:01 PDT 2007
//
// (C) Copyright 2007, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
//
// The parameters editor is used to edit all the parameters used within
// tesseract from the ui.
#ifdef _WIN32
#else
#include <stdlib.h>
#include <stdio.h>
#endif
#include <map>
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
#ifndef GRAPHICS_DISABLED
#include "paramsd.h"
#include "params.h"
#include "scrollview.h"
#include "svmnode.h"
#define VARDIR "configs/" /*parameters files */
#define MAX_ITEMS_IN_SUBMENU 30
// The following variables should remain static globals, since they
// are used by debug editor, which uses a single Tesseract instance.
//
// Contains the mappings from unique VC ids to their actual pointers.
static std::map<int, ParamContent*> vcMap;
static int nrParams = 0;
static int writeCommands[2];
ELISTIZE(ParamContent)
// Constructors for the various ParamTypes.
ParamContent::ParamContent(tesseract::StringParam* it) {
my_id_ = nrParams;
nrParams++;
param_type_ = VT_STRING;
sIt = it;
vcMap[my_id_] = this;
}
// Constructors for the various ParamTypes.
ParamContent::ParamContent(tesseract::IntParam* it) {
my_id_ = nrParams;
nrParams++;
param_type_ = VT_INTEGER;
iIt = it;
vcMap[my_id_] = this;
}
// Constructors for the various ParamTypes.
ParamContent::ParamContent(tesseract::BoolParam* it) {
my_id_ = nrParams;
nrParams++;
param_type_ = VT_BOOLEAN;
bIt = it;
vcMap[my_id_] = this;
}
// Constructors for the various ParamTypes.
ParamContent::ParamContent(tesseract::DoubleParam* it) {
my_id_ = nrParams;
nrParams++;
param_type_ = VT_DOUBLE;
dIt = it;
vcMap[my_id_] = this;
}
// Gets a VC object identified by its ID.
ParamContent* ParamContent::GetParamContentById(int id) {
return vcMap[id];
}
// Copy the first N words from the source string to the target string.
// Words are delimited by "_".
void ParamsEditor::GetFirstWords(
const char *s, // source string
int n, // number of words
char *t // target string
) {
int full_length = strlen(s);
int reqd_len = 0; // No. of chars requird
const char *next_word = s;
while ((n > 0) && reqd_len < full_length) {
reqd_len += strcspn(next_word, "_") + 1;
next_word += reqd_len;
n--;
}
strncpy(t, s, reqd_len);
t[reqd_len] = '\0'; // ensure null terminal
}
// Getter for the name.
const char* ParamContent::GetName() const {
if (param_type_ == VT_INTEGER) { return iIt->name_str(); }
else if (param_type_ == VT_BOOLEAN) { return bIt->name_str(); }
else if (param_type_ == VT_DOUBLE) { return dIt->name_str(); }
else if (param_type_ == VT_STRING) { return sIt->name_str(); }
else
return "ERROR: ParamContent::GetName()";
}
// Getter for the description.
const char* ParamContent::GetDescription() const {
if (param_type_ == VT_INTEGER) { return iIt->info_str(); }
else if (param_type_ == VT_BOOLEAN) { return bIt->info_str(); }
else if (param_type_ == VT_DOUBLE) { return dIt->info_str(); }
else if (param_type_ == VT_STRING) { return sIt->info_str(); }
else return NULL;
}
// Getter for the value.
STRING ParamContent::GetValue() const {
STRING result;
if (param_type_ == VT_INTEGER) {
result.add_str_int("", *iIt);
}
else if (param_type_ == VT_BOOLEAN) {
result.add_str_int("", *bIt);
}
else if (param_type_ == VT_DOUBLE) {
result.add_str_double("", *dIt);
}
else if (param_type_ == VT_STRING) {
if (((STRING) * (sIt)).string() != NULL) {
result = sIt->string();
}
else {
result = "Null";
}
}
return result;
}
// Setter for the value.
void ParamContent::SetValue(const char* val) {
// TODO (wanke) Test if the values actually are properly converted.
// (Quickly visible impacts?)
changed_ = TRUE;
if (param_type_ == VT_INTEGER) {
iIt->set_value(atoi(val));
}
else if (param_type_ == VT_BOOLEAN) {
bIt->set_value(atoi(val));
}
else if (param_type_ == VT_DOUBLE) {
dIt->set_value(strtod(val, NULL));
}
else if (param_type_ == VT_STRING) {
sIt->set_value(val);
}
}
// Gets the up to the first 3 prefixes from s (split by _).
// For example, tesseract_foo_bar will be split into tesseract,foo and bar.
void ParamsEditor::GetPrefixes(const char* s, STRING* level_one,
STRING* level_two,
STRING* level_three) {
char* p = new char[1024];
GetFirstWords(s, 1, p);
*level_one = p;
GetFirstWords(s, 2, p);
*level_two = p;
GetFirstWords(s, 3, p);
*level_three = p;
delete[] p;
}
// Compare two VC objects by their name.
int ParamContent::Compare(const void* v1, const void* v2) {
const ParamContent* one =
*reinterpret_cast<const ParamContent* const *>(v1);
const ParamContent* two =
*reinterpret_cast<const ParamContent* const *>(v2);
return strcmp(one->GetName(), two->GetName());
}
// Find all editable parameters used within tesseract and create a
// SVMenuNode tree from it.
// TODO (wanke): This is actually sort of hackish.
SVMenuNode* ParamsEditor::BuildListOfAllLeaves(tesseract::Tesseract *tess) {
SVMenuNode* mr = new SVMenuNode();
ParamContent_LIST vclist;
ParamContent_IT vc_it(&vclist);
// Amount counts the number of entries for a specific char*.
// TODO(rays) get rid of the use of std::map.
std::map<const char*, int> amount;
// Add all parameters to a list.
int v, i;
int num_iterations = (tess->params() == NULL) ? 1 : 2;
for (v = 0; v < num_iterations; ++v) {
tesseract::ParamsVectors *vec = (v == 0) ? GlobalParams() : tess->params();
for (i = 0; i < vec->int_params.size(); ++i) {
vc_it.add_after_then_move(new ParamContent(vec->int_params[i]));
}
for (i = 0; i < vec->bool_params.size(); ++i) {
vc_it.add_after_then_move(new ParamContent(vec->bool_params[i]));
}
for (i = 0; i < vec->string_params.size(); ++i) {
vc_it.add_after_then_move(new ParamContent(vec->string_params[i]));
}
for (i = 0; i < vec->double_params.size(); ++i) {
vc_it.add_after_then_move(new ParamContent(vec->double_params[i]));
}
}
// Count the # of entries starting with a specific prefix.
for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {
ParamContent* vc = vc_it.data();
STRING tag;
STRING tag2;
STRING tag3;
GetPrefixes(vc->GetName(), &tag, &tag2, &tag3);
amount[tag.string()]++;
amount[tag2.string()]++;
amount[tag3.string()]++;
}
vclist.sort(ParamContent::Compare); // Sort the list alphabetically.
SVMenuNode* other = mr->AddChild("OTHER");
// go through the list again and this time create the menu structure.
vc_it.move_to_first();
for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {
ParamContent* vc = vc_it.data();
STRING tag;
STRING tag2;
STRING tag3;
GetPrefixes(vc->GetName(), &tag, &tag2, &tag3);
if (amount[tag.string()] == 1) {
other->AddChild(vc->GetName(), vc->GetId(), vc->GetValue().string(),
vc->GetDescription());
}
else { // More than one would use this submenu -> create submenu.
SVMenuNode* sv = mr->AddChild(tag.string());
if ((amount[tag.string()] <= MAX_ITEMS_IN_SUBMENU) ||
(amount[tag2.string()] <= 1)) {
sv->AddChild(vc->GetName(), vc->GetId(),
vc->GetValue().string(), vc->GetDescription());
}
else { // Make subsubmenus.
SVMenuNode* sv2 = sv->AddChild(tag2.string());
sv2->AddChild(vc->GetName(), vc->GetId(),
vc->GetValue().string(), vc->GetDescription());
}
}
}
return mr;
}
// Event listener. Waits for SVET_POPUP events and processes them.
void ParamsEditor::Notify(const SVEvent* sve) {
if (sve->type == SVET_POPUP) { // only catch SVET_POPUP!
char* param = sve->parameter;
if (sve->command_id == writeCommands[0]) {
WriteParams(param, false);
}
else if (sve->command_id == writeCommands[1]) {
WriteParams(param, true);
}
else {
ParamContent* vc = ParamContent::GetParamContentById(
sve->command_id);
vc->SetValue(param);
sv_window_->AddMessage("Setting %s to %s",
vc->GetName(), vc->GetValue().string());
}
}
}
// Integrate the parameters editor as popupmenu into the existing scrollview
// window (usually the pg editor). If sv == null, create a new empty
// empty window and attach the parameters editor to that window (ugly).
ParamsEditor::ParamsEditor(tesseract::Tesseract* tess,
ScrollView* sv) {
if (sv == NULL) {
const char* name = "ParamEditorMAIN";
sv = new ScrollView(name, 1, 1, 200, 200, 300, 200);
}
sv_window_ = sv;
//Only one event handler per window.
//sv->AddEventHandler((SVEventHandler*) this);
SVMenuNode* svMenuRoot = BuildListOfAllLeaves(tess);
STRING paramfile;
paramfile = tess->datadir;
paramfile += VARDIR; // parameters dir
paramfile += "edited"; // actual name
SVMenuNode* std_menu = svMenuRoot->AddChild("Build Config File");
writeCommands[0] = nrParams + 1;
std_menu->AddChild("All Parameters", writeCommands[0],
paramfile.string(), "Config file name?");
writeCommands[1] = nrParams + 2;
std_menu->AddChild("changed_ Parameters Only", writeCommands[1],
paramfile.string(), "Config file name?");
svMenuRoot->BuildMenu(sv, false);
}
// Write all (changed_) parameters to a config file.
void ParamsEditor::WriteParams(char *filename,
bool changes_only) {
FILE *fp; // input file
char msg_str[255];
// if file exists
if ((fp = fopen(filename, "rb")) != NULL) {
fclose(fp);
sprintf(msg_str, "Overwrite file " "%s" "? (Y/N)", filename);
int a = sv_window_->ShowYesNoDialog(msg_str);
if (a == 'n') {
return;
} // don't write
}
fp = fopen(filename, "wb"); // can we write to it?
if (fp == NULL) {
sv_window_->AddMessage(
"Can't write to file "
"%s"
"",
filename);
return;
}
for (std::map<int, ParamContent*>::iterator iter = vcMap.begin();
iter != vcMap.end();
++iter) {
ParamContent* cur = iter->second;
if (!changes_only || cur->HasChanged()) {
fprintf(fp, "%-25s %-12s # %s\n",
cur->GetName(), cur->GetValue().string(), cur->GetDescription());
}
}
fclose(fp);
}
#endif

View File

@ -0,0 +1,126 @@
///////////////////////////////////////////////////////////////////////
// File: paramsd.cpp
// Description: Tesseract parameter editor
// Author: Joern Wanke
// Created: Wed Jul 18 10:05:01 PDT 2007
//
// (C) Copyright 2007, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
//
// Tesseract parameter editor is used to edit all the parameters used
// within tesseract from the ui.
#ifndef GRAPHICS_DISABLED
#ifndef VARABLED_H
#define VARABLED_H
#include "elst.h"
#ifndef NO_CUBE_BUILD
#include "scrollview.h"
#endif
#include "params.h"
#include "tesseractclass.h"
class SVMenuNode;
// A list of all possible parameter types used.
enum ParamType {
VT_INTEGER,
VT_BOOLEAN,
VT_STRING,
VT_DOUBLE
};
// A rather hackish helper structure which can take any kind of parameter input
// (defined by ParamType) and do a couple of common operations on them, like
// comparisond or getting its value. It is used in the context of the
// ParamsEditor as a bridge from the internal tesseract parameters to the
// ones displayed by the ScrollView server.
class ParamContent : public ELIST_LINK {
public:
// Compare two VC objects by their name.
static int Compare(const void* v1, const void* v2);
// Gets a VC object identified by its ID.
static ParamContent* GetParamContentById(int id);
// Constructors for the various ParamTypes.
ParamContent() {
}
explicit ParamContent(tesseract::StringParam* it);
explicit ParamContent(tesseract::IntParam* it);
explicit ParamContent(tesseract::BoolParam* it);
explicit ParamContent(tesseract::DoubleParam* it);
// Getters and Setters.
void SetValue(const char* val);
STRING GetValue() const;
const char* GetName() const;
const char* GetDescription() const;
int GetId() { return my_id_; }
bool HasChanged() { return changed_; }
private:
// The unique ID of this VC object.
int my_id_;
// Whether the parameter was changed_ and thus needs to be rewritten.
bool changed_;
// The actual ParamType of this VC object.
ParamType param_type_;
tesseract::StringParam* sIt;
tesseract::IntParam* iIt;
tesseract::BoolParam* bIt;
tesseract::DoubleParam* dIt;
};
ELISTIZEH(ParamContent)
// The parameters editor enables the user to edit all the parameters used within
// tesseract. It can be invoked on its own, but is supposed to be invoked by
// the program editor.
class ParamsEditor : public SVEventHandler {
public:
// Integrate the parameters editor as popupmenu into the existing scrollview
// window (usually the pg editor). If sv == null, create a new empty
// empty window and attach the parameter editor to that window (ugly).
explicit ParamsEditor(tesseract::Tesseract*, ScrollView* sv = NULL);
// Event listener. Waits for SVET_POPUP events and processes them.
void Notify(const SVEvent* sve);
private:
// Gets the up to the first 3 prefixes from s (split by _).
// For example, tesseract_foo_bar will be split into tesseract,foo and bar.
void GetPrefixes(const char* s, STRING* level_one,
STRING* level_two, STRING* level_three);
// Gets the first n words (split by _) and puts them in t.
// For example, tesseract_foo_bar with N=2 will yield tesseract_foo_.
void GetFirstWords(const char *s, // source string
int n, // number of words
char *t); // target string
// Find all editable parameters used within tesseract and create a
// SVMenuNode tree from it.
SVMenuNode *BuildListOfAllLeaves(tesseract::Tesseract *tess);
// Write all (changed_) parameters to a config file.
void WriteParams(char* filename, bool changes_only);
ScrollView* sv_window_;
};
#endif
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,87 @@
///////////////////////////////////////////////////////////////////////
// File: pgedit.h
// Description: Page structure file editor
// Author: Joern Wanke
// Created: Wed Jul 18 10:05:01 PDT 2007
//
// (C) Copyright 2007, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef PGEDIT_H
#define PGEDIT_H
#include "ocrblock.h"
#include "ocrrow.h"
#include "werd.h"
#include "rect.h"
#include "params.h"
#include "tesseractclass.h"
class ScrollView;
class SVMenuNode;
struct SVEvent;
// A small event handler class to process incoming events to
// this window.
class PGEventHandler : public SVEventHandler {
public:
PGEventHandler(tesseract::Tesseract* tess) : tess_(tess) {
}
void Notify(const SVEvent* sve);
private:
tesseract::Tesseract* tess_;
};
extern BLOCK_LIST *current_block_list;
extern STRING_VAR_H(editor_image_win_name, "EditorImage",
"Editor image window name");
extern INT_VAR_H(editor_image_xpos, 590, "Editor image X Pos");
extern INT_VAR_H(editor_image_ypos, 10, "Editor image Y Pos");
extern INT_VAR_H(editor_image_height, 680, "Editor image height");
extern INT_VAR_H(editor_image_width, 655, "Editor image width");
extern INT_VAR_H(editor_image_word_bb_color, BLUE,
"Word bounding box colour");
extern INT_VAR_H(editor_image_blob_bb_color, YELLOW,
"Blob bounding box colour");
extern INT_VAR_H(editor_image_text_color, WHITE, "Correct text colour");
extern STRING_VAR_H(editor_dbwin_name, "EditorDBWin",
"Editor debug window name");
extern INT_VAR_H(editor_dbwin_xpos, 50, "Editor debug window X Pos");
extern INT_VAR_H(editor_dbwin_ypos, 500, "Editor debug window Y Pos");
extern INT_VAR_H(editor_dbwin_height, 24, "Editor debug window height");
extern INT_VAR_H(editor_dbwin_width, 80, "Editor debug window width");
extern STRING_VAR_H(editor_word_name, "BlnWords",
"BL normalised word window");
extern INT_VAR_H(editor_word_xpos, 60, "Word window X Pos");
extern INT_VAR_H(editor_word_ypos, 510, "Word window Y Pos");
extern INT_VAR_H(editor_word_height, 240, "Word window height");
extern INT_VAR_H(editor_word_width, 655, "Word window width");
extern double_VAR_H(editor_smd_scale_factor, 1.0, "Scaling for smd image");
ScrollView* bln_word_window_handle(); //return handle
void build_image_window(int width, int height);
void display_bln_lines(ScrollView window,
ScrollView::Color colour,
float scale_factor,
float y_offset,
float minx,
float maxx);
//function to call
void pgeditor_msg( //message display
const char *msg);
void pgeditor_show_point( //display coords
SVEvent *event);
//put bln word in box
void show_point(PAGE_RES* page_res, float x, float y);
#endif

View File

@ -0,0 +1,233 @@
///////////////////////////////////////////////////////////////////////
// File: recogtraining.cpp
// Description: Functions for ambiguity and parameter training.
// Author: Daria Antonova
// Created: Mon Aug 13 11:26:43 PDT 2009
//
// (C) Copyright 2009, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "tesseractclass.h"
#include "boxread.h"
#include "control.h"
#include "cutil.h"
#include "host.h"
#include "ratngs.h"
#include "reject.h"
#include "stopper.h"
namespace tesseract {
const inT16 kMaxBoxEdgeDiff = 2;
// Sets flags necessary for recognition in the training mode.
// Opens and returns the pointer to the output file.
FILE *Tesseract::init_recog_training(const STRING &fname) {
if (tessedit_ambigs_training) {
tessedit_tess_adaption_mode.set_value(0); // turn off adaption
tessedit_enable_doc_dict.set_value(0); // turn off document dictionary
// Explore all segmentations.
getDict().stopper_no_acceptable_choices.set_value(1);
}
STRING output_fname = fname;
const char *lastdot = strrchr(output_fname.string(), '.');
if (lastdot != NULL) output_fname[lastdot - output_fname.string()] = '\0';
output_fname += ".txt";
FILE *output_file = open_file(output_fname.string(), "a+");
return output_file;
}
// Copies the bounding box from page_res_it->word() to the given TBOX.
bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) {
while (page_res_it->block() != NULL && page_res_it->word() == NULL)
page_res_it->forward();
if (page_res_it->word() != NULL) {
*tbox = page_res_it->word()->word->bounding_box();
// If tbox->left() is negative, the training image has vertical text and
// all the coordinates of bounding boxes of page_res are rotated by 90
// degrees in a counterclockwise direction. We need to rotate the TBOX back
// in order to compare with the TBOXes of box files.
if (tbox->left() < 0) {
tbox->rotate(FCOORD(0.0, -1.0));
}
return true;
}
else {
return false;
}
}
// This function takes tif/box pair of files and runs recognition on the image,
// while making sure that the word bounds that tesseract identified roughly
// match to those specified by the input box file. For each word (ngram in a
// single bounding box from the input box file) it outputs the ocred result,
// the correct label, rating and certainty.
void Tesseract::recog_training_segmented(const STRING &fname,
PAGE_RES *page_res,
volatile ETEXT_DESC *monitor,
FILE *output_file) {
STRING box_fname = fname;
const char *lastdot = strrchr(box_fname.string(), '.');
if (lastdot != NULL) box_fname[lastdot - box_fname.string()] = '\0';
box_fname += ".box";
// ReadNextBox() will close box_file
FILE *box_file = open_file(box_fname.string(), "r");
PAGE_RES_IT page_res_it;
page_res_it.page_res = page_res;
page_res_it.restart_page();
STRING label;
// Process all the words on this page.
TBOX tbox; // tesseract-identified box
TBOX bbox; // box from the box file
bool keep_going;
int line_number = 0;
int examined_words = 0;
do {
keep_going = read_t(&page_res_it, &tbox);
keep_going &= ReadNextBox(applybox_page, &line_number, box_file, &label,
&bbox);
// Align bottom left points of the TBOXes.
while (keep_going &&
!NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
if (bbox.bottom() < tbox.bottom()) {
page_res_it.forward();
keep_going = read_t(&page_res_it, &tbox);
}
else {
keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label,
&bbox);
}
}
while (keep_going &&
!NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
if (bbox.left() > tbox.left()) {
page_res_it.forward();
keep_going = read_t(&page_res_it, &tbox);
}
else {
keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label,
&bbox);
}
}
// OCR the word if top right points of the TBOXes are similar.
if (keep_going &&
NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
ambigs_classify_and_output(label.string(), &page_res_it, output_file);
examined_words++;
}
page_res_it.forward();
} while (keep_going);
// Set up scripts on all of the words that did not get sent to
// ambigs_classify_and_output. They all should have, but if all the
// werd_res's don't get uch_sets, tesseract will crash when you try
// to iterate over them. :-(
int total_words = 0;
for (page_res_it.restart_page(); page_res_it.block() != NULL;
page_res_it.forward()) {
if (page_res_it.word()) {
if (page_res_it.word()->uch_set == NULL)
page_res_it.word()->SetupFake(unicharset);
total_words++;
}
}
if (examined_words < 0.85 * total_words) {
tprintf("TODO(antonova): clean up recog_training_segmented; "
" It examined only a small fraction of the ambigs image.\n");
}
tprintf("recog_training_segmented: examined %d / %d words.\n",
examined_words, total_words);
}
// Helper prints the given set of blob choices.
static void PrintPath(int length, const BLOB_CHOICE** blob_choices,
const UNICHARSET& unicharset,
const char *label, FILE *output_file) {
float rating = 0.0f;
float certainty = 0.0f;
for (int i = 0; i < length; ++i) {
const BLOB_CHOICE* blob_choice = blob_choices[i];
fprintf(output_file, "%s",
unicharset.id_to_unichar(blob_choice->unichar_id()));
rating += blob_choice->rating();
if (certainty > blob_choice->certainty())
certainty = blob_choice->certainty();
}
fprintf(output_file, "\t%s\t%.4f\t%.4f\n",
label, rating, certainty);
}
// Helper recursively prints all paths through the ratings matrix, starting
// at column col.
static void PrintMatrixPaths(int col, int dim,
const MATRIX& ratings,
int length, const BLOB_CHOICE** blob_choices,
const UNICHARSET& unicharset,
const char *label, FILE *output_file) {
for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) {
if (ratings.get(col, row) != NOT_CLASSIFIED) {
BLOB_CHOICE_IT bc_it(ratings.get(col, row));
for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
blob_choices[length] = bc_it.data();
if (row + 1 < dim) {
PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices,
unicharset, label, output_file);
}
else {
PrintPath(length + 1, blob_choices, unicharset, label, output_file);
}
}
}
}
}
// Runs classify_word_pass1() on the current word. Outputs Tesseract's
// raw choice as a result of the classification. For words labeled with a
// single unichar also outputs all alternatives from blob_choices of the
// best choice.
void Tesseract::ambigs_classify_and_output(const char *label,
PAGE_RES_IT* pr_it,
FILE *output_file) {
// Classify word.
fflush(stdout);
WordData word_data(*pr_it);
SetupWordPassN(1, &word_data);
classify_word_and_language(1, pr_it, &word_data);
WERD_RES* werd_res = word_data.word;
WERD_CHOICE *best_choice = werd_res->best_choice;
ASSERT_HOST(best_choice != NULL);
// Compute the number of unichars in the label.
GenericVector<UNICHAR_ID> encoding;
if (!unicharset.encode_string(label, true, &encoding, NULL, NULL)) {
tprintf("Not outputting illegal unichar %s\n", label);
return;
}
// Dump all paths through the ratings matrix (which is normally small).
int dim = werd_res->ratings->dimension();
const BLOB_CHOICE** blob_choices = new const BLOB_CHOICE*[dim];
PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices,
unicharset, label, output_file);
delete[] blob_choices;
}
} // namespace tesseract

View File

@ -0,0 +1,798 @@
/**********************************************************************
* File: reject.cpp (Formerly reject.c)
* Description: Rejection functions used in tessedit
* Author: Phil Cheatle
* Created: Wed Sep 23 16:50:21 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifdef _MSC_VER
#pragma warning(disable:4244) // Conversion warnings
#pragma warning(disable:4305) // int/float warnings
#endif
#include "tessvars.h"
#ifdef __UNIX__
#include <assert.h>
#include <errno.h>
#endif
#include "scanutils.h"
#include <ctype.h>
#include <string.h>
#include "genericvector.h"
#include "reject.h"
#include "control.h"
#include "docqual.h"
#include "globaloc.h" // For err_exit.
#include "globals.h"
#include "helpers.h"
#include "tesseractclass.h"
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
CLISTIZEH(STRING) CLISTIZE(STRING)
/*************************************************************************
* set_done()
*
* Set the done flag based on the word acceptability criteria
*************************************************************************/
namespace tesseract {
void Tesseract::set_done(WERD_RES *word, inT16 pass) {
word->done = word->tess_accepted &&
(strchr(word->best_choice->unichar_string().string(), ' ') == NULL);
bool word_is_ambig = word->best_choice->dangerous_ambig_found();
bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
word->best_choice->permuter() == FREQ_DAWG_PERM ||
word->best_choice->permuter() == USER_DAWG_PERM;
if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
one_ell_conflict(word, FALSE)) {
if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
word->done = FALSE;
}
if (word->done && ((!word_from_dict &&
word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
word->done = FALSE;
}
if (tessedit_rejection_debug) {
tprintf("set_done(): done=%d\n", word->done);
word->best_choice->print("");
}
}
/*************************************************************************
* make_reject_map()
*
* Sets the done flag to indicate whether the resylt is acceptable.
*
* Sets a reject map for the word.
*************************************************************************/
void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) {
int i;
int offset;
flip_0O(word);
check_debug_pt(word, -1); // For trap only
set_done(word, pass); // Set acceptance
word->reject_map.initialise(word->best_choice->unichar_lengths().length());
reject_blanks(word);
/*
0: Rays original heuristic - the baseline
*/
if (tessedit_reject_mode == 0) {
if (!word->done)
reject_poor_matches(word);
}
else if (tessedit_reject_mode == 5) {
/*
5: Reject I/1/l from words where there is no strong contextual confirmation;
the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
and the whole of any words which are very small
*/
if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
word->reject_map.rej_word_small_xht();
}
else {
one_ell_conflict(word, TRUE);
/*
Originally the code here just used the done flag. Now I have duplicated
and unpacked the conditions for setting the done flag so that each
mechanism can be turned on or off independently. This works WITHOUT
affecting the done flag setting.
*/
if (rej_use_tess_accepted && !word->tess_accepted)
word->reject_map.rej_word_not_tess_accepted();
if (rej_use_tess_blanks &&
(strchr(word->best_choice->unichar_string().string(), ' ') != NULL))
word->reject_map.rej_word_contains_blanks();
WERD_CHOICE* best_choice = word->best_choice;
if (rej_use_good_perm) {
if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
best_choice->permuter() == FREQ_DAWG_PERM ||
best_choice->permuter() == USER_DAWG_PERM) &&
(!rej_use_sensible_wd ||
acceptable_word_string(*word->uch_set,
best_choice->unichar_string().string(),
best_choice->unichar_lengths().string()) !=
AC_UNACCEPTABLE)) {
// PASSED TEST
}
else if (best_choice->permuter() == NUMBER_PERM) {
if (rej_alphas_in_number_perm) {
for (i = 0, offset = 0;
best_choice->unichar_string()[offset] != '\0';
offset += best_choice->unichar_lengths()[i++]) {
if (word->reject_map[i].accepted() &&
word->uch_set->get_isalpha(
best_choice->unichar_string().string() + offset,
best_choice->unichar_lengths()[i]))
word->reject_map[i].setrej_bad_permuter();
// rej alpha
}
}
}
else {
word->reject_map.rej_word_bad_permuter();
}
}
/* Ambig word rejection was here once !!*/
}
}
else {
tprintf("BAD tessedit_reject_mode\n");
err_exit();
}
if (tessedit_image_border > -1)
reject_edge_blobs(word);
check_debug_pt(word, 10);
if (tessedit_rejection_debug) {
tprintf("Permuter Type = %d\n", word->best_choice->permuter());
tprintf("Certainty: %f Rating: %f\n",
word->best_choice->certainty(), word->best_choice->rating());
tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
}
flip_hyphens(word);
check_debug_pt(word, 20);
}
} // namespace tesseract
void reject_blanks(WERD_RES *word) {
inT16 i;
inT16 offset;
for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
offset += word->best_choice->unichar_lengths()[i], i += 1) {
if (word->best_choice->unichar_string()[offset] == ' ')
//rej unrecognised blobs
word->reject_map[i].setrej_tess_failure();
}
}
namespace tesseract {
void Tesseract::reject_I_1_L(WERD_RES *word) {
inT16 i;
inT16 offset;
for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
offset += word->best_choice->unichar_lengths()[i], i += 1) {
if (STRING(conflict_set_I_l_1).
contains(word->best_choice->unichar_string()[offset])) {
//rej 1Il conflict
word->reject_map[i].setrej_1Il_conflict();
}
}
}
} // namespace tesseract
void reject_poor_matches(WERD_RES *word) {
float threshold = compute_reject_threshold(word->best_choice);
for (int i = 0; i < word->best_choice->length(); ++i) {
if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
word->reject_map[i].setrej_tess_failure();
else if (word->best_choice->certainty(i) < threshold)
word->reject_map[i].setrej_poor_match();
}
}
/**********************************************************************
* compute_reject_threshold
*
* Set a rejection threshold for this word.
* Initially this is a trivial function which looks for the largest
* gap in the certainty value.
**********************************************************************/
float compute_reject_threshold(WERD_CHOICE* word) {
float threshold; // rejection threshold
float bestgap = 0.0f; // biggest gap
float gapstart; // bottom of gap
// super iterator
BLOB_CHOICE_IT choice_it; // real iterator
int blob_count = word->length();
GenericVector<float> ratings;
ratings.resize_no_init(blob_count);
for (int i = 0; i < blob_count; ++i) {
ratings[i] = word->certainty(i);
}
ratings.sort();
gapstart = ratings[0] - 1; // all reject if none better
if (blob_count >= 3) {
for (int index = 0; index < blob_count - 1; index++) {
if (ratings[index + 1] - ratings[index] > bestgap) {
bestgap = ratings[index + 1] - ratings[index];
// find biggest
gapstart = ratings[index];
}
}
}
threshold = gapstart + bestgap / 2;
return threshold;
}
/*************************************************************************
* reject_edge_blobs()
*
* If the word is perilously close to the edge of the image, reject those blobs
* in the word which are too close to the edge as they could be clipped.
*************************************************************************/
namespace tesseract {
void Tesseract::reject_edge_blobs(WERD_RES *word) {
TBOX word_box = word->word->bounding_box();
// Use the box_word as it is already denormed back to image coordinates.
int blobcount = word->box_word->length();
if (word_box.left() < tessedit_image_border ||
word_box.bottom() < tessedit_image_border ||
word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
word_box.top() + tessedit_image_border > ImageHeight() - 1) {
ASSERT_HOST(word->reject_map.length() == blobcount);
for (int blobindex = 0; blobindex < blobcount; blobindex++) {
TBOX blob_box = word->box_word->BlobBox(blobindex);
if (blob_box.left() < tessedit_image_border ||
blob_box.bottom() < tessedit_image_border ||
blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
word->reject_map[blobindex].setrej_edge_char();
// Close to edge
}
}
}
}
/**********************************************************************
* one_ell_conflict()
*
* Identify words where there is a potential I/l/1 error.
* - A bundle of contextual heuristics!
**********************************************************************/
BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
const char *word;
const char *lengths;
inT16 word_len; //its length
inT16 first_alphanum_index_;
inT16 first_alphanum_offset_;
inT16 i;
inT16 offset;
BOOL8 non_conflict_set_char; //non conf set a/n?
BOOL8 conflict = FALSE;
BOOL8 allow_1s;
ACCEPTABLE_WERD_TYPE word_type;
BOOL8 dict_perm_type;
BOOL8 dict_word_ok;
int dict_word_type;
word = word_res->best_choice->unichar_string().string();
lengths = word_res->best_choice->unichar_lengths().string();
word_len = strlen(lengths);
/*
If there are no occurrences of the conflict set characters then the word
is OK.
*/
if (strpbrk(word, conflict_set_I_l_1.string()) == NULL)
return FALSE;
/*
There is a conflict if there are NO other (confirmed) alphanumerics apart
from those in the conflict set.
*/
for (i = 0, offset = 0, non_conflict_set_char = FALSE;
(i < word_len) && !non_conflict_set_char; offset += lengths[i++])
non_conflict_set_char =
(word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
!STRING(conflict_set_I_l_1).contains(word[offset]);
if (!non_conflict_set_char) {
if (update_map)
reject_I_1_L(word_res);
return TRUE;
}
/*
If the word is accepted by a dawg permuter, and the first alpha character
is "I" or "l", check to see if the alternative is also a dawg word. If it
is, then there is a potential error otherwise the word is ok.
*/
dict_perm_type = (word_res->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
(word_res->best_choice->permuter() == USER_DAWG_PERM) ||
(rej_trust_doc_dawg &&
(word_res->best_choice->permuter() == DOC_DAWG_PERM)) ||
(word_res->best_choice->permuter() == FREQ_DAWG_PERM);
dict_word_type = dict_word(*(word_res->best_choice));
dict_word_ok = (dict_word_type > 0) &&
(rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
if ((rej_1Il_use_dict_word && dict_word_ok) ||
(rej_1Il_trust_permuter_type && dict_perm_type) ||
(dict_perm_type && dict_word_ok)) {
first_alphanum_index_ = first_alphanum_index(word, lengths);
first_alphanum_offset_ = first_alphanum_offset(word, lengths);
if (lengths[first_alphanum_index_] == 1 &&
word[first_alphanum_offset_] == 'I') {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
if (safe_dict_word(word_res) > 0) {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
if (update_map)
word_res->reject_map[first_alphanum_index_].
setrej_1Il_conflict();
return TRUE;
}
else {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
return FALSE;
}
}
if (lengths[first_alphanum_index_] == 1 &&
word[first_alphanum_offset_] == 'l') {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
if (safe_dict_word(word_res) > 0) {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
if (update_map)
word_res->reject_map[first_alphanum_index_].
setrej_1Il_conflict();
return TRUE;
}
else {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
return FALSE;
}
}
return FALSE;
}
/*
NEW 1Il code. The old code relied on permuter types too much. In fact,
tess will use TOP_CHOICE permute for good things like "palette".
In this code the string is examined independently to see if it looks like
a well formed word.
*/
/*
REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
dictionary word.
*/
first_alphanum_index_ = first_alphanum_index(word, lengths);
first_alphanum_offset_ = first_alphanum_offset(word, lengths);
if (lengths[first_alphanum_index_] == 1 &&
word[first_alphanum_offset_] == 'l') {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
if (safe_dict_word(word_res) > 0)
return FALSE;
else
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
}
else if (lengths[first_alphanum_index_] == 1 &&
word[first_alphanum_offset_] == 'I') {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
if (safe_dict_word(word_res) > 0)
return FALSE;
else
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
}
/*
For strings containing digits:
If there are no alphas OR the numeric permuter liked the word,
reject any non 1 conflict chs
Else reject all conflict chs
*/
if (word_contains_non_1_digit(word, lengths)) {
allow_1s = (alpha_count(word, lengths) == 0) ||
(word_res->best_choice->permuter() == NUMBER_PERM);
inT16 offset;
conflict = FALSE;
for (i = 0, offset = 0; word[offset] != '\0';
offset += word_res->best_choice->unichar_lengths()[i++]) {
if ((!allow_1s || (word[offset] != '1')) &&
STRING(conflict_set_I_l_1).contains(word[offset])) {
if (update_map)
word_res->reject_map[i].setrej_1Il_conflict();
conflict = TRUE;
}
}
return conflict;
}
/*
For anything else. See if it conforms to an acceptable word type. If so,
treat accordingly.
*/
word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
first_alphanum_index_ = first_alphanum_index(word, lengths);
first_alphanum_offset_ = first_alphanum_offset(word, lengths);
if (STRING(conflict_set_I_l_1).contains(word[first_alphanum_offset_])) {
if (update_map)
word_res->reject_map[first_alphanum_index_].
setrej_1Il_conflict();
return TRUE;
}
else
return FALSE;
}
else if (word_type == AC_UPPER_CASE) {
return FALSE;
}
else {
if (update_map)
reject_I_1_L(word_res);
return TRUE;
}
}
inT16 Tesseract::first_alphanum_index(const char *word,
const char *word_lengths) {
inT16 i;
inT16 offset;
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
unicharset.get_isdigit(word + offset, word_lengths[i]))
return i;
}
return -1;
}
inT16 Tesseract::first_alphanum_offset(const char *word,
const char *word_lengths) {
inT16 i;
inT16 offset;
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
unicharset.get_isdigit(word + offset, word_lengths[i]))
return offset;
}
return -1;
}
inT16 Tesseract::alpha_count(const char *word,
const char *word_lengths) {
inT16 i;
inT16 offset;
inT16 count = 0;
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
if (unicharset.get_isalpha(word + offset, word_lengths[i]))
count++;
}
return count;
}
BOOL8 Tesseract::word_contains_non_1_digit(const char *word,
const char *word_lengths) {
inT16 i;
inT16 offset;
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
if (unicharset.get_isdigit(word + offset, word_lengths[i]) &&
(word_lengths[i] != 1 || word[offset] != '1'))
return TRUE;
}
return FALSE;
}
/*************************************************************************
* dont_allow_1Il()
* Don't unreject LONE accepted 1Il conflict set chars
*************************************************************************/
void Tesseract::dont_allow_1Il(WERD_RES *word) {
int i = 0;
int offset;
int word_len = word->reject_map.length();
const char *s = word->best_choice->unichar_string().string();
const char *lengths = word->best_choice->unichar_lengths().string();
BOOL8 accepted_1Il = FALSE;
for (i = 0, offset = 0; i < word_len;
offset += word->best_choice->unichar_lengths()[i++]) {
if (word->reject_map[i].accepted()) {
if (STRING(conflict_set_I_l_1).contains(s[offset])) {
accepted_1Il = TRUE;
}
else {
if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
word->uch_set->get_isdigit(s + offset, lengths[i]))
return; // >=1 non 1Il ch accepted
}
}
}
if (!accepted_1Il)
return; //Nothing to worry about
for (i = 0, offset = 0; i < word_len;
offset += word->best_choice->unichar_lengths()[i++]) {
if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
word->reject_map[i].accepted())
word->reject_map[i].setrej_postNN_1Il();
}
}
inT16 Tesseract::count_alphanums(WERD_RES *word_res) {
int count = 0;
const WERD_CHOICE *best_choice = word_res->best_choice;
for (int i = 0; i < word_res->reject_map.length(); ++i) {
if ((word_res->reject_map[i].accepted()) &&
(word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
count++;
}
}
return count;
}
// reject all if most rejected.
void Tesseract::reject_mostly_rejects(WERD_RES *word) {
/* Reject the whole of the word if the fraction of rejects exceeds a limit */
if ((float)word->reject_map.reject_count() / word->reject_map.length() >=
rej_whole_of_mostly_reject_word_fract)
word->reject_map.rej_word_mostly_rej();
}
BOOL8 Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
inT16 char_quality;
inT16 accepted_char_quality;
if (word->best_choice->unichar_lengths().length() <= 1)
return FALSE;
if (!STRING(ok_repeated_ch_non_alphanum_wds).
contains(word->best_choice->unichar_string()[0]))
return FALSE;
UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
for (int i = 1; i < word->best_choice->length(); ++i) {
if (word->best_choice->unichar_id(i) != uch_id) return FALSE;
}
word_char_quality(word, row, &char_quality, &accepted_char_quality);
if ((word->best_choice->unichar_lengths().length() == char_quality) &&
(char_quality == accepted_char_quality))
return TRUE;
else
return FALSE;
}
inT16 Tesseract::safe_dict_word(const WERD_RES *werd_res) {
const WERD_CHOICE &word = *werd_res->best_choice;
int dict_word_type = werd_res->tesseract->dict_word(word);
return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
}
// Note: After running this function word_res->ratings
// might not contain the right BLOB_CHOICE corresponding to each character
// in word_res->best_choice.
void Tesseract::flip_hyphens(WERD_RES *word_res) {
WERD_CHOICE *best_choice = word_res->best_choice;
int i;
int prev_right = -9999;
int next_left;
TBOX out_box;
float aspect_ratio;
if (tessedit_lower_flip_hyphen <= 1)
return;
int num_blobs = word_res->rebuild_word->NumBlobs();
UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
TBLOB* blob = word_res->rebuild_word->blobs[i];
out_box = blob->bounding_box();
if (i + 1 == num_blobs)
next_left = 9999;
else
next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
// Don't touch small or touching blobs - it is too dangerous.
if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
(out_box.left() > prev_right) && (out_box.right() < next_left)) {
aspect_ratio = out_box.width() / (float)out_box.height();
if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
if (aspect_ratio >= tessedit_upper_flip_hyphen &&
word_res->uch_set->contains_unichar_id(unichar_dash) &&
word_res->uch_set->get_enabled(unichar_dash)) {
/* Certain HYPHEN */
best_choice->set_unichar_id(unichar_dash, i);
if (word_res->reject_map[i].rejected())
word_res->reject_map[i].setrej_hyphen_accept();
}
if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
word_res->reject_map[i].accepted())
//Suspected HYPHEN
word_res->reject_map[i].setrej_hyphen();
}
else if (best_choice->unichar_id(i) == unichar_dash) {
if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
(word_res->reject_map[i].rejected()))
word_res->reject_map[i].setrej_hyphen_accept();
//Certain HYPHEN
if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
(word_res->reject_map[i].accepted()))
//Suspected HYPHEN
word_res->reject_map[i].setrej_hyphen();
}
}
prev_right = out_box.right();
}
}
// Note: After running this function word_res->ratings
// might not contain the right BLOB_CHOICE corresponding to each character
// in word_res->best_choice.
void Tesseract::flip_0O(WERD_RES *word_res) {
WERD_CHOICE *best_choice = word_res->best_choice;
int i;
TBOX out_box;
if (!tessedit_flip_0O)
return;
int num_blobs = word_res->rebuild_word->NumBlobs();
for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
TBLOB* blob = word_res->rebuild_word->blobs[i];
if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
out_box = blob->bounding_box();
if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
(out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
return; //Beware words with sub/superscripts
}
}
UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
if (unichar_0 == INVALID_UNICHAR_ID ||
!word_res->uch_set->get_enabled(unichar_0) ||
unichar_O == INVALID_UNICHAR_ID ||
!word_res->uch_set->get_enabled(unichar_O)) {
return; // 0 or O are not present/enabled in unicharset
}
for (i = 1; i < best_choice->length(); ++i) {
if (best_choice->unichar_id(i) == unichar_0 ||
best_choice->unichar_id(i) == unichar_O) {
/* A0A */
if ((i + 1) < best_choice->length() &&
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 1))) {
best_choice->set_unichar_id(unichar_O, i);
}
/* A00A */
if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
(i + 1) < best_choice->length() &&
(best_choice->unichar_id(i + 1) == unichar_0 ||
best_choice->unichar_id(i + 1) == unichar_O) &&
(i + 2) < best_choice->length() &&
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 2))) {
best_choice->set_unichar_id(unichar_O, i);
i++;
}
/* AA0<non digit or end of word> */
if ((i > 1) &&
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 2)) &&
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
(((i + 1) < best_choice->length() &&
!word_res->uch_set->get_isdigit(best_choice->unichar_id(i + 1)) &&
!word_res->uch_set->eq(best_choice->unichar_id(i + 1), "l") &&
!word_res->uch_set->eq(best_choice->unichar_id(i + 1), "I")) ||
(i == best_choice->length() - 1))) {
best_choice->set_unichar_id(unichar_O, i);
}
/* 9O9 */
if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
(i + 1) < best_choice->length() &&
non_0_digit(*word_res->uch_set, best_choice->unichar_id(i + 1))) {
best_choice->set_unichar_id(unichar_0, i);
}
/* 9OOO */
if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
(i + 2) < best_choice->length() &&
(best_choice->unichar_id(i + 1) == unichar_0 ||
best_choice->unichar_id(i + 1) == unichar_O) &&
(best_choice->unichar_id(i + 2) == unichar_0 ||
best_choice->unichar_id(i + 2) == unichar_O)) {
best_choice->set_unichar_id(unichar_0, i);
best_choice->set_unichar_id(unichar_0, i + 1);
best_choice->set_unichar_id(unichar_0, i + 2);
i += 2;
}
/* 9OO<non upper> */
if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
(i + 2) < best_choice->length() &&
(best_choice->unichar_id(i + 1) == unichar_0 ||
best_choice->unichar_id(i + 1) == unichar_O) &&
!word_res->uch_set->get_isupper(best_choice->unichar_id(i + 2))) {
best_choice->set_unichar_id(unichar_0, i);
best_choice->set_unichar_id(unichar_0, i + 1);
i++;
}
/* 9O<non upper> */
if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
(i + 1) < best_choice->length() &&
!word_res->uch_set->get_isupper(best_choice->unichar_id(i + 1))) {
best_choice->set_unichar_id(unichar_0, i);
}
/* 9[.,]OOO.. */
if ((i > 1) &&
(word_res->uch_set->eq(best_choice->unichar_id(i - 1), ".") ||
word_res->uch_set->eq(best_choice->unichar_id(i - 1), ",")) &&
(word_res->uch_set->get_isdigit(best_choice->unichar_id(i - 2)) ||
best_choice->unichar_id(i - 2) == unichar_O)) {
if (best_choice->unichar_id(i - 2) == unichar_O) {
best_choice->set_unichar_id(unichar_0, i - 2);
}
while (i < best_choice->length() &&
(best_choice->unichar_id(i) == unichar_O ||
best_choice->unichar_id(i) == unichar_0)) {
best_choice->set_unichar_id(unichar_0, i);
i++;
}
i--;
}
}
}
}
BOOL8 Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
}
BOOL8 Tesseract::non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
}
} // namespace tesseract

View File

@ -0,0 +1,34 @@
/**********************************************************************
* File: reject.h (Formerly reject.h)
* Description: Rejection functions used in tessedit
* Author: Phil Cheatle
* Created: Wed Sep 23 16:50:21 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef REJECT_H
#define REJECT_H
#include "params.h"
#include "pageres.h"
void reject_blanks(WERD_RES *word);
void reject_poor_matches(WERD_RES *word);
float compute_reject_threshold(WERD_CHOICE* word);
BOOL8 word_contains_non_1_digit(const char *word, const char *word_lengths);
void dont_allow_1Il(WERD_RES *word);
void flip_hyphens(WERD_RES *word);
void flip_0O(WERD_RES *word);
BOOL8 non_0_digit(const char* str, int length);
#endif

View File

@ -0,0 +1,683 @@
///////////////////////////////////////////////////////////////////////
// File: resultiterator.cpp
// Description: Iterator for tesseract results that is capable of
// iterating in proper reading order over Bi Directional
// (e.g. mixed Hebrew and English) text.
// Author: David Eger
// Created: Fri May 27 13:58:06 PST 2011
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "resultiterator.h"
#include "allheaders.h"
#include "pageres.h"
#include "strngs.h"
#include "tesseractclass.h"
#include "unicharset.h"
#include "unicodes.h"
namespace tesseract {
ResultIterator::ResultIterator(const LTRResultIterator &resit)
: LTRResultIterator(resit) {
in_minor_direction_ = false;
at_beginning_of_minor_run_ = false;
preserve_interword_spaces_ = false;
BoolParam *p = ParamUtils::FindParam<BoolParam>(
"preserve_interword_spaces", GlobalParams()->bool_params,
tesseract_->params()->bool_params);
if (p != NULL) preserve_interword_spaces_ = (bool)(*p);
current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
MoveToLogicalStartOfTextline();
}
ResultIterator *ResultIterator::StartOfParagraph(
const LTRResultIterator &resit) {
return new ResultIterator(resit);
}
bool ResultIterator::ParagraphIsLtr() const {
return current_paragraph_is_ltr_;
}
bool ResultIterator::CurrentParagraphIsLtr() const {
if (!it_->word())
return true; // doesn't matter.
LTRResultIterator it(*this);
it.RestartParagraph();
// Try to figure out the ltr-ness of the paragraph. The rules below
// make more sense in the context of a difficult paragraph example.
// Here we denote {ltr characters, RTL CHARACTERS}:
//
// "don't go in there!" DAIS EH
// EHT OTNI DEPMUJ FELSMIH NEHT DNA
// .GNIDLIUB GNINRUB
//
// On the first line, the left-most word is LTR and the rightmost word
// is RTL. Thus, we are better off taking the majority direction for
// the whole paragraph contents. So instead of "the leftmost word is LTR"
// indicating an LTR paragraph, we use a heuristic about what RTL paragraphs
// would not do: Typically an RTL paragraph would *not* start with an LTR
// word. So our heuristics are as follows:
//
// (1) If the first text line has an RTL word in the left-most position
// it is RTL.
// (2) If the first text line has an LTR word in the right-most position
// it is LTR.
// (3) If neither of the above is true, take the majority count for the
// paragraph -- if there are more rtl words, it is RTL. If there
// are more LTR words, it's LTR.
bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;
bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;
int num_ltr, num_rtl;
num_rtl = leftmost_rtl ? 1 : 0;
num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;
for (it.Next(RIL_WORD);
!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);
it.Next(RIL_WORD)) {
StrongScriptDirection dir = it.WordDirection();
rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);
num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
num_ltr += rightmost_ltr ? 1 : 0;
}
if (leftmost_rtl)
return false;
if (rightmost_ltr)
return true;
// First line is ambiguous. Take statistics on the whole paragraph.
if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) do {
StrongScriptDirection dir = it.WordDirection();
num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;
} while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));
return num_ltr >= num_rtl;
}
const int ResultIterator::kMinorRunStart = -1;
const int ResultIterator::kMinorRunEnd = -2;
const int ResultIterator::kComplexWord = -3;
void ResultIterator::CalculateBlobOrder(
GenericVector<int> *blob_indices) const {
bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
blob_indices->clear();
if (Empty(RIL_WORD)) return;
if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {
// Easy! just return the blobs in order;
for (int i = 0; i < word_length_; i++)
blob_indices->push_back(i);
return;
}
// The blobs are in left-to-right order, but the current reading context
// is right-to-left.
const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;
const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;
const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;
const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;
const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;
const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;
const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;
// Step 1: Scan for and mark European Number sequences
// [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*
GenericVector<int> letter_types;
for (int i = 0; i < word_length_; i++) {
letter_types.push_back(it_->word()->SymbolDirection(i));
}
// Convert a single separtor sandwiched between two EN's into an EN.
for (int i = 0; i + 2 < word_length_; i++) {
if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&
(letter_types[i + 1] == U_EURO_NUM_SEP ||
letter_types[i + 1] == U_COMMON_NUM_SEP)) {
letter_types[i + 1] = U_EURO_NUM;
}
}
// Scan for sequences of European Number Terminators around ENs and convert
// them to ENs.
for (int i = 0; i < word_length_; i++) {
if (letter_types[i] == U_EURO_NUM_TERM) {
int j = i + 1;
while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) { j++; }
if (j < word_length_ && letter_types[j] == U_EURO_NUM) {
// The sequence [i..j] should be converted to all European Numbers.
for (int k = i; k < j; k++) letter_types[k] = U_EURO_NUM;
}
j = i - 1;
while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) { j--; }
if (j > -1 && letter_types[j] == U_EURO_NUM) {
// The sequence [j..i] should be converted to all European Numbers.
for (int k = j; k <= i; k++) letter_types[k] = U_EURO_NUM;
}
}
}
// Step 2: Convert all remaining types to either L or R.
// Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.
// All other are R.
for (int i = 0; i < word_length_;) {
int ti = letter_types[i];
if (ti == U_LTR || ti == U_EURO_NUM) {
// Left to right sequence; scan to the end of it.
int last_good = i;
for (int j = i + 1; j < word_length_; j++) {
int tj = letter_types[j];
if (tj == U_LTR || tj == U_EURO_NUM) {
last_good = j;
}
else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
// do nothing.
}
else {
break;
}
}
// [i..last_good] is the L sequence
for (int k = i; k <= last_good; k++) letter_types[k] = U_LTR;
i = last_good + 1;
}
else {
letter_types[i] = U_RTL;
i++;
}
}
// At this point, letter_types is entirely U_LTR or U_RTL.
for (int i = word_length_ - 1; i >= 0;) {
if (letter_types[i] == U_RTL) {
blob_indices->push_back(i);
i--;
}
else {
// left to right sequence. scan to the beginning.
int j = i - 1;
for (; j >= 0 && letter_types[j] != U_RTL; j--) {} // pass
// Now (j, i] is LTR
for (int k = j + 1; k <= i; k++) blob_indices->push_back(k);
i = j;
}
}
ASSERT_HOST(blob_indices->size() == word_length_);
}
static void PrintScriptDirs(const GenericVector<StrongScriptDirection> &dirs) {
for (int i = 0; i < dirs.size(); i++) {
switch (dirs[i]) {
case DIR_NEUTRAL: tprintf("N "); break;
case DIR_LEFT_TO_RIGHT: tprintf("L "); break;
case DIR_RIGHT_TO_LEFT: tprintf("R "); break;
case DIR_MIX: tprintf("Z "); break;
default: tprintf("? "); break;
}
}
tprintf("\n");
}
void ResultIterator::CalculateTextlineOrder(
bool paragraph_is_ltr,
const LTRResultIterator &resit,
GenericVectorEqEq<int> *word_indices) const {
GenericVector<StrongScriptDirection> directions;
CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);
}
void ResultIterator::CalculateTextlineOrder(
bool paragraph_is_ltr,
const LTRResultIterator &resit,
GenericVector<StrongScriptDirection> *dirs_arg,
GenericVectorEqEq<int> *word_indices) const {
GenericVector<StrongScriptDirection> dirs;
GenericVector<StrongScriptDirection> *directions;
directions = (dirs_arg != NULL) ? dirs_arg : &dirs;
directions->truncate(0);
// A LTRResultIterator goes strictly left-to-right word order.
LTRResultIterator ltr_it(resit);
ltr_it.RestartRow();
if (ltr_it.Empty(RIL_WORD)) return;
do {
directions->push_back(ltr_it.WordDirection());
} while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));
word_indices->truncate(0);
CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
}
void ResultIterator::CalculateTextlineOrder(
bool paragraph_is_ltr,
const GenericVector<StrongScriptDirection> &word_dirs,
GenericVectorEqEq<int> *reading_order) {
reading_order->truncate(0);
if (word_dirs.size() == 0) return;
// Take all of the runs of minor direction words and insert them
// in reverse order.
int minor_direction, major_direction, major_step, start, end;
if (paragraph_is_ltr) {
start = 0;
end = word_dirs.size();
major_step = 1;
major_direction = DIR_LEFT_TO_RIGHT;
minor_direction = DIR_RIGHT_TO_LEFT;
}
else {
start = word_dirs.size() - 1;
end = -1;
major_step = -1;
major_direction = DIR_RIGHT_TO_LEFT;
minor_direction = DIR_LEFT_TO_RIGHT;
// Special rule: if there are neutral words at the right most side
// of a line adjacent to a left-to-right word in the middle of the
// line, we interpret the end of the line as a single LTR sequence.
if (word_dirs[start] == DIR_NEUTRAL) {
int neutral_end = start;
while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
neutral_end--;
}
if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
// LTR followed by neutrals.
// Scan for the beginning of the minor left-to-right run.
int left = neutral_end;
for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
if (word_dirs[i] == DIR_LEFT_TO_RIGHT) left = i;
}
reading_order->push_back(kMinorRunStart);
for (int i = left; i < word_dirs.size(); i++) {
reading_order->push_back(i);
if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
}
reading_order->push_back(kMinorRunEnd);
start = left - 1;
}
}
}
for (int i = start; i != end;) {
if (word_dirs[i] == minor_direction) {
int j = i;
while (j != end && word_dirs[j] != major_direction)
j += major_step;
if (j == end) j -= major_step;
while (j != i && word_dirs[j] != minor_direction)
j -= major_step;
// [j..i] is a minor direction run.
reading_order->push_back(kMinorRunStart);
for (int k = j; k != i; k -= major_step) {
reading_order->push_back(k);
}
reading_order->push_back(i);
reading_order->push_back(kMinorRunEnd);
i = j + major_step;
}
else {
reading_order->push_back(i);
if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
i += major_step;
}
}
}
int ResultIterator::LTRWordIndex() const {
int this_word_index = 0;
LTRResultIterator textline(*this);
textline.RestartRow();
while (!textline.PositionedAtSameWord(it_)) {
this_word_index++;
textline.Next(RIL_WORD);
}
return this_word_index;
}
void ResultIterator::MoveToLogicalStartOfWord() {
if (word_length_ == 0) {
BeginWord(0);
return;
}
GenericVector<int> blob_order;
CalculateBlobOrder(&blob_order);
if (blob_order.size() == 0 || blob_order[0] == 0) return;
BeginWord(blob_order[0]);
}
bool ResultIterator::IsAtFinalSymbolOfWord() const {
if (!it_->word()) return true;
GenericVector<int> blob_order;
CalculateBlobOrder(&blob_order);
return blob_order.size() == 0 || blob_order.back() == blob_index_;
}
bool ResultIterator::IsAtFirstSymbolOfWord() const {
if (!it_->word()) return true;
GenericVector<int> blob_order;
CalculateBlobOrder(&blob_order);
return blob_order.size() == 0 || blob_order[0] == blob_index_;
}
void ResultIterator::AppendSuffixMarks(STRING *text) const {
if (!it_->word()) return;
bool reading_direction_is_ltr =
current_paragraph_is_ltr_ ^ in_minor_direction_;
// scan forward to see what meta-information the word ordering algorithm
// left us.
// If this word is at the *end* of a minor run, insert the other
// direction's mark; else if this was a complex word, insert the
// current reading order's mark.
GenericVectorEqEq<int> textline_order;
CalculateTextlineOrder(current_paragraph_is_ltr_,
*this, &textline_order);
int this_word_index = LTRWordIndex();
int i = textline_order.get_index(this_word_index);
if (i < 0) return;
int last_non_word_mark = 0;
for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {
last_non_word_mark = textline_order[i];
}
if (last_non_word_mark == kComplexWord) {
*text += reading_direction_is_ltr ? kLRM : kRLM;
}
else if (last_non_word_mark == kMinorRunEnd) {
if (current_paragraph_is_ltr_) {
*text += kLRM;
}
else {
*text += kRLM;
}
}
}
void ResultIterator::MoveToLogicalStartOfTextline() {
GenericVectorEqEq<int> word_indices;
RestartRow();
CalculateTextlineOrder(current_paragraph_is_ltr_,
dynamic_cast<const LTRResultIterator&>(*this),
&word_indices);
int i = 0;
for (; i < word_indices.size() && word_indices[i] < 0; i++) {
if (word_indices[i] == kMinorRunStart) in_minor_direction_ = true;
else if (word_indices[i] == kMinorRunEnd) in_minor_direction_ = false;
}
if (in_minor_direction_) at_beginning_of_minor_run_ = true;
if (i >= word_indices.size()) return;
int first_word_index = word_indices[i];
for (int j = 0; j < first_word_index; j++) {
PageIterator::Next(RIL_WORD);
}
MoveToLogicalStartOfWord();
}
void ResultIterator::Begin() {
LTRResultIterator::Begin();
current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
in_minor_direction_ = false;
at_beginning_of_minor_run_ = false;
MoveToLogicalStartOfTextline();
}
bool ResultIterator::Next(PageIteratorLevel level) {
if (it_->block() == NULL) return false; // already at end!
switch (level) {
case RIL_BLOCK: // explicit fall-through
case RIL_PARA: // explicit fall-through
case RIL_TEXTLINE:
if (!PageIterator::Next(level)) return false;
if (IsWithinFirstTextlineOfParagraph()) {
// if we've advanced to a new paragraph,
// recalculate current_paragraph_is_ltr_
current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
}
in_minor_direction_ = false;
MoveToLogicalStartOfTextline();
return it_->block() != NULL;
case RIL_SYMBOL:
{
GenericVector<int> blob_order;
CalculateBlobOrder(&blob_order);
int next_blob = 0;
while (next_blob < blob_order.size() &&
blob_index_ != blob_order[next_blob])
next_blob++;
next_blob++;
if (next_blob < blob_order.size()) {
// we're in the same word; simply advance one blob.
BeginWord(blob_order[next_blob]);
at_beginning_of_minor_run_ = false;
return true;
}
level = RIL_WORD; // we've fallen through to the next word.
}
case RIL_WORD: // explicit fall-through.
{
if (it_->word() == NULL) return Next(RIL_BLOCK);
GenericVectorEqEq<int> word_indices;
int this_word_index = LTRWordIndex();
CalculateTextlineOrder(current_paragraph_is_ltr_,
*this,
&word_indices);
int final_real_index = word_indices.size() - 1;
while (final_real_index > 0 && word_indices[final_real_index] < 0)
final_real_index--;
for (int i = 0; i < final_real_index; i++) {
if (word_indices[i] == this_word_index) {
int j = i + 1;
for (; j < final_real_index && word_indices[j] < 0; j++) {
if (word_indices[j] == kMinorRunStart) in_minor_direction_ = true;
if (word_indices[j] == kMinorRunEnd) in_minor_direction_ = false;
}
at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
// awesome, we move to word_indices[j]
if (BidiDebug(3)) {
tprintf("Next(RIL_WORD): %d -> %d\n",
this_word_index, word_indices[j]);
}
PageIterator::RestartRow();
for (int k = 0; k < word_indices[j]; k++) {
PageIterator::Next(RIL_WORD);
}
MoveToLogicalStartOfWord();
return true;
}
}
if (BidiDebug(3)) {
tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
}
// we're going off the end of the text line.
return Next(RIL_TEXTLINE);
}
}
ASSERT_HOST(false); // shouldn't happen.
return false;
}
bool ResultIterator::IsAtBeginningOf(PageIteratorLevel level) const {
if (it_->block() == NULL) return false; // Already at the end!
if (it_->word() == NULL) return true; // In an image block.
if (level == RIL_SYMBOL) return true; // Always at beginning of a symbol.
bool at_word_start = IsAtFirstSymbolOfWord();
if (level == RIL_WORD) return at_word_start;
ResultIterator line_start(*this);
// move to the first word in the line...
line_start.MoveToLogicalStartOfTextline();
bool at_textline_start = at_word_start && *line_start.it_ == *it_;
if (level == RIL_TEXTLINE) return at_textline_start;
// now we move to the left-most word...
line_start.RestartRow();
bool at_block_start = at_textline_start &&
line_start.it_->block() != line_start.it_->prev_block();
if (level == RIL_BLOCK) return at_block_start;
bool at_para_start = at_block_start ||
(at_textline_start &&
line_start.it_->row()->row->para() !=
line_start.it_->prev_row()->row->para());
if (level == RIL_PARA) return at_para_start;
ASSERT_HOST(false); // shouldn't happen.
return false;
}
/**
* NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the
* change that the variable next is now a ResultIterator instead of a
* PageIterator.
*/
bool ResultIterator::IsAtFinalElement(PageIteratorLevel level,
PageIteratorLevel element) const {
if (Empty(element)) return true; // Already at the end!
// The result is true if we step forward by element and find we are
// at the the end of the page or at beginning of *all* levels in:
// [level, element).
// When there is more than one level difference between element and level,
// we could for instance move forward one symbol and still be at the first
// word on a line, so we also have to be at the first symbol in a word.
ResultIterator next(*this);
next.Next(element);
if (next.Empty(element)) return true; // Reached the end of the page.
while (element > level) {
element = static_cast<PageIteratorLevel>(element - 1);
if (!next.IsAtBeginningOf(element))
return false;
}
return true;
}
/**
* Returns the null terminated UTF-8 encoded text string for the current
* object at the given level. Use delete [] to free after use.
*/
char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
if (it_->word() == NULL) return NULL; // Already at the end!
STRING text;
switch (level) {
case RIL_BLOCK:
{
ResultIterator pp(*this);
do {
pp.AppendUTF8ParagraphText(&text);
} while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
}
break;
case RIL_PARA:
AppendUTF8ParagraphText(&text);
break;
case RIL_TEXTLINE:
{
ResultIterator it(*this);
it.MoveToLogicalStartOfTextline();
it.IterateAndAppendUTF8TextlineText(&text);
}
break;
case RIL_WORD:
AppendUTF8WordText(&text);
break;
case RIL_SYMBOL:
{
bool reading_direction_is_ltr =
current_paragraph_is_ltr_ ^ in_minor_direction_;
if (at_beginning_of_minor_run_) {
text += reading_direction_is_ltr ? kLRM : kRLM;
}
text = it_->word()->BestUTF8(blob_index_, !reading_direction_is_ltr);
if (IsAtFinalSymbolOfWord()) AppendSuffixMarks(&text);
}
break;
}
int length = text.length() + 1;
char* result = new char[length];
strncpy(result, text.string(), length);
return result;
}
void ResultIterator::AppendUTF8WordText(STRING *text) const {
if (!it_->word()) return;
ASSERT_HOST(it_->word()->best_choice != NULL);
bool reading_direction_is_ltr =
current_paragraph_is_ltr_ ^ in_minor_direction_;
if (at_beginning_of_minor_run_) {
*text += reading_direction_is_ltr ? kLRM : kRLM;
}
GenericVector<int> blob_order;
CalculateBlobOrder(&blob_order);
for (int i = 0; i < blob_order.size(); i++) {
*text += it_->word()->BestUTF8(blob_order[i], !reading_direction_is_ltr);
}
AppendSuffixMarks(text);
}
void ResultIterator::IterateAndAppendUTF8TextlineText(STRING *text) {
if (Empty(RIL_WORD)) {
Next(RIL_WORD);
return;
}
if (BidiDebug(1)) {
GenericVectorEqEq<int> textline_order;
GenericVector<StrongScriptDirection> dirs;
CalculateTextlineOrder(current_paragraph_is_ltr_,
*this, &dirs, &textline_order);
tprintf("Strong Script dirs [%p/P=%s]: ", it_->row(),
current_paragraph_is_ltr_ ? "ltr" : "rtl");
PrintScriptDirs(dirs);
tprintf("Logical textline order [%p/P=%s]: ", it_->row(),
current_paragraph_is_ltr_ ? "ltr" : "rtl");
for (int i = 0; i < textline_order.size(); i++) {
tprintf("%d ", textline_order[i]);
}
tprintf("\n");
}
int words_appended = 0;
do {
int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space()
: (words_appended > 0);
for (int i = 0; i < numSpaces; ++i) {
*text += " ";
}
AppendUTF8WordText(text);
words_appended++;
} while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
if (BidiDebug(1)) {
tprintf("%d words printed\n", words_appended);
}
*text += line_separator_;
// If we just finished a paragraph, add an extra newline.
if (it_->block() == NULL || IsAtBeginningOf(RIL_PARA))
*text += paragraph_separator_;
}
void ResultIterator::AppendUTF8ParagraphText(STRING *text) const {
ResultIterator it(*this);
it.RestartParagraph();
it.MoveToLogicalStartOfTextline();
if (it.Empty(RIL_WORD)) return;
do {
it.IterateAndAppendUTF8TextlineText(text);
} while (it.it_->block() != NULL && !it.IsAtBeginningOf(RIL_PARA));
}
bool ResultIterator::BidiDebug(int min_level) const {
int debug_level = 1;
IntParam *p = ParamUtils::FindParam<IntParam>(
"bidi_debug", GlobalParams()->int_params,
tesseract_->params()->int_params);
if (p != NULL) debug_level = (inT32)(*p);
return debug_level >= min_level;
}
} // namespace tesseract.

View File

@ -0,0 +1,244 @@
///////////////////////////////////////////////////////////////////////
// File: resultiterator.h
// Description: Iterator for tesseract results that is capable of
// iterating in proper reading order over Bi Directional
// (e.g. mixed Hebrew and English) text.
// Author: David Eger
// Created: Fri May 27 13:58:06 PST 2011
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H__
#define TESSERACT_CCMAIN_RESULT_ITERATOR_H__
#include "platform.h"
#include "ltrresultiterator.h"
template <typename T> class GenericVector;
template <typename T> class GenericVectorEqEq;
class BLOB_CHOICE_IT;
class WERD_RES;
class STRING;
namespace tesseract {
class Tesseract;
class TESS_API ResultIterator : public LTRResultIterator {
public:
static ResultIterator *StartOfParagraph(const LTRResultIterator &resit);
/**
* ResultIterator is copy constructible!
* The default copy constructor works just fine for us.
*/
virtual ~ResultIterator() {}
// ============= Moving around within the page ============.
/**
* Moves the iterator to point to the start of the page to begin
* an iteration.
*/
virtual void Begin();
/**
* Moves to the start of the next object at the given level in the
* page hierarchy in the appropriate reading order and returns false if
* the end of the page was reached.
* NOTE that RIL_SYMBOL will skip non-text blocks, but all other
* PageIteratorLevel level values will visit each non-text block once.
* Think of non text blocks as containing a single para, with a single line,
* with a single imaginary word.
* Calls to Next with different levels may be freely intermixed.
* This function iterates words in right-to-left scripts correctly, if
* the appropriate language has been loaded into Tesseract.
*/
virtual bool Next(PageIteratorLevel level);
/**
* IsAtBeginningOf() returns whether we're at the logical beginning of the
* given level. (as opposed to ResultIterator's left-to-right top-to-bottom
* order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf().
* For a full description, see pageiterator.h
*/
virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
/**
* Implement PageIterator's IsAtFinalElement correctly in a BiDi context.
* For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we
* point at the last word in a paragraph. See PageIterator for full comment.
*/
virtual bool IsAtFinalElement(PageIteratorLevel level,
PageIteratorLevel element) const;
// ============= Accessing data ==============.
/**
* Returns the null terminated UTF-8 encoded text string for the current
* object at the given level. Use delete [] to free after use.
*/
virtual char* GetUTF8Text(PageIteratorLevel level) const;
/**
* Return whether the current paragraph's dominant reading direction
* is left-to-right (as opposed to right-to-left).
*/
bool ParagraphIsLtr() const;
// ============= Exposed only for testing =============.
/**
* Yields the reading order as a sequence of indices and (optional)
* meta-marks for a set of words (given left-to-right).
* The meta marks are passed as negative values:
* kMinorRunStart Start of minor direction text.
* kMinorRunEnd End of minor direction text.
* kComplexWord The next indexed word contains both left-to-right and
* right-to-left characters and was treated as neutral.
*
* For example, suppose we have five words in a text line,
* indexed [0,1,2,3,4] from the leftmost side of the text line.
* The following are all believable reading_orders:
*
* Left-to-Right (in ltr paragraph):
* { 0, 1, 2, 3, 4 }
* Left-to-Right (in rtl paragraph):
* { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd }
* Right-to-Left (in rtl paragraph):
* { 4, 3, 2, 1, 0 }
* Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph:
* { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }
*/
static void CalculateTextlineOrder(
bool paragraph_is_ltr,
const GenericVector<StrongScriptDirection> &word_dirs,
GenericVectorEqEq<int> *reading_order);
static const int kMinorRunStart;
static const int kMinorRunEnd;
static const int kComplexWord;
protected:
/**
* We presume the data associated with the given iterator will outlive us.
* NB: This is private because it does something that is non-obvious:
* it resets to the beginning of the paragraph instead of staying wherever
* resit might have pointed.
*/
TESS_LOCAL explicit ResultIterator(const LTRResultIterator &resit);
private:
/**
* Calculates the current paragraph's dominant writing direction.
* Typically, members should use current_paragraph_ltr_ instead.
*/
bool CurrentParagraphIsLtr() const;
/**
* Returns word indices as measured from resit->RestartRow() = index 0
* for the reading order of words within a textline given an iterator
* into the middle of the text line.
* In addition to non-negative word indices, the following negative values
* may be inserted:
* kMinorRunStart Start of minor direction text.
* kMinorRunEnd End of minor direction text.
* kComplexWord The previous word contains both left-to-right and
* right-to-left characters and was treated as neutral.
*/
void CalculateTextlineOrder(bool paragraph_is_ltr,
const LTRResultIterator &resit,
GenericVectorEqEq<int> *indices) const;
/** Same as above, but the caller's ssd gets filled in if ssd != NULL. */
void CalculateTextlineOrder(bool paragraph_is_ltr,
const LTRResultIterator &resit,
GenericVector<StrongScriptDirection> *ssd,
GenericVectorEqEq<int> *indices) const;
/**
* What is the index of the current word in a strict left-to-right reading
* of the row?
*/
int LTRWordIndex() const;
/**
* Given an iterator pointing at a word, returns the logical reading order
* of blob indices for the word.
*/
void CalculateBlobOrder(GenericVector<int> *blob_indices) const;
/** Precondition: current_paragraph_is_ltr_ is set. */
void MoveToLogicalStartOfTextline();
/**
* Precondition: current_paragraph_is_ltr_ and in_minor_direction_
* are set.
*/
void MoveToLogicalStartOfWord();
/** Are we pointing at the final (reading order) symbol of the word? */
bool IsAtFinalSymbolOfWord() const;
/** Are we pointing at the first (reading order) symbol of the word? */
bool IsAtFirstSymbolOfWord() const;
/**
* Append any extra marks that should be appended to this word when printed.
* Mostly, these are Unicode BiDi control characters.
*/
void AppendSuffixMarks(STRING *text) const;
/** Appends the current word in reading order to the given buffer.*/
void AppendUTF8WordText(STRING *text) const;
/**
* Appends the text of the current text line, *assuming this iterator is
* positioned at the beginning of the text line* This function
* updates the iterator to point to the first position past the text line.
* Each textline is terminated in a single newline character.
* If the textline ends a paragraph, it gets a second terminal newline.
*/
void IterateAndAppendUTF8TextlineText(STRING *text);
/**
* Appends the text of the current paragraph in reading order
* to the given buffer.
* Each textline is terminated in a single newline character, and the
* paragraph gets an extra newline at the end.
*/
void AppendUTF8ParagraphText(STRING *text) const;
/** Returns whether the bidi_debug flag is set to at least min_level. */
bool BidiDebug(int min_level) const;
bool current_paragraph_is_ltr_;
/**
* Is the currently pointed-at character at the beginning of
* a minor-direction run?
*/
bool at_beginning_of_minor_run_;
/** Is the currently pointed-at character in a minor-direction sequence? */
bool in_minor_direction_;
/**
* Should detected inter-word spaces be preserved, or "compressed" to a single
* space character (default behavior).
*/
bool preserve_interword_spaces_;
};
} // namespace tesseract.
#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H__

View File

@ -0,0 +1,619 @@
/******************************************************************
* File: superscript.cpp
* Description: Correction pass to fix superscripts and subscripts.
* Author: David Eger
* Created: Mon Mar 12 14:05:00 PDT 2012
*
* (C) Copyright 2012, Google, Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "normalis.h"
#include "tesseractclass.h"
static int LeadingUnicharsToChopped(WERD_RES *word, int num_unichars) {
int num_chopped = 0;
for (int i = 0; i < num_unichars; i++)
num_chopped += word->best_state[i];
return num_chopped;
}
static int TrailingUnicharsToChopped(WERD_RES *word, int num_unichars) {
int num_chopped = 0;
for (int i = 0; i < num_unichars; i++)
num_chopped += word->best_state[word->best_state.size() - 1 - i];
return num_chopped;
}
namespace tesseract {
/**
* Given a recognized blob, see if a contiguous collection of sub-pieces
* (chopped blobs) starting at its left might qualify as being a subscript
* or superscript letter based only on y position. Also do this for the
* right side.
*/
void YOutlierPieces(WERD_RES *word, int rebuilt_blob_index,
int super_y_bottom, int sub_y_top,
ScriptPos *leading_pos, int *num_leading_outliers,
ScriptPos *trailing_pos, int *num_trailing_outliers) {
ScriptPos sp_unused1, sp_unused2;
int unused1, unused2;
if (!leading_pos) leading_pos = &sp_unused1;
if (!num_leading_outliers) num_leading_outliers = &unused1;
if (!trailing_pos) trailing_pos = &sp_unused2;
if (!num_trailing_outliers) num_trailing_outliers = &unused2;
*num_leading_outliers = *num_trailing_outliers = 0;
*leading_pos = *trailing_pos = SP_NORMAL;
int chopped_start = LeadingUnicharsToChopped(word, rebuilt_blob_index);
int num_chopped_pieces = word->best_state[rebuilt_blob_index];
ScriptPos last_pos = SP_NORMAL;
int trailing_outliers = 0;
for (int i = 0; i < num_chopped_pieces; i++) {
TBOX box = word->chopped_word->blobs[chopped_start + i]->bounding_box();
ScriptPos pos = SP_NORMAL;
if (box.bottom() >= super_y_bottom) {
pos = SP_SUPERSCRIPT;
}
else if (box.top() <= sub_y_top) {
pos = SP_SUBSCRIPT;
}
if (pos == SP_NORMAL) {
if (trailing_outliers == i) {
*num_leading_outliers = trailing_outliers;
*leading_pos = last_pos;
}
trailing_outliers = 0;
}
else {
if (pos == last_pos) {
trailing_outliers++;
}
else {
trailing_outliers = 1;
}
}
last_pos = pos;
}
*num_trailing_outliers = trailing_outliers;
*trailing_pos = last_pos;
}
/**
* Attempt to split off any high (or low) bits at the ends of the word with poor
* certainty and recognize them separately. If the certainty gets much better
* and other sanity checks pass, acccept.
*
* This superscript fix is meant to be called in the second pass of recognition
* when we have tried once and already have a preliminary answer for word.
*
* @return Whether we modified the given word.
*/
bool Tesseract::SubAndSuperscriptFix(WERD_RES *word) {
if (word->tess_failed || word->word->flag(W_REP_CHAR) ||
!word->best_choice) {
return false;
}
int num_leading, num_trailing;
ScriptPos sp_leading, sp_trailing;
float leading_certainty, trailing_certainty;
float avg_certainty, unlikely_threshold;
// Calculate the number of whole suspicious characters at the edges.
GetSubAndSuperscriptCandidates(
word, &num_leading, &sp_leading, &leading_certainty,
&num_trailing, &sp_trailing, &trailing_certainty,
&avg_certainty, &unlikely_threshold);
const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super";
const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super";
int num_blobs = word->best_choice->length();
// Calculate the remainder (partial characters) at the edges.
// This accounts for us having classified the best version of
// a word as [speaker?'] when it was instead [speaker.^{21}]
// (that is we accidentally thought the 2 was attached to the period).
int num_remainder_leading = 0, num_remainder_trailing = 0;
if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
int super_y_bottom =
kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
int sub_y_top =
kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
int last_word_char = num_blobs - 1 - num_trailing;
float last_char_certainty = word->best_choice->certainty(last_word_char);
if (word->best_choice->unichar_id(last_word_char) != 0 &&
last_char_certainty <= unlikely_threshold) {
ScriptPos rpos;
YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top,
NULL, NULL, &rpos, &num_remainder_trailing);
if (num_trailing > 0 && rpos != sp_trailing) num_remainder_trailing = 0;
if (num_remainder_trailing > 0 &&
last_char_certainty < trailing_certainty) {
trailing_certainty = last_char_certainty;
}
}
bool another_blob_available = (num_remainder_trailing == 0) ||
num_leading + num_trailing + 1 < num_blobs;
int first_char_certainty = word->best_choice->certainty(num_leading);
if (another_blob_available &&
word->best_choice->unichar_id(num_leading) != 0 &&
first_char_certainty <= unlikely_threshold) {
ScriptPos lpos;
YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top,
&lpos, &num_remainder_leading, NULL, NULL);
if (num_leading > 0 && lpos != sp_leading) num_remainder_leading = 0;
if (num_remainder_leading > 0 &&
first_char_certainty < leading_certainty) {
leading_certainty = first_char_certainty;
}
}
}
// If nothing to do, bail now.
if (num_leading + num_trailing +
num_remainder_leading + num_remainder_trailing == 0) {
return false;
}
if (superscript_debug >= 1) {
tprintf("Candidate for superscript detection: %s (",
word->best_choice->unichar_string().string());
if (num_leading || num_remainder_leading) {
tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading,
leading_pos);
}
if (num_trailing || num_remainder_trailing) {
tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing,
trailing_pos);
}
tprintf(")\n");
}
if (superscript_debug >= 3) {
word->best_choice->print();
}
if (superscript_debug >= 2) {
tprintf(" Certainties -- Average: %.2f Unlikely thresh: %.2f ",
avg_certainty, unlikely_threshold);
if (num_leading)
tprintf("Orig. leading (min): %.2f ", leading_certainty);
if (num_trailing)
tprintf("Orig. trailing (min): %.2f ", trailing_certainty);
tprintf("\n");
}
// We've now calculated the number of rebuilt blobs we want to carve off.
// However, split_word() works from TBLOBs in chopped_word, so we need to
// convert to those.
int num_chopped_leading =
LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
int num_chopped_trailing =
TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
int retry_leading = 0;
int retry_trailing = 0;
bool is_good = false;
WERD_RES *revised = TrySuperscriptSplits(
num_chopped_leading, leading_certainty, sp_leading,
num_chopped_trailing, trailing_certainty, sp_trailing,
word, &is_good, &retry_leading, &retry_trailing);
if (is_good) {
word->ConsumeWordResults(revised);
}
else if (retry_leading || retry_trailing) {
int retry_chopped_leading =
LeadingUnicharsToChopped(revised, retry_leading);
int retry_chopped_trailing =
TrailingUnicharsToChopped(revised, retry_trailing);
WERD_RES *revised2 = TrySuperscriptSplits(
retry_chopped_leading, leading_certainty, sp_leading,
retry_chopped_trailing, trailing_certainty, sp_trailing,
revised, &is_good, &retry_leading, &retry_trailing);
if (is_good) {
word->ConsumeWordResults(revised2);
}
delete revised2;
}
delete revised;
return is_good;
}
/**
* Determine how many characters (rebuilt blobs) on each end of a given word
* might plausibly be superscripts so SubAndSuperscriptFix can try to
* re-recognize them. Even if we find no whole blobs at either end,
* we will set *unlikely_threshold to a certainty that might be used to
* select "bad enough" outlier characters. If *unlikely_threshold is set to 0,
* though, there's really no hope.
*
* @param[in] word The word to examine.
* @param[out] num_rebuilt_leading the number of rebuilt blobs at the start
* of the word which are all up or down and
* seem badly classified.
* @param[out] leading_pos "super" or "sub" (for debugging)
* @param[out] leading_certainty the worst certainty in the leading blobs.
* @param[out] num_rebuilt_trailing the number of rebuilt blobs at the end
* of the word which are all up or down and
* seem badly classified.
* @param[out] trailing_pos "super" or "sub" (for debugging)
* @param[out] trailing_certainty the worst certainty in the trailing blobs.
* @param[out] avg_certainty the average certainty of "normal" blobs in
* the word.
* @param[out] unlikely_threshold the threshold (on certainty) we used to
* select "bad enough" outlier characters.
*/
void Tesseract::GetSubAndSuperscriptCandidates(const WERD_RES *word,
int *num_rebuilt_leading,
ScriptPos *leading_pos,
float *leading_certainty,
int *num_rebuilt_trailing,
ScriptPos *trailing_pos,
float *trailing_certainty,
float *avg_certainty,
float *unlikely_threshold) {
*avg_certainty = *unlikely_threshold = 0.0f;
*num_rebuilt_leading = *num_rebuilt_trailing = 0;
*leading_certainty = *trailing_certainty = 0.0f;
int super_y_bottom =
kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
int sub_y_top =
kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
// Step one: Get an average certainty for "normally placed" characters.
// Counts here are of blobs in the rebuild_word / unichars in best_choice.
*leading_pos = *trailing_pos = SP_NORMAL;
int leading_outliers = 0;
int trailing_outliers = 0;
int num_normal = 0;
float normal_certainty_total = 0.0f;
float worst_normal_certainty = 0.0f;
ScriptPos last_pos = SP_NORMAL;
int num_blobs = word->rebuild_word->NumBlobs();
for (int b = 0; b < num_blobs; ++b) {
TBOX box = word->rebuild_word->blobs[b]->bounding_box();
ScriptPos pos = SP_NORMAL;
if (box.bottom() >= super_y_bottom) {
pos = SP_SUPERSCRIPT;
}
else if (box.top() <= sub_y_top) {
pos = SP_SUBSCRIPT;
}
if (pos == SP_NORMAL) {
if (word->best_choice->unichar_id(b) != 0) {
float char_certainty = word->best_choice->certainty(b);
if (char_certainty < worst_normal_certainty) {
worst_normal_certainty = char_certainty;
}
num_normal++;
normal_certainty_total += char_certainty;
}
if (trailing_outliers == b) {
leading_outliers = trailing_outliers;
*leading_pos = last_pos;
}
trailing_outliers = 0;
}
else {
if (last_pos == pos) {
trailing_outliers++;
}
else {
trailing_outliers = 1;
}
}
last_pos = pos;
}
*trailing_pos = last_pos;
if (num_normal >= 3) { // throw out the worst as an outlier.
num_normal--;
normal_certainty_total -= worst_normal_certainty;
}
if (num_normal > 0) {
*avg_certainty = normal_certainty_total / num_normal;
*unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
}
if (num_normal == 0 ||
(leading_outliers == 0 && trailing_outliers == 0)) {
return;
}
// Step two: Try to split off bits of the word that are both outliers
// and have much lower certainty than average
// Calculate num_leading and leading_certainty.
for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0;
*num_rebuilt_leading < leading_outliers;
(*num_rebuilt_leading)++) {
float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);
if (char_certainty > *unlikely_threshold) {
break;
}
if (char_certainty < *leading_certainty) {
*leading_certainty = char_certainty;
}
}
// Calculate num_trailing and trailing_certainty.
for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
*num_rebuilt_trailing < trailing_outliers;
(*num_rebuilt_trailing)++) {
int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
float char_certainty = word->best_choice->certainty(blob_idx);
if (char_certainty > *unlikely_threshold) {
break;
}
if (char_certainty < *trailing_certainty) {
*trailing_certainty = char_certainty;
}
}
}
/**
* Try splitting off the given number of (chopped) blobs from the front and
* back of the given word and recognizing the pieces.
*
* @param[in] num_chopped_leading how many chopped blobs from the left
* end of the word to chop off and try recognizing as a
* superscript (or subscript)
* @param[in] leading_certainty the (minimum) certainty had by the
* characters in the original leading section.
* @param[in] leading_pos "super" or "sub" (for debugging)
* @param[in] num_chopped_trailing how many chopped blobs from the right
* end of the word to chop off and try recognizing as a
* superscript (or subscript)
* @param[in] trailing_certainty the (minimum) certainty had by the
* characters in the original trailing section.
* @param[in] trailing_pos "super" or "sub" (for debugging)
* @param[in] word the word to try to chop up.
* @param[out] is_good do we believe our result?
* @param[out] retry_rebuild_leading, retry_rebuild_trailing
* If non-zero, and !is_good, then the caller may have luck trying
* to split the returned word with this number of (rebuilt) leading
* and trailing blobs / unichars.
* @return A word which is the result of re-recognizing as asked.
*/
WERD_RES *Tesseract::TrySuperscriptSplits(
int num_chopped_leading, float leading_certainty, ScriptPos leading_pos,
int num_chopped_trailing, float trailing_certainty,
ScriptPos trailing_pos,
WERD_RES *word,
bool *is_good,
int *retry_rebuild_leading, int *retry_rebuild_trailing) {
int num_chopped = word->chopped_word->NumBlobs();
*retry_rebuild_leading = *retry_rebuild_trailing = 0;
// Chop apart the word into up to three pieces.
BlamerBundle *bb0 = NULL;
BlamerBundle *bb1 = NULL;
WERD_RES *prefix = NULL;
WERD_RES *core = NULL;
WERD_RES *suffix = NULL;
if (num_chopped_leading > 0) {
prefix = new WERD_RES(*word);
split_word(prefix, num_chopped_leading, &core, &bb0);
}
else {
core = new WERD_RES(*word);
}
if (num_chopped_trailing > 0) {
int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
split_word(core, split_pt, &suffix, &bb1);
}
// Recognize the pieces in turn.
int saved_cp_multiplier = classify_class_pruner_multiplier;
int saved_im_multiplier = classify_integer_matcher_multiplier;
if (prefix) {
// Turn off Tesseract's y-position penalties for the leading superscript.
classify_class_pruner_multiplier.set_value(0);
classify_integer_matcher_multiplier.set_value(0);
// Adjust our expectations about the baseline for this prefix.
if (superscript_debug >= 3) {
tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
}
recog_word_recursive(prefix);
if (superscript_debug >= 2) {
tprintf(" The leading bits look like %s %s\n",
ScriptPosToString(leading_pos),
prefix->best_choice->unichar_string().string());
}
// Restore the normal y-position penalties.
classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
}
if (superscript_debug >= 3) {
tprintf(" recognizing middle %d chopped blobs\n",
num_chopped - num_chopped_leading - num_chopped_trailing);
}
if (suffix) {
// Turn off Tesseract's y-position penalties for the trailing superscript.
classify_class_pruner_multiplier.set_value(0);
classify_integer_matcher_multiplier.set_value(0);
if (superscript_debug >= 3) {
tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
}
recog_word_recursive(suffix);
if (superscript_debug >= 2) {
tprintf(" The trailing bits look like %s %s\n",
ScriptPosToString(trailing_pos),
suffix->best_choice->unichar_string().string());
}
// Restore the normal y-position penalties.
classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
}
// Evaluate whether we think the results are believably better
// than what we already had.
bool good_prefix = !prefix || BelievableSuperscript(
superscript_debug >= 1, *prefix,
superscript_bettered_certainty * leading_certainty,
retry_rebuild_leading, NULL);
bool good_suffix = !suffix || BelievableSuperscript(
superscript_debug >= 1, *suffix,
superscript_bettered_certainty * trailing_certainty,
NULL, retry_rebuild_trailing);
*is_good = good_prefix && good_suffix;
if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
// None of it is any good. Quit now.
delete core;
delete prefix;
delete suffix;
return NULL;
}
recog_word_recursive(core);
// Now paste the results together into core.
if (suffix) {
suffix->SetAllScriptPositions(trailing_pos);
join_words(core, suffix, bb1);
}
if (prefix) {
prefix->SetAllScriptPositions(leading_pos);
join_words(prefix, core, bb0);
core = prefix;
prefix = NULL;
}
if (superscript_debug >= 1) {
tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
core->best_choice->unichar_string().string());
}
return core;
}
/**
* Return whether this is believable superscript or subscript text.
*
* We insist that:
* + there are no punctuation marks.
* + there are no italics.
* + no normal-sized character is smaller than superscript_scaledown_ratio
* of what it ought to be, and
* + each character is at least as certain as certainty_threshold.
*
* @param[in] debug If true, spew debug output
* @param[in] word The word whose best_choice we're evaluating
* @param[in] certainty_threshold If any of the characters have less
* certainty than this, reject.
* @param[out] left_ok How many left-side characters were ok?
* @param[out] right_ok How many right-side characters were ok?
* @return Whether the complete best choice is believable as a superscript.
*/
bool Tesseract::BelievableSuperscript(bool debug,
const WERD_RES &word,
float certainty_threshold,
int *left_ok,
int *right_ok) const {
int initial_ok_run_count = 0;
int ok_run_count = 0;
float worst_certainty = 0.0f;
const WERD_CHOICE &wc = *word.best_choice;
const UnicityTable<FontInfo>& fontinfo_table = get_fontinfo_table();
for (int i = 0; i < wc.length(); i++) {
TBLOB *blob = word.rebuild_word->blobs[i];
UNICHAR_ID unichar_id = wc.unichar_id(i);
float char_certainty = wc.certainty(i);
bool bad_certainty = char_certainty < certainty_threshold;
bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
bool is_italic = word.fontinfo && word.fontinfo->is_italic();
BLOB_CHOICE *choice = word.GetBlobChoice(i);
if (choice && fontinfo_table.size() > 0) {
// Get better information from the specific choice, if available.
int font_id1 = choice->fontinfo_id();
bool font1_is_italic = font_id1 >= 0
? fontinfo_table.get(font_id1).is_italic() : false;
int font_id2 = choice->fontinfo_id2();
is_italic = font1_is_italic &&
(font_id2 < 0 || fontinfo_table.get(font_id2).is_italic());
}
float height_fraction = 1.0f;
float char_height = blob->bounding_box().height();
float normal_height = char_height;
if (wc.unicharset()->top_bottom_useful()) {
int min_bot, max_bot, min_top, max_top;
wc.unicharset()->get_top_bottom(unichar_id,
&min_bot, &max_bot,
&min_top, &max_top);
float hi_height = max_top - max_bot;
float lo_height = min_top - min_bot;
normal_height = (hi_height + lo_height) / 2;
if (normal_height >= kBlnXHeight) {
// Only ding characters that we have decent information for because
// they're supposed to be normal sized, not tiny specks or dashes.
height_fraction = char_height / normal_height;
}
}
bool bad_height = height_fraction < superscript_scaledown_ratio;
if (debug) {
if (is_italic) {
tprintf(" Rejecting: superscript is italic.\n");
}
if (is_punc) {
tprintf(" Rejecting: punctuation present.\n");
}
const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
if (bad_certainty) {
tprintf(" Rejecting: don't believe character %s with certainty %.2f "
"which is less than threshold %.2f\n", char_str,
char_certainty, certainty_threshold);
}
if (bad_height) {
tprintf(" Rejecting: character %s seems too small @ %.2f versus "
"expected %.2f\n", char_str, char_height, normal_height);
}
}
if (bad_certainty || bad_height || is_punc || is_italic) {
if (ok_run_count == i) {
initial_ok_run_count = ok_run_count;
}
ok_run_count = 0;
}
else {
ok_run_count++;
}
if (char_certainty < worst_certainty) {
worst_certainty = char_certainty;
}
}
bool all_ok = ok_run_count == wc.length();
if (all_ok && debug) {
tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
}
if (!all_ok) {
if (left_ok) *left_ok = initial_ok_run_count;
if (right_ok) *right_ok = ok_run_count;
}
return all_ok;
}
} // namespace tesseract

View File

@ -0,0 +1,82 @@
/**********************************************************************
* File: tessbox.cpp (Formerly tessbox.c)
* Description: Black boxed Tess for developing a resaljet.
* Author: Ray Smith
* Created: Thu Apr 23 11:03:36 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifdef _MSC_VER
#pragma warning(disable:4244) // Conversion warnings
#endif
#include "mfoutline.h"
#include "tessbox.h"
#include "tesseractclass.h"
#define EXTERN
/**
* @name tess_segment_pass_n
*
* Segment a word using the pass_n conditions of the tess segmenter.
* @param pass_n pass number
* @param word word to do
*/
namespace tesseract {
void Tesseract::tess_segment_pass_n(int pass_n, WERD_RES *word) {
int saved_enable_assoc = 0;
int saved_chop_enable = 0;
if (word->word->flag(W_DONT_CHOP)) {
saved_enable_assoc = wordrec_enable_assoc;
saved_chop_enable = chop_enable;
wordrec_enable_assoc.set_value(0);
chop_enable.set_value(0);
}
if (pass_n == 1)
set_pass1();
else
set_pass2();
recog_word(word);
if (word->best_choice == NULL)
word->SetupFake(*word->uch_set);
if (word->word->flag(W_DONT_CHOP)) {
wordrec_enable_assoc.set_value(saved_enable_assoc);
chop_enable.set_value(saved_chop_enable);
}
}
/**
* @name tess_acceptable_word
*
* @return true if the word is regarded as "good enough".
* @param word_choice after context
* @param raw_choice before context
*/
bool Tesseract::tess_acceptable_word(WERD_RES* word) {
return getDict().AcceptableResult(word);
}
/**
* @name tess_add_doc_word
*
* Add the given word to the document dictionary
*/
void Tesseract::tess_add_doc_word(WERD_CHOICE *word_choice) {
getDict().add_document_word(*word_choice);
}
} // namespace tesseract

View File

@ -0,0 +1,28 @@
/**********************************************************************
* File: tessbox.h (Formerly tessbox.h)
* Description: Black boxed Tess for developing a resaljet.
* Author: Ray Smith
* Created: Thu Apr 23 11:03:36 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TESSBOX_H
#define TESSBOX_H
#include "ratngs.h"
#include "tesseractclass.h"
// TODO(ocr-team): Delete this along with other empty header files.
#endif

View File

@ -0,0 +1,501 @@
/**********************************************************************
* File: tessedit.cpp (Formerly tessedit.c)
* Description: (Previously) Main program for merge of tess and editor.
* Now just code to load the language model and various
* engine-specific data files.
* Author: Ray Smith
* Created: Tue Jan 07 15:21:46 GMT 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
#include "stderr.h"
#include "basedir.h"
#include "tessvars.h"
#include "control.h"
#include "reject.h"
#include "pageres.h"
#include "nwmain.h"
#include "pgedit.h"
#include "tprintf.h"
#include "tessedit.h"
#include "stopper.h"
#include "intmatcher.h"
#include "chop.h"
#include "efio.h"
#include "danerror.h"
#include "globals.h"
#include "tesseractclass.h"
#include "params.h"
#define VARDIR "configs/" /*variables files */
// config under api
#define API_CONFIG "configs/api_config"
ETEXT_DESC *global_monitor = NULL; // progress monitor
namespace tesseract {
// Read a "config" file containing a set of variable, value pairs.
// Searches the standard places: tessdata/configs, tessdata/tessconfigs
// and also accepts a relative or absolute path name.
void Tesseract::read_config_file(const char *filename,
SetParamConstraint constraint) {
STRING path = datadir;
path += "configs/";
path += filename;
FILE* fp;
if ((fp = fopen(path.string(), "rb")) != NULL) {
fclose(fp);
}
else {
path = datadir;
path += "tessconfigs/";
path += filename;
if ((fp = fopen(path.string(), "rb")) != NULL) {
fclose(fp);
}
else {
path = filename;
}
}
ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
}
// Returns false if a unicharset file for the specified language was not found
// or was invalid.
// This function initializes TessdataManager. After TessdataManager is
// no longer needed, TessdataManager::End() should be called.
//
// This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
// it is OEM_DEFAULT, in which case the value of the variable will be obtained
// from the language-specific config file (stored in [lang].traineddata), from
// the config files specified on the command line or left as the default
// OEM_TESSERACT_ONLY if none of the configs specify this variable.
bool Tesseract::init_tesseract_lang_data(
const char *arg0, const char *textbase, const char *language,
OcrEngineMode oem, char **configs, int configs_size,
const GenericVector<STRING> *vars_vec,
const GenericVector<STRING> *vars_values,
bool set_only_non_debug_params) {
// Set the basename, compute the data directory.
main_setup(arg0, textbase);
// Set the language data path prefix
lang = language != NULL ? language : "eng";
language_data_path_prefix = datadir;
language_data_path_prefix += lang;
language_data_path_prefix += ".";
// Initialize TessdataManager.
//STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
STRING tessdata_path = arg0;
if (!tessdata_manager.Init(tessdata_path.string(),
tessdata_manager_debug_level)) {
return false;
}
// If a language specific config file (lang.config) exists, load it in.
if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) {
ParamUtils::ReadParamsFromFp(
tessdata_manager.GetDataFilePtr(),
tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG),
SET_PARAM_CONSTRAINT_NONE, this->params());
if (tessdata_manager_debug_level) {
tprintf("Loaded language config file\n");
}
}
SetParamConstraint set_params_constraint = set_only_non_debug_params ?
SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE;
// Load tesseract variables from config files. This is done after loading
// language-specific variables from [lang].traineddata file, so that custom
// config files can override values in [lang].traineddata file.
for (int i = 0; i < configs_size; ++i) {
read_config_file(configs[i], set_params_constraint);
}
// Set params specified in vars_vec (done after setting params from config
// files, so that params in vars_vec can override those from files).
if (vars_vec != NULL && vars_values != NULL) {
for (int i = 0; i < vars_vec->size(); ++i) {
if (!ParamUtils::SetParam((*vars_vec)[i].string(),
(*vars_values)[i].string(),
set_params_constraint, this->params())) {
tprintf("Error setting param %s\n", (*vars_vec)[i].string());
exit(1);
}
}
}
if (((STRING &)tessedit_write_params_to_file).length() > 0) {
FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb");
if (params_file != NULL) {
ParamUtils::PrintParams(params_file, this->params());
fclose(params_file);
if (tessdata_manager_debug_level > 0) {
tprintf("Wrote parameters to %s\n",
tessedit_write_params_to_file.string());
}
}
else {
tprintf("Failed to open %s for writing params.\n",
tessedit_write_params_to_file.string());
}
}
// Determine which ocr engine(s) should be loaded and used for recognition.
if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
if (tessdata_manager_debug_level) {
tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n",
static_cast<int>(tessedit_ocr_engine_mode));
}
// If we are only loading the config file (and so not planning on doing any
// recognition) then there's nothing else do here.
if (tessedit_init_config_only) {
if (tessdata_manager_debug_level) {
tprintf("Returning after loading config file\n");
}
return true;
}
// Load the unicharset
if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) ||
!unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) {
return false;
}
if (unicharset.size() > MAX_NUM_CLASSES) {
tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
return false;
}
if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n");
right_to_left_ = unicharset.major_right_to_left();
// Setup initial unichar ambigs table and read universal ambigs.
UNICHARSET encoder_unicharset;
encoder_unicharset.CopyFrom(unicharset);
unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
if (!tessedit_ambigs_training &&
tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) {
TFile ambigs_file;
ambigs_file.Open(tessdata_manager.GetDataFilePtr(),
tessdata_manager.GetEndOffset(TESSDATA_AMBIGS) + 1);
unichar_ambigs.LoadUnicharAmbigs(
encoder_unicharset,
&ambigs_file,
ambigs_debug_level, use_ambigs_for_adaption, &unicharset);
if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
}
// The various OcrEngineMode settings (see publictypes.h) determine which
// engine-specific data files need to be loaded. Currently everything needs
// the base tesseract data, which supplies other useful information, but
// alternative engines, such as cube and LSTM are optional.
#ifndef NO_CUBE_BUILD
if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
ASSERT_HOST(init_cube_objects(false, &tessdata_manager));
if (tessdata_manager_debug_level)
tprintf("Loaded Cube w/out combiner\n");
}
else if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) {
ASSERT_HOST(init_cube_objects(true, &tessdata_manager));
if (tessdata_manager_debug_level)
tprintf("Loaded Cube with combiner\n");
}
#endif
// Init ParamsModel.
// Load pass1 and pass2 weights (for now these two sets are the same, but in
// the future separate sets of weights can be generated).
for (int p = ParamsModel::PTRAIN_PASS1;
p < ParamsModel::PTRAIN_NUM_PASSES; ++p) {
language_model_->getParamsModel().SetPass(
static_cast<ParamsModel::PassEnum>(p));
if (tessdata_manager.SeekToStart(TESSDATA_PARAMS_MODEL)) {
if (!language_model_->getParamsModel().LoadFromFp(
lang.string(), tessdata_manager.GetDataFilePtr(),
tessdata_manager.GetEndOffset(TESSDATA_PARAMS_MODEL))) {
return false;
}
}
}
if (tessdata_manager_debug_level) language_model_->getParamsModel().Print();
return true;
}
// Helper returns true if the given string is in the vector of strings.
static bool IsStrInList(const STRING& str,
const GenericVector<STRING>& str_list) {
for (int i = 0; i < str_list.size(); ++i) {
if (str_list[i] == str)
return true;
}
return false;
}
// Parse a string of the form [~]<lang>[+[~]<lang>]*.
// Langs with no prefix get appended to to_load, provided they
// are not in there already.
// Langs with ~ prefix get appended to not_to_load, provided they are not in
// there already.
void Tesseract::ParseLanguageString(const char* lang_str,
GenericVector<STRING>* to_load,
GenericVector<STRING>* not_to_load) {
STRING remains(lang_str);
while (remains.length() > 0) {
// Find the start of the lang code and which vector to add to.
const char* start = remains.string();
while (*start == '+')
++start;
GenericVector<STRING>* target = to_load;
if (*start == '~') {
target = not_to_load;
++start;
}
// Find the index of the end of the lang code in string start.
int end = strlen(start);
const char* plus = strchr(start, '+');
if (plus != NULL && plus - start < end)
end = plus - start;
STRING lang_code(start);
lang_code.truncate_at(end);
STRING next(start + end);
remains = next;
// Check whether lang_code is already in the target vector and add.
if (!IsStrInList(lang_code, *target)) {
if (tessdata_manager_debug_level)
tprintf("Adding language '%s' to list\n", lang_code.string());
target->push_back(lang_code);
}
}
}
// Initialize for potentially a set of languages defined by the language
// string and recursively any additional languages required by any language
// traineddata file (via tessedit_load_sublangs in its config) that is loaded.
// See init_tesseract_internal for args.
int Tesseract::init_tesseract(
const char *arg0, const char *textbase, const char *language,
OcrEngineMode oem, char **configs, int configs_size,
const GenericVector<STRING> *vars_vec,
const GenericVector<STRING> *vars_values,
bool set_only_non_debug_params) {
GenericVector<STRING> langs_to_load;
GenericVector<STRING> langs_not_to_load;
ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
sub_langs_.delete_data_pointers();
sub_langs_.clear();
// Find the first loadable lang and load into this.
// Add any languages that this language requires
bool loaded_primary = false;
// Load the rest into sub_langs_.
for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
const char *lang_str = langs_to_load[lang_index].string();
Tesseract *tess_to_init;
if (!loaded_primary) {
tess_to_init = this;
}
else {
tess_to_init = new Tesseract;
}
int result = tess_to_init->init_tesseract_internal(
arg0, textbase, lang_str, oem, configs, configs_size,
vars_vec, vars_values, set_only_non_debug_params);
if (!loaded_primary) {
if (result < 0) {
tprintf("Failed loading language '%s'\n", lang_str);
}
else {
if (tessdata_manager_debug_level)
tprintf("Loaded language '%s' as main language\n", lang_str);
ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
&langs_to_load, &langs_not_to_load);
loaded_primary = true;
}
}
else {
if (result < 0) {
tprintf("Failed loading language '%s'\n", lang_str);
delete tess_to_init;
}
else {
if (tessdata_manager_debug_level)
tprintf("Loaded language '%s' as secondary language\n", lang_str);
sub_langs_.push_back(tess_to_init);
// Add any languages that this language requires
ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
&langs_to_load, &langs_not_to_load);
}
}
}
}
if (!loaded_primary) {
tprintf("Tesseract couldn't load any languages!\n");
return -1; // Couldn't load any language!
}
if (!sub_langs_.empty()) {
// In multilingual mode word ratings have to be directly comparable,
// so use the same language model weights for all languages:
// use the primary language's params model if
// tessedit_use_primary_params_model is set,
// otherwise use default language model weights.
if (tessedit_use_primary_params_model) {
for (int s = 0; s < sub_langs_.size(); ++s) {
sub_langs_[s]->language_model_->getParamsModel().Copy(
this->language_model_->getParamsModel());
}
tprintf("Using params model of the primary language\n");
if (tessdata_manager_debug_level) {
this->language_model_->getParamsModel().Print();
}
}
else {
this->language_model_->getParamsModel().Clear();
for (int s = 0; s < sub_langs_.size(); ++s) {
sub_langs_[s]->language_model_->getParamsModel().Clear();
}
if (tessdata_manager_debug_level)
tprintf("Using default language params\n");
}
}
SetupUniversalFontIds();
return 0;
}
// Common initialization for a single language.
// arg0 is the datapath for the tessdata directory, which could be the
// path of the tessdata directory with no trailing /, or (if tessdata
// lives in the same directory as the executable, the path of the executable,
// hence the name arg0.
// textbase is an optional output file basename (used only for training)
// language is the language code to load.
// oem controls which engine(s) will operate on the image
// configs (argv) is an array of config filenames to load variables from.
// May be NULL.
// configs_size (argc) is the number of elements in configs.
// vars_vec is an optional vector of variables to set.
// vars_values is an optional corresponding vector of values for the variables
// in vars_vec.
// If set_only_init_params is true, then only the initialization variables
// will be set.
int Tesseract::init_tesseract_internal(
const char *arg0, const char *textbase, const char *language,
OcrEngineMode oem, char **configs, int configs_size,
const GenericVector<STRING> *vars_vec,
const GenericVector<STRING> *vars_values,
bool set_only_non_debug_params) {
if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
configs_size, vars_vec, vars_values,
set_only_non_debug_params)) {
return -1;
}
if (tessedit_init_config_only) {
tessdata_manager.End();
return 0;
}
// If only Cube will be used, skip loading Tesseract classifier's
// pre-trained templates.
bool init_tesseract_classifier =
(tessedit_ocr_engine_mode == OEM_TESSERACT_ONLY ||
tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED);
// If only Cube will be used and if it has its own Unicharset,
// skip initializing permuter and loading Tesseract Dawgs.
bool init_dict =
!(tessedit_ocr_engine_mode == OEM_CUBE_ONLY &&
tessdata_manager.SeekToStart(TESSDATA_CUBE_UNICHARSET));
program_editup(textbase, init_tesseract_classifier, init_dict);
tessdata_manager.End();
return 0; //Normal exit
}
// Helper builds the all_fonts table by adding new fonts from new_fonts.
static void CollectFonts(const UnicityTable<FontInfo>& new_fonts,
UnicityTable<FontInfo>* all_fonts) {
for (int i = 0; i < new_fonts.size(); ++i) {
// UnicityTable uniques as we go.
all_fonts->push_back(new_fonts.get(i));
}
}
// Helper assigns an id to lang_fonts using the index in all_fonts table.
static void AssignIds(const UnicityTable<FontInfo>& all_fonts,
UnicityTable<FontInfo>* lang_fonts) {
for (int i = 0; i < lang_fonts->size(); ++i) {
int index = all_fonts.get_id(lang_fonts->get(i));
lang_fonts->get_mutable(i)->universal_id = index;
}
}
// Set the universal_id member of each font to be unique among all
// instances of the same font loaded.
void Tesseract::SetupUniversalFontIds() {
// Note that we can get away with bitwise copying FontInfo in
// all_fonts, as it is a temporary structure and we avoid setting the
// delete callback.
UnicityTable<FontInfo> all_fonts;
all_fonts.set_compare_callback(NewPermanentTessCallback(CompareFontInfo));
// Create the universal ID table.
CollectFonts(get_fontinfo_table(), &all_fonts);
for (int i = 0; i < sub_langs_.size(); ++i) {
CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
}
// Assign ids from the table to each font table.
AssignIds(all_fonts, &get_fontinfo_table());
for (int i = 0; i < sub_langs_.size(); ++i) {
AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
}
font_table_size_ = all_fonts.size();
}
// init the LM component
int Tesseract::init_tesseract_lm(const char *arg0,
const char *textbase,
const char *language) {
if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
NULL, 0, NULL, NULL, false))
return -1;
getDict().SetupForLoad(Dict::GlobalDawgCache());
getDict().Load(tessdata_manager.GetDataFileName().string(), lang);
getDict().FinishLoad();
tessdata_manager.End();
return 0;
}
void Tesseract::end_tesseract() {
end_recog();
}
/* Define command type identifiers */
enum CMD_EVENTS
{
ACTION_1_CMD_EVENT,
RECOG_WERDS,
RECOG_PSEUDO,
ACTION_2_CMD_EVENT
};
} // namespace tesseract

View File

@ -0,0 +1,29 @@
/**********************************************************************
* File: tessedit.h (Formerly tessedit.h)
* Description: Main program for merge of tess and editor.
* Author: Ray Smith
* Created: Tue Jan 07 15:21:46 GMT 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TESSEDIT_H
#define TESSEDIT_H
#include "blobs.h"
#include "pgedit.h"
//progress monitor
extern ETEXT_DESC *global_monitor;
#endif

View File

@ -0,0 +1,305 @@
/**********************************************************************
* File: tesseract_cube_combiner.h
* Description: Declaration of the Tesseract & Cube results combiner Class
* Author: Ahmad Abdulkader
* Created: 2008
*
* (C) Copyright 2008, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
// The TesseractCubeCombiner class provides the functionality of combining
// the recognition results of Tesseract and Cube at the word level
#include <algorithm>
#include <wctype.h>
#include "tesseract_cube_combiner.h"
#include "cube_object.h"
#include "cube_reco_context.h"
#include "cube_utils.h"
#include "neural_net.h"
#include "tesseractclass.h"
#include "word_altlist.h"
namespace tesseract {
TesseractCubeCombiner::TesseractCubeCombiner(CubeRecoContext *cube_cntxt) {
cube_cntxt_ = cube_cntxt;
combiner_net_ = NULL;
}
TesseractCubeCombiner::~TesseractCubeCombiner() {
if (combiner_net_ != NULL) {
delete combiner_net_;
combiner_net_ = NULL;
}
}
bool TesseractCubeCombiner::LoadCombinerNet() {
ASSERT_HOST(cube_cntxt_);
// Compute the path of the combiner net
string data_path;
cube_cntxt_->GetDataFilePath(&data_path);
string net_file_name = data_path + cube_cntxt_->Lang() +
".tesseract_cube.nn";
// Return false if file does not exist
FILE *fp = fopen(net_file_name.c_str(), "rb");
if (fp == NULL)
return false;
else
fclose(fp);
// Load and validate net
combiner_net_ = NeuralNet::FromFile(net_file_name);
if (combiner_net_ == NULL) {
tprintf("Could not read combiner net file %s", net_file_name.c_str());
return false;
}
else if (combiner_net_->out_cnt() != 2) {
tprintf("Invalid combiner net file %s! Output count != 2\n",
net_file_name.c_str());
delete combiner_net_;
combiner_net_ = NULL;
return false;
}
return true;
}
// Normalize a UTF-8 string. Converts the UTF-8 string to UTF32 and optionally
// strips punc and/or normalizes case and then converts back
string TesseractCubeCombiner::NormalizeString(const string &str,
bool remove_punc,
bool norm_case) {
// convert to UTF32
string_32 str32;
CubeUtils::UTF8ToUTF32(str.c_str(), &str32);
// strip punc and normalize
string_32 new_str32;
for (int idx = 0; idx < str32.length(); idx++) {
// if no punc removal is required or not a punctuation character
if (!remove_punc || iswpunct(str32[idx]) == 0) {
char_32 norm_char = str32[idx];
// normalize case if required
if (norm_case && iswalpha(norm_char)) {
norm_char = towlower(norm_char);
}
new_str32.push_back(norm_char);
}
}
// convert back to UTF8
string new_str;
CubeUtils::UTF32ToUTF8(new_str32.c_str(), &new_str);
return new_str;
}
// Compares 2 strings optionally ignoring punctuation
int TesseractCubeCombiner::CompareStrings(const string &str1,
const string &str2,
bool ignore_punc,
bool ignore_case) {
if (!ignore_punc && !ignore_case) {
return str1.compare(str2);
}
string norm_str1 = NormalizeString(str1, ignore_punc, ignore_case);
string norm_str2 = NormalizeString(str2, ignore_punc, ignore_case);
return norm_str1.compare(norm_str2);
}
// Check if a string is a valid Tess dict word or not
bool TesseractCubeCombiner::ValidWord(const string &str) {
return (cube_cntxt_->TesseractObject()->getDict().valid_word(str.c_str())
> 0);
}
// Public method for computing the combiner features. The agreement
// output parameter will be true if both answers are identical,
// and false otherwise.
bool TesseractCubeCombiner::ComputeCombinerFeatures(const string &tess_str,
int tess_confidence,
CubeObject *cube_obj,
WordAltList *cube_alt_list,
vector<double> *features,
bool *agreement) {
features->clear();
*agreement = false;
if (cube_alt_list == NULL || cube_alt_list->AltCount() <= 0)
return false;
// Get Cube's best string; return false if empty
char_32 *cube_best_str32 = cube_alt_list->Alt(0);
if (cube_best_str32 == NULL || CubeUtils::StrLen(cube_best_str32) < 1)
return false;
string cube_best_str;
int cube_best_cost = cube_alt_list->AltCost(0);
int cube_best_bigram_cost = 0;
bool cube_best_bigram_cost_valid = true;
if (cube_cntxt_->Bigrams())
cube_best_bigram_cost = cube_cntxt_->Bigrams()->
Cost(cube_best_str32, cube_cntxt_->CharacterSet());
else
cube_best_bigram_cost_valid = false;
CubeUtils::UTF32ToUTF8(cube_best_str32, &cube_best_str);
// Get Tesseract's UTF32 string
string_32 tess_str32;
CubeUtils::UTF8ToUTF32(tess_str.c_str(), &tess_str32);
// Compute agreement flag
*agreement = (tess_str.compare(cube_best_str) == 0);
// Get Cube's second best string; if empty, return false
char_32 *cube_next_best_str32;
string cube_next_best_str;
int cube_next_best_cost = WORST_COST;
if (cube_alt_list->AltCount() > 1) {
cube_next_best_str32 = cube_alt_list->Alt(1);
if (cube_next_best_str32 == NULL ||
CubeUtils::StrLen(cube_next_best_str32) == 0) {
return false;
}
cube_next_best_cost = cube_alt_list->AltCost(1);
CubeUtils::UTF32ToUTF8(cube_next_best_str32, &cube_next_best_str);
}
// Rank of Tesseract's top result in Cube's alternate list
int tess_rank = 0;
for (tess_rank = 0; tess_rank < cube_alt_list->AltCount(); tess_rank++) {
string alt_str;
CubeUtils::UTF32ToUTF8(cube_alt_list->Alt(tess_rank), &alt_str);
if (alt_str == tess_str)
break;
}
// Cube's cost for tesseract's result. Note that this modifies the
// state of cube_obj, including its alternate list by calling RecognizeWord()
int tess_cost = cube_obj->WordCost(tess_str.c_str());
// Cube's bigram cost of Tesseract's string
int tess_bigram_cost = 0;
int tess_bigram_cost_valid = true;
if (cube_cntxt_->Bigrams())
tess_bigram_cost = cube_cntxt_->Bigrams()->
Cost(tess_str32.c_str(), cube_cntxt_->CharacterSet());
else
tess_bigram_cost_valid = false;
// Tesseract confidence
features->push_back(tess_confidence);
// Cube cost of Tesseract string
features->push_back(tess_cost);
// Cube Rank of Tesseract string
features->push_back(tess_rank);
// length of Tesseract OCR string
features->push_back(tess_str.length());
// Tesseract OCR string in dictionary
features->push_back(ValidWord(tess_str));
if (tess_bigram_cost_valid) {
// bigram cost of Tesseract string
features->push_back(tess_bigram_cost);
}
// Cube tess_cost of Cube best string
features->push_back(cube_best_cost);
// Cube tess_cost of Cube next best string
features->push_back(cube_next_best_cost);
// length of Cube string
features->push_back(cube_best_str.length());
// Cube string in dictionary
features->push_back(ValidWord(cube_best_str));
if (cube_best_bigram_cost_valid) {
// bigram cost of Cube string
features->push_back(cube_best_bigram_cost);
}
// case-insensitive string comparison, including punctuation
int compare_nocase_punc = CompareStrings(cube_best_str,
tess_str, false, true);
features->push_back(compare_nocase_punc == 0);
// case-sensitive string comparison, ignoring punctuation
int compare_case_nopunc = CompareStrings(cube_best_str,
tess_str, true, false);
features->push_back(compare_case_nopunc == 0);
// case-insensitive string comparison, ignoring punctuation
int compare_nocase_nopunc = CompareStrings(cube_best_str,
tess_str, true, true);
features->push_back(compare_nocase_nopunc == 0);
return true;
}
// The CubeObject parameter is used for 2 purposes: 1) to retrieve
// cube's alt list, and 2) to compute cube's word cost for the
// tesseract result. The call to CubeObject::WordCost() modifies
// the object's alternate list, so previous state will be lost.
float TesseractCubeCombiner::CombineResults(WERD_RES *tess_res,
CubeObject *cube_obj) {
// If no combiner is loaded or the cube object is undefined,
// tesseract wins with probability 1.0
if (combiner_net_ == NULL || cube_obj == NULL) {
tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): "
"Cube objects not initialized; defaulting to Tesseract\n");
return 1.0;
}
// Retrieve the alternate list from the CubeObject's current state.
// If the alt list empty, tesseract wins with probability 1.0
WordAltList *cube_alt_list = cube_obj->AlternateList();
if (cube_alt_list == NULL)
cube_alt_list = cube_obj->RecognizeWord();
if (cube_alt_list == NULL || cube_alt_list->AltCount() <= 0) {
tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): "
"Cube returned no results; defaulting to Tesseract\n");
return 1.0;
}
return CombineResults(tess_res, cube_obj, cube_alt_list);
}
// The alt_list parameter is expected to have been extracted from the
// CubeObject that recognized the word to be combined. The cube_obj
// parameter passed may be either same instance or a separate instance to
// be used only by the combiner. In both cases, its alternate
// list will be modified by an internal call to RecognizeWord().
float TesseractCubeCombiner::CombineResults(WERD_RES *tess_res,
CubeObject *cube_obj,
WordAltList *cube_alt_list) {
// If no combiner is loaded or the cube object is undefined, or the
// alt list is empty, tesseract wins with probability 1.0
if (combiner_net_ == NULL || cube_obj == NULL ||
cube_alt_list == NULL || cube_alt_list->AltCount() <= 0) {
tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): "
"Cube result cannot be retrieved; defaulting to Tesseract\n");
return 1.0;
}
// Tesseract result string, tesseract confidence, and cost of
// tesseract result according to cube
string tess_str = tess_res->best_choice->unichar_string().string();
// Map certainty [-20.0, 0.0] to confidence [0, 100]
int tess_confidence = MIN(100, MAX(1, static_cast<int>(
100 + (5 * tess_res->best_choice->certainty()))));
// Compute the combiner features. If feature computation fails or
// answers are identical, tesseract wins with probability 1.0
vector<double> features;
bool agreement;
bool combiner_success = ComputeCombinerFeatures(tess_str, tess_confidence,
cube_obj, cube_alt_list,
&features, &agreement);
if (!combiner_success || agreement)
return 1.0;
// Classify combiner feature vector and return output (probability
// of tesseract class).
double net_out[2];
if (!combiner_net_->FeedForward(&features[0], net_out))
return 1.0;
return net_out[1];
}
}

View File

@ -0,0 +1,93 @@
/**********************************************************************
* File: tesseract_cube_combiner.h
* Description: Declaration of the Tesseract & Cube results combiner Class
* Author: Ahmad Abdulkader
* Created: 2008
*
* (C) Copyright 2008, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
// The TesseractCubeCombiner class provides the functionality of combining
// the recognition results of Tesseract and Cube at the word level
#ifndef TESSERACT_CCMAIN_TESSERACT_CUBE_COMBINER_H
#define TESSERACT_CCMAIN_TESSERACT_CUBE_COMBINER_H
#include <string>
#include <vector>
#include "pageres.h"
namespace tesseract {
class CubeObject;
class NeuralNet;
class CubeRecoContext;
class WordAltList;
class TesseractCubeCombiner {
public:
explicit TesseractCubeCombiner(CubeRecoContext *cube_cntxt);
virtual ~TesseractCubeCombiner();
// There are 2 public methods for combining the results of tesseract
// and cube. Both return the probability that the Tesseract result is
// correct. The difference between the two interfaces is in how the
// passed-in CubeObject is used.
// The CubeObject parameter is used for 2 purposes: 1) to retrieve
// cube's alt list, and 2) to compute cube's word cost for the
// tesseract result. Both uses may modify the state of the
// CubeObject (including the BeamSearch state) with a call to
// RecognizeWord().
float CombineResults(WERD_RES *tess_res, CubeObject *cube_obj);
// The alt_list parameter is expected to have been extracted from the
// CubeObject that recognized the word to be combined. The cube_obj
// parameter passed in is a separate instance to be used only by
// the combiner.
float CombineResults(WERD_RES *tess_res, CubeObject *cube_obj,
WordAltList *alt_list);
// Public method for computing the combiner features. The agreement
// output parameter will be true if both answers are identical,
// false otherwise. Modifies the cube_alt_list, so no assumptions
// should be made about its state upon return.
bool ComputeCombinerFeatures(const std::string &tess_res,
int tess_confidence,
CubeObject *cube_obj,
WordAltList *cube_alt_list,
std::vector<double> *features,
bool *agreement);
// Is the word valid according to Tesseract's language model
bool ValidWord(const std::string &str);
// Loads the combiner neural network from file, using cube_cntxt_
// to find path.
bool LoadCombinerNet();
private:
// Normalize a UTF-8 string. Converts the UTF-8 string to UTF32 and optionally
// strips punc and/or normalizes case and then converts back
std::string NormalizeString(const std::string &str, bool remove_punc, bool norm_case);
// Compares 2 strings after optionally normalizing them and or stripping
// punctuation
int CompareStrings(const std::string &str1, const std::string &str2, bool ignore_punc,
bool norm_case);
NeuralNet *combiner_net_; // pointer to the combiner NeuralNet object
CubeRecoContext *cube_cntxt_; // used for language ID and data paths
};
}
#endif // TESSERACT_CCMAIN_TESSERACT_CUBE_COMBINER_H

View File

@ -0,0 +1,769 @@
///////////////////////////////////////////////////////////////////////
// File: tesseractclass.cpp
// Description: The Tesseract class. It holds/owns everything needed
// to run Tesseract on a single language, and also a set of
// sub-Tesseracts to run sub-languages. For thread safety, *every*
// variable that was previously global or static (except for
// constant data, and some visual debugging flags) has been moved
// in here, directly, or indirectly.
// This makes it safe to run multiple Tesseracts in different
// threads in parallel, and keeps the different language
// instances separate.
// Some global functions remain, but they are isolated re-entrant
// functions that operate on their arguments. Functions that work
// on variable data have been moved to an appropriate class based
// mostly on the directory hierarchy. For more information see
// slide 6 of "2ArchitectureAndDataStructures" in
// https://drive.google.com/file/d/0B7l10Bj_LprhbUlIUFlCdGtDYkE/edit?usp=sharing
// Some global data and related functions still exist in the
// training-related code, but they don't interfere with normal
// recognition operation.
// Author: Ray Smith
// Created: Fri Mar 07 08:17:01 PST 2008
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
#include "tesseractclass.h"
#include "allheaders.h"
#ifndef NO_CUBE_BUILD
#include "cube_reco_context.h"
#endif
#include "edgblob.h"
#include "equationdetect.h"
#include "globals.h"
#ifndef NO_CUBE_BUILD
#include "tesseract_cube_combiner.h"
#endif
namespace tesseract {
Tesseract::Tesseract()
: BOOL_MEMBER(tessedit_resegment_from_boxes, false,
"Take segmentation and labeling from box file",
this->params()),
BOOL_MEMBER(tessedit_resegment_from_line_boxes, false,
"Conversion of word/line box file to char box file",
this->params()),
BOOL_MEMBER(tessedit_train_from_boxes, false,
"Generate training data from boxed chars", this->params()),
BOOL_MEMBER(tessedit_make_boxes_from_boxes, false,
"Generate more boxes from boxed chars", this->params()),
BOOL_MEMBER(tessedit_dump_pageseg_images, false,
"Dump intermediate images made during page segmentation",
this->params()),
// The default for pageseg_mode is the old behaviour, so as not to
// upset anything that relies on that.
INT_MEMBER(
tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
"Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
" 5=line, 6=word, 7=char"
" (Values from PageSegMode enum in publictypes.h)",
this->params()),
INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY,
"Which OCR engine(s) to run (Tesseract, Cube, both)."
" Defaults to loading and running only Tesseract"
" (no Cube,no combiner)."
" Values from OcrEngineMode enum in tesseractclass.h)",
this->params()),
STRING_MEMBER(tessedit_char_blacklist, "",
"Blacklist of chars not to recognize", this->params()),
STRING_MEMBER(tessedit_char_whitelist, "",
"Whitelist of chars to recognize", this->params()),
STRING_MEMBER(tessedit_char_unblacklist, "",
"List of chars to override tessedit_char_blacklist",
this->params()),
BOOL_MEMBER(tessedit_ambigs_training, false,
"Perform training for ambiguities", this->params()),
INT_MEMBER(pageseg_devanagari_split_strategy,
tesseract::ShiroRekhaSplitter::NO_SPLIT,
"Whether to use the top-line splitting process for Devanagari "
"documents while performing page-segmentation.",
this->params()),
INT_MEMBER(ocr_devanagari_split_strategy,
tesseract::ShiroRekhaSplitter::NO_SPLIT,
"Whether to use the top-line splitting process for Devanagari "
"documents while performing ocr.",
this->params()),
STRING_MEMBER(tessedit_write_params_to_file, "",
"Write all parameters to the given file.", this->params()),
BOOL_MEMBER(tessedit_adaption_debug, false,
"Generate and print debug"
" information for adaption",
this->params()),
INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()),
INT_MEMBER(applybox_debug, 1, "Debug level", this->params()),
INT_MEMBER(applybox_page, 0, "Page number to apply boxes from",
this->params()),
STRING_MEMBER(applybox_exposure_pattern, ".exp",
"Exposure value follows"
" this pattern in the image filename. The name of the image"
" files are expected to be in the form"
" [lang].[fontname].exp[num].tif",
this->params()),
BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,
"Learn both character fragments (as is done in the"
" special low exposure mode) as well as unfragmented"
" characters.",
this->params()),
BOOL_MEMBER(applybox_learn_ngrams_mode, false,
"Each bounding box"
" is assumed to contain ngrams. Only learn the ngrams"
" whose outlines overlap horizontally.",
this->params()),
BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words",
this->params()),
BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices",
this->params()),
BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats",
this->params()),
BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true,
"Try to improve fuzzy spaces", this->params()),
BOOL_MEMBER(tessedit_unrej_any_wd, false,
"Don't bother with word plausibility", this->params()),
BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?",
this->params()),
BOOL_MEMBER(tessedit_redo_xheight, true, "Check/Correct x-height",
this->params()),
BOOL_MEMBER(tessedit_enable_doc_dict, true,
"Add words to the document dictionary", this->params()),
BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char",
this->params()),
BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats",
this->params()),
BOOL_MEMBER(tessedit_enable_bigram_correction, true,
"Enable correction based on the word bigram dictionary.",
this->params()),
BOOL_MEMBER(tessedit_enable_dict_correction, false,
"Enable single word correction based on the dictionary.",
this->params()),
INT_MEMBER(tessedit_bigram_debug, 0,
"Amount of debug output for bigram correction.",
this->params()),
BOOL_MEMBER(enable_noise_removal, true,
"Remove and conditionally reassign small outlines when they"
" confuse layout analysis, determining diacritics vs noise",
this->params()),
INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines",
this->params()),
// Worst (min) certainty, for which a diacritic is allowed to make the
// base
// character worse and still be included.
double_MEMBER(noise_cert_basechar, -8.0,
"Hingepoint for base char certainty", this->params()),
// Worst (min) certainty, for which a non-overlapping diacritic is allowed
// to make the base character worse and still be included.
double_MEMBER(noise_cert_disjoint, -1.0,
"Hingepoint for disjoint certainty", this->params()),
// Worst (min) certainty, for which a diacritic is allowed to make a new
// stand-alone blob.
double_MEMBER(noise_cert_punc, -3.0,
"Threshold for new punc char certainty", this->params()),
// Factor of certainty margin for adding diacritics to not count as worse.
double_MEMBER(noise_cert_factor, 0.375,
"Scaling on certainty diff from Hingepoint",
this->params()),
INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob",
this->params()),
INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word",
this->params()),
INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()),
BOOL_MEMBER(debug_acceptable_wds, false, "Dump word pass/fail chk",
this->params()),
STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation",
this->params()),
STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation",
this->params()),
STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation",
this->params()),
double_MEMBER(quality_rej_pc, 0.08,
"good_quality_doc lte rejection limit", this->params()),
double_MEMBER(quality_blob_pc, 0.0,
"good_quality_doc gte good blobs limit", this->params()),
double_MEMBER(quality_outline_pc, 1.0,
"good_quality_doc lte outline error limit", this->params()),
double_MEMBER(quality_char_pc, 0.95,
"good_quality_doc gte good char limit", this->params()),
INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word",
this->params()),
INT_MEMBER(tessedit_tess_adaption_mode, 0x27,
"Adaptation decision algorithm for tess", this->params()),
BOOL_MEMBER(tessedit_minimal_rej_pass1, false,
"Do minimal rejection on pass 1 output", this->params()),
BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria",
this->params()),
BOOL_MEMBER(tessedit_matcher_log, false, "Log matcher activity",
this->params()),
INT_MEMBER(tessedit_test_adaption_mode, 3,
"Adaptation decision algorithm for tess", this->params()),
BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
this->params()),
BOOL_MEMBER(paragraph_text_based, true,
"Run paragraph detection on the post-text-recognition "
"(more accurate)",
this->params()),
INT_MEMBER(cube_debug_level, 0, "Print cube debug info.", this->params()),
STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines",
this->params()),
STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines",
this->params()),
BOOL_MEMBER(docqual_excuse_outline_errs, false,
"Allow outline errs in unrejection?", this->params()),
BOOL_MEMBER(tessedit_good_quality_unrej, true,
"Reduce rejection on good docs", this->params()),
BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?",
this->params()),
double_MEMBER(tessedit_reject_doc_percent, 65.00,
"%rej allowed before rej whole doc", this->params()),
double_MEMBER(tessedit_reject_block_percent, 45.00,
"%rej allowed before rej whole block", this->params()),
double_MEMBER(tessedit_reject_row_percent, 40.00,
"%rej allowed before rej whole row", this->params()),
double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00,
"Number of row rejects in whole word rejects"
"which prevents whole row rejection",
this->params()),
BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,
"Only rej partially rejected words in block rejection",
this->params()),
BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true,
"Only rej partially rejected words in row rejection",
this->params()),
BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false,
"Use word segmentation quality metric", this->params()),
BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false,
"Use word segmentation quality metric", this->params()),
INT_MEMBER(tessedit_preserve_min_wd_len, 2,
"Only preserve wds longer than this", this->params()),
BOOL_MEMBER(tessedit_row_rej_good_docs, true,
"Apply row rejection to good docs", this->params()),
double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1,
"rej good doc wd if more than this fraction rejected",
this->params()),
BOOL_MEMBER(tessedit_reject_bad_qual_wds, true,
"Reject all bad quality wds", this->params()),
BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats",
this->params()),
BOOL_MEMBER(tessedit_debug_quality_metrics, false,
"Output data to debug file", this->params()),
BOOL_MEMBER(bland_unrej, false, "unrej potential with no chekcs",
this->params()),
double_MEMBER(quality_rowrej_pc, 1.1,
"good_quality_doc gte good char limit", this->params()),
BOOL_MEMBER(unlv_tilde_crunching, true,
"Mark v.bad words for tilde crunch", this->params()),
BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
this->params()),
BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
this->params()),
BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
"Take out ~^ early?", this->params()),
double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this",
this->params()),
BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()),
double_MEMBER(crunch_poor_garbage_cert, -9.0,
"crunch garbage cert lt this", this->params()),
double_MEMBER(crunch_poor_garbage_rate, 60,
"crunch garbage rating lt this", this->params()),
double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this",
this->params()),
double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this",
this->params()),
BOOL_MEMBER(crunch_pot_garbage, true, "POTENTIAL crunch garbage",
this->params()),
double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this",
this->params()),
double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this",
this->params()),
double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this",
this->params()),
double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this",
this->params()),
double_MEMBER(crunch_del_min_width, 3.0,
"Del if word width lt xht x this", this->params()),
double_MEMBER(crunch_del_high_word, 1.5,
"Del if word gt xht x this above bl", this->params()),
double_MEMBER(crunch_del_low_word, 0.5,
"Del if word gt xht x this below bl", this->params()),
double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this",
this->params()),
INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch",
this->params()),
INT_MEMBER(crunch_pot_indicators, 1,
"How many potential indicators needed", this->params()),
BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings",
this->params()),
BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring",
this->params()),
BOOL_MEMBER(crunch_leave_accept_strings, false,
"Don't pot crunch sensible strings", this->params()),
BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures",
this->params()),
INT_MEMBER(crunch_leave_lc_strings, 4,
"Don't crunch words with long lower case strings",
this->params()),
INT_MEMBER(crunch_leave_uc_strings, 4,
"Don't crunch words with long lower case strings",
this->params()),
INT_MEMBER(crunch_long_repetitions, 3,
"Crunch words with long repetitions", this->params()),
INT_MEMBER(crunch_debug, 0, "As it says", this->params()),
INT_MEMBER(fixsp_non_noise_limit, 1,
"How many non-noise blbs either side?", this->params()),
double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this",
this->params()),
BOOL_MEMBER(tessedit_prefer_joined_punct, false,
"Reward punctation joins", this->params()),
INT_MEMBER(fixsp_done_mode, 1, "What constitues done for spacing",
this->params()),
INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug",
this->params()),
STRING_MEMBER(numeric_punctuation, ".,",
"Punct. chs expected WITHIN numbers", this->params()),
INT_MEMBER(x_ht_acceptance_tolerance, 8,
"Max allowed deviation of blob top outside of font data",
this->params()),
INT_MEMBER(x_ht_min_change, 8,
"Min change in xht before actually trying it", this->params()),
INT_MEMBER(superscript_debug, 0,
"Debug level for sub & superscript fixer", this->params()),
double_MEMBER(
superscript_worse_certainty, 2.0,
"How many times worse "
"certainty does a superscript position glyph need to be for "
"us to try classifying it as a char with a different "
"baseline?",
this->params()),
double_MEMBER(
superscript_bettered_certainty, 0.97,
"What reduction in "
"badness do we think sufficient to choose a superscript "
"over what we'd thought. For example, a value of 0.6 means "
"we want to reduce badness of certainty by at least 40%",
this->params()),
double_MEMBER(superscript_scaledown_ratio, 0.4,
"A superscript scaled down more than this is unbelievably "
"small. For example, 0.3 means we expect the font size to "
"be no smaller than 30% of the text line font size.",
this->params()),
double_MEMBER(subscript_max_y_top, 0.5,
"Maximum top of a character measured as a multiple of "
"x-height above the baseline for us to reconsider whether "
"it's a subscript.",
this->params()),
double_MEMBER(superscript_min_y_bottom, 0.3,
"Minimum bottom of a character measured as a multiple of "
"x-height above the baseline for us to reconsider whether "
"it's a superscript.",
this->params()),
BOOL_MEMBER(tessedit_write_block_separators, false,
"Write block separators in output", this->params()),
BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code",
this->params()),
BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file",
this->params()),
BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file",
this->params()),
BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
this->params()),
BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
this->params()),
BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
this->params()),
BOOL_MEMBER(textonly_pdf, false, "Create PDF with only one invisible text layer",
this->params()),
STRING_MEMBER(unrecognised_char, "|",
"Output char for unidentified blobs", this->params()),
INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
INT_MEMBER(suspect_space_level, 100,
"Min suspect level for rejecting spaces", this->params()),
INT_MEMBER(suspect_short_words, 2,
"Don't suspect dict wds longer than this", this->params()),
BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected",
this->params()),
double_MEMBER(suspect_rating_per_ch, 999.9,
"Don't touch bad rating limit", this->params()),
double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit",
this->params()),
BOOL_MEMBER(tessedit_minimal_rejection, false,
"Only reject tess failures", this->params()),
BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING",
this->params()),
BOOL_MEMBER(tessedit_word_for_word, false,
"Make output have exactly one word per WERD", this->params()),
BOOL_MEMBER(tessedit_zero_kelvin_rejection, false,
"Don't reject ANYTHING AT ALL", this->params()),
BOOL_MEMBER(tessedit_consistent_reps, true,
"Force all rep chars the same", this->params()),
INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm",
this->params()),
BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug",
this->params()),
BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips",
this->params()),
double_MEMBER(tessedit_lower_flip_hyphen, 1.5,
"Aspect ratio dot/hyphen test", this->params()),
double_MEMBER(tessedit_upper_flip_hyphen, 1.8,
"Aspect ratio dot/hyphen test", this->params()),
BOOL_MEMBER(rej_trust_doc_dawg, false,
"Use DOC dawg in 11l conf. detector", this->params()),
BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test",
this->params()),
BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check",
this->params()),
BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control",
this->params()),
BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control",
this->params()),
BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control",
this->params()),
BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check",
this->params()),
BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check",
this->params()),
double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85,
"if >this fract", this->params()),
INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit",
this->params()),
STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075",
"Allow NN to unrej", this->params()),
STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set",
this->params()),
INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this",
this->params()),
BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes",
this->params()),
INT_MEMBER(tessedit_page_number, -1,
"-1 -> All pages"
" , else specifc page to process",
this->params()),
BOOL_MEMBER(tessedit_write_images, false,
"Capture the image from the IPE", this->params()),
BOOL_MEMBER(interactive_display_mode, false, "Run interactively?",
this->params()),
STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word",
this->params()),
INT_MEMBER(tessdata_manager_debug_level, 0,
"Debug level for"
" TessdataManager functions.",
this->params()),
STRING_MEMBER(tessedit_load_sublangs, "",
"List of languages to load with this one", this->params()),
BOOL_MEMBER(tessedit_use_primary_params_model, false,
"In multilingual mode use params model of the"
" primary language",
this->params()),
double_MEMBER(min_orientation_margin, 7.0,
"Min acceptable orientation margin", this->params()),
BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding",
this->params()),
BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model",
this->params()),
BOOL_MEMBER(poly_allow_detailed_fx, false,
"Allow feature extractors to see the original outline",
this->params()),
BOOL_INIT_MEMBER(tessedit_init_config_only, false,
"Only initialize with the config file. Useful if the "
"instance is not going to be used for OCR but say only "
"for layout analysis.",
this->params()),
BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
this->params()),
BOOL_MEMBER(textord_tabfind_vertical_text, true,
"Enable vertical detection", this->params()),
BOOL_MEMBER(textord_tabfind_force_vertical_text, false,
"Force using vertical text page mode", this->params()),
double_MEMBER(
textord_tabfind_vertical_text_ratio, 0.5,
"Fraction of textlines deemed vertical to use vertical page "
"mode",
this->params()),
double_MEMBER(
textord_tabfind_aligned_gap_fraction, 0.75,
"Fraction of height used as a minimum gap for aligned blobs.",
this->params()),
INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
this->params()),
BOOL_MEMBER(preserve_interword_spaces, false,
"Preserve multiple interword spaces", this->params()),
BOOL_MEMBER(include_page_breaks, FALSE,
"Include page separator string in output text after each "
"image/page.",
this->params()),
STRING_MEMBER(page_separator, "\f",
"Page separator (default is form feed control character)",
this->params()),
// The following parameters were deprecated and removed from their
// original
// locations. The parameters are temporarily kept here to give Tesseract
// users a chance to updated their [lang].traineddata and config files
// without introducing failures during Tesseract initialization.
// TODO(ocr-team): remove these parameters from the code once we are
// reasonably sure that Tesseract users have updated their data files.
//
// BEGIN DEPRECATED PARAMETERS
BOOL_MEMBER(textord_tabfind_vertical_horizontal_mix, true,
"find horizontal lines such as headers in vertical page mode",
this->params()),
INT_MEMBER(tessedit_ok_mode, 5, "Acceptance decision algorithm",
this->params()),
BOOL_INIT_MEMBER(load_fixed_length_dawgs, true,
"Load fixed length dawgs"
" (e.g. for non-space delimited languages)",
this->params()),
INT_MEMBER(segment_debug, 0, "Debug the whole segmentation process",
this->params()),
BOOL_MEMBER(permute_debug, 0, "Debug char permutation process",
this->params()),
double_MEMBER(bestrate_pruning_factor, 2.0,
"Multiplying factor of"
" current best rate to prune other hypotheses",
this->params()),
BOOL_MEMBER(permute_script_word, 0,
"Turn on word script consistency permuter", this->params()),
BOOL_MEMBER(segment_segcost_rating, 0,
"incorporate segmentation cost in word rating?",
this->params()),
double_MEMBER(segment_reward_script, 0.95,
"Score multipler for script consistency within a word. "
"Being a 'reward' factor, it should be <= 1. "
"Smaller value implies bigger reward.",
this->params()),
BOOL_MEMBER(permute_fixed_length_dawg, 0,
"Turn on fixed-length phrasebook search permuter",
this->params()),
BOOL_MEMBER(permute_chartype_word, 0,
"Turn on character type (property) consistency permuter",
this->params()),
double_MEMBER(segment_reward_chartype, 0.97,
"Score multipler for char type consistency within a word. ",
this->params()),
double_MEMBER(segment_reward_ngram_best_choice, 0.99,
"Score multipler for ngram permuter's best choice"
" (only used in the Han script path).",
this->params()),
BOOL_MEMBER(ngram_permuter_activated, false,
"Activate character-level n-gram-based permuter",
this->params()),
BOOL_MEMBER(permute_only_top, false, "Run only the top choice permuter",
this->params()),
INT_MEMBER(language_model_fixed_length_choices_depth, 3,
"Depth of blob choice lists to explore"
" when fixed length dawgs are on",
this->params()),
BOOL_MEMBER(use_new_state_cost, FALSE,
"use new state cost heuristics for segmentation state"
" evaluation",
this->params()),
double_MEMBER(heuristic_segcost_rating_base, 1.25,
"base factor for adding segmentation cost into word rating."
"It's a multiplying factor, the larger the value above 1, "
"the bigger the effect of segmentation cost.",
this->params()),
double_MEMBER(heuristic_weight_rating, 1.0,
"weight associated with char rating in combined cost of"
"state",
this->params()),
double_MEMBER(heuristic_weight_width, 1000.0,
"weight associated with width evidence in combined cost of"
" state",
this->params()),
double_MEMBER(heuristic_weight_seamcut, 0.0,
"weight associated with seam cut in combined cost of state",
this->params()),
double_MEMBER(heuristic_max_char_wh_ratio, 2.0,
"max char width-to-height ratio allowed in segmentation",
this->params()),
BOOL_MEMBER(enable_new_segsearch, true,
"Enable new segmentation search path.", this->params()),
double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0,
"Maximum character width-to-height ratio for"
" fixed-pitch fonts",
this->params()),
// END DEPRECATED PARAMETERS
backup_config_file_(NULL),
pix_binary_(NULL),
cube_binary_(NULL),
pix_grey_(NULL),
pix_original_(NULL),
pix_thresholds_(NULL),
source_resolution_(0),
textord_(this),
right_to_left_(false),
scaled_color_(NULL),
scaled_factor_(-1),
deskew_(1.0f, 0.0f),
reskew_(1.0f, 0.0f),
most_recently_used_(this),
font_table_size_(0),
#ifndef NO_CUBE_BUILD
cube_cntxt_(NULL),
tess_cube_combiner_(NULL),
#endif
equ_detect_(NULL) {
}
Tesseract::~Tesseract() {
Clear();
pixDestroy(&pix_original_);
end_tesseract();
sub_langs_.delete_data_pointers();
#ifndef NO_CUBE_BUILD
// Delete cube objects.
if (cube_cntxt_ != NULL) {
delete cube_cntxt_;
cube_cntxt_ = NULL;
}
if (tess_cube_combiner_ != NULL) {
delete tess_cube_combiner_;
tess_cube_combiner_ = NULL;
}
#endif
}
void Tesseract::Clear() {
pixDestroy(&pix_binary_);
pixDestroy(&cube_binary_);
pixDestroy(&pix_grey_);
pixDestroy(&pix_thresholds_);
pixDestroy(&scaled_color_);
deskew_ = FCOORD(1.0f, 0.0f);
reskew_ = FCOORD(1.0f, 0.0f);
splitter_.Clear();
scaled_factor_ = -1;
for (int i = 0; i < sub_langs_.size(); ++i)
sub_langs_[i]->Clear();
}
void Tesseract::SetEquationDetect(EquationDetect* detector) {
equ_detect_ = detector;
equ_detect_->SetLangTesseract(this);
}
// Clear all memory of adaption for this and all subclassifiers.
void Tesseract::ResetAdaptiveClassifier() {
ResetAdaptiveClassifierInternal();
for (int i = 0; i < sub_langs_.size(); ++i) {
sub_langs_[i]->ResetAdaptiveClassifierInternal();
}
}
// Clear the document dictionary for this and all subclassifiers.
void Tesseract::ResetDocumentDictionary() {
getDict().ResetDocumentDictionary();
for (int i = 0; i < sub_langs_.size(); ++i) {
sub_langs_[i]->getDict().ResetDocumentDictionary();
}
}
void Tesseract::SetBlackAndWhitelist() {
// Set the white and blacklists (if any)
unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
tessedit_char_whitelist.string(),
tessedit_char_unblacklist.string());
// Black and white lists should apply to all loaded classifiers.
for (int i = 0; i < sub_langs_.size(); ++i) {
sub_langs_[i]->unicharset.set_black_and_whitelist(
tessedit_char_blacklist.string(), tessedit_char_whitelist.string(),
tessedit_char_unblacklist.string());
}
}
// Perform steps to prepare underlying binary image/other data structures for
// page segmentation.
void Tesseract::PrepareForPageseg() {
textord_.set_use_cjk_fp_model(textord_use_cjk_fp_model);
pixDestroy(&cube_binary_);
cube_binary_ = pixClone(pix_binary());
// Find the max splitter strategy over all langs.
ShiroRekhaSplitter::SplitStrategy max_pageseg_strategy =
static_cast<ShiroRekhaSplitter::SplitStrategy>(
static_cast<inT32>(pageseg_devanagari_split_strategy));
for (int i = 0; i < sub_langs_.size(); ++i) {
ShiroRekhaSplitter::SplitStrategy pageseg_strategy =
static_cast<ShiroRekhaSplitter::SplitStrategy>(
static_cast<inT32>(sub_langs_[i]->pageseg_devanagari_split_strategy));
if (pageseg_strategy > max_pageseg_strategy)
max_pageseg_strategy = pageseg_strategy;
// Clone the cube image to all the sub langs too.
pixDestroy(&sub_langs_[i]->cube_binary_);
sub_langs_[i]->cube_binary_ = pixClone(pix_binary());
pixDestroy(&sub_langs_[i]->pix_binary_);
sub_langs_[i]->pix_binary_ = pixClone(pix_binary());
}
// Perform shiro-rekha (top-line) splitting and replace the current image by
// the newly splitted image.
splitter_.set_orig_pix(pix_binary());
splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
if (splitter_.Split(true)) {
ASSERT_HOST(splitter_.splitted_image());
pixDestroy(&pix_binary_);
pix_binary_ = pixClone(splitter_.splitted_image());
}
}
// Perform steps to prepare underlying binary image/other data structures for
// OCR. The current segmentation is required by this method.
// Note that this method resets pix_binary_ to the original binarized image,
// which may be different from the image actually used for OCR depending on the
// value of devanagari_ocr_split_strategy.
void Tesseract::PrepareForTessOCR(BLOCK_LIST* block_list,
Tesseract* osd_tess, OSResults* osr) {
// Find the max splitter strategy over all langs.
ShiroRekhaSplitter::SplitStrategy max_ocr_strategy =
static_cast<ShiroRekhaSplitter::SplitStrategy>(
static_cast<inT32>(ocr_devanagari_split_strategy));
for (int i = 0; i < sub_langs_.size(); ++i) {
ShiroRekhaSplitter::SplitStrategy ocr_strategy =
static_cast<ShiroRekhaSplitter::SplitStrategy>(
static_cast<inT32>(sub_langs_[i]->ocr_devanagari_split_strategy));
if (ocr_strategy > max_ocr_strategy)
max_ocr_strategy = ocr_strategy;
}
// Utilize the segmentation information available.
splitter_.set_segmentation_block_list(block_list);
splitter_.set_ocr_split_strategy(max_ocr_strategy);
// Run the splitter for OCR
bool split_for_ocr = splitter_.Split(false);
// Restore pix_binary to the binarized original pix for future reference.
ASSERT_HOST(splitter_.orig_pix());
pixDestroy(&pix_binary_);
pix_binary_ = pixClone(splitter_.orig_pix());
// If the pageseg and ocr strategies are different, refresh the block list
// (from the last SegmentImage call) with blobs from the real image to be used
// for OCR.
if (splitter_.HasDifferentSplitStrategies()) {
BLOCK block("", TRUE, 0, 0, 0, 0, pixGetWidth(pix_binary_),
pixGetHeight(pix_binary_));
Pix* pix_for_ocr = split_for_ocr ? splitter_.splitted_image() :
splitter_.orig_pix();
extract_edges(pix_for_ocr, &block);
splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());
}
// The splitter isn't needed any more after this, so save memory by clearing.
splitter_.Clear();
}
} // namespace tesseract

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,24 @@
/**********************************************************************
* File: tessvars.cpp (Formerly tessvars.c)
* Description: Variables and other globals for tessedit.
* Author: Ray Smith
* Created: Mon Apr 13 13:13:23 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include <stdio.h>
#include "tessvars.h"
FILE *debug_fp = stderr; // write debug stuff here

View File

@ -0,0 +1,27 @@
/**********************************************************************
* File: tessvars.h (Formerly tessvars.h)
* Description: Variables and other globals for tessedit.
* Author: Ray Smith
* Created: Mon Apr 13 13:13:23 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TESSVARS_H
#define TESSVARS_H
#include <stdio.h>
extern FILE *debug_fp; // write debug stuff here
#endif

View File

@ -0,0 +1,330 @@
/**********************************************************************
* File: tfacepp.cpp (Formerly tface++.c)
* Description: C++ side of the C/C++ Tess/Editor interface.
* Author: Ray Smith
* Created: Thu Apr 23 15:39:23 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifdef _MSC_VER
#pragma warning(disable:4244) // Conversion warnings
#pragma warning(disable:4305) // int/float warnings
#pragma warning(disable:4800) // int/bool warnings
#endif
#include <math.h>
#include "blamer.h"
#include "errcode.h"
#include "ratngs.h"
#include "reject.h"
#include "tesseractclass.h"
#include "werd.h"
#define MAX_UNDIVIDED_LENGTH 24
/**********************************************************************
* recog_word
*
* Convert the word to tess form and pass it to the tess segmenter.
* Convert the output back to editor form.
**********************************************************************/
namespace tesseract {
void Tesseract::recog_word(WERD_RES *word) {
if (wordrec_skip_no_truth_words && (word->blamer_bundle == NULL ||
word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) {
if (classify_debug_level) tprintf("No truth for word - skipping\n");
word->tess_failed = true;
return;
}
ASSERT_HOST(!word->chopped_word->blobs.empty());
recog_word_recursive(word);
word->SetupBoxWord();
if (word->best_choice->length() != word->box_word->length()) {
tprintf("recog_word ASSERT FAIL String:\"%s\"; "
"Strlen=%d; #Blobs=%d\n",
word->best_choice->debug_string().string(),
word->best_choice->length(), word->box_word->length());
}
ASSERT_HOST(word->best_choice->length() == word->box_word->length());
// Check that the ratings matrix size matches the sum of all the
// segmentation states.
if (!word->StatesAllValid()) {
tprintf("Not all words have valid states relative to ratings matrix!!");
word->DebugWordChoices(true, NULL);
ASSERT_HOST(word->StatesAllValid());
}
if (tessedit_override_permuter) {
/* Override the permuter type if a straight dictionary check disagrees. */
uinT8 perm_type = word->best_choice->permuter();
if ((perm_type != SYSTEM_DAWG_PERM) &&
(perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
uinT8 real_dict_perm_type = dict_word(*word->best_choice);
if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
(real_dict_perm_type == FREQ_DAWG_PERM) ||
(real_dict_perm_type == USER_DAWG_PERM)) &&
(alpha_count(word->best_choice->unichar_string().string(),
word->best_choice->unichar_lengths().string()) > 0)) {
word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
}
}
if (tessedit_rejection_debug &&
perm_type != word->best_choice->permuter()) {
tprintf("Permuter Type Flipped from %d to %d\n",
perm_type, word->best_choice->permuter());
}
}
// Factored out from control.cpp
ASSERT_HOST((word->best_choice == NULL) == (word->raw_choice == NULL));
if (word->best_choice == NULL || word->best_choice->length() == 0 ||
static_cast<int>(strspn(word->best_choice->unichar_string().string(),
" ")) == word->best_choice->length()) {
word->tess_failed = true;
word->reject_map.initialise(word->box_word->length());
word->reject_map.rej_word_tess_failure();
}
else {
word->tess_failed = false;
}
}
/**********************************************************************
* recog_word_recursive
*
* Convert the word to tess form and pass it to the tess segmenter.
* Convert the output back to editor form.
**********************************************************************/
void Tesseract::recog_word_recursive(WERD_RES *word) {
int word_length = word->chopped_word->NumBlobs(); // no of blobs
if (word_length > MAX_UNDIVIDED_LENGTH) {
return split_and_recog_word(word);
}
cc_recog(word);
word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
// Do sanity checks and minor fixes on best_choice.
if (word->best_choice->length() > word_length) {
word->best_choice->make_bad(); // should never happen
tprintf("recog_word: Discarded long string \"%s\""
" (%d characters vs %d blobs)\n",
word->best_choice->unichar_string().string(),
word->best_choice->length(), word_length);
tprintf("Word is at:");
word->word->bounding_box().print();
}
if (word->best_choice->length() < word_length) {
UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
while (word->best_choice->length() < word_length) {
word->best_choice->append_unichar_id(space_id, 1, 0.0,
word->best_choice->certainty());
}
}
}
/**********************************************************************
* split_and_recog_word
*
* Split the word into 2 smaller pieces at the largest gap.
* Recognize the pieces and stick the results back together.
**********************************************************************/
void Tesseract::split_and_recog_word(WERD_RES *word) {
// Find the biggest blob gap in the chopped_word.
int bestgap = -MAX_INT32;
int split_index = 0;
for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
int gap = blob_box.left() - prev_box.right();
if (gap > bestgap) {
bestgap = gap;
split_index = b;
}
}
ASSERT_HOST(split_index > 0);
WERD_RES *word2 = NULL;
BlamerBundle *orig_bb = NULL;
split_word(word, split_index, &word2, &orig_bb);
// Recognize the first part of the word.
recog_word_recursive(word);
// Recognize the second part of the word.
recog_word_recursive(word2);
join_words(word, word2, orig_bb);
}
/**********************************************************************
* split_word
*
* Split a given WERD_RES in place into two smaller words for recognition.
* split_pt is the index of the first blob to go in the second word.
* The underlying word is left alone, only the TWERD (and subsequent data)
* are split up. orig_blamer_bundle is set to the original blamer bundle,
* and will now be owned by the caller. New blamer bundles are forged for the
* two pieces.
**********************************************************************/
void Tesseract::split_word(WERD_RES *word,
int split_pt,
WERD_RES **right_piece,
BlamerBundle **orig_blamer_bundle) const {
ASSERT_HOST(split_pt > 0 && split_pt < word->chopped_word->NumBlobs());
// Save a copy of the blamer bundle so we can try to reconstruct it below.
BlamerBundle *orig_bb =
word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : NULL;
WERD_RES *word2 = new WERD_RES(*word);
// blow away the copied chopped_word, as we want to work with
// the blobs from the input chopped_word so seam_arrays can be merged.
TWERD *chopped = word->chopped_word;
TWERD *chopped2 = new TWERD;
chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
chopped2->blobs.push_back(chopped->blobs[i]);
}
chopped->blobs.truncate(split_pt);
word->chopped_word = NULL;
delete word2->chopped_word;
word2->chopped_word = NULL;
const UNICHARSET &unicharset = *word->uch_set;
word->ClearResults();
word2->ClearResults();
word->chopped_word = chopped;
word2->chopped_word = chopped2;
word->SetupBasicsFromChoppedWord(unicharset);
word2->SetupBasicsFromChoppedWord(unicharset);
// Try to adjust the blamer bundle.
if (orig_bb != NULL) {
// TODO(rays) Looks like a leak to me.
// orig_bb should take, rather than copy.
word->blamer_bundle = new BlamerBundle();
word2->blamer_bundle = new BlamerBundle();
orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
word2->chopped_word->blobs[0]->bounding_box().left(),
wordrec_debug_blamer,
word->blamer_bundle, word2->blamer_bundle);
}
*right_piece = word2;
*orig_blamer_bundle = orig_bb;
}
/**********************************************************************
* join_words
*
* The opposite of split_word():
* join word2 (including any recognized data / seam array / etc)
* onto the right of word and then delete word2.
* Also, if orig_bb is provided, stitch it back into word.
**********************************************************************/
void Tesseract::join_words(WERD_RES *word,
WERD_RES *word2,
BlamerBundle *orig_bb) const {
TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
// Tack the word2 outputs onto the end of the word outputs.
word->chopped_word->blobs += word2->chopped_word->blobs;
word->rebuild_word->blobs += word2->rebuild_word->blobs;
word2->chopped_word->blobs.clear();
word2->rebuild_word->blobs.clear();
TPOINT split_pt;
split_pt.x = (prev_box.right() + blob_box.left()) / 2;
split_pt.y = (prev_box.top() + prev_box.bottom() +
blob_box.top() + blob_box.bottom()) / 4;
// Move the word2 seams onto the end of the word1 seam_array.
// Since the seam list is one element short, an empty seam marking the
// end of the last blob in the first word is needed first.
word->seam_array.push_back(new SEAM(0.0f, split_pt));
word->seam_array += word2->seam_array;
word2->seam_array.truncate(0);
// Fix widths and gaps.
word->blob_widths += word2->blob_widths;
word->blob_gaps += word2->blob_gaps;
// Fix the ratings matrix.
int rat1 = word->ratings->dimension();
int rat2 = word2->ratings->dimension();
word->ratings->AttachOnCorner(word2->ratings);
ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
word->best_state += word2->best_state;
// Append the word choices.
*word->raw_choice += *word2->raw_choice;
// How many alt choices from each should we try to get?
const int kAltsPerPiece = 2;
// When do we start throwing away extra alt choices?
const int kTooManyAltChoices = 100;
// Construct the cartesian product of the best_choices of word(1) and word2.
WERD_CHOICE_LIST joined_choices;
WERD_CHOICE_IT jc_it(&joined_choices);
WERD_CHOICE_IT bc1_it(&word->best_choices);
WERD_CHOICE_IT bc2_it(&word2->best_choices);
int num_word1_choices = word->best_choices.length();
int total_joined_choices = num_word1_choices;
// Nota Bene: For the main loop here, we operate only on the 2nd and greater
// word2 choices, and put them in the joined_choices list. The 1st word2
// choice gets added to the original word1 choices in-place after we have
// finished with them.
int bc2_index = 1;
for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
if (total_joined_choices >= kTooManyAltChoices &&
bc2_index > kAltsPerPiece)
break;
int bc1_index = 0;
for (bc1_it.move_to_first(); bc1_index < num_word1_choices;
++bc1_index, bc1_it.forward()) {
if (total_joined_choices >= kTooManyAltChoices &&
bc1_index > kAltsPerPiece)
break;
WERD_CHOICE *wc = new WERD_CHOICE(*bc1_it.data());
*wc += *bc2_it.data();
jc_it.add_after_then_move(wc);
++total_joined_choices;
}
}
// Now that we've filled in as many alternates as we want, paste the best
// choice for word2 onto the original word alt_choices.
bc1_it.move_to_first();
bc2_it.move_to_first();
for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
*bc1_it.data() += *bc2_it.data();
}
bc1_it.move_to_last();
bc1_it.add_list_after(&joined_choices);
// Restore the pointer to original blamer bundle and combine blamer
// information recorded in the splits.
if (orig_bb != NULL) {
orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle,
wordrec_debug_blamer);
delete word->blamer_bundle;
word->blamer_bundle = orig_bb;
}
word->SetupBoxWord();
word->reject_map.initialise(word->box_word->length());
delete word2;
}
} // namespace tesseract

View File

@ -0,0 +1,334 @@
///////////////////////////////////////////////////////////////////////
// File: thresholder.cpp
// Description: Base API for thresolding images in tesseract.
// Author: Ray Smith
// Created: Mon May 12 11:28:15 PDT 2008
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "allheaders.h"
#include "thresholder.h"
#include <string.h>
#include "otsuthr.h"
#include "openclwrapper.h"
namespace tesseract {
ImageThresholder::ImageThresholder()
: pix_(NULL),
image_width_(0), image_height_(0),
pix_channels_(0), pix_wpl_(0),
scale_(1), yres_(300), estimated_res_(300) {
SetRectangle(0, 0, 0, 0);
}
ImageThresholder::~ImageThresholder() {
Clear();
}
// Destroy the Pix if there is one, freeing memory.
void ImageThresholder::Clear() {
pixDestroy(&pix_);
}
// Return true if no image has been set.
bool ImageThresholder::IsEmpty() const {
return pix_ == NULL;
}
// SetImage makes a copy of all the image data, so it may be deleted
// immediately after this call.
// Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
// Palette color images will not work properly and must be converted to
// 24 bit.
// Binary images of 1 bit per pixel may also be given but they must be
// byte packed with the MSB of the first byte being the first pixel, and a
// one pixel is WHITE. For binary images set bytes_per_pixel=0.
void ImageThresholder::SetImage(const unsigned char* imagedata,
int width, int height,
int bytes_per_pixel, int bytes_per_line) {
int bpp = bytes_per_pixel * 8;
if (bpp == 0) bpp = 1;
Pix* pix = pixCreate(width, height, bpp == 24 ? 32 : bpp);
l_uint32* data = pixGetData(pix);
int wpl = pixGetWpl(pix);
switch (bpp) {
case 1:
for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) {
for (int x = 0; x < width; ++x) {
if (imagedata[x / 8] & (0x80 >> (x % 8)))
CLEAR_DATA_BIT(data, x);
else
SET_DATA_BIT(data, x);
}
}
break;
case 8:
// Greyscale just copies the bytes in the right order.
for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) {
for (int x = 0; x < width; ++x)
SET_DATA_BYTE(data, x, imagedata[x]);
}
break;
case 24:
// Put the colors in the correct places in the line buffer.
for (int y = 0; y < height; ++y, imagedata += bytes_per_line) {
for (int x = 0; x < width; ++x, ++data) {
SET_DATA_BYTE(data, COLOR_RED, imagedata[3 * x]);
SET_DATA_BYTE(data, COLOR_GREEN, imagedata[3 * x + 1]);
SET_DATA_BYTE(data, COLOR_BLUE, imagedata[3 * x + 2]);
}
}
break;
case 32:
// Maintain byte order consistency across different endianness.
for (int y = 0; y < height; ++y, imagedata += bytes_per_line, data += wpl) {
for (int x = 0; x < width; ++x) {
data[x] = (imagedata[x * 4] << 24) | (imagedata[x * 4 + 1] << 16) |
(imagedata[x * 4 + 2] << 8) | imagedata[x * 4 + 3];
}
}
break;
default:
tprintf("Cannot convert RAW image to Pix with bpp = %d\n", bpp);
}
pixSetYRes(pix, 300);
SetImage(pix);
pixDestroy(&pix);
}
// Store the coordinates of the rectangle to process for later use.
// Doesn't actually do any thresholding.
void ImageThresholder::SetRectangle(int left, int top, int width, int height) {
rect_left_ = left;
rect_top_ = top;
rect_width_ = width;
rect_height_ = height;
}
// Get enough parameters to be able to rebuild bounding boxes in the
// original image (not just within the rectangle).
// Left and top are enough with top-down coordinates, but
// the height of the rectangle and the image are needed for bottom-up.
void ImageThresholder::GetImageSizes(int* left, int* top,
int* width, int* height,
int* imagewidth, int* imageheight) {
*left = rect_left_;
*top = rect_top_;
*width = rect_width_;
*height = rect_height_;
*imagewidth = image_width_;
*imageheight = image_height_;
}
// Pix vs raw, which to use? Pix is the preferred input for efficiency,
// since raw buffers are copied.
// SetImage for Pix clones its input, so the source pix may be pixDestroyed
// immediately after, but may not go away until after the Thresholder has
// finished with it.
void ImageThresholder::SetImage(const Pix* pix) {
if (pix_ != NULL)
pixDestroy(&pix_);
Pix* src = const_cast<Pix*>(pix);
int depth;
pixGetDimensions(src, &image_width_, &image_height_, &depth);
// Convert the image as necessary so it is one of binary, plain RGB, or
// 8 bit with no colormap. Guarantee that we always end up with our own copy,
// not just a clone of the input.
if (pixGetColormap(src)) {
Pix* tmp = pixRemoveColormap(src, REMOVE_CMAP_BASED_ON_SRC);
depth = pixGetDepth(tmp);
if (depth > 1 && depth < 8) {
pix_ = pixConvertTo8(tmp, false);
pixDestroy(&tmp);
}
else {
pix_ = tmp;
}
}
else if (depth > 1 && depth < 8) {
pix_ = pixConvertTo8(src, false);
}
else {
pix_ = pixCopy(NULL, src);
}
depth = pixGetDepth(pix_);
pix_channels_ = depth / 8;
pix_wpl_ = pixGetWpl(pix_);
scale_ = 1;
estimated_res_ = yres_ = pixGetYRes(pix_);
Init();
}
// Threshold the source image as efficiently as possible to the output Pix.
// Creates a Pix and sets pix to point to the resulting pointer.
// Caller must use pixDestroy to free the created Pix.
void ImageThresholder::ThresholdToPix(PageSegMode pageseg_mode, Pix** pix) {
if (pix_channels_ == 0) {
// We have a binary image, but it still has to be copied, as this API
// allows the caller to modify the output.
Pix* original = GetPixRect();
*pix = pixCopy(NULL, original);
pixDestroy(&original);
}
else {
OtsuThresholdRectToPix(pix_, pix);
}
}
// Gets a pix that contains an 8 bit threshold value at each pixel. The
// returned pix may be an integer reduction of the binary image such that
// the scale factor may be inferred from the ratio of the sizes, even down
// to the extreme of a 1x1 pixel thresholds image.
// Ideally the 8 bit threshold should be the exact threshold used to generate
// the binary image in ThresholdToPix, but this is not a hard constraint.
// Returns NULL if the input is binary. PixDestroy after use.
Pix* ImageThresholder::GetPixRectThresholds() {
if (IsBinary()) return NULL;
Pix* pix_grey = GetPixRectGrey();
int width = pixGetWidth(pix_grey);
int height = pixGetHeight(pix_grey);
int* thresholds;
int* hi_values;
OtsuThreshold(pix_grey, 0, 0, width, height, &thresholds, &hi_values);
pixDestroy(&pix_grey);
Pix* pix_thresholds = pixCreate(width, height, 8);
int threshold = thresholds[0] > 0 ? thresholds[0] : 128;
pixSetAllArbitrary(pix_thresholds, threshold);
delete[] thresholds;
delete[] hi_values;
return pix_thresholds;
}
// Common initialization shared between SetImage methods.
void ImageThresholder::Init() {
SetRectangle(0, 0, image_width_, image_height_);
}
// Get a clone/copy of the source image rectangle.
// The returned Pix must be pixDestroyed.
// This function will be used in the future by the page layout analysis, and
// the layout analysis that uses it will only be available with Leptonica,
// so there is no raw equivalent.
Pix* ImageThresholder::GetPixRect() {
if (IsFullImage()) {
// Just clone the whole thing.
return pixClone(pix_);
}
else {
// Crop to the given rectangle.
Box* box = boxCreate(rect_left_, rect_top_, rect_width_, rect_height_);
Pix* cropped = pixClipRectangle(pix_, box, NULL);
boxDestroy(&box);
return cropped;
}
}
// Get a clone/copy of the source image rectangle, reduced to greyscale,
// and at the same resolution as the output binary.
// The returned Pix must be pixDestroyed.
// Provided to the classifier to extract features from the greyscale image.
Pix* ImageThresholder::GetPixRectGrey() {
Pix* pix = GetPixRect(); // May have to be reduced to grey.
int depth = pixGetDepth(pix);
if (depth != 8) {
Pix* result = depth < 8 ? pixConvertTo8(pix, false)
: pixConvertRGBToLuminance(pix);
pixDestroy(&pix);
return result;
}
return pix;
}
// Otsu thresholds the rectangle, taking the rectangle from *this.
void ImageThresholder::OtsuThresholdRectToPix(Pix* src_pix,
Pix** out_pix) const {
PERF_COUNT_START("OtsuThresholdRectToPix")
int* thresholds;
int* hi_values;
int num_channels = OtsuThreshold(src_pix, rect_left_, rect_top_, rect_width_,
rect_height_, &thresholds, &hi_values);
// only use opencl if compiled w/ OpenCL and selected device is opencl
#ifdef USE_OPENCL
OpenclDevice od;
if ((num_channels == 4 || num_channels == 1) &&
od.selectedDeviceIsOpenCL() && rect_top_ == 0 && rect_left_ == 0) {
od.ThresholdRectToPixOCL((unsigned char*)pixGetData(src_pix), num_channels,
pixGetWpl(src_pix) * 4, thresholds, hi_values,
out_pix /*pix_OCL*/, rect_height_, rect_width_,
rect_top_, rect_left_);
}
else {
#endif
ThresholdRectToPix(src_pix, num_channels, thresholds, hi_values, out_pix);
#ifdef USE_OPENCL
}
#endif
delete[] thresholds;
delete[] hi_values;
PERF_COUNT_END
}
/// Threshold the rectangle, taking everything except the src_pix
/// from the class, using thresholds/hi_values to the output pix.
/// NOTE that num_channels is the size of the thresholds and hi_values
// arrays and also the bytes per pixel in src_pix.
void ImageThresholder::ThresholdRectToPix(Pix* src_pix,
int num_channels,
const int* thresholds,
const int* hi_values,
Pix** pix) const {
PERF_COUNT_START("ThresholdRectToPix")
*pix = pixCreate(rect_width_, rect_height_, 1);
uinT32* pixdata = pixGetData(*pix);
int wpl = pixGetWpl(*pix);
int src_wpl = pixGetWpl(src_pix);
uinT32* srcdata = pixGetData(src_pix);
for (int y = 0; y < rect_height_; ++y) {
const uinT32* linedata = srcdata + (y + rect_top_) * src_wpl;
uinT32* pixline = pixdata + y * wpl;
for (int x = 0; x < rect_width_; ++x) {
bool white_result = true;
for (int ch = 0; ch < num_channels; ++ch) {
int pixel = GET_DATA_BYTE(const_cast<void*>(
reinterpret_cast<const void *>(linedata)),
(x + rect_left_) * num_channels + ch);
if (hi_values[ch] >= 0 &&
(pixel > thresholds[ch]) == (hi_values[ch] == 0)) {
white_result = false;
break;
}
}
if (white_result)
CLEAR_DATA_BIT(pixline, x);
else
SET_DATA_BIT(pixline, x);
}
}
PERF_COUNT_END
}
} // namespace tesseract.

View File

@ -0,0 +1,189 @@
///////////////////////////////////////////////////////////////////////
// File: thresholder.h
// Description: Base API for thresolding images in tesseract.
// Author: Ray Smith
// Created: Mon May 12 11:00:15 PDT 2008
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCMAIN_THRESHOLDER_H__
#define TESSERACT_CCMAIN_THRESHOLDER_H__
#include "platform.h"
#include "publictypes.h"
struct Pix;
namespace tesseract {
/// Base class for all tesseract image thresholding classes.
/// Specific classes can add new thresholding methods by
/// overriding ThresholdToPix.
/// Each instance deals with a single image, but the design is intended to
/// be useful for multiple calls to SetRectangle and ThresholdTo* if
/// desired.
class TESS_API ImageThresholder {
public:
ImageThresholder();
virtual ~ImageThresholder();
/// Destroy the Pix if there is one, freeing memory.
virtual void Clear();
/// Return true if no image has been set.
bool IsEmpty() const;
/// SetImage makes a copy of all the image data, so it may be deleted
/// immediately after this call.
/// Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
/// Palette color images will not work properly and must be converted to
/// 24 bit.
/// Binary images of 1 bit per pixel may also be given but they must be
/// byte packed with the MSB of the first byte being the first pixel, and a
/// one pixel is WHITE. For binary images set bytes_per_pixel=0.
void SetImage(const unsigned char* imagedata, int width, int height,
int bytes_per_pixel, int bytes_per_line);
/// Store the coordinates of the rectangle to process for later use.
/// Doesn't actually do any thresholding.
void SetRectangle(int left, int top, int width, int height);
/// Get enough parameters to be able to rebuild bounding boxes in the
/// original image (not just within the rectangle).
/// Left and top are enough with top-down coordinates, but
/// the height of the rectangle and the image are needed for bottom-up.
virtual void GetImageSizes(int* left, int* top, int* width, int* height,
int* imagewidth, int* imageheight);
/// Return true if the source image is color.
bool IsColor() const {
return pix_channels_ >= 3;
}
/// Returns true if the source image is binary.
bool IsBinary() const {
return pix_channels_ == 0;
}
int GetScaleFactor() const {
return scale_;
}
// Set the resolution of the source image in pixels per inch.
// This should be called right after SetImage(), and will let us return
// appropriate font sizes for the text.
void SetSourceYResolution(int ppi) {
yres_ = ppi;
estimated_res_ = ppi;
}
int GetSourceYResolution() const {
return yres_;
}
int GetScaledYResolution() const {
return scale_ * yres_;
}
// Set the resolution of the source image in pixels per inch, as estimated
// by the thresholder from the text size found during thresholding.
// This value will be used to set internal size thresholds during recognition
// and will not influence the output "point size." The default value is
// the same as the source resolution. (yres_)
void SetEstimatedResolution(int ppi) {
estimated_res_ = ppi;
}
// Returns the estimated resolution, including any active scaling.
// This value will be used to set internal size thresholds during recognition.
int GetScaledEstimatedResolution() const {
return scale_ * estimated_res_;
}
/// Pix vs raw, which to use? Pix is the preferred input for efficiency,
/// since raw buffers are copied.
/// SetImage for Pix clones its input, so the source pix may be pixDestroyed
/// immediately after, but may not go away until after the Thresholder has
/// finished with it.
void SetImage(const Pix* pix);
/// Threshold the source image as efficiently as possible to the output Pix.
/// Creates a Pix and sets pix to point to the resulting pointer.
/// Caller must use pixDestroy to free the created Pix.
virtual void ThresholdToPix(PageSegMode pageseg_mode, Pix** pix);
// Gets a pix that contains an 8 bit threshold value at each pixel. The
// returned pix may be an integer reduction of the binary image such that
// the scale factor may be inferred from the ratio of the sizes, even down
// to the extreme of a 1x1 pixel thresholds image.
// Ideally the 8 bit threshold should be the exact threshold used to generate
// the binary image in ThresholdToPix, but this is not a hard constraint.
// Returns NULL if the input is binary. PixDestroy after use.
virtual Pix* GetPixRectThresholds();
/// Get a clone/copy of the source image rectangle.
/// The returned Pix must be pixDestroyed.
/// This function will be used in the future by the page layout analysis, and
/// the layout analysis that uses it will only be available with Leptonica,
/// so there is no raw equivalent.
Pix* GetPixRect();
// Get a clone/copy of the source image rectangle, reduced to greyscale,
// and at the same resolution as the output binary.
// The returned Pix must be pixDestroyed.
// Provided to the classifier to extract features from the greyscale image.
virtual Pix* GetPixRectGrey();
protected:
// ----------------------------------------------------------------------
// Utility functions that may be useful components for other thresholders.
/// Common initialization shared between SetImage methods.
virtual void Init();
/// Return true if we are processing the full image.
bool IsFullImage() const {
return rect_left_ == 0 && rect_top_ == 0 &&
rect_width_ == image_width_ && rect_height_ == image_height_;
}
// Otsu thresholds the rectangle, taking the rectangle from *this.
void OtsuThresholdRectToPix(Pix* src_pix, Pix** out_pix) const;
/// Threshold the rectangle, taking everything except the src_pix
/// from the class, using thresholds/hi_values to the output pix.
/// NOTE that num_channels is the size of the thresholds and hi_values
// arrays and also the bytes per pixel in src_pix.
void ThresholdRectToPix(Pix* src_pix, int num_channels,
const int* thresholds, const int* hi_values,
Pix** pix) const;
protected:
/// Clone or other copy of the source Pix.
/// The pix will always be PixDestroy()ed on destruction of the class.
Pix* pix_;
int image_width_; //< Width of source pix_.
int image_height_; //< Height of source pix_.
int pix_channels_; //< Number of 8-bit channels in pix_.
int pix_wpl_; //< Words per line of pix_.
// Limits of image rectangle to be processed.
int scale_; //< Scale factor from original image.
int yres_; //< y pixels/inch in source image.
int estimated_res_; //< Resolution estimate from text size.
int rect_left_;
int rect_top_;
int rect_width_;
int rect_height_;
};
} // namespace tesseract.
#endif // TESSERACT_CCMAIN_THRESHOLDER_H__

View File

@ -0,0 +1,2 @@
#define GIT_REV "3.05.00dev"

View File

@ -0,0 +1,59 @@
/**********************************************************************
* File: werdit.cpp (Formerly wordit.c)
* Description: An iterator for passing over all the words in a document.
* Author: Ray Smith
* Created: Mon Apr 27 08:51:22 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "werdit.h"
/**********************************************************************
* make_pseudo_word
*
* Make all the blobs inside a selection into a single word.
* The returned PAGE_RES_IT* it points to the new word. After use, call
* it->DeleteCurrentWord() to delete the fake word, and then
* delete it to get rid of the iterator itself.
**********************************************************************/
PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box) {
PAGE_RES_IT pr_it(page_res);
C_BLOB_LIST new_blobs; // list of gathered blobs
C_BLOB_IT new_blob_it = &new_blobs; // iterator
for (WERD_RES* word_res = pr_it.word(); word_res != NULL;
word_res = pr_it.forward()) {
WERD* word = word_res->word;
if (word->bounding_box().overlap(selection_box)) {
C_BLOB_IT blob_it(word->cblob_list());
for (blob_it.mark_cycle_pt();
!blob_it.cycled_list(); blob_it.forward()) {
C_BLOB* blob = blob_it.data();
if (blob->bounding_box().overlap(selection_box)) {
new_blob_it.add_after_then_move(C_BLOB::deep_copy(blob));
}
}
if (!new_blobs.empty()) {
WERD* pseudo_word = new WERD(&new_blobs, 1, NULL);
word_res = pr_it.InsertSimpleCloneWord(*word_res, pseudo_word);
PAGE_RES_IT* it = new PAGE_RES_IT(page_res);
while (it->word() != word_res && it->word() != NULL) it->forward();
ASSERT_HOST(it->word() == word_res);
return it;
}
}
}
return NULL;
}

View File

@ -0,0 +1,27 @@
/**********************************************************************
* File: wordit.c
* Description: An iterator for passing over all the words in a document.
* Author: Ray Smith
* Created: Mon Apr 27 08:51:22 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef WERDIT_H
#define WERDIT_H
#include "pageres.h"
PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box);
#endif

View File

@ -0,0 +1,603 @@
///////////////////////////////////////////////////////////////////////
// File: blamer.cpp
// Description: Module allowing precise error causes to be allocated.
// Author: Rike Antonova
// Refactored: Ray Smith
// Created: Mon Feb 04 14:37:01 PST 2013
//
// (C) Copyright 2013, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "blamer.h"
#include "blobs.h"
#include "matrix.h"
#include "normalis.h"
#include "pageres.h"
// Names for each value of IncorrectResultReason enum. Keep in sync.
const char kBlameCorrect[] = "corr";
const char kBlameClassifier[] = "cl";
const char kBlameChopper[] = "chop";
const char kBlameClassLMTradeoff[] = "cl/LM";
const char kBlamePageLayout[] = "pglt";
const char kBlameSegsearchHeur[] = "ss_heur";
const char kBlameSegsearchPP[] = "ss_pp";
const char kBlameClassOldLMTradeoff[] = "cl/old_LM";
const char kBlameAdaption[] = "adapt";
const char kBlameNoTruthSplit[] = "no_tr_spl";
const char kBlameNoTruth[] = "no_tr";
const char kBlameUnknown[] = "unkn";
const char * const kIncorrectResultReasonNames[] = {
kBlameCorrect,
kBlameClassifier,
kBlameChopper,
kBlameClassLMTradeoff,
kBlamePageLayout,
kBlameSegsearchHeur,
kBlameSegsearchPP,
kBlameClassOldLMTradeoff,
kBlameAdaption,
kBlameNoTruthSplit,
kBlameNoTruth,
kBlameUnknown
};
const char *BlamerBundle::IncorrectReasonName(IncorrectResultReason irr) {
return kIncorrectResultReasonNames[irr];
}
const char *BlamerBundle::IncorrectReason() const {
return kIncorrectResultReasonNames[incorrect_result_reason_];
}
// Functions to setup the blamer.
// Whole word string, whole word bounding box.
void BlamerBundle::SetWordTruth(const UNICHARSET& unicharset,
const char* truth_str, const TBOX& word_box) {
truth_word_.InsertBox(0, word_box);
truth_has_char_boxes_ = false;
// Encode the string as UNICHAR_IDs.
GenericVector<UNICHAR_ID> encoding;
GenericVector<char> lengths;
unicharset.encode_string(truth_str, false, &encoding, &lengths, NULL);
int total_length = 0;
for (int i = 0; i < encoding.size(); total_length += lengths[i++]) {
STRING uch(truth_str + total_length);
uch.truncate_at(lengths[i] - total_length);
UNICHAR_ID id = encoding[i];
if (id != INVALID_UNICHAR_ID) uch = unicharset.get_normed_unichar(id);
truth_text_.push_back(uch);
}
}
// Single "character" string, "character" bounding box.
// May be called multiple times to indicate the characters in a word.
void BlamerBundle::SetSymbolTruth(const UNICHARSET& unicharset,
const char* char_str, const TBOX& char_box) {
STRING symbol_str(char_str);
UNICHAR_ID id = unicharset.unichar_to_id(char_str);
if (id != INVALID_UNICHAR_ID) {
STRING normed_uch(unicharset.get_normed_unichar(id));
if (normed_uch.length() > 0) symbol_str = normed_uch;
}
int length = truth_word_.length();
truth_text_.push_back(symbol_str);
truth_word_.InsertBox(length, char_box);
if (length == 0)
truth_has_char_boxes_ = true;
else if (truth_word_.BlobBox(length - 1) == char_box)
truth_has_char_boxes_ = false;
}
// Marks that there is something wrong with the truth text, like it contains
// reject characters.
void BlamerBundle::SetRejectedTruth() {
incorrect_result_reason_ = IRR_NO_TRUTH;
truth_has_char_boxes_ = false;
}
// Returns true if the provided word_choice is correct.
bool BlamerBundle::ChoiceIsCorrect(const WERD_CHOICE* word_choice) const {
if (word_choice == NULL) return false;
const UNICHARSET* uni_set = word_choice->unicharset();
STRING normed_choice_str;
for (int i = 0; i < word_choice->length(); ++i) {
normed_choice_str +=
uni_set->get_normed_unichar(word_choice->unichar_id(i));
}
STRING truth_str = TruthString();
return truth_str == normed_choice_str;
}
void BlamerBundle::FillDebugString(const STRING &msg,
const WERD_CHOICE *choice,
STRING *debug) {
(*debug) += "Truth ";
for (int i = 0; i < this->truth_text_.length(); ++i) {
(*debug) += this->truth_text_[i];
}
if (!this->truth_has_char_boxes_) (*debug) += " (no char boxes)";
if (choice != NULL) {
(*debug) += " Choice ";
STRING choice_str;
choice->string_and_lengths(&choice_str, NULL);
(*debug) += choice_str;
}
if (msg.length() > 0) {
(*debug) += "\n";
(*debug) += msg;
}
(*debug) += "\n";
}
// Sets up the norm_truth_word from truth_word using the given DENORM.
void BlamerBundle::SetupNormTruthWord(const DENORM& denorm) {
// TODO(rays) Is this the last use of denorm in WERD_RES and can it go?
norm_box_tolerance_ = kBlamerBoxTolerance * denorm.x_scale();
TPOINT topleft;
TPOINT botright;
TPOINT norm_topleft;
TPOINT norm_botright;
for (int b = 0; b < truth_word_.length(); ++b) {
const TBOX &box = truth_word_.BlobBox(b);
topleft.x = box.left();
topleft.y = box.top();
botright.x = box.right();
botright.y = box.bottom();
denorm.NormTransform(NULL, topleft, &norm_topleft);
denorm.NormTransform(NULL, botright, &norm_botright);
TBOX norm_box(norm_topleft.x, norm_botright.y,
norm_botright.x, norm_topleft.y);
norm_truth_word_.InsertBox(b, norm_box);
}
}
// Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
// bundles) where the right edge/ of the left-hand word is word1_right,
// and the left edge of the right-hand word is word2_left.
void BlamerBundle::SplitBundle(int word1_right, int word2_left, bool debug,
BlamerBundle* bundle1,
BlamerBundle* bundle2) const {
STRING debug_str;
// Find truth boxes that correspond to the split in the blobs.
int b;
int begin2_truth_index = -1;
if (incorrect_result_reason_ != IRR_NO_TRUTH &&
truth_has_char_boxes_) {
debug_str = "Looking for truth split at";
debug_str.add_str_int(" end1_x ", word1_right);
debug_str.add_str_int(" begin2_x ", word2_left);
debug_str += "\nnorm_truth_word boxes:\n";
if (norm_truth_word_.length() > 1) {
norm_truth_word_.BlobBox(0).print_to_str(&debug_str);
for (b = 1; b < norm_truth_word_.length(); ++b) {
norm_truth_word_.BlobBox(b).print_to_str(&debug_str);
if ((abs(word1_right - norm_truth_word_.BlobBox(b - 1).right()) <
norm_box_tolerance_) &&
(abs(word2_left - norm_truth_word_.BlobBox(b).left()) <
norm_box_tolerance_)) {
begin2_truth_index = b;
debug_str += "Split found";
break;
}
}
debug_str += '\n';
}
}
// Populate truth information in word and word2 with the first and second
// part of the original truth.
if (begin2_truth_index > 0) {
bundle1->truth_has_char_boxes_ = true;
bundle1->norm_box_tolerance_ = norm_box_tolerance_;
bundle2->truth_has_char_boxes_ = true;
bundle2->norm_box_tolerance_ = norm_box_tolerance_;
BlamerBundle *curr_bb = bundle1;
for (b = 0; b < norm_truth_word_.length(); ++b) {
if (b == begin2_truth_index) curr_bb = bundle2;
curr_bb->norm_truth_word_.InsertBox(b, norm_truth_word_.BlobBox(b));
curr_bb->truth_word_.InsertBox(b, truth_word_.BlobBox(b));
curr_bb->truth_text_.push_back(truth_text_[b]);
}
}
else if (incorrect_result_reason_ == IRR_NO_TRUTH) {
bundle1->incorrect_result_reason_ = IRR_NO_TRUTH;
bundle2->incorrect_result_reason_ = IRR_NO_TRUTH;
}
else {
debug_str += "Truth split not found";
debug_str += truth_has_char_boxes_ ?
"\n" : " (no truth char boxes)\n";
bundle1->SetBlame(IRR_NO_TRUTH_SPLIT, debug_str, NULL, debug);
bundle2->SetBlame(IRR_NO_TRUTH_SPLIT, debug_str, NULL, debug);
}
}
// "Joins" the blames from bundle1 and bundle2 into *this.
void BlamerBundle::JoinBlames(const BlamerBundle& bundle1,
const BlamerBundle& bundle2, bool debug) {
STRING debug_str;
IncorrectResultReason irr = incorrect_result_reason_;
if (irr != IRR_NO_TRUTH_SPLIT) debug_str = "";
if (bundle1.incorrect_result_reason_ != IRR_CORRECT &&
bundle1.incorrect_result_reason_ != IRR_NO_TRUTH &&
bundle1.incorrect_result_reason_ != IRR_NO_TRUTH_SPLIT) {
debug_str += "Blame from part 1: ";
debug_str += bundle1.debug_;
irr = bundle1.incorrect_result_reason_;
}
if (bundle2.incorrect_result_reason_ != IRR_CORRECT &&
bundle2.incorrect_result_reason_ != IRR_NO_TRUTH &&
bundle2.incorrect_result_reason_ != IRR_NO_TRUTH_SPLIT) {
debug_str += "Blame from part 2: ";
debug_str += bundle2.debug_;
if (irr == IRR_CORRECT) {
irr = bundle2.incorrect_result_reason_;
}
else if (irr != bundle2.incorrect_result_reason_) {
irr = IRR_UNKNOWN;
}
}
incorrect_result_reason_ = irr;
if (irr != IRR_CORRECT && irr != IRR_NO_TRUTH) {
SetBlame(irr, debug_str, NULL, debug);
}
}
// If a blob with the same bounding box as one of the truth character
// bounding boxes is not classified as the corresponding truth character
// blames character classifier for incorrect answer.
void BlamerBundle::BlameClassifier(const UNICHARSET& unicharset,
const TBOX& blob_box,
const BLOB_CHOICE_LIST& choices,
bool debug) {
if (!truth_has_char_boxes_ ||
incorrect_result_reason_ != IRR_CORRECT)
return; // Nothing to do here.
for (int b = 0; b < norm_truth_word_.length(); ++b) {
const TBOX &truth_box = norm_truth_word_.BlobBox(b);
// Note that we are more strict on the bounding box boundaries here
// than in other places (chopper, segmentation search), since we do
// not have the ability to check the previous and next bounding box.
if (blob_box.x_almost_equal(truth_box, norm_box_tolerance_ / 2)) {
bool found = false;
bool incorrect_adapted = false;
UNICHAR_ID incorrect_adapted_id = INVALID_UNICHAR_ID;
const char *truth_str = truth_text_[b].string();
// We promise not to modify the list or its contents, using a
// const BLOB_CHOICE* below.
BLOB_CHOICE_IT choices_it(const_cast<BLOB_CHOICE_LIST*>(&choices));
for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
choices_it.forward()) {
const BLOB_CHOICE* choice = choices_it.data();
if (strcmp(truth_str, unicharset.get_normed_unichar(
choice->unichar_id())) == 0) {
found = true;
break;
}
else if (choice->IsAdapted()) {
incorrect_adapted = true;
incorrect_adapted_id = choice->unichar_id();
}
} // end choices_it for loop
if (!found) {
STRING debug_str = "unichar ";
debug_str += truth_str;
debug_str += " not found in classification list";
SetBlame(IRR_CLASSIFIER, debug_str, NULL, debug);
}
else if (incorrect_adapted) {
STRING debug_str = "better rating for adapted ";
debug_str += unicharset.id_to_unichar(incorrect_adapted_id);
debug_str += " than for correct ";
debug_str += truth_str;
SetBlame(IRR_ADAPTION, debug_str, NULL, debug);
}
break;
}
} // end iterating over blamer_bundle->norm_truth_word
}
// Checks whether chops were made at all the character bounding box
// boundaries in word->truth_word. If not - blames the chopper for an
// incorrect answer.
void BlamerBundle::SetChopperBlame(const WERD_RES* word, bool debug) {
if (NoTruth() || !truth_has_char_boxes_ ||
word->chopped_word->blobs.empty()) {
return;
}
STRING debug_str;
bool missing_chop = false;
int num_blobs = word->chopped_word->blobs.size();
int box_index = 0;
int blob_index = 0;
inT16 truth_x = -1;
while (box_index < truth_word_.length() && blob_index < num_blobs) {
truth_x = norm_truth_word_.BlobBox(box_index).right();
TBLOB * curr_blob = word->chopped_word->blobs[blob_index];
if (curr_blob->bounding_box().right() < truth_x - norm_box_tolerance_) {
++blob_index;
continue; // encountered an extra chop, keep looking
}
else if (curr_blob->bounding_box().right() >
truth_x + norm_box_tolerance_) {
missing_chop = true;
break;
}
else {
++blob_index;
}
}
if (missing_chop || box_index < norm_truth_word_.length()) {
STRING debug_str;
if (missing_chop) {
debug_str.add_str_int("Detected missing chop (tolerance=",
norm_box_tolerance_);
debug_str += ") at Bounding Box=";
TBLOB * curr_blob = word->chopped_word->blobs[blob_index];
curr_blob->bounding_box().print_to_str(&debug_str);
debug_str.add_str_int("\nNo chop for truth at x=", truth_x);
}
else {
debug_str.add_str_int("Missing chops for last ",
norm_truth_word_.length() - box_index);
debug_str += " truth box(es)";
}
debug_str += "\nMaximally chopped word boxes:\n";
for (blob_index = 0; blob_index < num_blobs; ++blob_index) {
TBLOB * curr_blob = word->chopped_word->blobs[blob_index];
curr_blob->bounding_box().print_to_str(&debug_str);
debug_str += '\n';
}
debug_str += "Truth bounding boxes:\n";
for (box_index = 0; box_index < norm_truth_word_.length(); ++box_index) {
norm_truth_word_.BlobBox(box_index).print_to_str(&debug_str);
debug_str += '\n';
}
SetBlame(IRR_CHOPPER, debug_str, word->best_choice, debug);
}
}
// Blames the classifier or the language model if, after running only the
// chopper, best_choice is incorrect and no blame has been yet set.
// Blames the classifier if best_choice is classifier's top choice and is a
// dictionary word (i.e. language model could not have helped).
// Otherwise, blames the language model (formerly permuter word adjustment).
void BlamerBundle::BlameClassifierOrLangModel(
const WERD_RES* word,
const UNICHARSET& unicharset, bool valid_permuter, bool debug) {
if (valid_permuter) {
// Find out whether best choice is a top choice.
best_choice_is_dict_and_top_choice_ = true;
for (int i = 0; i < word->best_choice->length(); ++i) {
BLOB_CHOICE_IT blob_choice_it(word->GetBlobChoices(i));
ASSERT_HOST(!blob_choice_it.empty());
BLOB_CHOICE *first_choice = NULL;
for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
blob_choice_it.forward()) { // find first non-fragment choice
if (!(unicharset.get_fragment(blob_choice_it.data()->unichar_id()))) {
first_choice = blob_choice_it.data();
break;
}
}
ASSERT_HOST(first_choice != NULL);
if (first_choice->unichar_id() != word->best_choice->unichar_id(i)) {
best_choice_is_dict_and_top_choice_ = false;
break;
}
}
}
STRING debug_str;
if (best_choice_is_dict_and_top_choice_) {
debug_str = "Best choice is: incorrect, top choice, dictionary word";
debug_str += " with permuter ";
debug_str += word->best_choice->permuter_name();
}
else {
debug_str = "Classifier/Old LM tradeoff is to blame";
}
SetBlame(best_choice_is_dict_and_top_choice_ ? IRR_CLASSIFIER
: IRR_CLASS_OLD_LM_TRADEOFF,
debug_str, word->best_choice, debug);
}
// Sets up the correct_segmentation_* to mark the correct bounding boxes.
void BlamerBundle::SetupCorrectSegmentation(const TWERD* word, bool debug) {
params_training_bundle_.StartHypothesisList();
if (incorrect_result_reason_ != IRR_CORRECT || !truth_has_char_boxes_)
return; // Nothing to do here.
STRING debug_str;
debug_str += "Blamer computing correct_segmentation_cols\n";
int curr_box_col = 0;
int next_box_col = 0;
int num_blobs = word->NumBlobs();
if (num_blobs == 0) return; // No blobs to play with.
int blob_index = 0;
inT16 next_box_x = word->blobs[blob_index]->bounding_box().right();
for (int truth_idx = 0; blob_index < num_blobs &&
truth_idx < norm_truth_word_.length();
++blob_index) {
++next_box_col;
inT16 curr_box_x = next_box_x;
if (blob_index + 1 < num_blobs)
next_box_x = word->blobs[blob_index + 1]->bounding_box().right();
inT16 truth_x = norm_truth_word_.BlobBox(truth_idx).right();
debug_str.add_str_int("Box x coord vs. truth: ", curr_box_x);
debug_str.add_str_int(" ", truth_x);
debug_str += "\n";
if (curr_box_x > (truth_x + norm_box_tolerance_)) {
break; // failed to find a matching box
}
else if (curr_box_x >= truth_x - norm_box_tolerance_ && // matched
(blob_index + 1 >= num_blobs || // next box can't be included
next_box_x > truth_x + norm_box_tolerance_)) {
correct_segmentation_cols_.push_back(curr_box_col);
correct_segmentation_rows_.push_back(next_box_col - 1);
++truth_idx;
debug_str.add_str_int("col=", curr_box_col);
debug_str.add_str_int(" row=", next_box_col - 1);
debug_str += "\n";
curr_box_col = next_box_col;
}
}
if (blob_index < num_blobs || // trailing blobs
correct_segmentation_cols_.length() != norm_truth_word_.length()) {
debug_str.add_str_int("Blamer failed to find correct segmentation"
" (tolerance=", norm_box_tolerance_);
if (blob_index >= num_blobs) debug_str += " blob == NULL";
debug_str += ")\n";
debug_str.add_str_int(" path length ", correct_segmentation_cols_.length());
debug_str.add_str_int(" vs. truth ", norm_truth_word_.length());
debug_str += "\n";
SetBlame(IRR_UNKNOWN, debug_str, NULL, debug);
correct_segmentation_cols_.clear();
correct_segmentation_rows_.clear();
}
}
// Returns true if a guided segmentation search is needed.
bool BlamerBundle::GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const {
return incorrect_result_reason_ == IRR_CORRECT &&
!segsearch_is_looking_for_blame_ &&
truth_has_char_boxes_ &&
!ChoiceIsCorrect(best_choice);
}
// Setup ready to guide the segmentation search to the correct segmentation.
// The callback pp_cb is used to avoid a cyclic dependency.
// It calls into LMPainPoints::GenerateForBlamer by pre-binding the
// WERD_RES, and the LMPainPoints itself.
// pp_cb must be a permanent callback, and should be deleted by the caller.
void BlamerBundle::InitForSegSearch(const WERD_CHOICE *best_choice,
MATRIX* ratings, UNICHAR_ID wildcard_id,
bool debug, STRING *debug_str,
TessResultCallback2<bool, int, int>* cb) {
segsearch_is_looking_for_blame_ = true;
if (debug) {
tprintf("segsearch starting to look for blame\n");
}
// Fill pain points for any unclassifed blob corresponding to the
// correct segmentation state.
*debug_str += "Correct segmentation:\n";
for (int idx = 0; idx < correct_segmentation_cols_.length(); ++idx) {
debug_str->add_str_int("col=", correct_segmentation_cols_[idx]);
debug_str->add_str_int(" row=", correct_segmentation_rows_[idx]);
*debug_str += "\n";
if (!ratings->Classified(correct_segmentation_cols_[idx],
correct_segmentation_rows_[idx],
wildcard_id) &&
!cb->Run(correct_segmentation_cols_[idx],
correct_segmentation_rows_[idx])) {
segsearch_is_looking_for_blame_ = false;
*debug_str += "\nFailed to insert pain point\n";
SetBlame(IRR_SEGSEARCH_HEUR, *debug_str, best_choice, debug);
break;
}
} // end for blamer_bundle->correct_segmentation_cols/rows
}
// Returns true if the guided segsearch is in progress.
bool BlamerBundle::GuidedSegsearchStillGoing() const {
return segsearch_is_looking_for_blame_;
}
// The segmentation search has ended. Sets the blame appropriately.
void BlamerBundle::FinishSegSearch(const WERD_CHOICE *best_choice,
bool debug, STRING *debug_str) {
// If we are still looking for blame (i.e. best_choice is incorrect, but a
// path representing the correct segmentation could be constructed), we can
// blame segmentation search pain point prioritization if the rating of the
// path corresponding to the correct segmentation is better than that of
// best_choice (i.e. language model would have done the correct thing, but
// because of poor pain point prioritization the correct segmentation was
// never explored). Otherwise we blame the tradeoff between the language model
// and the classifier, since even after exploring the path corresponding to
// the correct segmentation incorrect best_choice would have been chosen.
// One special case when we blame the classifier instead is when best choice
// is incorrect, but it is a dictionary word and it classifier's top choice.
if (segsearch_is_looking_for_blame_) {
segsearch_is_looking_for_blame_ = false;
if (best_choice_is_dict_and_top_choice_) {
*debug_str = "Best choice is: incorrect, top choice, dictionary word";
*debug_str += " with permuter ";
*debug_str += best_choice->permuter_name();
SetBlame(IRR_CLASSIFIER, *debug_str, best_choice, debug);
}
else if (best_correctly_segmented_rating_ <
best_choice->rating()) {
*debug_str += "Correct segmentation state was not explored";
SetBlame(IRR_SEGSEARCH_PP, *debug_str, best_choice, debug);
}
else {
if (best_correctly_segmented_rating_ >=
WERD_CHOICE::kBadRating) {
*debug_str += "Correct segmentation paths were pruned by LM\n";
}
else {
debug_str->add_str_double("Best correct segmentation rating ",
best_correctly_segmented_rating_);
debug_str->add_str_double(" vs. best choice rating ",
best_choice->rating());
}
SetBlame(IRR_CLASS_LM_TRADEOFF, *debug_str, best_choice, debug);
}
}
}
// If the bundle is null or still does not indicate the correct result,
// fix it and use some backup reason for the blame.
void BlamerBundle::LastChanceBlame(bool debug, WERD_RES* word) {
if (word->blamer_bundle == NULL) {
word->blamer_bundle = new BlamerBundle();
word->blamer_bundle->SetBlame(IRR_PAGE_LAYOUT, "LastChanceBlame",
word->best_choice, debug);
}
else if (word->blamer_bundle->incorrect_result_reason_ == IRR_NO_TRUTH) {
word->blamer_bundle->SetBlame(IRR_NO_TRUTH, "Rejected truth",
word->best_choice, debug);
}
else {
bool correct = word->blamer_bundle->ChoiceIsCorrect(word->best_choice);
IncorrectResultReason irr = word->blamer_bundle->incorrect_result_reason_;
if (irr == IRR_CORRECT && !correct) {
STRING debug_str = "Choice is incorrect after recognition";
word->blamer_bundle->SetBlame(IRR_UNKNOWN, debug_str, word->best_choice,
debug);
}
else if (irr != IRR_CORRECT && correct) {
if (debug) {
tprintf("Corrected %s\n", word->blamer_bundle->debug_.string());
}
word->blamer_bundle->incorrect_result_reason_ = IRR_CORRECT;
word->blamer_bundle->debug_ = "";
}
}
}
// Sets the misadaption debug if this word is incorrect, as this word is
// being adapted to.
void BlamerBundle::SetMisAdaptionDebug(const WERD_CHOICE *best_choice,
bool debug) {
if (incorrect_result_reason_ != IRR_NO_TRUTH &&
!ChoiceIsCorrect(best_choice)) {
misadaption_debug_ = "misadapt to word (";
misadaption_debug_ += best_choice->permuter_name();
misadaption_debug_ += "): ";
FillDebugString("", best_choice, &misadaption_debug_);
if (debug) {
tprintf("%s\n", misadaption_debug_.string());
}
}
}

View File

@ -0,0 +1,333 @@
///////////////////////////////////////////////////////////////////////
// File: blamer.h
// Description: Module allowing precise error causes to be allocated.
// Author: Rike Antonova
// Refactored: Ray Smith
// Created: Mon Feb 04 14:37:01 PST 2013
//
// (C) Copyright 2013, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCSTRUCT_BLAMER_H_
#define TESSERACT_CCSTRUCT_BLAMER_H_
#include <stdio.h>
#include "boxword.h"
#include "genericvector.h"
#include "matrix.h"
#include "params_training_featdef.h"
#include "ratngs.h"
#include "strngs.h"
#include "tesscallback.h"
static const inT16 kBlamerBoxTolerance = 5;
// Enum for expressing the source of error.
// Note: Please update kIncorrectResultReasonNames when modifying this enum.
enum IncorrectResultReason {
// The text recorded in best choice == truth text
IRR_CORRECT,
// Either: Top choice is incorrect and is a dictionary word (language model
// is unlikely to help correct such errors, so blame the classifier).
// Or: the correct unichar was not included in shortlist produced by the
// classifier at all.
IRR_CLASSIFIER,
// Chopper have not found one or more splits that correspond to the correct
// character bounding boxes recorded in BlamerBundle::truth_word.
IRR_CHOPPER,
// Classifier did include correct unichars for each blob in the correct
// segmentation, however its rating could have been too bad to allow the
// language model to pull out the correct choice. On the other hand the
// strength of the language model might have been too weak to favor the
// correct answer, this we call this case a classifier-language model
// tradeoff error.
IRR_CLASS_LM_TRADEOFF,
// Page layout failed to produce the correct bounding box. Blame page layout
// if the truth was not found for the word, which implies that the bounding
// box of the word was incorrect (no truth word had a similar bounding box).
IRR_PAGE_LAYOUT,
// SegSearch heuristic prevented one or more blobs from the correct
// segmentation state to be classified (e.g. the blob was too wide).
IRR_SEGSEARCH_HEUR,
// The correct segmentaiton state was not explored because of poor SegSearch
// pain point prioritization. We blame SegSearch pain point prioritization
// if the best rating of a choice constructed from correct segmentation is
// better than that of the best choice (i.e. if we got to explore the correct
// segmentation state, language model would have picked the correct choice).
IRR_SEGSEARCH_PP,
// Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
// and thus use the old language model (permuters).
// TODO(antonova): integrate the new language mode with chopper
IRR_CLASS_OLD_LM_TRADEOFF,
// If there is an incorrect adaptive template match with a better score than
// a correct one (either pre-trained or adapted), mark this as adaption error.
IRR_ADAPTION,
// split_and_recog_word() failed to find a suitable split in truth.
IRR_NO_TRUTH_SPLIT,
// Truth is not available for this word (e.g. when words in corrected content
// file are turned into ~~~~ because an appropriate alignment was not found.
IRR_NO_TRUTH,
// The text recorded in best choice != truth text, but none of the above
// reasons are set.
IRR_UNKNOWN,
IRR_NUM_REASONS
};
// Blamer-related information to determine the source of errors.
struct BlamerBundle {
static const char *IncorrectReasonName(IncorrectResultReason irr);
BlamerBundle() : truth_has_char_boxes_(false),
incorrect_result_reason_(IRR_CORRECT),
lattice_data_(NULL) {
ClearResults();
}
BlamerBundle(const BlamerBundle &other) {
this->CopyTruth(other);
this->CopyResults(other);
}
~BlamerBundle() { delete[] lattice_data_; }
// Accessors.
STRING TruthString() const {
STRING truth_str;
for (int i = 0; i < truth_text_.length(); ++i)
truth_str += truth_text_[i];
return truth_str;
}
IncorrectResultReason incorrect_result_reason() const {
return incorrect_result_reason_;
}
bool NoTruth() const {
return incorrect_result_reason_ == IRR_NO_TRUTH ||
incorrect_result_reason_ == IRR_PAGE_LAYOUT;
}
bool HasDebugInfo() const {
return debug_.length() > 0 || misadaption_debug_.length() > 0;
}
const STRING& debug() const {
return debug_;
}
const STRING& misadaption_debug() const {
return misadaption_debug_;
}
void UpdateBestRating(float rating) {
if (rating < best_correctly_segmented_rating_)
best_correctly_segmented_rating_ = rating;
}
int correct_segmentation_length() const {
return correct_segmentation_cols_.length();
}
// Returns true if the given ratings matrix col,row position is included
// in the correct segmentation path at the given index.
bool MatrixPositionCorrect(int index, const MATRIX_COORD& coord) {
return correct_segmentation_cols_[index] == coord.col &&
correct_segmentation_rows_[index] == coord.row;
}
void set_best_choice_is_dict_and_top_choice(bool value) {
best_choice_is_dict_and_top_choice_ = value;
}
const char* lattice_data() const {
return lattice_data_;
}
int lattice_size() const {
return lattice_size_; // size of lattice_data in bytes
}
void set_lattice_data(const char* data, int size) {
lattice_size_ = size;
delete[] lattice_data_;
lattice_data_ = new char[lattice_size_];
memcpy(lattice_data_, data, lattice_size_);
}
const tesseract::ParamsTrainingBundle& params_training_bundle() const {
return params_training_bundle_;
}
// Adds a new ParamsTrainingHypothesis to the current hypothesis list.
void AddHypothesis(const tesseract::ParamsTrainingHypothesis& hypo) {
params_training_bundle_.AddHypothesis(hypo);
}
// Functions to setup the blamer.
// Whole word string, whole word bounding box.
void SetWordTruth(const UNICHARSET& unicharset,
const char* truth_str, const TBOX& word_box);
// Single "character" string, "character" bounding box.
// May be called multiple times to indicate the characters in a word.
void SetSymbolTruth(const UNICHARSET& unicharset,
const char* char_str, const TBOX& char_box);
// Marks that there is something wrong with the truth text, like it contains
// reject characters.
void SetRejectedTruth();
// Returns true if the provided word_choice is correct.
bool ChoiceIsCorrect(const WERD_CHOICE* word_choice) const;
void ClearResults() {
norm_truth_word_.DeleteAllBoxes();
norm_box_tolerance_ = 0;
if (!NoTruth()) incorrect_result_reason_ = IRR_CORRECT;
debug_ = "";
segsearch_is_looking_for_blame_ = false;
best_correctly_segmented_rating_ = WERD_CHOICE::kBadRating;
correct_segmentation_cols_.clear();
correct_segmentation_rows_.clear();
best_choice_is_dict_and_top_choice_ = false;
delete[] lattice_data_;
lattice_data_ = NULL;
lattice_size_ = 0;
}
void CopyTruth(const BlamerBundle &other) {
truth_has_char_boxes_ = other.truth_has_char_boxes_;
truth_word_ = other.truth_word_;
truth_text_ = other.truth_text_;
incorrect_result_reason_ =
(other.NoTruth() ? other.incorrect_result_reason_ : IRR_CORRECT);
}
void CopyResults(const BlamerBundle &other) {
norm_truth_word_ = other.norm_truth_word_;
norm_box_tolerance_ = other.norm_box_tolerance_;
incorrect_result_reason_ = other.incorrect_result_reason_;
segsearch_is_looking_for_blame_ = other.segsearch_is_looking_for_blame_;
best_correctly_segmented_rating_ = other.best_correctly_segmented_rating_;
correct_segmentation_cols_ = other.correct_segmentation_cols_;
correct_segmentation_rows_ = other.correct_segmentation_rows_;
best_choice_is_dict_and_top_choice_ =
other.best_choice_is_dict_and_top_choice_;
if (other.lattice_data_ != NULL) {
lattice_data_ = new char[other.lattice_size_];
memcpy(lattice_data_, other.lattice_data_, other.lattice_size_);
lattice_size_ = other.lattice_size_;
}
else {
lattice_data_ = NULL;
}
}
const char *IncorrectReason() const;
// Appends choice and truth details to the given debug string.
void FillDebugString(const STRING &msg, const WERD_CHOICE *choice,
STRING *debug);
// Sets up the norm_truth_word from truth_word using the given DENORM.
void SetupNormTruthWord(const DENORM& denorm);
// Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
// bundles) where the right edge/ of the left-hand word is word1_right,
// and the left edge of the right-hand word is word2_left.
void SplitBundle(int word1_right, int word2_left, bool debug,
BlamerBundle* bundle1, BlamerBundle* bundle2) const;
// "Joins" the blames from bundle1 and bundle2 into *this.
void JoinBlames(const BlamerBundle& bundle1, const BlamerBundle& bundle2,
bool debug);
// If a blob with the same bounding box as one of the truth character
// bounding boxes is not classified as the corresponding truth character
// blames character classifier for incorrect answer.
void BlameClassifier(const UNICHARSET& unicharset,
const TBOX& blob_box,
const BLOB_CHOICE_LIST& choices,
bool debug);
// Checks whether chops were made at all the character bounding box
// boundaries in word->truth_word. If not - blames the chopper for an
// incorrect answer.
void SetChopperBlame(const WERD_RES* word, bool debug);
// Blames the classifier or the language model if, after running only the
// chopper, best_choice is incorrect and no blame has been yet set.
// Blames the classifier if best_choice is classifier's top choice and is a
// dictionary word (i.e. language model could not have helped).
// Otherwise, blames the language model (formerly permuter word adjustment).
void BlameClassifierOrLangModel(
const WERD_RES* word,
const UNICHARSET& unicharset, bool valid_permuter, bool debug);
// Sets up the correct_segmentation_* to mark the correct bounding boxes.
void SetupCorrectSegmentation(const TWERD* word, bool debug);
// Returns true if a guided segmentation search is needed.
bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const;
// Setup ready to guide the segmentation search to the correct segmentation.
// The callback pp_cb is used to avoid a cyclic dependency.
// It calls into LMPainPoints::GenerateForBlamer by pre-binding the
// WERD_RES, and the LMPainPoints itself.
// pp_cb must be a permanent callback, and should be deleted by the caller.
void InitForSegSearch(const WERD_CHOICE *best_choice,
MATRIX* ratings, UNICHAR_ID wildcard_id,
bool debug, STRING *debug_str,
TessResultCallback2<bool, int, int>* pp_cb);
// Returns true if the guided segsearch is in progress.
bool GuidedSegsearchStillGoing() const;
// The segmentation search has ended. Sets the blame appropriately.
void FinishSegSearch(const WERD_CHOICE *best_choice,
bool debug, STRING *debug_str);
// If the bundle is null or still does not indicate the correct result,
// fix it and use some backup reason for the blame.
static void LastChanceBlame(bool debug, WERD_RES* word);
// Sets the misadaption debug if this word is incorrect, as this word is
// being adapted to.
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug);
private:
void SetBlame(IncorrectResultReason irr, const STRING &msg,
const WERD_CHOICE *choice, bool debug) {
incorrect_result_reason_ = irr;
debug_ = IncorrectReason();
debug_ += " to blame: ";
FillDebugString(msg, choice, &debug_);
if (debug) tprintf("SetBlame(): %s", debug_.string());
}
private:
// Set to true when bounding boxes for individual unichars are recorded.
bool truth_has_char_boxes_;
// The true_word (in the original image coordinate space) contains ground
// truth bounding boxes for this WERD_RES.
tesseract::BoxWord truth_word_;
// Same as above, but in normalized coordinates
// (filled in by WERD_RES::SetupForRecognition()).
tesseract::BoxWord norm_truth_word_;
// Tolerance for bounding box comparisons in normalized space.
int norm_box_tolerance_;
// Contains ground truth unichar for each of the bounding boxes in truth_word.
GenericVector<STRING> truth_text_;
// The reason for incorrect OCR result.
IncorrectResultReason incorrect_result_reason_;
// Debug text associated with the blame.
STRING debug_;
// Misadaption debug information (filled in if this word was misadapted to).
STRING misadaption_debug_;
// Variables used by the segmentation search when looking for the blame.
// Set to true while segmentation search is continued after the usual
// termination condition in order to look for the blame.
bool segsearch_is_looking_for_blame_;
// Best rating for correctly segmented path
// (set and used by SegSearch when looking for blame).
float best_correctly_segmented_rating_;
// Vectors populated by SegSearch to indicate column and row indices that
// correspond to blobs with correct bounding boxes.
GenericVector<int> correct_segmentation_cols_;
GenericVector<int> correct_segmentation_rows_;
// Set to true if best choice is a dictionary word and
// classifier's top choice.
bool best_choice_is_dict_and_top_choice_;
// Serialized segmentation search lattice.
char *lattice_data_;
int lattice_size_; // size of lattice_data in bytes
// Information about hypotheses (paths) explored by the segmentation search.
tesseract::ParamsTrainingBundle params_training_bundle_;
};
#endif // TESSERACT_CCSTRUCT_BLAMER_H_

View File

@ -0,0 +1,29 @@
/**********************************************************************
* File: blckerr.h (Formerly blockerr.h)
* Description: Error codes for the page block classes.
* Author: Ray Smith
* Created: Tue Mar 19 17:43:30 GMT 1991
*
* (C) Copyright 1991, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef BLCKERR_H
#define BLCKERR_H
#include "errcode.h"
const ERRCODE BADBLOCKLINE = "Y coordinate in block out of bounds";
const ERRCODE LOSTBLOCKLINE = "Can't find rectangle for line";
const ERRCODE ILLEGAL_GRADIENT = "Gradient wrong side of edge step!";
const ERRCODE WRONG_WORD = "Word doesn't have blobs of that type";
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,847 @@
/**********************************************************************
* File: blobbox.h (Formerly blobnbox.h)
* Description: Code for the textord blob class.
* Author: Ray Smith
* Created: Thu Jul 30 09:08:51 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef BLOBBOX_H
#define BLOBBOX_H
#include "clst.h"
#include "elst2.h"
#include "werd.h"
#include "ocrblock.h"
#include "statistc.h"
enum PITCH_TYPE
{
PITCH_DUNNO, // insufficient data
PITCH_DEF_FIXED, // definitely fixed
PITCH_MAYBE_FIXED, // could be
PITCH_DEF_PROP,
PITCH_MAYBE_PROP,
PITCH_CORR_FIXED,
PITCH_CORR_PROP
};
// The possible tab-stop types of each side of a BLOBNBOX.
// The ordering is important, as it is used for deleting dead-ends in the
// search. ALIGNED, CONFIRMED and VLINE should remain greater than the
// non-aligned, unset, or deleted members.
enum TabType {
TT_NONE, // Not a tab.
TT_DELETED, // Not a tab after detailed analysis.
TT_MAYBE_RAGGED, // Initial designation of a tab-stop candidate.
TT_MAYBE_ALIGNED, // Initial designation of a tab-stop candidate.
TT_CONFIRMED, // Aligned with neighbours.
TT_VLINE // Detected as a vertical line.
};
// The possible region types of a BLOBNBOX.
// Note: keep all the text types > BRT_UNKNOWN and all the image types less.
// Keep in sync with kBlobTypes in colpartition.cpp and BoxColor, and the
// *Type static functions below.
enum BlobRegionType {
BRT_NOISE, // Neither text nor image.
BRT_HLINE, // Horizontal separator line.
BRT_VLINE, // Vertical separator line.
BRT_RECTIMAGE, // Rectangular image.
BRT_POLYIMAGE, // Non-rectangular image.
BRT_UNKNOWN, // Not determined yet.
BRT_VERT_TEXT, // Vertical alignment, not necessarily vertically oriented.
BRT_TEXT, // Convincing text.
BRT_COUNT // Number of possibilities.
};
// enum for elements of arrays that refer to neighbours.
// NOTE: keep in this order, so ^2 can be used to flip direction.
enum BlobNeighbourDir {
BND_LEFT,
BND_BELOW,
BND_RIGHT,
BND_ABOVE,
BND_COUNT
};
// enum for special type of text characters, such as math symbol or italic.
enum BlobSpecialTextType {
BSTT_NONE, // No special.
BSTT_ITALIC, // Italic style.
BSTT_DIGIT, // Digit symbols.
BSTT_MATH, // Mathmatical symobls (not including digit).
BSTT_UNCLEAR, // Characters with low recognition rate.
BSTT_SKIP, // Characters that we skip labeling (usually too small).
BSTT_COUNT
};
inline BlobNeighbourDir DirOtherWay(BlobNeighbourDir dir) {
return static_cast<BlobNeighbourDir>(dir ^ 2);
}
// BlobTextFlowType indicates the quality of neighbouring information
// related to a chain of connected components, either horizontally or
// vertically. Also used by ColPartition for the collection of blobs
// within, which should all have the same value in most cases.
enum BlobTextFlowType {
BTFT_NONE, // No text flow set yet.
BTFT_NONTEXT, // Flow too poor to be likely text.
BTFT_NEIGHBOURS, // Neighbours support flow in this direction.
BTFT_CHAIN, // There is a weak chain of text in this direction.
BTFT_STRONG_CHAIN, // There is a strong chain of text in this direction.
BTFT_TEXT_ON_IMAGE, // There is a strong chain of text on an image.
BTFT_LEADER, // Leader dots/dashes etc.
BTFT_COUNT
};
// Returns true if type1 dominates type2 in a merge. Mostly determined by the
// ordering of the enum, LEADER is weak and dominates nothing.
// The function is anti-symmetric (t1 > t2) === !(t2 > t1), except that
// this cannot be true if t1 == t2, so the result is undefined.
inline bool DominatesInMerge(BlobTextFlowType type1, BlobTextFlowType type2) {
// LEADER always loses.
if (type1 == BTFT_LEADER) return false;
if (type2 == BTFT_LEADER) return true;
// With those out of the way, the ordering of the enum determines the result.
return type1 >= type2;
}
namespace tesseract {
class ColPartition;
}
class BLOBNBOX;
ELISTIZEH(BLOBNBOX)
class BLOBNBOX :public ELIST_LINK
{
public:
BLOBNBOX() {
ConstructionInit();
}
explicit BLOBNBOX(C_BLOB *srcblob) {
box = srcblob->bounding_box();
ConstructionInit();
cblob_ptr = srcblob;
area = static_cast<int>(srcblob->area());
}
~BLOBNBOX() {
if (owns_cblob_) delete cblob_ptr;
}
static BLOBNBOX* RealBlob(C_OUTLINE* outline) {
C_BLOB* blob = new C_BLOB(outline);
return new BLOBNBOX(blob);
}
// Rotates the box and the underlying blob.
void rotate(FCOORD rotation);
// Methods that act on the box without touching the underlying blob.
// Reflect the box in the y-axis, leaving the underlying blob untouched.
void reflect_box_in_y_axis();
// Rotates the box by the angle given by rotation.
// If the blob is a diacritic, then only small rotations for skew
// correction can be applied.
void rotate_box(FCOORD rotation);
// Moves just the box by the given vector.
void translate_box(ICOORD v) {
if (IsDiacritic()) {
box.move(v);
base_char_top_ += v.y();
base_char_bottom_ += v.y();
}
else {
box.move(v);
set_diacritic_box(box);
}
}
void merge(BLOBNBOX *nextblob);
void really_merge(BLOBNBOX* other);
void chop( // fake chop blob
BLOBNBOX_IT *start_it, // location of this
BLOBNBOX_IT *blob_it, // iterator
FCOORD rotation, // for landscape
float xheight); // line height
void NeighbourGaps(int gaps[BND_COUNT]) const;
void MinMaxGapsClipped(int* h_min, int* h_max,
int* v_min, int* v_max) const;
void CleanNeighbours();
// Returns positive if there is at least one side neighbour that has a
// similar stroke width and is not on the other side of a rule line.
int GoodTextBlob() const;
// Returns the number of side neighbours that are of type BRT_NOISE.
int NoisyNeighbours() const;
// Returns true if the blob is noise and has no owner.
bool DeletableNoise() const {
return owner() == NULL && region_type() == BRT_NOISE;
}
// Returns true, and sets vert_possible/horz_possible if the blob has some
// feature that makes it individually appear to flow one way.
// eg if it has a high aspect ratio, yet has a complex shape, such as a
// joined word in Latin, Arabic, or Hindi, rather than being a -, I, l, 1.
bool DefiniteIndividualFlow();
// Returns true if there is no tabstop violation in merging this and other.
bool ConfirmNoTabViolation(const BLOBNBOX& other) const;
// Returns true if other has a similar stroke width to this.
bool MatchingStrokeWidth(const BLOBNBOX& other,
double fractional_tolerance,
double constant_tolerance) const;
// Returns a bounding box of the outline contained within the
// given horizontal range.
TBOX BoundsWithinLimits(int left, int right);
// Estimates and stores the baseline position based on the shape of the
// outline.
void EstimateBaselinePosition();
// Simple accessors.
const TBOX& bounding_box() const {
return box;
}
// Set the bounding box. Use with caution.
// Normally use compute_bounding_box instead.
void set_bounding_box(const TBOX& new_box) {
box = new_box;
base_char_top_ = box.top();
base_char_bottom_ = box.bottom();
}
void compute_bounding_box() {
box = cblob_ptr->bounding_box();
base_char_top_ = box.top();
base_char_bottom_ = box.bottom();
baseline_y_ = box.bottom();
}
const TBOX& reduced_box() const {
return red_box;
}
void set_reduced_box(TBOX new_box) {
red_box = new_box;
reduced = TRUE;
}
inT32 enclosed_area() const {
return area;
}
bool joined_to_prev() const {
return joined != 0;
}
bool red_box_set() const {
return reduced != 0;
}
int repeated_set() const {
return repeated_set_;
}
void set_repeated_set(int set_id) {
repeated_set_ = set_id;
}
C_BLOB *cblob() const {
return cblob_ptr;
}
TabType left_tab_type() const {
return left_tab_type_;
}
void set_left_tab_type(TabType new_type) {
left_tab_type_ = new_type;
}
TabType right_tab_type() const {
return right_tab_type_;
}
void set_right_tab_type(TabType new_type) {
right_tab_type_ = new_type;
}
BlobRegionType region_type() const {
return region_type_;
}
void set_region_type(BlobRegionType new_type) {
region_type_ = new_type;
}
BlobSpecialTextType special_text_type() const {
return spt_type_;
}
void set_special_text_type(BlobSpecialTextType new_type) {
spt_type_ = new_type;
}
BlobTextFlowType flow() const {
return flow_;
}
void set_flow(BlobTextFlowType value) {
flow_ = value;
}
bool vert_possible() const {
return vert_possible_;
}
void set_vert_possible(bool value) {
vert_possible_ = value;
}
bool horz_possible() const {
return horz_possible_;
}
void set_horz_possible(bool value) {
horz_possible_ = value;
}
int left_rule() const {
return left_rule_;
}
void set_left_rule(int new_left) {
left_rule_ = new_left;
}
int right_rule() const {
return right_rule_;
}
void set_right_rule(int new_right) {
right_rule_ = new_right;
}
int left_crossing_rule() const {
return left_crossing_rule_;
}
void set_left_crossing_rule(int new_left) {
left_crossing_rule_ = new_left;
}
int right_crossing_rule() const {
return right_crossing_rule_;
}
void set_right_crossing_rule(int new_right) {
right_crossing_rule_ = new_right;
}
float horz_stroke_width() const {
return horz_stroke_width_;
}
void set_horz_stroke_width(float width) {
horz_stroke_width_ = width;
}
float vert_stroke_width() const {
return vert_stroke_width_;
}
void set_vert_stroke_width(float width) {
vert_stroke_width_ = width;
}
float area_stroke_width() const {
return area_stroke_width_;
}
tesseract::ColPartition* owner() const {
return owner_;
}
void set_owner(tesseract::ColPartition* new_owner) {
owner_ = new_owner;
}
bool leader_on_left() const {
return leader_on_left_;
}
void set_leader_on_left(bool flag) {
leader_on_left_ = flag;
}
bool leader_on_right() const {
return leader_on_right_;
}
void set_leader_on_right(bool flag) {
leader_on_right_ = flag;
}
BLOBNBOX* neighbour(BlobNeighbourDir n) const {
return neighbours_[n];
}
bool good_stroke_neighbour(BlobNeighbourDir n) const {
return good_stroke_neighbours_[n];
}
void set_neighbour(BlobNeighbourDir n, BLOBNBOX* neighbour, bool good) {
neighbours_[n] = neighbour;
good_stroke_neighbours_[n] = good;
}
bool IsDiacritic() const {
return base_char_top_ != box.top() || base_char_bottom_ != box.bottom();
}
int base_char_top() const {
return base_char_top_;
}
int base_char_bottom() const {
return base_char_bottom_;
}
int baseline_position() const {
return baseline_y_;
}
int line_crossings() const {
return line_crossings_;
}
void set_line_crossings(int value) {
line_crossings_ = value;
}
void set_diacritic_box(const TBOX& diacritic_box) {
base_char_top_ = diacritic_box.top();
base_char_bottom_ = diacritic_box.bottom();
}
BLOBNBOX* base_char_blob() const {
return base_char_blob_;
}
void set_base_char_blob(BLOBNBOX* blob) {
base_char_blob_ = blob;
}
void set_owns_cblob(bool value) { owns_cblob_ = value; }
bool UniquelyVertical() const {
return vert_possible_ && !horz_possible_;
}
bool UniquelyHorizontal() const {
return horz_possible_ && !vert_possible_;
}
// Returns true if the region type is text.
static bool IsTextType(BlobRegionType type) {
return type == BRT_TEXT || type == BRT_VERT_TEXT;
}
// Returns true if the region type is image.
static bool IsImageType(BlobRegionType type) {
return type == BRT_RECTIMAGE || type == BRT_POLYIMAGE;
}
// Returns true if the region type is line.
static bool IsLineType(BlobRegionType type) {
return type == BRT_HLINE || type == BRT_VLINE;
}
// Returns true if the region type cannot be merged.
static bool UnMergeableType(BlobRegionType type) {
return IsLineType(type) || IsImageType(type);
}
// Helper to call CleanNeighbours on all blobs on the list.
static void CleanNeighbours(BLOBNBOX_LIST* blobs);
// Helper to delete all the deletable blobs on the list.
static void DeleteNoiseBlobs(BLOBNBOX_LIST* blobs);
// Helper to compute edge offsets for all the blobs on the list.
// See coutln.h for an explanation of edge offsets.
static void ComputeEdgeOffsets(Pix* thresholds, Pix* grey,
BLOBNBOX_LIST* blobs);
#ifndef GRAPHICS_DISABLED
// Helper to draw all the blobs on the list in the given body_colour,
// with child outlines in the child_colour.
static void PlotBlobs(BLOBNBOX_LIST* list,
ScrollView::Color body_colour,
ScrollView::Color child_colour,
ScrollView* win);
// Helper to draw only DeletableNoise blobs (unowned, BRT_NOISE) on the
// given list in the given body_colour, with child outlines in the
// child_colour.
static void PlotNoiseBlobs(BLOBNBOX_LIST* list,
ScrollView::Color body_colour,
ScrollView::Color child_colour,
ScrollView* win);
static ScrollView::Color TextlineColor(BlobRegionType region_type,
BlobTextFlowType flow_type);
// Keep in sync with BlobRegionType.
ScrollView::Color BoxColor() const;
void plot(ScrollView* window, // window to draw in
ScrollView::Color blob_colour, // for outer bits
ScrollView::Color child_colour); // for holes
#endif
// Initializes the bulk of the members to default values for use at
// construction time.
void ConstructionInit() {
cblob_ptr = NULL;
owns_cblob_ = false;
area = 0;
area_stroke_width_ = 0.0f;
horz_stroke_width_ = 0.0f;
vert_stroke_width_ = 0.0f;
ReInit();
}
// Initializes members set by StrokeWidth and beyond, without discarding
// stored area and strokewidth values, which are expensive to calculate.
void ReInit() {
joined = false;
reduced = false;
repeated_set_ = 0;
left_tab_type_ = TT_NONE;
right_tab_type_ = TT_NONE;
region_type_ = BRT_UNKNOWN;
flow_ = BTFT_NONE;
spt_type_ = BSTT_SKIP;
left_rule_ = 0;
right_rule_ = 0;
left_crossing_rule_ = 0;
right_crossing_rule_ = 0;
if (area_stroke_width_ == 0.0f && area > 0 && cblob() != NULL)
area_stroke_width_ = 2.0f * area / cblob()->perimeter();
owner_ = NULL;
base_char_top_ = box.top();
base_char_bottom_ = box.bottom();
baseline_y_ = box.bottom();
line_crossings_ = 0;
base_char_blob_ = NULL;
horz_possible_ = false;
vert_possible_ = false;
leader_on_left_ = false;
leader_on_right_ = false;
ClearNeighbours();
}
void ClearNeighbours() {
for (int n = 0; n < BND_COUNT; ++n) {
neighbours_[n] = NULL;
good_stroke_neighbours_[n] = false;
}
}
private:
C_BLOB *cblob_ptr; // edgestep blob
TBOX box; // bounding box
TBOX red_box; // bounding box
int area : 30; // enclosed area
int joined : 1; // joined to prev
int reduced : 1; // reduced box set
int repeated_set_; // id of the set of repeated blobs
TabType left_tab_type_; // Indicates tab-stop assessment
TabType right_tab_type_; // Indicates tab-stop assessment
BlobRegionType region_type_; // Type of region this blob belongs to
BlobTextFlowType flow_; // Quality of text flow.
inT16 left_rule_; // x-coord of nearest but not crossing rule line
inT16 right_rule_; // x-coord of nearest but not crossing rule line
inT16 left_crossing_rule_; // x-coord of nearest or crossing rule line
inT16 right_crossing_rule_; // x-coord of nearest or crossing rule line
inT16 base_char_top_; // y-coord of top/bottom of diacritic base,
inT16 base_char_bottom_; // if it exists else top/bottom of this blob.
inT16 baseline_y_; // Estimate of baseline position.
int line_crossings_; // Number of line intersections touched.
BLOBNBOX* base_char_blob_; // The blob that was the base char.
float horz_stroke_width_; // Median horizontal stroke width
float vert_stroke_width_; // Median vertical stroke width
float area_stroke_width_; // Stroke width from area/perimeter ratio.
tesseract::ColPartition* owner_; // Who will delete me when I am not needed
BlobSpecialTextType spt_type_; // Special text type.
BLOBNBOX* neighbours_[BND_COUNT];
bool good_stroke_neighbours_[BND_COUNT];
bool horz_possible_; // Could be part of horizontal flow.
bool vert_possible_; // Could be part of vertical flow.
bool leader_on_left_; // There is a leader to the left.
bool leader_on_right_; // There is a leader to the right.
// Iff true, then the destructor should delete the cblob_ptr.
// TODO(rays) migrate all uses to correctly setting this flag instead of
// deleting the C_BLOB before deleting the BLOBNBOX.
bool owns_cblob_;
};
class TO_ROW : public ELIST2_LINK
{
public:
static const int kErrorWeight = 3;
TO_ROW() {
clear();
} //empty
TO_ROW( //constructor
BLOBNBOX *blob, //from first blob
float top, //of row //target height
float bottom,
float row_size);
void print() const;
float max_y() const { //access function
return y_max;
}
float min_y() const {
return y_min;
}
float mean_y() const {
return (y_min + y_max) / 2.0f;
}
float initial_min_y() const {
return initial_y_min;
}
float line_m() const { //access to line fit
return m;
}
float line_c() const {
return c;
}
float line_error() const {
return error;
}
float parallel_c() const {
return para_c;
}
float parallel_error() const {
return para_error;
}
float believability() const { //baseline goodness
return credibility;
}
float intercept() const { //real parallel_c
return y_origin;
}
void add_blob( //put in row
BLOBNBOX *blob, //blob to add
float top, //of row //target height
float bottom,
float row_size);
void insert_blob( //put in row in order
BLOBNBOX *blob);
BLOBNBOX_LIST *blob_list() { //get list
return &blobs;
}
void set_line( //set line spec
float new_m, //line to set
float new_c,
float new_error) {
m = new_m;
c = new_c;
error = new_error;
}
void set_parallel_line( //set fixed gradient line
float gradient, //page gradient
float new_c,
float new_error) {
para_c = new_c;
para_error = new_error;
credibility =
(float)(blobs.length() - kErrorWeight * new_error);
y_origin = (float)(new_c / sqrt(1 + gradient * gradient));
//real intercept
}
void set_limits( //set min,max
float new_min, //bottom and
float new_max) { //top of row
y_min = new_min;
y_max = new_max;
}
void compute_vertical_projection();
//get projection
bool rep_chars_marked() const {
return num_repeated_sets_ != -1;
}
void clear_rep_chars_marked() {
num_repeated_sets_ = -1;
}
int num_repeated_sets() const {
return num_repeated_sets_;
}
void set_num_repeated_sets(int num_sets) {
num_repeated_sets_ = num_sets;
}
// true when dead
BOOL8 merged;
BOOL8 all_caps; // had no ascenders
BOOL8 used_dm_model; // in guessing pitch
inT16 projection_left; // start of projection
inT16 projection_right; // start of projection
PITCH_TYPE pitch_decision; // how strong is decision
float fixed_pitch; // pitch or 0
float fp_space; // sp if fixed pitch
float fp_nonsp; // nonsp if fixed pitch
float pr_space; // sp if prop
float pr_nonsp; // non sp if prop
float spacing; // to "next" row
float xheight; // of line
int xheight_evidence; // number of blobs of height xheight
float ascrise; // ascenders
float descdrop; // descenders
float body_size; // of CJK characters. Assumed to be
// xheight+ascrise for non-CJK text.
inT32 min_space; // min size for real space
inT32 max_nonspace; // max size of non-space
inT32 space_threshold; // space vs nonspace
float kern_size; // average non-space
float space_size; // average space
WERD_LIST rep_words; // repeated chars
ICOORDELT_LIST char_cells; // fixed pitch cells
QSPLINE baseline; // curved baseline
STATS projection; // vertical projection
private:
void clear(); // clear all values to reasonable defaults
BLOBNBOX_LIST blobs; //blobs in row
float y_min; //coords
float y_max;
float initial_y_min;
float m, c; //line spec
float error; //line error
float para_c; //constrained fit
float para_error;
float y_origin; //rotated para_c;
float credibility; //baseline believability
int num_repeated_sets_; // number of sets of repeated blobs
// set to -1 if we have not searched
// for repeated blobs in this row yet
};
ELIST2IZEH(TO_ROW)
class TO_BLOCK :public ELIST_LINK
{
public:
TO_BLOCK() : pitch_decision(PITCH_DUNNO) {
clear();
} //empty
TO_BLOCK( //constructor
BLOCK *src_block); //real block
~TO_BLOCK();
void clear(); // clear all scalar members.
TO_ROW_LIST *get_rows() { //access function
return &row_list;
}
// Rotate all the blobnbox lists and the underlying block. Then update the
// median size statistic from the blobs list.
void rotate(const FCOORD& rotation) {
BLOBNBOX_LIST* blobnbox_list[] = { &blobs, &underlines, &noise_blobs,
&small_blobs, &large_blobs, NULL };
for (BLOBNBOX_LIST** list = blobnbox_list; *list != NULL; ++list) {
BLOBNBOX_IT it(*list);
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
it.data()->rotate(rotation);
}
}
// Rotate the block
ASSERT_HOST(block->poly_block() != NULL);
block->rotate(rotation);
// Update the median size statistic from the blobs list.
STATS widths(0, block->bounding_box().width());
STATS heights(0, block->bounding_box().height());
BLOBNBOX_IT blob_it(&blobs);
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
widths.add(blob_it.data()->bounding_box().width(), 1);
heights.add(blob_it.data()->bounding_box().height(), 1);
}
block->set_median_size(static_cast<int>(widths.median() + 0.5),
static_cast<int>(heights.median() + 0.5));
}
void print_rows() { //debug info
TO_ROW_IT row_it = &row_list;
TO_ROW *row;
for (row_it.mark_cycle_pt(); !row_it.cycled_list();
row_it.forward()) {
row = row_it.data();
tprintf("Row range (%g,%g), para_c=%g, blobcount=" INT32FORMAT
"\n", row->min_y(), row->max_y(), row->parallel_c(),
row->blob_list()->length());
}
}
// Reorganizes the blob lists with a different definition of small, medium
// and large, compared to the original definition.
// Height is still the primary filter key, but medium width blobs of small
// height become medium, and very wide blobs of small height stay small.
void ReSetAndReFilterBlobs();
// Deletes noise blobs from all lists where not owned by a ColPartition.
void DeleteUnownedNoise();
// Computes and stores the edge offsets on each blob for use in feature
// extraction, using greyscale if the supplied grey and thresholds pixes
// are 8-bit or otherwise (if NULL or not 8 bit) the original binary
// edge step outlines.
// Thresholds must either be the same size as grey or an integer down-scale
// of grey.
// See coutln.h for an explanation of edge offsets.
void ComputeEdgeOffsets(Pix* thresholds, Pix* grey);
#ifndef GRAPHICS_DISABLED
// Draw the noise blobs from all lists in red.
void plot_noise_blobs(ScrollView* to_win);
// Draw the blobs on on the various lists in the block in different colors.
void plot_graded_blobs(ScrollView* to_win);
#endif
BLOBNBOX_LIST blobs; //medium size
BLOBNBOX_LIST underlines; //underline blobs
BLOBNBOX_LIST noise_blobs; //very small
BLOBNBOX_LIST small_blobs; //fairly small
BLOBNBOX_LIST large_blobs; //big blobs
BLOCK *block; //real block
PITCH_TYPE pitch_decision; //how strong is decision
float line_spacing; //estimate
// line_size is a lower-bound estimate of the font size in pixels of
// the text in the block (with ascenders and descenders), being a small
// (1.25) multiple of the median height of filtered blobs.
// In most cases the font size will be bigger, but it will be closer
// if the text is allcaps, or in a no-x-height script.
float line_size; //estimate
float max_blob_size; //line assignment limit
float baseline_offset; //phase shift
float xheight; //median blob size
float fixed_pitch; //pitch or 0
float kern_size; //average non-space
float space_size; //average space
inT32 min_space; //min definite space
inT32 max_nonspace; //max definite
float fp_space; //sp if fixed pitch
float fp_nonsp; //nonsp if fixed pitch
float pr_space; //sp if prop
float pr_nonsp; //non sp if prop
TO_ROW *key_row; //starting row
private:
TO_ROW_LIST row_list; //temporary rows
};
ELISTIZEH(TO_BLOCK)
extern double_VAR_H(textord_error_weight, 3,
"Weighting for error in believability");
void find_cblob_limits( //get y limits
C_BLOB *blob, //blob to search
float leftx, //x limits
float rightx,
FCOORD rotation, //for landscape
float &ymin, //output y limits
float &ymax);
void find_cblob_vlimits( //get y limits
C_BLOB *blob, //blob to search
float leftx, //x limits
float rightx,
float &ymin, //output y limits
float &ymax);
void find_cblob_hlimits( //get x limits
C_BLOB *blob, //blob to search
float bottomy, //y limits
float topy,
float &xmin, //output x limits
float &xymax);
C_BLOB *crotate_cblob( //rotate it
C_BLOB *blob, //blob to search
FCOORD rotation //for landscape
);
TBOX box_next( //get bounding box
BLOBNBOX_IT *it //iterator to blobds
);
TBOX box_next_pre_chopped( //get bounding box
BLOBNBOX_IT *it //iterator to blobds
);
void vertical_cblob_projection( //project outlines
C_BLOB *blob, //blob to project
STATS *stats //output
);
void vertical_coutline_projection( //project outlines
C_OUTLINE *outline, //outline to project
STATS *stats //output
);
#ifndef GRAPHICS_DISABLED
void plot_blob_list(ScrollView* win, // window to draw in
BLOBNBOX_LIST *list, // blob list
ScrollView::Color body_colour, // colour to draw
ScrollView::Color child_colour); // colour of child
#endif // GRAPHICS_DISABLED
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,449 @@
/* -*-C-*-
********************************************************************************
*
* File: blobs.h (Formerly blobs.h)
* Description: Blob definition
* Author: Mark Seaman, OCR Technology
* Created: Fri Oct 27 15:39:52 1989
* Modified: Thu Mar 28 15:33:38 1991 (Mark Seaman) marks@hpgrlt
* Language: C
* Package: N/A
* Status: Experimental (Do Not Distribute)
*
* (c) Copyright 1989, Hewlett-Packard Company.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
*********************************************************************************/
#ifndef BLOBS_H
#define BLOBS_H
/*----------------------------------------------------------------------
I n c l u d e s
----------------------------------------------------------------------*/
#include "clst.h"
#include "normalis.h"
#include "publictypes.h"
#include "rect.h"
#include "vecfuncs.h"
class BLOCK;
class C_BLOB;
class C_OUTLINE;
class LLSQ;
class ROW;
class WERD;
/*----------------------------------------------------------------------
T y p e s
----------------------------------------------------------------------*/
#define EDGEPTFLAGS 4 /*concavity,length etc. */
struct TPOINT {
TPOINT(): x(0), y(0) {}
TPOINT(inT16 vx, inT16 vy) : x(vx), y(vy) {}
TPOINT(const ICOORD &ic) : x(ic.x()), y(ic.y()) {}
void operator+=(const TPOINT& other) {
x += other.x;
y += other.y;
}
void operator/=(int divisor) {
x /= divisor;
y /= divisor;
}
bool operator==(const TPOINT& other) const {
return x == other.x && y == other.y;
}
// Returns true when the two line segments cross each other.
// (Moved from outlines.cpp).
static bool IsCrossed(const TPOINT& a0, const TPOINT& a1, const TPOINT& b0,
const TPOINT& b1);
inT16 x; // absolute x coord.
inT16 y; // absolute y coord.
};
typedef TPOINT VECTOR; // structure for coordinates.
struct EDGEPT {
EDGEPT()
: next(NULL), prev(NULL), src_outline(NULL), start_step(0), step_count(0) {
memset(flags, 0, EDGEPTFLAGS * sizeof(flags[0]));
}
EDGEPT(const EDGEPT& src) : next(NULL), prev(NULL) {
CopyFrom(src);
}
EDGEPT& operator=(const EDGEPT& src) {
CopyFrom(src);
return *this;
}
// Copies the data elements, but leaves the pointers untouched.
void CopyFrom(const EDGEPT& src) {
pos = src.pos;
vec = src.vec;
memcpy(flags, src.flags, EDGEPTFLAGS * sizeof(flags[0]));
src_outline = src.src_outline;
start_step = src.start_step;
step_count = src.step_count;
}
// Returns the squared distance between the points, with the x-component
// weighted by x_factor.
int WeightedDistance(const EDGEPT& other, int x_factor) const {
int x_dist = pos.x - other.pos.x;
int y_dist = pos.y - other.pos.y;
return x_dist * x_dist * x_factor + y_dist * y_dist;
}
// Returns true if the positions are equal.
bool EqualPos(const EDGEPT& other) const { return pos == other.pos; }
// Returns the bounding box of the outline segment from *this to *end.
// Ignores hidden edge flags.
TBOX SegmentBox(const EDGEPT* end) const {
TBOX box(pos.x, pos.y, pos.x, pos.y);
const EDGEPT* pt = this;
do {
pt = pt->next;
if (pt->pos.x < box.left()) box.set_left(pt->pos.x);
if (pt->pos.x > box.right()) box.set_right(pt->pos.x);
if (pt->pos.y < box.bottom()) box.set_bottom(pt->pos.y);
if (pt->pos.y > box.top()) box.set_top(pt->pos.y);
} while (pt != end && pt != this);
return box;
}
// Returns the area of the outline segment from *this to *end.
// Ignores hidden edge flags.
int SegmentArea(const EDGEPT* end) const {
int area = 0;
const EDGEPT* pt = this->next;
do {
TPOINT origin_vec(pt->pos.x - pos.x, pt->pos.y - pos.y);
area += CROSS(origin_vec, pt->vec);
pt = pt->next;
} while (pt != end && pt != this);
return area;
}
// Returns true if the number of points in the outline segment from *this to
// *end is less that min_points and false if we get back to *this first.
// Ignores hidden edge flags.
bool ShortNonCircularSegment(int min_points, const EDGEPT* end) const {
int count = 0;
const EDGEPT* pt = this;
do {
if (pt == end) return true;
pt = pt->next;
++count;
} while (pt != this && count <= min_points);
return false;
}
// Accessors to hide or reveal a cut edge from feature extractors.
void Hide() {
flags[0] = true;
}
void Reveal() {
flags[0] = false;
}
bool IsHidden() const {
return flags[0] != 0;
}
void MarkChop() {
flags[2] = true;
}
bool IsChopPt() const {
return flags[2] != 0;
}
TPOINT pos; // position
VECTOR vec; // vector to next point
// TODO(rays) Remove flags and replace with
// is_hidden, runlength, dir, and fixed. The only use
// of the flags other than is_hidden is in polyaprx.cpp.
char flags[EDGEPTFLAGS]; // concavity, length etc
EDGEPT* next; // anticlockwise element
EDGEPT* prev; // clockwise element
C_OUTLINE* src_outline; // Outline it came from.
// The following fields are not used if src_outline is NULL.
int start_step; // Location of pos in src_outline.
int step_count; // Number of steps used (may wrap around).
};
// For use in chop and findseam to keep a list of which EDGEPTs were inserted.
CLISTIZEH(EDGEPT);
struct TESSLINE {
TESSLINE() : is_hole(false), loop(NULL), next(NULL) {}
TESSLINE(const TESSLINE& src) : loop(NULL), next(NULL) {
CopyFrom(src);
}
~TESSLINE() {
Clear();
}
TESSLINE& operator=(const TESSLINE& src) {
CopyFrom(src);
return *this;
}
// Consume the circular list of EDGEPTs to make a TESSLINE.
static TESSLINE* BuildFromOutlineList(EDGEPT* outline);
// Copies the data and the outline, but leaves next untouched.
void CopyFrom(const TESSLINE& src);
// Deletes owned data.
void Clear();
// Normalize in-place using the DENORM.
void Normalize(const DENORM& denorm);
// Rotates by the given rotation in place.
void Rotate(const FCOORD rotation);
// Moves by the given vec in place.
void Move(const ICOORD vec);
// Scales by the given factor in place.
void Scale(float factor);
// Sets up the start and vec members of the loop from the pos members.
void SetupFromPos();
// Recomputes the bounding box from the points in the loop.
void ComputeBoundingBox();
// Computes the min and max cross product of the outline points with the
// given vec and returns the results in min_xp and max_xp. Geometrically
// this is the left and right edge of the outline perpendicular to the
// given direction, but to get the distance units correct, you would
// have to divide by the modulus of vec.
void MinMaxCrossProduct(const TPOINT vec, int* min_xp, int* max_xp) const;
TBOX bounding_box() const;
// Returns true if *this and other have equal bounding boxes.
bool SameBox(const TESSLINE& other) const {
return topleft == other.topleft && botright == other.botright;
}
// Returns true if the given line segment crosses any outline of this blob.
bool SegmentCrosses(const TPOINT& pt1, const TPOINT& pt2) const {
if (Contains(pt1) && Contains(pt2)) {
EDGEPT* pt = loop;
do {
if (TPOINT::IsCrossed(pt1, pt2, pt->pos, pt->next->pos)) return true;
pt = pt->next;
} while (pt != loop);
}
return false;
}
// Returns true if the point is contained within the outline box.
bool Contains(const TPOINT& pt) const {
return topleft.x <= pt.x && pt.x <= botright.x &&
botright.y <= pt.y && pt.y <= topleft.y;
}
#ifndef GRAPHICS_DISABLED
void plot(ScrollView* window, ScrollView::Color color,
ScrollView::Color child_color);
#endif // GRAPHICS_DISABLED
// Returns the first outline point that has a different src_outline to its
// predecessor, or, if all the same, the lowest indexed point.
EDGEPT* FindBestStartPt() const;
int BBArea() const {
return (botright.x - topleft.x) * (topleft.y - botright.y);
}
TPOINT topleft; // Top left of loop.
TPOINT botright; // Bottom right of loop.
TPOINT start; // Start of loop.
bool is_hole; // True if this is a hole/child outline.
EDGEPT *loop; // Edgeloop.
TESSLINE *next; // Next outline in blob.
}; // Outline structure.
struct TBLOB {
TBLOB() : outlines(NULL) {}
TBLOB(const TBLOB& src) : outlines(NULL) {
CopyFrom(src);
}
~TBLOB() {
Clear();
}
TBLOB& operator=(const TBLOB& src) {
CopyFrom(src);
return *this;
}
// Factory to build a TBLOB from a C_BLOB with polygonal approximation along
// the way. If allow_detailed_fx is true, the EDGEPTs in the returned TBLOB
// contain pointers to the input C_OUTLINEs that enable higher-resolution
// feature extraction that does not use the polygonal approximation.
static TBLOB* PolygonalCopy(bool allow_detailed_fx, C_BLOB* src);
// Factory builds a blob with no outlines, but copies the other member data.
static TBLOB* ShallowCopy(const TBLOB& src);
// Normalizes the blob for classification only if needed.
// (Normally this means a non-zero classify rotation.)
// If no Normalization is needed, then NULL is returned, and the input blob
// can be used directly. Otherwise a new TBLOB is returned which must be
// deleted after use.
TBLOB* ClassifyNormalizeIfNeeded() const;
// Copies the data and the outlines, but leaves next untouched.
void CopyFrom(const TBLOB& src);
// Deletes owned data.
void Clear();
// Sets up the built-in DENORM and normalizes the blob in-place.
// For parameters see DENORM::SetupNormalization, plus the inverse flag for
// this blob and the Pix for the full image.
void Normalize(const BLOCK* block,
const FCOORD* rotation,
const DENORM* predecessor,
float x_origin, float y_origin,
float x_scale, float y_scale,
float final_xshift, float final_yshift,
bool inverse, Pix* pix);
// Rotates by the given rotation in place.
void Rotate(const FCOORD rotation);
// Moves by the given vec in place.
void Move(const ICOORD vec);
// Scales by the given factor in place.
void Scale(float factor);
// Recomputes the bounding boxes of the outlines.
void ComputeBoundingBoxes();
// Returns the number of outlines.
int NumOutlines() const;
TBOX bounding_box() const;
// Returns true if the given line segment crosses any outline of this blob.
bool SegmentCrossesOutline(const TPOINT& pt1, const TPOINT& pt2) const {
for (const TESSLINE* outline = outlines; outline != NULL;
outline = outline->next) {
if (outline->SegmentCrosses(pt1, pt2)) return true;
}
return false;
}
// Returns true if the point is contained within any of the outline boxes.
bool Contains(const TPOINT& pt) const {
for (const TESSLINE* outline = outlines; outline != NULL;
outline = outline->next) {
if (outline->Contains(pt)) return true;
}
return false;
}
// Finds and deletes any duplicate outlines in this blob, without deleting
// their EDGEPTs.
void EliminateDuplicateOutlines();
// Swaps the outlines of *this and next if needed to keep the centers in
// increasing x.
void CorrectBlobOrder(TBLOB* next);
const DENORM& denorm() const {
return denorm_;
}
#ifndef GRAPHICS_DISABLED
void plot(ScrollView* window, ScrollView::Color color,
ScrollView::Color child_color);
#endif // GRAPHICS_DISABLED
int BBArea() const {
int total_area = 0;
for (TESSLINE* outline = outlines; outline != NULL; outline = outline->next)
total_area += outline->BBArea();
return total_area;
}
// Computes the center of mass and second moments for the old baseline and
// 2nd moment normalizations. Returns the outline length.
// The input denorm should be the normalizations that have been applied from
// the image to the current state of this TBLOB.
int ComputeMoments(FCOORD* center, FCOORD* second_moments) const;
// Computes the precise bounding box of the coords that are generated by
// GetEdgeCoords. This may be different from the bounding box of the polygon.
void GetPreciseBoundingBox(TBOX* precise_box) const;
// Adds edges to the given vectors.
// For all the edge steps in all the outlines, or polygonal approximation
// where there are no edge steps, collects the steps into x_coords/y_coords.
// x_coords is a collection of the x-coords of vertical edges for each
// y-coord starting at box.bottom().
// y_coords is a collection of the y-coords of horizontal edges for each
// x-coord starting at box.left().
// Eg x_coords[0] is a collection of the x-coords of edges at y=bottom.
// Eg x_coords[1] is a collection of the x-coords of edges at y=bottom + 1.
void GetEdgeCoords(const TBOX& box,
GenericVector<GenericVector<int> >* x_coords,
GenericVector<GenericVector<int> >* y_coords) const;
TESSLINE *outlines; // List of outlines in blob.
private: // TODO(rays) Someday the data members will be private too.
// For all the edge steps in all the outlines, or polygonal approximation
// where there are no edge steps, collects the steps into the bounding_box,
// llsq and/or the x_coords/y_coords. Both are used in different kinds of
// normalization.
// For a description of x_coords, y_coords, see GetEdgeCoords above.
void CollectEdges(const TBOX& box,
TBOX* bounding_box, LLSQ* llsq,
GenericVector<GenericVector<int> >* x_coords,
GenericVector<GenericVector<int> >* y_coords) const;
private:
// DENORM indicating the transformations that this blob has undergone so far.
DENORM denorm_;
}; // Blob structure.
struct TWERD {
TWERD() : latin_script(false) {}
TWERD(const TWERD& src) {
CopyFrom(src);
}
~TWERD() {
Clear();
}
TWERD& operator=(const TWERD& src) {
CopyFrom(src);
return *this;
}
// Factory to build a TWERD from a (C_BLOB) WERD, with polygonal
// approximation along the way.
static TWERD* PolygonalCopy(bool allow_detailed_fx, WERD* src);
// Baseline normalizes the blobs in-place, recording the normalization in the
// DENORMs in the blobs.
void BLNormalize(const BLOCK* block, const ROW* row, Pix* pix, bool inverse,
float x_height, float baseline_shift, bool numeric_mode,
tesseract::OcrEngineMode hint,
const TBOX* norm_box,
DENORM* word_denorm);
// Copies the data and the blobs, but leaves next untouched.
void CopyFrom(const TWERD& src);
// Deletes owned data.
void Clear();
// Recomputes the bounding boxes of the blobs.
void ComputeBoundingBoxes();
// Returns the number of blobs in the word.
int NumBlobs() const {
return blobs.size();
}
TBOX bounding_box() const;
// Merges the blobs from start to end, not including end, and deletes
// the blobs between start and end.
void MergeBlobs(int start, int end);
void plot(ScrollView* window);
GenericVector<TBLOB*> blobs; // Blobs in word.
bool latin_script; // This word is in a latin-based script.
};
/*----------------------------------------------------------------------
F u n c t i o n s
----------------------------------------------------------------------*/
// TODO(rays) Make divisible_blob and divide_blobs members of TBLOB.
bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT* location);
void divide_blobs(TBLOB *blob, TBLOB *other_blob, bool italic_blob,
const TPOINT& location);
#endif

View File

@ -0,0 +1,71 @@
/**********************************************************************
* File: blread.cpp (Formerly pdread.c)
* Description: Friend function of BLOCK to read the uscan pd file.
* Author: Ray Smith
* Created: Mon Mar 18 14:39:00 GMT 1991
*
* (C) Copyright 1991, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include <stdlib.h>
#ifdef __UNIX__
#include <assert.h>
#endif
#include "scanutils.h"
#include "fileerr.h"
#include "blread.h"
#define UNLV_EXT ".uzn" // unlv zone file
/**********************************************************************
* read_unlv_file
*
* Read a whole unlv zone file to make a list of blocks.
**********************************************************************/
bool read_unlv_file( //print list of sides
STRING name, //basename of file
inT32 xsize, //image size
inT32 ysize, //image size
BLOCK_LIST *blocks //output list
) {
FILE *pdfp; //file pointer
BLOCK *block; //current block
int x; //current top-down coords
int y;
int width; //of current block
int height;
BLOCK_IT block_it = blocks; //block iterator
name += UNLV_EXT; //add extension
if ((pdfp = fopen (name.string (), "rb")) == NULL) {
return false; //didn't read one
} else {
while (tfscanf(pdfp, "%d %d %d %d %*s", &x, &y, &width, &height) >= 4) {
//make rect block
block = new BLOCK (name.string (), TRUE, 0, 0,
(inT16) x, (inT16) (ysize - y - height),
(inT16) (x + width), (inT16) (ysize - y));
//on end of list
block_it.add_to_end (block);
}
fclose(pdfp);
}
return true;
}
void FullPageBlock(int width, int height, BLOCK_LIST *blocks) {
BLOCK_IT block_it(blocks);
BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height);
block_it.add_to_end(block);
}

View File

@ -0,0 +1,33 @@
/**********************************************************************
* File: blread.h (Formerly pdread.h)
* Description: Friend function of BLOCK to read the uscan pd file.
* Author: Ray Smith
* Created: Mon Mar 18 14:39:00 GMT 1991
*
* (C) Copyright 1991, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef BLREAD_H
#define BLREAD_H
#include "params.h"
#include "ocrblock.h"
bool read_unlv_file( //print list of sides
STRING name, //basename of file
inT32 xsize, //image size
inT32 ysize, //image size
BLOCK_LIST *blocks //output list
);
void FullPageBlock(int width, int height, BLOCK_LIST *blocks);
#endif

View File

@ -0,0 +1,235 @@
/**********************************************************************
* File: boxread.cpp
* Description: Read data from a box file.
* Author: Ray Smith
* Created: Fri Aug 24 17:47:23 PDT 2007
*
* (C) Copyright 2007, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "boxread.h"
#include <string.h>
#include "fileerr.h"
#include "rect.h"
#include "strngs.h"
#include "tprintf.h"
#include "unichar.h"
// Special char code used to identify multi-blob labels.
static const char* kMultiBlobLabelCode = "WordStr";
// Open the boxfile based on the given image filename.
FILE* OpenBoxFile(const STRING& fname) {
STRING filename = BoxFileName(fname);
FILE* box_file = NULL;
if (!(box_file = fopen(filename.string(), "rb"))) {
CANTOPENFILE.error("read_next_box", TESSEXIT, "Can't open box file %s",
filename.string());
}
return box_file;
}
// Reads all boxes from the given filename.
// Reads a specific target_page number if >= 0, or all pages otherwise.
// Skips blanks if skip_blanks is true.
// The UTF-8 label of the box is put in texts, and the full box definition as
// a string is put in box_texts, with the corresponding page number in pages.
// Each of the output vectors is optional (may be NULL).
// Returns false if no boxes are found.
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING& filename,
GenericVector<TBOX>* boxes,
GenericVector<STRING>* texts,
GenericVector<STRING>* box_texts,
GenericVector<int>* pages) {
GenericVector<char> box_data;
if (!tesseract::LoadDataFromFile(BoxFileName(filename), &box_data))
return false;
// Convert the array of bytes to a string, so it can be used by the parser.
box_data.push_back('\0');
return ReadMemBoxes(target_page, skip_blanks, &box_data[0], boxes, texts,
box_texts, pages);
}
// Reads all boxes from the string. Otherwise, as ReadAllBoxes.
bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
GenericVector<TBOX>* boxes,
GenericVector<STRING>* texts,
GenericVector<STRING>* box_texts,
GenericVector<int>* pages) {
STRING box_str(box_data);
GenericVector<STRING> lines;
box_str.split('\n', &lines);
if (lines.empty()) return false;
int num_boxes = 0;
for (int i = 0; i < lines.size(); ++i) {
int page = 0;
STRING utf8_str;
TBOX box;
if (!ParseBoxFileStr(lines[i].string(), &page, &utf8_str, &box)) {
continue;
}
if (skip_blanks && (utf8_str == " " || utf8_str == "\t")) continue;
if (target_page >= 0 && page != target_page) continue;
if (boxes != NULL) boxes->push_back(box);
if (texts != NULL) texts->push_back(utf8_str);
if (box_texts != NULL) {
STRING full_text;
MakeBoxFileStr(utf8_str.string(), box, target_page, &full_text);
box_texts->push_back(full_text);
}
if (pages != NULL) pages->push_back(page);
++num_boxes;
}
return num_boxes > 0;
}
// Returns the box file name corresponding to the given image_filename.
STRING BoxFileName(const STRING& image_filename) {
STRING box_filename = image_filename;
const char *lastdot = strrchr(box_filename.string(), '.');
if (lastdot != NULL)
box_filename.truncate_at(lastdot - box_filename.string());
box_filename += ".box";
return box_filename;
}
// TODO(rays) convert all uses of ReadNextBox to use the new ReadAllBoxes.
// Box files are used ONLY DURING TRAINING, but by both processes of
// creating tr files with tesseract, and unicharset_extractor.
// ReadNextBox factors out the code to interpret a line of a box
// file so that applybox and unicharset_extractor interpret the same way.
// This function returns the next valid box file utf8 string and coords
// and returns true, or false on eof (and closes the file).
// It ignores the utf8 file signature ByteOrderMark (U+FEFF=EF BB BF), checks
// for valid utf-8 and allows space or tab between fields.
// utf8_str is set with the unichar string, and bounding box with the box.
// If there are page numbers in the file, it reads them all.
bool ReadNextBox(int *line_number, FILE* box_file,
STRING* utf8_str, TBOX* bounding_box) {
return ReadNextBox(-1, line_number, box_file, utf8_str, bounding_box);
}
// As ReadNextBox above, but get a specific page number. (0-based)
// Use -1 to read any page number. Files without page number all
// read as if they are page 0.
bool ReadNextBox(int target_page, int *line_number, FILE* box_file,
STRING* utf8_str, TBOX* bounding_box) {
int page = 0;
char buff[kBoxReadBufSize]; // boxfile read buffer
char *buffptr = buff;
while (fgets(buff, sizeof(buff) - 1, box_file)) {
(*line_number)++;
buffptr = buff;
const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
buffptr += 3; // Skip unicode file designation.
// Check for blank lines in box file
if (*buffptr == '\n' || *buffptr == '\0') continue;
// Skip blank boxes.
if (*buffptr == ' ' || *buffptr == '\t') continue;
if (*buffptr != '\0') {
if (!ParseBoxFileStr(buffptr, &page, utf8_str, bounding_box)) {
tprintf("Box file format error on line %i; ignored\n", *line_number);
continue;
}
if (target_page >= 0 && target_page != page)
continue; // Not on the appropriate page.
return true; // Successfully read a box.
}
}
fclose(box_file);
return false; // EOF
}
// Parses the given box file string into a page_number, utf8_str, and
// bounding_box. Returns true on a successful parse.
// The box file is assumed to contain box definitions, one per line, of the
// following format for blob-level boxes:
// <UTF8 str> <left> <bottom> <right> <top> <page id>
// and for word/line-level boxes:
// WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
// See applyybox.cpp for more information.
bool ParseBoxFileStr(const char* boxfile_str, int* page_number,
STRING* utf8_str, TBOX* bounding_box) {
*bounding_box = TBOX(); // Initialize it to empty.
*utf8_str = "";
char uch[kBoxReadBufSize];
const char *buffptr = boxfile_str;
// Read the unichar without messing up on Tibetan.
// According to issue 253 the utf-8 surrogates 85 and A0 are treated
// as whitespace by sscanf, so it is more reliable to just find
// ascii space and tab.
int uch_len = 0;
// Skip unicode file designation, if present.
const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
buffptr += 3;
// Allow a single blank as the UTF-8 string. Check for empty string and
// then blindly eat the first character.
if (*buffptr == '\0') return false;
do {
uch[uch_len++] = *buffptr++;
} while (*buffptr != '\0' && *buffptr != ' ' && *buffptr != '\t' &&
uch_len < kBoxReadBufSize - 1);
uch[uch_len] = '\0';
if (*buffptr != '\0') ++buffptr;
int x_min, y_min, x_max, y_max;
*page_number = 0;
int count = sscanf(buffptr, "%d %d %d %d %d",
&x_min, &y_min, &x_max, &y_max, page_number);
if (count != 5 && count != 4) {
tprintf("Bad box coordinates in boxfile string! %s\n", ubuf);
return false;
}
// Test for long space-delimited string label.
if (strcmp(uch, kMultiBlobLabelCode) == 0 &&
(buffptr = strchr(buffptr, '#')) != NULL) {
strncpy(uch, buffptr + 1, kBoxReadBufSize - 1);
uch[kBoxReadBufSize - 1] = '\0'; // Prevent buffer overrun.
chomp_string(uch);
uch_len = strlen(uch);
}
// Validate UTF8 by making unichars with it.
int used = 0;
while (used < uch_len) {
UNICHAR ch(uch + used, uch_len - used);
int new_used = ch.utf8_len();
if (new_used == 0) {
tprintf("Bad UTF-8 str %s starts with 0x%02x at col %d\n",
uch + used, uch[used], used + 1);
return false;
}
used += new_used;
}
*utf8_str = uch;
if (x_min > x_max) Swap(&x_min, &x_max);
if (y_min > y_max) Swap(&y_min, &y_max);
bounding_box->set_to_given_coords(x_min, y_min, x_max, y_max);
return true; // Successfully read a box.
}
// Creates a box file string from a unichar string, TBOX and page number.
void MakeBoxFileStr(const char* unichar_str, const TBOX& box, int page_num,
STRING* box_str) {
*box_str = unichar_str;
box_str->add_str_int(" ", box.left());
box_str->add_str_int(" ", box.bottom());
box_str->add_str_int(" ", box.right());
box_str->add_str_int(" ", box.top());
box_str->add_str_int(" ", page_num);
}

View File

@ -0,0 +1,85 @@
/**********************************************************************
* File: boxread.cpp
* Description: Read data from a box file.
* Author: Ray Smith
* Created: Fri Aug 24 17:47:23 PDT 2007
*
* (C) Copyright 2007, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_CCUTIL_BOXREAD_H__
#define TESSERACT_CCUTIL_BOXREAD_H__
#include <stdio.h>
#include "genericvector.h"
#include "strngs.h"
class STRING;
class TBOX;
// Size of buffer used to read a line from a box file.
const int kBoxReadBufSize = 1024;
// Open the boxfile based on the given image filename.
// Returns NULL if the box file cannot be opened.
FILE* OpenBoxFile(const STRING& fname);
// Reads all boxes from the given filename.
// Reads a specific target_page number if >= 0, or all pages otherwise.
// Skips blanks if skip_blanks is true.
// The UTF-8 label of the box is put in texts, and the full box definition as
// a string is put in box_texts, with the corresponding page number in pages.
// Each of the output vectors is optional (may be NULL).
// Returns false if no boxes are found.
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING& filename,
GenericVector<TBOX>* boxes,
GenericVector<STRING>* texts,
GenericVector<STRING>* box_texts,
GenericVector<int>* pages);
// Reads all boxes from the string. Otherwise, as ReadAllBoxes.
bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
GenericVector<TBOX>* boxes,
GenericVector<STRING>* texts,
GenericVector<STRING>* box_texts,
GenericVector<int>* pages);
// Returns the box file name corresponding to the given image_filename.
STRING BoxFileName(const STRING& image_filename);
// ReadNextBox factors out the code to interpret a line of a box
// file so that applybox and unicharset_extractor interpret the same way.
// This function returns the next valid box file utf8 string and coords
// and returns true, or false on eof (and closes the file).
// It ignores the utf8 file signature ByteOrderMark (U+FEFF=EF BB BF), checks
// for valid utf-8 and allows space or tab between fields.
// utf8_str is set with the unichar string, and bounding box with the box.
// If there are page numbers in the file, it reads them all.
bool ReadNextBox(int *line_number, FILE* box_file,
STRING* utf8_str, TBOX* bounding_box);
// As ReadNextBox above, but get a specific page number. (0-based)
// Use -1 to read any page number. Files without page number all
// read as if they are page 0.
bool ReadNextBox(int target_page, int *line_number, FILE* box_file,
STRING* utf8_str, TBOX* bounding_box);
// Parses the given box file string into a page_number, utf8_str, and
// bounding_box. Returns true on a successful parse.
bool ParseBoxFileStr(const char* boxfile_str, int* page_number,
STRING* utf8_str, TBOX* bounding_box);
// Creates a box file string from a unichar string, TBOX and page number.
void MakeBoxFileStr(const char* unichar_str, const TBOX& box, int page_num,
STRING* box_str);
#endif // TESSERACT_CCUTIL_BOXREAD_H__

View File

@ -0,0 +1,203 @@
///////////////////////////////////////////////////////////////////////
// File: boxword.h
// Description: Class to represent the bounding boxes of the output.
// Author: Ray Smith
// Created: Tue May 25 14:18:14 PDT 2010
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "blobs.h"
#include "boxword.h"
#include "normalis.h"
#include "ocrblock.h"
#include "pageres.h"
namespace tesseract {
// Clip output boxes to input blob boxes for bounds that are within this
// tolerance. Otherwise, the blob may be chopped and we have to just use
// the word bounding box.
const int kBoxClipTolerance = 2;
BoxWord::BoxWord() : length_(0) {
}
BoxWord::BoxWord(const BoxWord& src) {
CopyFrom(src);
}
BoxWord::~BoxWord() {
}
BoxWord& BoxWord::operator=(const BoxWord& src) {
CopyFrom(src);
return *this;
}
void BoxWord::CopyFrom(const BoxWord& src) {
bbox_ = src.bbox_;
length_ = src.length_;
boxes_.clear();
boxes_.reserve(length_);
for (int i = 0; i < length_; ++i)
boxes_.push_back(src.boxes_[i]);
}
// Factory to build a BoxWord from a TWERD using the DENORMs on each blob to
// switch back to original image coordinates.
BoxWord* BoxWord::CopyFromNormalized(TWERD* tessword) {
BoxWord* boxword = new BoxWord();
// Count the blobs.
boxword->length_ = tessword->NumBlobs();
// Allocate memory.
boxword->boxes_.reserve(boxword->length_);
for (int b = 0; b < boxword->length_; ++b) {
TBLOB* tblob = tessword->blobs[b];
TBOX blob_box;
for (TESSLINE* outline = tblob->outlines; outline != NULL;
outline = outline->next) {
EDGEPT* edgept = outline->loop;
// Iterate over the edges.
do {
if (!edgept->IsHidden() || !edgept->prev->IsHidden()) {
ICOORD pos(edgept->pos.x, edgept->pos.y);
TPOINT denormed;
tblob->denorm().DenormTransform(NULL, edgept->pos, &denormed);
pos.set_x(denormed.x);
pos.set_y(denormed.y);
TBOX pt_box(pos, pos);
blob_box += pt_box;
}
edgept = edgept->next;
} while (edgept != outline->loop);
}
boxword->boxes_.push_back(blob_box);
}
boxword->ComputeBoundingBox();
return boxword;
}
// Clean up the bounding boxes from the polygonal approximation by
// expanding slightly, then clipping to the blobs from the original_word
// that overlap. If not null, the block provides the inverse rotation.
void BoxWord::ClipToOriginalWord(const BLOCK* block, WERD* original_word) {
for (int i = 0; i < length_; ++i) {
TBOX box = boxes_[i];
// Expand by a single pixel, as the poly approximation error is 1 pixel.
box = TBOX(box.left() - 1, box.bottom() - 1,
box.right() + 1, box.top() + 1);
// Now find the original box that matches.
TBOX original_box;
C_BLOB_IT b_it(original_word->cblob_list());
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
TBOX blob_box = b_it.data()->bounding_box();
if (block != NULL)
blob_box.rotate(block->re_rotation());
if (blob_box.major_overlap(box)) {
original_box += blob_box;
}
}
if (!original_box.null_box()) {
if (NearlyEqual<int>(original_box.left(), box.left(), kBoxClipTolerance))
box.set_left(original_box.left());
if (NearlyEqual<int>(original_box.right(), box.right(),
kBoxClipTolerance))
box.set_right(original_box.right());
if (NearlyEqual<int>(original_box.top(), box.top(), kBoxClipTolerance))
box.set_top(original_box.top());
if (NearlyEqual<int>(original_box.bottom(), box.bottom(),
kBoxClipTolerance))
box.set_bottom(original_box.bottom());
}
original_box = original_word->bounding_box();
if (block != NULL)
original_box.rotate(block->re_rotation());
boxes_[i] = box.intersection(original_box);
}
ComputeBoundingBox();
}
// Merges the boxes from start to end, not including end, and deletes
// the boxes between start and end.
void BoxWord::MergeBoxes(int start, int end) {
start = ClipToRange(start, 0, length_);
end = ClipToRange(end, 0, length_);
if (end <= start + 1)
return;
for (int i = start + 1; i < end; ++i) {
boxes_[start] += boxes_[i];
}
int shrinkage = end - 1 - start;
length_ -= shrinkage;
for (int i = start + 1; i < length_; ++i)
boxes_[i] = boxes_[i + shrinkage];
boxes_.truncate(length_);
}
// Inserts a new box before the given index.
// Recomputes the bounding box.
void BoxWord::InsertBox(int index, const TBOX& box) {
if (index < length_)
boxes_.insert(box, index);
else
boxes_.push_back(box);
length_ = boxes_.size();
ComputeBoundingBox();
}
// Changes the box at the given index to the new box.
// Recomputes the bounding box.
void BoxWord::ChangeBox(int index, const TBOX& box) {
boxes_[index] = box;
ComputeBoundingBox();
}
// Deletes the box with the given index, and shuffles up the rest.
// Recomputes the bounding box.
void BoxWord::DeleteBox(int index) {
ASSERT_HOST(0 <= index && index < length_);
boxes_.remove(index);
--length_;
ComputeBoundingBox();
}
// Deletes all the boxes stored in BoxWord.
void BoxWord::DeleteAllBoxes() {
length_ = 0;
boxes_.clear();
bbox_ = TBOX();
}
// Computes the bounding box of the word.
void BoxWord::ComputeBoundingBox() {
bbox_ = TBOX();
for (int i = 0; i < length_; ++i)
bbox_ += boxes_[i];
}
// This and other putatively are the same, so call the (permanent) callback
// for each blob index where the bounding boxes match.
// The callback is deleted on completion.
void BoxWord::ProcessMatchedBlobs(const TWERD& other,
TessCallback1<int>* cb) const {
for (int i = 0; i < length_ && i < other.NumBlobs(); ++i) {
TBOX blob_box = other.blobs[i]->bounding_box();
if (blob_box == boxes_[i])
cb->Run(i);
}
delete cb;
}
} // namespace tesseract.

View File

@ -0,0 +1,101 @@
///////////////////////////////////////////////////////////////////////
// File: boxword.h
// Description: Class to represent the bounding boxes of the output.
// Author: Ray Smith
// Created: Tue May 25 14:18:14 PDT 2010
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CSTRUCT_BOXWORD_H__
#define TESSERACT_CSTRUCT_BOXWORD_H__
#include "genericvector.h"
#include "rect.h"
#include "unichar.h"
class BLOCK;
class DENORM;
struct TWERD;
class UNICHARSET;
class WERD;
class WERD_CHOICE;
class WERD_RES;
namespace tesseract {
// Class to hold an array of bounding boxes for an output word and
// the bounding box of the whole word.
class BoxWord {
public:
BoxWord();
explicit BoxWord(const BoxWord& src);
~BoxWord();
BoxWord& operator=(const BoxWord& src);
void CopyFrom(const BoxWord& src);
// Factory to build a BoxWord from a TWERD using the DENORMs on each blob to
// switch back to original image coordinates.
static BoxWord* CopyFromNormalized(TWERD* tessword);
// Clean up the bounding boxes from the polygonal approximation by
// expanding slightly, then clipping to the blobs from the original_word
// that overlap. If not null, the block provides the inverse rotation.
void ClipToOriginalWord(const BLOCK* block, WERD* original_word);
// Merges the boxes from start to end, not including end, and deletes
// the boxes between start and end.
void MergeBoxes(int start, int end);
// Inserts a new box before the given index.
// Recomputes the bounding box.
void InsertBox(int index, const TBOX& box);
// Changes the box at the given index to the new box.
// Recomputes the bounding box.
void ChangeBox(int index, const TBOX& box);
// Deletes the box with the given index, and shuffles up the rest.
// Recomputes the bounding box.
void DeleteBox(int index);
// Deletes all the boxes stored in BoxWord.
void DeleteAllBoxes();
// This and other putatively are the same, so call the (permanent) callback
// for each blob index where the bounding boxes match.
// The callback is deleted on completion.
void ProcessMatchedBlobs(const TWERD& other, TessCallback1<int>* cb) const;
const TBOX& bounding_box() const {
return bbox_;
}
int length() const { return length_; }
const TBOX& BlobBox(int index) const {
return boxes_[index];
}
private:
void ComputeBoundingBox();
TBOX bbox_;
int length_;
GenericVector<TBOX> boxes_;
};
} // namespace tesseract.
#endif // TESSERACT_CSTRUCT_BOXWORD_H__

View File

@ -0,0 +1,36 @@
///////////////////////////////////////////////////////////////////////
// File: ccstruct.cpp
// Description: ccstruct class.
// Author: Samuel Charron
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "ccstruct.h"
namespace tesseract {
// APPROXIMATIONS of the fractions of the character cell taken by
// the descenders, ascenders, and x-height.
const double CCStruct::kDescenderFraction = 0.25;
const double CCStruct::kXHeightFraction = 0.5;
const double CCStruct::kAscenderFraction = 0.25;
const double CCStruct::kXHeightCapRatio = CCStruct::kXHeightFraction /
(CCStruct::kXHeightFraction + CCStruct::kAscenderFraction);
CCStruct::CCStruct() {}
CCStruct::~CCStruct() {
}
}

View File

@ -0,0 +1,44 @@
///////////////////////////////////////////////////////////////////////
// File: ccstruct.h
// Description: ccstruct class.
// Author: Samuel Charron
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCSTRUCT_CCSTRUCT_H__
#define TESSERACT_CCSTRUCT_CCSTRUCT_H__
#include "cutil.h"
namespace tesseract {
class CCStruct : public CUtil {
public:
CCStruct();
~CCStruct();
// Globally accessible constants.
// APPROXIMATIONS of the fractions of the character cell taken by
// the descenders, ascenders, and x-height.
static const double kDescenderFraction; // = 0.25;
static const double kXHeightFraction; // = 0.5;
static const double kAscenderFraction; // = 0.25;
// Derived value giving the x-height as a fraction of cap-height.
static const double kXHeightCapRatio; // = XHeight/(XHeight + Ascender).
};
class Tesseract;
} // namespace tesseract
#endif // TESSERACT_CCSTRUCT_CCSTRUCT_H__

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,291 @@
/**********************************************************************
* File: coutln.c (Formerly: coutline.c)
* Description: Code for the C_OUTLINE class.
* Author: Ray Smith
* Created: Mon Oct 07 16:01:57 BST 1991
*
* (C) Copyright 1991, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef COUTLN_H
#define COUTLN_H
#include "crakedge.h"
#include "mod128.h"
#include "bits16.h"
#include "rect.h"
#include "blckerr.h"
#include "scrollview.h"
class DENORM;
#define INTERSECTING MAX_INT16//no winding number
//mask to get step
#define STEP_MASK 3
enum C_OUTLINE_FLAGS
{
COUT_INVERSE //White on black blob
};
// Simple struct to hold the 3 values needed to compute a more precise edge
// position and direction. The offset_numerator is the difference between the
// grey threshold and the mean pixel value. pixel_diff is the difference between
// the pixels in the edge. Consider the following row of pixels: p1 p2 p3 p4 p5
// Say the image was thresholded at threshold t, making p1, p2, p3 black
// and p4, p5 white (p1, p2, p3 < t, and p4, p5 >= t), but suppose that
// max(p[i+1] - p[i]) is p3 - p2. Then the extrapolated position of the edge,
// based on the maximum gradient, is at the crack between p2 and p3 plus the
// offset (t - (p2+p3)/2)/(p3 - p2). We store the pixel difference p3-p2
// denominator in pixel_diff and the offset numerator, relative to the original
// binary edge (t - (p2+p3)/2) - (p3 -p2) in offset_numerator.
// The sign of offset_numerator and pixel_diff are manipulated to ensure
// that the pixel_diff, which will be used as a weight, is always positive.
// The direction stores the quantized feature direction for the given step
// computed from the edge gradient. (Using binary_angle_plus_pi.)
// If the pixel_diff is zero, it means that the direction of the gradient
// is in conflict with the step direction, so this step is to be ignored.
struct EdgeOffset {
inT8 offset_numerator;
uinT8 pixel_diff;
uinT8 direction;
};
class DLLSYM C_OUTLINE; //forward declaration
struct Pix;
ELISTIZEH (C_OUTLINE)
class DLLSYM C_OUTLINE:public ELIST_LINK {
public:
C_OUTLINE() { //empty constructor
steps = NULL;
offsets = NULL;
}
C_OUTLINE( //constructor
CRACKEDGE *startpt, //from edge detector
ICOORD bot_left, //bounding box //length of loop
ICOORD top_right,
inT16 length);
C_OUTLINE(ICOORD startpt, //start of loop
DIR128 *new_steps, //steps in loop
inT16 length); //length of loop
//outline to copy
C_OUTLINE(C_OUTLINE *srcline, FCOORD rotation); //and rotate
// Build a fake outline, given just a bounding box and append to the list.
static void FakeOutline(const TBOX& box, C_OUTLINE_LIST* outlines);
~C_OUTLINE () { //destructor
if (steps != NULL)
free_mem(steps);
steps = NULL;
delete [] offsets;
}
BOOL8 flag( //test flag
C_OUTLINE_FLAGS mask) const { //flag to test
return flags.bit (mask);
}
void set_flag( //set flag value
C_OUTLINE_FLAGS mask, //flag to test
BOOL8 value) { //value to set
flags.set_bit (mask, value);
}
C_OUTLINE_LIST *child() { //get child list
return &children;
}
//access function
const TBOX &bounding_box() const {
return box;
}
void set_step( //set a step
inT16 stepindex, //index of step
inT8 stepdir) { //chain code
int shift = stepindex%4 * 2;
uinT8 mask = 3 << shift;
steps[stepindex/4] = ((stepdir << shift) & mask) |
(steps[stepindex/4] & ~mask);
//squeeze 4 into byte
}
void set_step( //set a step
inT16 stepindex, //index of step
DIR128 stepdir) { //direction
//clean it
inT8 chaindir = stepdir.get_dir() >> (DIRBITS - 2);
//difference
set_step(stepindex, chaindir);
//squeeze 4 into byte
}
inT32 pathlength() const { //get path length
return stepcount;
}
// Return step at a given index as a DIR128.
DIR128 step_dir(int index) const {
return DIR128((inT16)(((steps[index/4] >> (index%4 * 2)) & STEP_MASK) <<
(DIRBITS - 2)));
}
// Return the step vector for the given outline position.
ICOORD step(int index) const { // index of step
return step_coords[chain_code(index)];
}
// get start position
const ICOORD &start_pos() const {
return start;
}
// Returns the position at the given index on the outline.
// NOT to be used lightly, as it has to iterate the outline to find out.
ICOORD position_at_index(int index) const {
ICOORD pos = start;
for (int i = 0; i < index; ++i)
pos += step(i);
return pos;
}
// Returns the sub-pixel accurate position given the integer position pos
// at the given index on the outline. pos may be a return value of
// position_at_index, or computed by repeatedly adding step to the
// start_pos() in the usual way.
FCOORD sub_pixel_pos_at_index(const ICOORD& pos, int index) const {
const ICOORD& step_to_next(step(index));
FCOORD f_pos(pos.x() + step_to_next.x() / 2.0f,
pos.y() + step_to_next.y() / 2.0f);
if (offsets != NULL && offsets[index].pixel_diff > 0) {
float offset = offsets[index].offset_numerator;
offset /= offsets[index].pixel_diff;
if (step_to_next.x() != 0)
f_pos.set_y(f_pos.y() + offset);
else
f_pos.set_x(f_pos.x() + offset);
}
return f_pos;
}
// Returns the step direction for the given index or -1 if there is none.
int direction_at_index(int index) const {
if (offsets != NULL && offsets[index].pixel_diff > 0)
return offsets[index].direction;
return -1;
}
// Returns the edge strength for the given index.
// If there are no recorded edge strengths, returns 1 (assuming the image
// is binary). Returns 0 if the gradient direction conflicts with the
// step direction, indicating that this position could be skipped.
int edge_strength_at_index(int index) const {
if (offsets != NULL)
return offsets[index].pixel_diff;
return 1;
}
// Return the step as a chain code (0-3) related to the standard feature
// direction of binary_angle_plus_pi by:
// chain_code * 64 = feature direction.
int chain_code(int index) const { // index of step
return (steps[index / 4] >> (index % 4 * 2)) & STEP_MASK;
}
inT32 area() const; // Returns area of self and 1st level children.
inT32 perimeter() const; // Total perimeter of self and 1st level children.
inT32 outer_area() const; // Returns area of self only.
inT32 count_transitions( //count maxima
inT32 threshold); //size threshold
BOOL8 operator< ( //containment test
const C_OUTLINE & other) const;
BOOL8 operator> ( //containment test
C_OUTLINE & other) const
{
return other < *this; //use the < to do it
}
inT16 winding_number( //get winding number
ICOORD testpt) const; //around this point
//get direction
inT16 turn_direction() const;
void reverse(); //reverse direction
void move( // reposition outline
const ICOORD vec); // by vector
// Returns true if *this and its children are legally nested.
// The outer area of a child should have the opposite sign to the
// parent. If not, it means we have discarded an outline in between
// (probably due to excessive length).
bool IsLegallyNested() const;
// If this outline is smaller than the given min_size, delete this and
// remove from its list, via *it, after checking that *it points to this.
// Otherwise, if any children of this are too small, delete them.
// On entry, *it must be an iterator pointing to this. If this gets deleted
// then this is extracted from *it, so an iteration can continue.
void RemoveSmallRecursive(int min_size, C_OUTLINE_IT* it);
// Adds sub-pixel resolution EdgeOffsets for the outline if the supplied
// pix is 8-bit. Does nothing otherwise.
void ComputeEdgeOffsets(int threshold, Pix* pix);
// Adds sub-pixel resolution EdgeOffsets for the outline using only
// a binary image source.
void ComputeBinaryOffsets();
// Renders the outline to the given pix, with left and top being
// the coords of the upper-left corner of the pix.
void render(int left, int top, Pix* pix) const;
// Renders just the outline to the given pix (no fill), with left and top
// being the coords of the upper-left corner of the pix.
void render_outline(int left, int top, Pix* pix) const;
#ifndef GRAPHICS_DISABLED
void plot( //draw one
ScrollView* window, //window to draw in
ScrollView::Color colour) const; //colour to draw it
// Draws the outline in the given colour, normalized using the given denorm,
// making use of sub-pixel accurate information if available.
void plot_normed(const DENORM& denorm, ScrollView::Color colour,
ScrollView* window) const;
#endif // GRAPHICS_DISABLED
C_OUTLINE& operator=(const C_OUTLINE& source);
static C_OUTLINE* deep_copy(const C_OUTLINE* src) {
C_OUTLINE* outline = new C_OUTLINE;
*outline = *src;
return outline;
}
static ICOORD chain_step(int chaindir);
// The maximum length of any outline. The stepcount is stored as 16 bits,
// but it is probably not a good idea to increase this constant by much
// and switch to 32 bits, as it plays an important role in keeping huge
// outlines invisible, which prevents bad speed behavior.
static const int kMaxOutlineLength = 16000;
private:
// Helper for ComputeBinaryOffsets. Increments pos, dir_counts, pos_totals
// by the step, increment, and vertical step ? x : y position * increment
// at step s Mod stepcount respectively. Used to add or subtract the
// direction and position to/from accumulators of a small neighbourhood.
void increment_step(int s, int increment, ICOORD* pos, int* dir_counts,
int* pos_totals) const;
int step_mem() const { return (stepcount+3) / 4; }
TBOX box; // bounding box
ICOORD start; // start coord
inT16 stepcount; // no of steps
BITS16 flags; // flags about outline
uinT8 *steps; // step array
EdgeOffset* offsets; // Higher precision edge.
C_OUTLINE_LIST children; // child elements
static ICOORD step_coords[4];
};
#endif

View File

@ -0,0 +1,37 @@
/**********************************************************************
* File: crakedge.h (Formerly: crkedge.h)
* Description: Sturctures for the Crack following edge detector.
* Author: Ray Smith
* Created: Fri Mar 22 16:06:38 GMT 1991
*
* (C) Copyright 1991, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef CRAKEDGE_H
#define CRAKEDGE_H
#include "points.h"
#include "mod128.h"
class CRACKEDGE {
public:
CRACKEDGE() {}
ICOORD pos; /*position of crack */
inT8 stepx; //edge step
inT8 stepy;
inT8 stepdir; //chaincode
CRACKEDGE *prev; /*previous point */
CRACKEDGE *next; /*next point */
};
#endif

View File

@ -0,0 +1,295 @@
///////////////////////////////////////////////////////////////////////
// File: detlinefit.cpp
// Description: Deterministic least median squares line fitting.
// Author: Ray Smith
// Created: Thu Feb 28 14:45:01 PDT 2008
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "detlinefit.h"
#include "statistc.h"
#include "ndminx.h"
#include "tprintf.h"
namespace tesseract {
// The number of points to consider at each end.
const int kNumEndPoints = 3;
// The minimum number of points at which to switch to number of points
// for badly fitted lines.
// To ensure a sensible error metric, kMinPointsForErrorCount should be at
// least kMaxRealDistance / (1 - %ile) where %ile is the fractile used in
// ComputeUpperQuartileError.
const int kMinPointsForErrorCount = 16;
// The maximum real distance to use before switching to number of
// mis-fitted points, which will get square-rooted for true distance.
const int kMaxRealDistance = 2.0;
DetLineFit::DetLineFit() : square_length_(0.0) {
}
DetLineFit::~DetLineFit() {
}
// Delete all Added points.
void DetLineFit::Clear() {
pts_.clear();
distances_.clear();
}
// Add a new point. Takes a copy - the pt doesn't need to stay in scope.
void DetLineFit::Add(const ICOORD& pt) {
pts_.push_back(PointWidth(pt, 0));
}
// Associates a half-width with the given point if a point overlaps the
// previous point by more than half the width, and its distance is further
// than the previous point, then the more distant point is ignored in the
// distance calculation. Useful for ignoring i dots and other diacritics.
void DetLineFit::Add(const ICOORD& pt, int halfwidth) {
pts_.push_back(PointWidth(pt, halfwidth));
}
// Fits a line to the points, ignoring the skip_first initial points and the
// skip_last final points, returning the fitted line as a pair of points,
// and the upper quartile error.
double DetLineFit::Fit(int skip_first, int skip_last,
ICOORD* pt1, ICOORD* pt2) {
// Do something sensible with no points.
if (pts_.empty()) {
pt1->set_x(0);
pt1->set_y(0);
*pt2 = *pt1;
return 0.0;
}
// Count the points and find the first and last kNumEndPoints.
int pt_count = pts_.size();
ICOORD* starts[kNumEndPoints];
if (skip_first >= pt_count) skip_first = pt_count - 1;
int start_count = 0;
int end_i = MIN(skip_first + kNumEndPoints, pt_count);
for (int i = skip_first; i < end_i; ++i) {
starts[start_count++] = &pts_[i].pt;
}
ICOORD* ends[kNumEndPoints];
if (skip_last >= pt_count) skip_last = pt_count - 1;
int end_count = 0;
end_i = MAX(0, pt_count - kNumEndPoints - skip_last);
for (int i = pt_count - 1 - skip_last; i >= end_i; --i) {
ends[end_count++] = &pts_[i].pt;
}
// 1 or 2 points need special treatment.
if (pt_count <= 2) {
*pt1 = *starts[0];
if (pt_count > 1)
*pt2 = *ends[0];
else
*pt2 = *pt1;
return 0.0;
}
// Although with between 2 and 2*kNumEndPoints-1 points, there will be
// overlap in the starts, ends sets, this is OK and taken care of by the
// if (*start != *end) test below, which also tests for equal input points.
double best_uq = -1.0;
// Iterate each pair of points and find the best fitting line.
for (int i = 0; i < start_count; ++i) {
ICOORD* start = starts[i];
for (int j = 0; j < end_count; ++j) {
ICOORD* end = ends[j];
if (*start != *end) {
ComputeDistances(*start, *end);
// Compute the upper quartile error from the line.
double dist = EvaluateLineFit();
if (dist < best_uq || best_uq < 0.0) {
best_uq = dist;
*pt1 = *start;
*pt2 = *end;
}
}
}
}
// Finally compute the square root to return the true distance.
return best_uq > 0.0 ? sqrt(best_uq) : best_uq;
}
// Constrained fit with a supplied direction vector. Finds the best line_pt,
// that is one of the supplied points having the median cross product with
// direction, ignoring points that have a cross product outside of the range
// [min_dist, max_dist]. Returns the resulting error metric using the same
// reduced set of points.
// *Makes use of floating point arithmetic*
double DetLineFit::ConstrainedFit(const FCOORD& direction,
double min_dist, double max_dist,
bool debug, ICOORD* line_pt) {
ComputeConstrainedDistances(direction, min_dist, max_dist);
// Do something sensible with no points or computed distances.
if (pts_.empty() || distances_.empty()) {
line_pt->set_x(0);
line_pt->set_y(0);
return 0.0;
}
int median_index = distances_.choose_nth_item(distances_.size() / 2);
*line_pt = distances_[median_index].data;
if (debug) {
tprintf("Constrained fit to dir %g, %g = %d, %d :%d distances:\n",
direction.x(), direction.y(),
line_pt->x(), line_pt->y(), distances_.size());
for (int i = 0; i < distances_.size(); ++i) {
tprintf("%d: %d, %d -> %g\n", i, distances_[i].data.x(),
distances_[i].data.y(), distances_[i].key);
}
tprintf("Result = %d\n", median_index);
}
// Center distances on the fitted point.
double dist_origin = direction * *line_pt;
for (int i = 0; i < distances_.size(); ++i) {
distances_[i].key -= dist_origin;
}
return sqrt(EvaluateLineFit());
}
// Returns true if there were enough points at the last call to Fit or
// ConstrainedFit for the fitted points to be used on a badly fitted line.
bool DetLineFit::SufficientPointsForIndependentFit() const {
return distances_.size() >= kMinPointsForErrorCount;
}
// Backwards compatible fit returning a gradient and constant.
// Deprecated. Prefer Fit(ICOORD*, ICOORD*) where possible, but use this
// function in preference to the LMS class.
double DetLineFit::Fit(float* m, float* c) {
ICOORD start, end;
double error = Fit(&start, &end);
if (end.x() != start.x()) {
*m = static_cast<float>(end.y() - start.y()) / (end.x() - start.x());
*c = start.y() - *m * start.x();
} else {
*m = 0.0f;
*c = 0.0f;
}
return error;
}
// Backwards compatible constrained fit with a supplied gradient.
// Deprecated. Use ConstrainedFit(const FCOORD& direction) where possible
// to avoid potential difficulties with infinite gradients.
double DetLineFit::ConstrainedFit(double m, float* c) {
// Do something sensible with no points.
if (pts_.empty()) {
*c = 0.0f;
return 0.0;
}
double cos = 1.0 / sqrt(1.0 + m * m);
FCOORD direction(cos, m * cos);
ICOORD line_pt;
double error = ConstrainedFit(direction, -MAX_FLOAT32, MAX_FLOAT32, false,
&line_pt);
*c = line_pt.y() - line_pt.x() * m;
return error;
}
// Computes and returns the squared evaluation metric for a line fit.
double DetLineFit::EvaluateLineFit() {
// Compute the upper quartile error from the line.
double dist = ComputeUpperQuartileError();
if (distances_.size() >= kMinPointsForErrorCount &&
dist > kMaxRealDistance * kMaxRealDistance) {
// Use the number of mis-fitted points as the error metric, as this
// gives a better measure of fit for badly fitted lines where more
// than a quarter are badly fitted.
double threshold = kMaxRealDistance * sqrt(square_length_);
dist = NumberOfMisfittedPoints(threshold);
}
return dist;
}
// Computes the absolute error distances of the points from the line,
// and returns the squared upper-quartile error distance.
double DetLineFit::ComputeUpperQuartileError() {
int num_errors = distances_.size();
if (num_errors == 0) return 0.0;
// Get the absolute values of the errors.
for (int i = 0; i < num_errors; ++i) {
if (distances_[i].key < 0) distances_[i].key = -distances_[i].key;
}
// Now get the upper quartile distance.
int index = distances_.choose_nth_item(3 * num_errors / 4);
double dist = distances_[index].key;
// The true distance is the square root of the dist squared / square_length.
// Don't bother with the square root. Just return the square distance.
return square_length_ > 0.0 ? dist * dist / square_length_ : 0.0;
}
// Returns the number of sample points that have an error more than threshold.
int DetLineFit::NumberOfMisfittedPoints(double threshold) const {
int num_misfits = 0;
int num_dists = distances_.size();
// Get the absolute values of the errors.
for (int i = 0; i < num_dists; ++i) {
if (distances_[i].key > threshold)
++num_misfits;
}
return num_misfits;
}
// Computes all the cross product distances of the points from the line,
// storing the actual (signed) cross products in distances.
// Ignores distances of points that are further away than the previous point,
// and overlaps the previous point by at least half.
void DetLineFit::ComputeDistances(const ICOORD& start, const ICOORD& end) {
distances_.truncate(0);
ICOORD line_vector = end;
line_vector -= start;
square_length_ = line_vector.sqlength();
int line_length = IntCastRounded(sqrt(square_length_));
// Compute the distance of each point from the line.
int prev_abs_dist = 0;
int prev_dot = 0;
for (int i = 0; i < pts_.size(); ++i) {
ICOORD pt_vector = pts_[i].pt;
pt_vector -= start;
int dot = line_vector % pt_vector;
// Compute |line_vector||pt_vector|sin(angle between)
int dist = line_vector * pt_vector;
int abs_dist = dist < 0 ? -dist : dist;
if (abs_dist > prev_abs_dist && i > 0) {
// Ignore this point if it overlaps the previous one.
int separation = abs(dot - prev_dot);
if (separation < line_length * pts_[i].halfwidth ||
separation < line_length * pts_[i - 1].halfwidth)
continue;
}
distances_.push_back(DistPointPair(dist, pts_[i].pt));
prev_abs_dist = abs_dist;
prev_dot = dot;
}
}
// Computes all the cross product distances of the points perpendicular to
// the given direction, ignoring distances outside of the give distance range,
// storing the actual (signed) cross products in distances_.
void DetLineFit::ComputeConstrainedDistances(const FCOORD& direction,
double min_dist, double max_dist) {
distances_.truncate(0);
square_length_ = direction.sqlength();
// Compute the distance of each point from the line.
for (int i = 0; i < pts_.size(); ++i) {
FCOORD pt_vector = pts_[i].pt;
// Compute |line_vector||pt_vector|sin(angle between)
double dist = direction * pt_vector;
if (min_dist <= dist && dist <= max_dist)
distances_.push_back(DistPointPair(dist, pts_[i].pt));
}
}
} // namespace tesseract.

View File

@ -0,0 +1,164 @@
///////////////////////////////////////////////////////////////////////
// File: detlinefit.h
// Description: Deterministic least upper-quartile squares line fitting.
// Author: Ray Smith
// Created: Thu Feb 28 14:35:01 PDT 2008
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCSTRUCT_DETLINEFIT_H_
#define TESSERACT_CCSTRUCT_DETLINEFIT_H_
#include "genericvector.h"
#include "kdpair.h"
#include "points.h"
namespace tesseract {
// This class fits a line to a set of ICOORD points.
// There is no restriction on the direction of the line, as it
// uses a vector method, ie no concern over infinite gradients.
// The fitted line has the least upper quartile of squares of perpendicular
// distances of all source points from the line, subject to the constraint
// that the line is made from one of the pairs of [{p1,p2,p3},{pn-2, pn-1, pn}]
// i.e. the 9 combinations of one of the first 3 and last 3 points.
// A fundamental assumption of this algorithm is that one of the first 3 and
// one of the last 3 points are near the best line fit.
// The points must be Added in line order for the algorithm to work properly.
// No floating point calculations are needed* to make an accurate fit,
// and no random numbers are needed** so the algorithm is deterministic,
// architecture-stable, and compiler-stable as well as stable to minor
// changes in the input.
// *A single floating point division is used to compute each line's distance.
// This is unlikely to result in choice of a different line, but if it does,
// it would be easy to replace with a 64 bit integer calculation.
// **Random numbers are used in the nth_item function, but the worst
// non-determinism that can result is picking a different result among equals,
// and that wouldn't make any difference to the end-result distance, so the
// randomness does not affect the determinism of the algorithm. The random
// numbers are only there to guarantee average linear time.
// Fitting time is linear, but with a high constant, as it tries 9 different
// lines and computes the distance of all points each time.
// This class is aimed at replacing the LLSQ (linear least squares) and
// LMS (least median of squares) classes that are currently used for most
// of the line fitting in Tesseract.
class DetLineFit {
public:
DetLineFit();
~DetLineFit();
// Delete all Added points.
void Clear();
// Adds a new point. Takes a copy - the pt doesn't need to stay in scope.
// Add must be called on points in sequence along the line.
void Add(const ICOORD& pt);
// Associates a half-width with the given point if a point overlaps the
// previous point by more than half the width, and its distance is further
// than the previous point, then the more distant point is ignored in the
// distance calculation. Useful for ignoring i dots and other diacritics.
void Add(const ICOORD& pt, int halfwidth);
// Fits a line to the points, returning the fitted line as a pair of
// points, and the upper quartile error.
double Fit(ICOORD* pt1, ICOORD* pt2) {
return Fit(0, 0, pt1, pt2);
}
// Fits a line to the points, ignoring the skip_first initial points and the
// skip_last final points, returning the fitted line as a pair of points,
// and the upper quartile error.
double Fit(int skip_first, int skip_last, ICOORD* pt1, ICOORD* pt2);
// Constrained fit with a supplied direction vector. Finds the best line_pt,
// that is one of the supplied points having the median cross product with
// direction, ignoring points that have a cross product outside of the range
// [min_dist, max_dist]. Returns the resulting error metric using the same
// reduced set of points.
// *Makes use of floating point arithmetic*
double ConstrainedFit(const FCOORD& direction,
double min_dist, double max_dist,
bool debug, ICOORD* line_pt);
// Returns true if there were enough points at the last call to Fit or
// ConstrainedFit for the fitted points to be used on a badly fitted line.
bool SufficientPointsForIndependentFit() const;
// Backwards compatible fit returning a gradient and constant.
// Deprecated. Prefer Fit(ICOORD*, ICOORD*) where possible, but use this
// function in preference to the LMS class.
double Fit(float* m, float* c);
// Backwards compatible constrained fit with a supplied gradient.
// Deprecated. Use ConstrainedFit(const FCOORD& direction) where possible
// to avoid potential difficulties with infinite gradients.
double ConstrainedFit(double m, float* c);
private:
// Simple struct to hold an ICOORD point and a halfwidth representing half
// the "width" (supposedly approximately parallel to the direction of the
// line) of each point, such that distant points can be discarded when they
// overlap nearer points. (Think i dot and other diacritics or noise.)
struct PointWidth {
PointWidth() : pt(ICOORD(0, 0)), halfwidth(0) {}
PointWidth(const ICOORD& pt0, int halfwidth0)
: pt(pt0), halfwidth(halfwidth0) {}
ICOORD pt;
int halfwidth;
};
// Type holds the distance of each point from the fitted line and the point
// itself. Use of double allows integer distances from ICOORDs to be stored
// exactly, and also the floating point results from ConstrainedFit.
typedef KDPairInc<double, ICOORD> DistPointPair;
// Computes and returns the squared evaluation metric for a line fit.
double EvaluateLineFit();
// Computes the absolute values of the precomputed distances_,
// and returns the squared upper-quartile error distance.
double ComputeUpperQuartileError();
// Returns the number of sample points that have an error more than threshold.
int NumberOfMisfittedPoints(double threshold) const;
// Computes all the cross product distances of the points from the line,
// storing the actual (signed) cross products in distances_.
// Ignores distances of points that are further away than the previous point,
// and overlaps the previous point by at least half.
void ComputeDistances(const ICOORD& start, const ICOORD& end);
// Computes all the cross product distances of the points perpendicular to
// the given direction, ignoring distances outside of the give distance range,
// storing the actual (signed) cross products in distances_.
void ComputeConstrainedDistances(const FCOORD& direction,
double min_dist, double max_dist);
// Stores all the source points in the order they were given and their
// halfwidths, if any.
GenericVector<PointWidth> pts_;
// Stores the computed perpendicular distances of (some of) the pts_ from a
// given vector (assuming it goes through the origin, making it a line).
// Since the distances may be a subset of the input points, and get
// re-ordered by the nth_item function, the original point is stored
// along side the distance.
GenericVector<DistPointPair> distances_; // Distances of points.
// The squared length of the vector used to compute distances_.
double square_length_;
};
} // namespace tesseract.
#endif // TESSERACT_CCSTRUCT_DETLINEFIT_H_

View File

@ -0,0 +1,98 @@
/**********************************************************************
* File: dppoint.cpp
* Description: Simple generic dynamic programming class.
* Author: Ray Smith
* Created: Wed Mar 25 19:08:01 PDT 2009
*
* (C) Copyright 2009, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "dppoint.h"
#include "tprintf.h"
namespace tesseract {
// Solve the dynamic programming problem for the given array of points, with
// the given size and cost function.
// Steps backwards are limited to being between min_step and max_step
// inclusive.
// The return value is the tail of the best path.
DPPoint* DPPoint::Solve(int min_step, int max_step, bool debug,
CostFunc cost_func, int size, DPPoint* points) {
if (size <= 0 || max_step < min_step || min_step >= size)
return NULL; // Degenerate, but not necessarily an error.
ASSERT_HOST(min_step > 0); // Infinite loop possible if this is not true.
if (debug)
tprintf("min = %d, max=%d\n",
min_step, max_step);
// Evaluate the total cost at each point.
for (int i = 0; i < size; ++i) {
for (int offset = min_step; offset <= max_step; ++offset) {
DPPoint* prev = offset <= i ? points + i - offset : NULL;
inT64 new_cost = (points[i].*cost_func)(prev);
if (points[i].best_prev_ != NULL && offset > min_step * 2 &&
new_cost > points[i].total_cost_)
break; // Find only the first minimum if going over twice the min.
}
points[i].total_cost_ += points[i].local_cost_;
if (debug) {
tprintf("At point %d, local cost=%d, total_cost=%d, steps=%d\n",
i, points[i].local_cost_, points[i].total_cost_,
points[i].total_steps_);
}
}
// Now find the end of the best path and return it.
int best_cost = points[size - 1].total_cost_;
int best_end = size - 1;
for (int end = best_end - 1; end >= size - min_step; --end) {
int cost = points[end].total_cost_;
if (cost < best_cost) {
best_cost = cost;
best_end = end;
}
}
return points + best_end;
}
// A CostFunc that takes the variance of step into account in the cost.
inT64 DPPoint::CostWithVariance(const DPPoint* prev) {
if (prev == NULL || prev == this) {
UpdateIfBetter(0, 1, NULL, 0, 0, 0);
return 0;
}
int delta = this - prev;
inT32 n = prev->n_ + 1;
inT32 sig_x = prev->sig_x_ + delta;
inT64 sig_xsq = prev->sig_xsq_ + delta * delta;
inT64 cost = (sig_xsq - sig_x * sig_x / n) / n;
cost += prev->total_cost_;
UpdateIfBetter(cost, prev->total_steps_ + 1, prev, n, sig_x, sig_xsq);
return cost;
}
// Update the other members if the cost is lower.
void DPPoint::UpdateIfBetter(inT64 cost, inT32 steps, const DPPoint* prev,
inT32 n, inT32 sig_x, inT64 sig_xsq) {
if (cost < total_cost_) {
total_cost_ = cost;
total_steps_ = steps;
best_prev_ = prev;
n_ = n;
sig_x_ = sig_x;
sig_xsq_ = sig_xsq;
}
}
} // namespace tesseract.

View File

@ -0,0 +1,102 @@
/**********************************************************************
* File: dppoint.h
* Description: Simple generic dynamic programming class.
* Author: Ray Smith
* Created: Wed Mar 25 18:57:01 PDT 2009
*
* (C) Copyright 2009, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_CCSTRUCT_DPPOINT_H__
#define TESSERACT_CCSTRUCT_DPPOINT_H__
#include "host.h"
namespace tesseract {
// A simple class to provide a dynamic programming solution to a class of
// 1st-order problems in which the cost is dependent only on the current
// step and the best cost to that step, with a possible special case
// of using the variance of the steps, and only the top choice is required.
// Useful for problems such as finding the optimal cut points in a fixed-pitch
// (vertical or horizontal) situation.
// Skeletal Example:
// DPPoint* array = new DPPoint[width];
// for (int i = 0; i < width; i++) {
// array[i].AddLocalCost(cost_at_i)
// }
// DPPoint* best_end = DPPoint::Solve(..., array);
// while (best_end != NULL) {
// int cut_index = best_end - array;
// best_end = best_end->best_prev();
// }
// delete [] array;
class DPPoint {
public:
// The cost function evaluates the total cost at this (excluding this's
// local_cost) and if it beats this's total_cost, then
// replace the appropriate values in this.
typedef inT64 (DPPoint::*CostFunc)(const DPPoint* prev);
DPPoint()
: local_cost_(0), total_cost_(MAX_INT32), total_steps_(1), best_prev_(NULL),
n_(0), sig_x_(0), sig_xsq_(0) {
}
// Solve the dynamic programming problem for the given array of points, with
// the given size and cost function.
// Steps backwards are limited to being between min_step and max_step
// inclusive.
// The return value is the tail of the best path.
static DPPoint* Solve(int min_step, int max_step, bool debug,
CostFunc cost_func, int size, DPPoint* points);
// A CostFunc that takes the variance of step into account in the cost.
inT64 CostWithVariance(const DPPoint* prev);
// Accessors.
int total_cost() const {
return total_cost_;
}
int Pathlength() const {
return total_steps_;
}
const DPPoint* best_prev() const {
return best_prev_;
}
void AddLocalCost(int new_cost) {
local_cost_ += new_cost;
}
private:
// Code common to different cost functions.
// Update the other members if the cost is lower.
void UpdateIfBetter(inT64 cost, inT32 steps, const DPPoint* prev,
inT32 n, inT32 sig_x, inT64 sig_xsq);
inT32 local_cost_; // Cost of this point on its own.
inT32 total_cost_; // Sum of all costs in best path to here.
// During cost calculations local_cost is excluded.
inT32 total_steps_; // Number of steps in best path to here.
const DPPoint* best_prev_; // Pointer to prev point in best path from here.
// Information for computing the variance part of the cost.
inT32 n_; // Number of steps in best path to here for variance.
inT32 sig_x_; // Sum of step sizes for computing variance.
inT64 sig_xsq_; // Sum of squares of steps for computing variance.
};
} // namespace tesseract.
#endif // TESSERACT_CCSTRUCT_DPPOINT_H__

View File

@ -0,0 +1,262 @@
///////////////////////////////////////////////////////////////////////
// File: fontinfo.cpp
// Description: Font information classes abstracted from intproto.h/cpp.
// Author: rays@google.com (Ray Smith)
// Created: Wed May 18 10:39:01 PDT 2011
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "fontinfo.h"
#include "bitvector.h"
#include "unicity_table.h"
namespace tesseract {
// Writes to the given file. Returns false in case of error.
bool FontInfo::Serialize(FILE* fp) const {
if (!write_info(fp, *this)) return false;
if (!write_spacing_info(fp, *this)) return false;
return true;
}
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool FontInfo::DeSerialize(bool swap, FILE* fp) {
if (!read_info(fp, this, swap)) return false;
if (!read_spacing_info(fp, this, swap)) return false;
return true;
}
FontInfoTable::FontInfoTable() {
set_compare_callback(NewPermanentTessCallback(CompareFontInfo));
set_clear_callback(NewPermanentTessCallback(FontInfoDeleteCallback));
}
FontInfoTable::~FontInfoTable() {
}
// Writes to the given file. Returns false in case of error.
bool FontInfoTable::Serialize(FILE* fp) const {
return this->SerializeClasses(fp);
}
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool FontInfoTable::DeSerialize(bool swap, FILE* fp) {
truncate(0);
return this->DeSerializeClasses(swap, fp);
}
// Returns true if the given set of fonts includes one with the same
// properties as font_id.
bool FontInfoTable::SetContainsFontProperties(
int font_id, const GenericVector<ScoredFont>& font_set) const {
uinT32 properties = get(font_id).properties;
for (int f = 0; f < font_set.size(); ++f) {
if (get(font_set[f].fontinfo_id).properties == properties)
return true;
}
return false;
}
// Returns true if the given set of fonts includes multiple properties.
bool FontInfoTable::SetContainsMultipleFontProperties(
const GenericVector<ScoredFont>& font_set) const {
if (font_set.empty()) return false;
int first_font = font_set[0].fontinfo_id;
uinT32 properties = get(first_font).properties;
for (int f = 1; f < font_set.size(); ++f) {
if (get(font_set[f].fontinfo_id).properties != properties)
return true;
}
return false;
}
// Moves any non-empty FontSpacingInfo entries from other to this.
void FontInfoTable::MoveSpacingInfoFrom(FontInfoTable* other) {
set_compare_callback(NewPermanentTessCallback(CompareFontInfo));
set_clear_callback(NewPermanentTessCallback(FontInfoDeleteCallback));
for (int i = 0; i < other->size(); ++i) {
GenericVector<FontSpacingInfo*>* spacing_vec = other->get(i).spacing_vec;
if (spacing_vec != NULL) {
int target_index = get_index(other->get(i));
if (target_index < 0) {
// Bit copy the FontInfo and steal all the pointers.
push_back(other->get(i));
other->get(i).name = NULL;
} else {
delete [] get(target_index).spacing_vec;
get(target_index).spacing_vec = other->get(i).spacing_vec;
}
other->get(i).spacing_vec = NULL;
}
}
}
// Moves this to the target unicity table.
void FontInfoTable::MoveTo(UnicityTable<FontInfo>* target) {
target->clear();
target->set_compare_callback(NewPermanentTessCallback(CompareFontInfo));
target->set_clear_callback(NewPermanentTessCallback(FontInfoDeleteCallback));
for (int i = 0; i < size(); ++i) {
// Bit copy the FontInfo and steal all the pointers.
target->push_back(get(i));
get(i).name = NULL;
get(i).spacing_vec = NULL;
}
}
// Compare FontInfo structures.
bool CompareFontInfo(const FontInfo& fi1, const FontInfo& fi2) {
// The font properties are required to be the same for two font with the same
// name, so there is no need to test them.
// Consequently, querying the table with only its font name as information is
// enough to retrieve its properties.
return strcmp(fi1.name, fi2.name) == 0;
}
// Compare FontSet structures.
bool CompareFontSet(const FontSet& fs1, const FontSet& fs2) {
if (fs1.size != fs2.size)
return false;
for (int i = 0; i < fs1.size; ++i) {
if (fs1.configs[i] != fs2.configs[i])
return false;
}
return true;
}
// Callbacks for GenericVector.
void FontInfoDeleteCallback(FontInfo f) {
if (f.spacing_vec != NULL) {
f.spacing_vec->delete_data_pointers();
delete f.spacing_vec;
}
delete[] f.name;
}
void FontSetDeleteCallback(FontSet fs) {
delete[] fs.configs;
}
/*---------------------------------------------------------------------------*/
// Callbacks used by UnicityTable to read/write FontInfo/FontSet structures.
bool read_info(FILE* f, FontInfo* fi, bool swap) {
inT32 size;
if (fread(&size, sizeof(size), 1, f) != 1) return false;
if (swap)
Reverse32(&size);
char* font_name = new char[size + 1];
fi->name = font_name;
if (static_cast<int>(fread(font_name, sizeof(*font_name), size, f)) != size)
return false;
font_name[size] = '\0';
if (fread(&fi->properties, sizeof(fi->properties), 1, f) != 1) return false;
if (swap)
Reverse32(&fi->properties);
return true;
}
bool write_info(FILE* f, const FontInfo& fi) {
inT32 size = strlen(fi.name);
if (fwrite(&size, sizeof(size), 1, f) != 1) return false;
if (static_cast<int>(fwrite(fi.name, sizeof(*fi.name), size, f)) != size)
return false;
if (fwrite(&fi.properties, sizeof(fi.properties), 1, f) != 1) return false;
return true;
}
bool read_spacing_info(FILE *f, FontInfo* fi, bool swap) {
inT32 vec_size, kern_size;
if (fread(&vec_size, sizeof(vec_size), 1, f) != 1) return false;
if (swap) Reverse32(&vec_size);
ASSERT_HOST(vec_size >= 0);
if (vec_size == 0) return true;
fi->init_spacing(vec_size);
for (int i = 0; i < vec_size; ++i) {
FontSpacingInfo *fs = new FontSpacingInfo();
if (fread(&fs->x_gap_before, sizeof(fs->x_gap_before), 1, f) != 1 ||
fread(&fs->x_gap_after, sizeof(fs->x_gap_after), 1, f) != 1 ||
fread(&kern_size, sizeof(kern_size), 1, f) != 1) {
delete fs;
return false;
}
if (swap) {
ReverseN(&(fs->x_gap_before), sizeof(fs->x_gap_before));
ReverseN(&(fs->x_gap_after), sizeof(fs->x_gap_after));
Reverse32(&kern_size);
}
if (kern_size < 0) { // indication of a NULL entry in fi->spacing_vec
delete fs;
continue;
}
if (kern_size > 0 && (!fs->kerned_unichar_ids.DeSerialize(swap, f) ||
!fs->kerned_x_gaps.DeSerialize(swap, f))) {
delete fs;
return false;
}
fi->add_spacing(i, fs);
}
return true;
}
bool write_spacing_info(FILE* f, const FontInfo& fi) {
inT32 vec_size = (fi.spacing_vec == NULL) ? 0 : fi.spacing_vec->size();
if (fwrite(&vec_size, sizeof(vec_size), 1, f) != 1) return false;
inT16 x_gap_invalid = -1;
for (int i = 0; i < vec_size; ++i) {
FontSpacingInfo *fs = fi.spacing_vec->get(i);
inT32 kern_size = (fs == NULL) ? -1 : fs->kerned_x_gaps.size();
if (fs == NULL) {
// Valid to have the identical fwrites. Writing invalid x-gaps.
if (fwrite(&(x_gap_invalid), sizeof(x_gap_invalid), 1, f) != 1 ||
fwrite(&(x_gap_invalid), sizeof(x_gap_invalid), 1, f) != 1 ||
fwrite(&kern_size, sizeof(kern_size), 1, f) != 1) {
return false;
}
} else {
if (fwrite(&(fs->x_gap_before), sizeof(fs->x_gap_before), 1, f) != 1 ||
fwrite(&(fs->x_gap_after), sizeof(fs->x_gap_after), 1, f) != 1 ||
fwrite(&kern_size, sizeof(kern_size), 1, f) != 1) {
return false;
}
}
if (kern_size > 0 && (!fs->kerned_unichar_ids.Serialize(f) ||
!fs->kerned_x_gaps.Serialize(f))) {
return false;
}
}
return true;
}
bool read_set(FILE* f, FontSet* fs, bool swap) {
if (fread(&fs->size, sizeof(fs->size), 1, f) != 1) return false;
if (swap)
Reverse32(&fs->size);
fs->configs = new int32_t[fs->size];
for (int i = 0; i < fs->size; ++i) {
if (fread(&fs->configs[i], sizeof(fs->configs[i]), 1, f) != 1) return false;
if (swap)
Reverse32(&fs->configs[i]);
}
return true;
}
bool write_set(FILE* f, const FontSet& fs) {
if (fwrite(&fs.size, sizeof(fs.size), 1, f) != 1) return false;
for (int i = 0; i < fs.size; ++i) {
if (fwrite(&fs.configs[i], sizeof(fs.configs[i]), 1, f) != 1) return false;
}
return true;
}
} // namespace tesseract.

View File

@ -0,0 +1,191 @@
///////////////////////////////////////////////////////////////////////
// File: fontinfo.h
// Description: Font information classes abstracted from intproto.h/cpp.
// Author: rays@google.com (Ray Smith)
// Created: Tue May 17 17:08:01 PDT 2011
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCSTRUCT_FONTINFO_H_
#define TESSERACT_CCSTRUCT_FONTINFO_H_
#include "genericvector.h"
#include "host.h"
#include "unichar.h"
#include <stdint.h>
template <typename T> class UnicityTable;
namespace tesseract {
class BitVector;
// Simple struct to hold a font and a score. The scores come from the low-level
// integer matcher, so they are in the uinT16 range. Fonts are an index to
// fontinfo_table.
// These get copied around a lot, so best to keep them small.
struct ScoredFont {
ScoredFont() : fontinfo_id(-1), score(0) {}
ScoredFont(int font_id, uinT16 classifier_score)
: fontinfo_id(font_id), score(classifier_score) {}
// Index into fontinfo table, but inside the classifier, may be a shapetable
// index.
inT32 fontinfo_id;
// Raw score from the low-level classifier.
uinT16 score;
};
// Struct for information about spacing between characters in a particular font.
struct FontSpacingInfo {
inT16 x_gap_before;
inT16 x_gap_after;
GenericVector<UNICHAR_ID> kerned_unichar_ids;
GenericVector<inT16> kerned_x_gaps;
};
/*
* font_properties contains properties about boldness, italicness, fixed pitch,
* serif, fraktur
*/
struct FontInfo {
FontInfo() : name(NULL), properties(0), universal_id(0), spacing_vec(NULL) {}
~FontInfo() {}
// Writes to the given file. Returns false in case of error.
bool Serialize(FILE* fp) const;
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
// Reserves unicharset_size spots in spacing_vec.
void init_spacing(int unicharset_size) {
spacing_vec = new GenericVector<FontSpacingInfo *>();
spacing_vec->init_to_size(unicharset_size, NULL);
}
// Adds the given pointer to FontSpacingInfo to spacing_vec member
// (FontInfo class takes ownership of the pointer).
// Note: init_spacing should be called before calling this function.
void add_spacing(UNICHAR_ID uch_id, FontSpacingInfo *spacing_info) {
ASSERT_HOST(spacing_vec != NULL && spacing_vec->size() > uch_id);
(*spacing_vec)[uch_id] = spacing_info;
}
// Returns the pointer to FontSpacingInfo for the given UNICHAR_ID.
const FontSpacingInfo *get_spacing(UNICHAR_ID uch_id) const {
return (spacing_vec == NULL || spacing_vec->size() <= uch_id) ?
NULL : (*spacing_vec)[uch_id];
}
// Fills spacing with the value of the x gap expected between the two given
// UNICHAR_IDs. Returns true on success.
bool get_spacing(UNICHAR_ID prev_uch_id,
UNICHAR_ID uch_id,
int *spacing) const {
const FontSpacingInfo *prev_fsi = this->get_spacing(prev_uch_id);
const FontSpacingInfo *fsi = this->get_spacing(uch_id);
if (prev_fsi == NULL || fsi == NULL) return false;
int i = 0;
for (; i < prev_fsi->kerned_unichar_ids.size(); ++i) {
if (prev_fsi->kerned_unichar_ids[i] == uch_id) break;
}
if (i < prev_fsi->kerned_unichar_ids.size()) {
*spacing = prev_fsi->kerned_x_gaps[i];
} else {
*spacing = prev_fsi->x_gap_after + fsi->x_gap_before;
}
return true;
}
bool is_italic() const { return properties & 1; }
bool is_bold() const { return (properties & 2) != 0; }
bool is_fixed_pitch() const { return (properties & 4) != 0; }
bool is_serif() const { return (properties & 8) != 0; }
bool is_fraktur() const { return (properties & 16) != 0; }
char* name;
uinT32 properties;
// The universal_id is a field reserved for the initialization process
// to assign a unique id number to all fonts loaded for the current
// combination of languages. This id will then be returned by
// ResultIterator::WordFontAttributes.
inT32 universal_id;
// Horizontal spacing between characters (indexed by UNICHAR_ID).
GenericVector<FontSpacingInfo *> *spacing_vec;
};
// Every class (character) owns a FontSet that represents all the fonts that can
// render this character.
// Since almost all the characters from the same script share the same set of
// fonts, the sets are shared over multiple classes (see
// Classify::fontset_table_). Thus, a class only store an id to a set.
// Because some fonts cannot render just one character of a set, there are a
// lot of FontSet that differ only by one font. Rather than storing directly
// the FontInfo in the FontSet structure, it's better to share FontInfos among
// FontSets (Classify::fontinfo_table_).
struct FontSet {
int32_t size;
int32_t* configs; // FontInfo ids
};
// Class that adds a bit of functionality on top of GenericVector to
// implement a table of FontInfo that replaces UniCityTable<FontInfo>.
// TODO(rays) change all references once all existing traineddata files
// are replaced.
class FontInfoTable : public GenericVector<FontInfo> {
public:
FontInfoTable();
~FontInfoTable();
// Writes to the given file. Returns false in case of error.
bool Serialize(FILE* fp) const;
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
// Returns true if the given set of fonts includes one with the same
// properties as font_id.
bool SetContainsFontProperties(
int font_id, const GenericVector<ScoredFont>& font_set) const;
// Returns true if the given set of fonts includes multiple properties.
bool SetContainsMultipleFontProperties(
const GenericVector<ScoredFont>& font_set) const;
// Moves any non-empty FontSpacingInfo entries from other to this.
void MoveSpacingInfoFrom(FontInfoTable* other);
// Moves this to the target unicity table.
void MoveTo(UnicityTable<FontInfo>* target);
};
// Compare FontInfo structures.
bool CompareFontInfo(const FontInfo& fi1, const FontInfo& fi2);
// Compare FontSet structures.
bool CompareFontSet(const FontSet& fs1, const FontSet& fs2);
// Deletion callbacks for GenericVector.
void FontInfoDeleteCallback(FontInfo f);
void FontSetDeleteCallback(FontSet fs);
// Callbacks used by UnicityTable to read/write FontInfo/FontSet structures.
bool read_info(FILE* f, FontInfo* fi, bool swap);
bool write_info(FILE* f, const FontInfo& fi);
bool read_spacing_info(FILE *f, FontInfo* fi, bool swap);
bool write_spacing_info(FILE* f, const FontInfo& fi);
bool read_set(FILE* f, FontSet* fs, bool swap);
bool write_set(FILE* f, const FontSet& fs);
} // namespace tesseract.
#endif /* THIRD_PARTY_TESSERACT_CCSTRUCT_FONTINFO_H_ */

View File

@ -0,0 +1,38 @@
/**********************************************************************
* File: genblob.cpp (Formerly gblob.c)
* Description: Generic Blob processing routines
* Author: Phil Cheatle
* Created: Mon Nov 25 10:53:26 GMT 1991
*
* (C) Copyright 1991, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "genblob.h"
#include "stepblob.h"
/**********************************************************************
* c_blob_comparator()
*
* Blob comparator used to sort a blob list so that blobs are in increasing
* order of left edge.
**********************************************************************/
int c_blob_comparator( // sort blobs
const void *blob1p, // ptr to ptr to blob1
const void *blob2p // ptr to ptr to blob2
) {
C_BLOB *blob1 = *(C_BLOB **) blob1p;
C_BLOB *blob2 = *(C_BLOB **) blob2p;
return blob1->bounding_box ().left () - blob2->bounding_box ().left ();
}

View File

@ -0,0 +1,27 @@
/**********************************************************************
* File: genblob.h (Formerly gblob.h)
* Description: Generic Blob processing routines
* Author: Phil Cheatle
* Created: Mon Nov 25 10:53:26 GMT 1991
*
* (C) Copyright 1991, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef GENBLOB_H
#define GENBLOB_H
// Sort function to sort blobs by ascending left edge.
int c_blob_comparator(const void *blob1p, // ptr to ptr to blob1
const void *blob2p);
#endif

View File

@ -0,0 +1,17 @@
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HPDSIZES_H
#define HPDSIZES_H
#define NUM_TEXT_ATTR 10
#define NUM_BLOCK_ATTR 7
#define MAXLENGTH 128
#define NUM_BACKGROUNDS 8
#endif

View File

@ -0,0 +1,699 @@
///////////////////////////////////////////////////////////////////////
// File: imagedata.h
// Description: Class to hold information about a single multi-page tiff
// training file and its corresponding boxes or text file.
// Author: Ray Smith
// Created: Tue May 28 08:56:06 PST 2013
//
// (C) Copyright 2013, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
#include "imagedata.h"
#include "allheaders.h"
#include "boxread.h"
#include "callcpp.h"
#include "helpers.h"
#include "tprintf.h"
#if defined(__MINGW32__)
# include <unistd.h>
#elif __cplusplus > 199711L // in C++11
# include <thread>
#endif
// Number of documents to read ahead while training. Doesn't need to be very
// large.
const int kMaxReadAhead = 8;
namespace tesseract {
WordFeature::WordFeature() : x_(0), y_(0), dir_(0) {
}
WordFeature::WordFeature(const FCOORD& fcoord, uinT8 dir)
: x_(IntCastRounded(fcoord.x())),
y_(ClipToRange(IntCastRounded(fcoord.y()), 0, MAX_UINT8)),
dir_(dir) {
}
// Computes the maximum x and y value in the features.
void WordFeature::ComputeSize(const GenericVector<WordFeature>& features,
int* max_x, int* max_y) {
*max_x = 0;
*max_y = 0;
for (int f = 0; f < features.size(); ++f) {
if (features[f].x_ > *max_x) *max_x = features[f].x_;
if (features[f].y_ > *max_y) *max_y = features[f].y_;
}
}
// Draws the features in the given window.
void WordFeature::Draw(const GenericVector<WordFeature>& features,
ScrollView* window) {
#ifndef GRAPHICS_DISABLED
for (int f = 0; f < features.size(); ++f) {
FCOORD pos(features[f].x_, features[f].y_);
FCOORD dir;
dir.from_direction(features[f].dir_);
dir *= 8.0f;
window->SetCursor(IntCastRounded(pos.x() - dir.x()),
IntCastRounded(pos.y() - dir.y()));
window->DrawTo(IntCastRounded(pos.x() + dir.x()),
IntCastRounded(pos.y() + dir.y()));
}
#endif
}
// Writes to the given file. Returns false in case of error.
bool WordFeature::Serialize(FILE* fp) const {
if (fwrite(&x_, sizeof(x_), 1, fp) != 1) return false;
if (fwrite(&y_, sizeof(y_), 1, fp) != 1) return false;
if (fwrite(&dir_, sizeof(dir_), 1, fp) != 1) return false;
return true;
}
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool WordFeature::DeSerialize(bool swap, FILE* fp) {
if (fread(&x_, sizeof(x_), 1, fp) != 1) return false;
if (swap) ReverseN(&x_, sizeof(x_));
if (fread(&y_, sizeof(y_), 1, fp) != 1) return false;
if (fread(&dir_, sizeof(dir_), 1, fp) != 1) return false;
return true;
}
void FloatWordFeature::FromWordFeatures(
const GenericVector<WordFeature>& word_features,
GenericVector<FloatWordFeature>* float_features) {
for (int i = 0; i < word_features.size(); ++i) {
FloatWordFeature f;
f.x = word_features[i].x();
f.y = word_features[i].y();
f.dir = word_features[i].dir();
f.x_bucket = 0; // Will set it later.
float_features->push_back(f);
}
}
// Sort function to sort first by x-bucket, then by y.
/* static */
int FloatWordFeature::SortByXBucket(const void* v1, const void* v2) {
const FloatWordFeature* f1 = reinterpret_cast<const FloatWordFeature*>(v1);
const FloatWordFeature* f2 = reinterpret_cast<const FloatWordFeature*>(v2);
int x_diff = f1->x_bucket - f2->x_bucket;
if (x_diff == 0) return f1->y - f2->y;
return x_diff;
}
ImageData::ImageData() : page_number_(-1), vertical_text_(false) {
}
// Takes ownership of the pix and destroys it.
ImageData::ImageData(bool vertical, Pix* pix)
: page_number_(0), vertical_text_(vertical) {
SetPix(pix);
}
ImageData::~ImageData() {
}
// Builds and returns an ImageData from the basic data. Note that imagedata,
// truth_text, and box_text are all the actual file data, NOT filenames.
ImageData* ImageData::Build(const char* name, int page_number, const char* lang,
const char* imagedata, int imagedatasize,
const char* truth_text, const char* box_text) {
ImageData* image_data = new ImageData();
image_data->imagefilename_ = name;
image_data->page_number_ = page_number;
image_data->language_ = lang;
// Save the imagedata.
image_data->image_data_.resize_no_init(imagedatasize);
memcpy(&image_data->image_data_[0], imagedata, imagedatasize);
if (!image_data->AddBoxes(box_text)) {
if (truth_text == NULL || truth_text[0] == '\0') {
tprintf("Error: No text corresponding to page %d from image %s!\n",
page_number, name);
delete image_data;
return NULL;
}
image_data->transcription_ = truth_text;
// If we have no boxes, the transcription is in the 0th box_texts_.
image_data->box_texts_.push_back(truth_text);
// We will create a box for the whole image on PreScale, to save unpacking
// the image now.
} else if (truth_text != NULL && truth_text[0] != '\0' &&
image_data->transcription_ != truth_text) {
// Save the truth text as it is present and disagrees with the box text.
image_data->transcription_ = truth_text;
}
return image_data;
}
// Writes to the given file. Returns false in case of error.
bool ImageData::Serialize(TFile* fp) const {
if (!imagefilename_.Serialize(fp)) return false;
if (fp->FWrite(&page_number_, sizeof(page_number_), 1) != 1) return false;
if (!image_data_.Serialize(fp)) return false;
if (!transcription_.Serialize(fp)) return false;
// WARNING: Will not work across different endian machines.
if (!boxes_.Serialize(fp)) return false;
if (!box_texts_.SerializeClasses(fp)) return false;
inT8 vertical = vertical_text_;
if (fp->FWrite(&vertical, sizeof(vertical), 1) != 1) return false;
return true;
}
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool ImageData::DeSerialize(bool swap, TFile* fp) {
if (!imagefilename_.DeSerialize(swap, fp)) return false;
if (fp->FRead(&page_number_, sizeof(page_number_), 1) != 1) return false;
if (swap) ReverseN(&page_number_, sizeof(page_number_));
if (!image_data_.DeSerialize(swap, fp)) return false;
if (!transcription_.DeSerialize(swap, fp)) return false;
// WARNING: Will not work across different endian machines.
if (!boxes_.DeSerialize(swap, fp)) return false;
if (!box_texts_.DeSerializeClasses(swap, fp)) return false;
inT8 vertical = 0;
if (fp->FRead(&vertical, sizeof(vertical), 1) != 1) return false;
vertical_text_ = vertical != 0;
return true;
}
// As DeSerialize, but only seeks past the data - hence a static method.
bool ImageData::SkipDeSerialize(bool swap, TFile* fp) {
if (!STRING::SkipDeSerialize(swap, fp)) return false;
inT32 page_number;
if (fp->FRead(&page_number, sizeof(page_number), 1) != 1) return false;
if (!GenericVector<char>::SkipDeSerialize(swap, fp)) return false;
if (!STRING::SkipDeSerialize(swap, fp)) return false;
if (!GenericVector<TBOX>::SkipDeSerialize(swap, fp)) return false;
if (!GenericVector<STRING>::SkipDeSerializeClasses(swap, fp)) return false;
inT8 vertical = 0;
return fp->FRead(&vertical, sizeof(vertical), 1) == 1;
}
// Saves the given Pix as a PNG-encoded string and destroys it.
void ImageData::SetPix(Pix* pix) {
SetPixInternal(pix, &image_data_);
}
// Returns the Pix image for *this. Must be pixDestroyed after use.
Pix* ImageData::GetPix() const {
return GetPixInternal(image_data_);
}
// Gets anything and everything with a non-NULL pointer, prescaled to a
// given target_height (if 0, then the original image height), and aligned.
// Also returns (if not NULL) the width and height of the scaled image.
// The return value is the scaled Pix, which must be pixDestroyed after use,
// and scale_factor (if not NULL) is set to the scale factor that was applied
// to the image to achieve the target_height.
Pix* ImageData::PreScale(int target_height, int max_height, float* scale_factor,
int* scaled_width, int* scaled_height,
GenericVector<TBOX>* boxes) const {
int input_width = 0;
int input_height = 0;
Pix* src_pix = GetPix();
ASSERT_HOST(src_pix != NULL);
input_width = pixGetWidth(src_pix);
input_height = pixGetHeight(src_pix);
if (target_height == 0) {
target_height = MIN(input_height, max_height);
}
float im_factor = static_cast<float>(target_height) / input_height;
if (scaled_width != NULL)
*scaled_width = IntCastRounded(im_factor * input_width);
if (scaled_height != NULL)
*scaled_height = target_height;
// Get the scaled image.
Pix* pix = pixScale(src_pix, im_factor, im_factor);
if (pix == NULL) {
tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n",
input_width, input_height, im_factor);
}
if (scaled_width != NULL) *scaled_width = pixGetWidth(pix);
if (scaled_height != NULL) *scaled_height = pixGetHeight(pix);
pixDestroy(&src_pix);
if (boxes != NULL) {
// Get the boxes.
boxes->truncate(0);
for (int b = 0; b < boxes_.size(); ++b) {
TBOX box = boxes_[b];
box.scale(im_factor);
boxes->push_back(box);
}
if (boxes->empty()) {
// Make a single box for the whole image.
TBOX box(0, 0, im_factor * input_width, target_height);
boxes->push_back(box);
}
}
if (scale_factor != NULL) *scale_factor = im_factor;
return pix;
}
int ImageData::MemoryUsed() const {
return image_data_.size();
}
// Draws the data in a new window.
void ImageData::Display() const {
#ifndef GRAPHICS_DISABLED
const int kTextSize = 64;
// Draw the image.
Pix* pix = GetPix();
if (pix == NULL) return;
int width = pixGetWidth(pix);
int height = pixGetHeight(pix);
ScrollView* win = new ScrollView("Imagedata", 100, 100,
2 * (width + 2 * kTextSize),
2 * (height + 4 * kTextSize),
width + 10, height + 3 * kTextSize, true);
win->Image(pix, 0, height - 1);
pixDestroy(&pix);
// Draw the boxes.
win->Pen(ScrollView::RED);
win->Brush(ScrollView::NONE);
int text_size = kTextSize;
if (!boxes_.empty() && boxes_[0].height() * 2 < text_size)
text_size = boxes_[0].height() * 2;
win->TextAttributes("Arial", text_size, false, false, false);
if (!boxes_.empty()) {
for (int b = 0; b < boxes_.size(); ++b) {
boxes_[b].plot(win);
win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].string());
}
} else {
// The full transcription.
win->Pen(ScrollView::CYAN);
win->Text(0, height + kTextSize * 2, transcription_.string());
}
win->Update();
window_wait(win);
#endif
}
// Adds the supplied boxes and transcriptions that correspond to the correct
// page number.
void ImageData::AddBoxes(const GenericVector<TBOX>& boxes,
const GenericVector<STRING>& texts,
const GenericVector<int>& box_pages) {
// Copy the boxes and make the transcription.
for (int i = 0; i < box_pages.size(); ++i) {
if (page_number_ >= 0 && box_pages[i] != page_number_) continue;
transcription_ += texts[i];
boxes_.push_back(boxes[i]);
box_texts_.push_back(texts[i]);
}
}
// Saves the given Pix as a PNG-encoded string and destroys it.
void ImageData::SetPixInternal(Pix* pix, GenericVector<char>* image_data) {
l_uint8* data;
size_t size;
pixWriteMem(&data, &size, pix, IFF_PNG);
pixDestroy(&pix);
image_data->resize_no_init(size);
memcpy(&(*image_data)[0], data, size);
free(data);
}
// Returns the Pix image for the image_data. Must be pixDestroyed after use.
Pix* ImageData::GetPixInternal(const GenericVector<char>& image_data) {
Pix* pix = NULL;
if (!image_data.empty()) {
// Convert the array to an image.
const unsigned char* u_data =
reinterpret_cast<const unsigned char*>(&image_data[0]);
pix = pixReadMem(u_data, image_data.size());
}
return pix;
}
// Parses the text string as a box file and adds any discovered boxes that
// match the page number. Returns false on error.
bool ImageData::AddBoxes(const char* box_text) {
if (box_text != NULL && box_text[0] != '\0') {
GenericVector<TBOX> boxes;
GenericVector<STRING> texts;
GenericVector<int> box_pages;
if (ReadMemBoxes(page_number_, false, box_text, &boxes,
&texts, NULL, &box_pages)) {
AddBoxes(boxes, texts, box_pages);
return true;
} else {
tprintf("Error: No boxes for page %d from image %s!\n",
page_number_, imagefilename_.string());
}
}
return false;
}
// Thread function to call ReCachePages.
void* ReCachePagesFunc(void* data) {
DocumentData* document_data = reinterpret_cast<DocumentData*>(data);
document_data->ReCachePages();
return NULL;
}
DocumentData::DocumentData(const STRING& name)
: document_name_(name),
pages_offset_(-1),
total_pages_(-1),
memory_used_(0),
max_memory_(0),
reader_(NULL) {}
DocumentData::~DocumentData() {
SVAutoLock lock_p(&pages_mutex_);
SVAutoLock lock_g(&general_mutex_);
}
// Reads all the pages in the given lstmf filename to the cache. The reader
// is used to read the file.
bool DocumentData::LoadDocument(const char* filename, const char* lang,
int start_page, inT64 max_memory,
FileReader reader) {
SetDocument(filename, lang, max_memory, reader);
pages_offset_ = start_page;
return ReCachePages();
}
// Sets up the document, without actually loading it.
void DocumentData::SetDocument(const char* filename, const char* lang,
inT64 max_memory, FileReader reader) {
SVAutoLock lock_p(&pages_mutex_);
SVAutoLock lock(&general_mutex_);
document_name_ = filename;
lang_ = lang;
pages_offset_ = -1;
max_memory_ = max_memory;
reader_ = reader;
}
// Writes all the pages to the given filename. Returns false on error.
bool DocumentData::SaveDocument(const char* filename, FileWriter writer) {
SVAutoLock lock(&pages_mutex_);
TFile fp;
fp.OpenWrite(NULL);
if (!pages_.Serialize(&fp) || !fp.CloseWrite(filename, writer)) {
tprintf("Serialize failed: %s\n", filename);
return false;
}
return true;
}
bool DocumentData::SaveToBuffer(GenericVector<char>* buffer) {
SVAutoLock lock(&pages_mutex_);
TFile fp;
fp.OpenWrite(buffer);
return pages_.Serialize(&fp);
}
// Adds the given page data to this document, counting up memory.
void DocumentData::AddPageToDocument(ImageData* page) {
SVAutoLock lock(&pages_mutex_);
pages_.push_back(page);
set_memory_used(memory_used() + page->MemoryUsed());
}
// If the given index is not currently loaded, loads it using a separate
// thread.
void DocumentData::LoadPageInBackground(int index) {
ImageData* page = NULL;
if (IsPageAvailable(index, &page)) return;
SVAutoLock lock(&pages_mutex_);
if (pages_offset_ == index) return;
pages_offset_ = index;
pages_.clear();
#ifndef GRAPHICS_DISABLED
SVSync::StartThread(ReCachePagesFunc, this);
#endif // GRAPHICS_DISABLED
}
// Returns a pointer to the page with the given index, modulo the total
// number of pages. Blocks until the background load is completed.
const ImageData* DocumentData::GetPage(int index) {
ImageData* page = NULL;
while (!IsPageAvailable(index, &page)) {
// If there is no background load scheduled, schedule one now.
pages_mutex_.Lock();
bool needs_loading = pages_offset_ != index;
pages_mutex_.Unlock();
if (needs_loading) LoadPageInBackground(index);
// We can't directly load the page, or the background load will delete it
// while the caller is using it, so give it a chance to work.
#if __cplusplus > 199711L && !defined(__MINGW32__)
std::this_thread::sleep_for(std::chrono::seconds(1));
#elif _WIN32 // MSVS
Sleep(1000);
#else
sleep(1);
#endif
}
return page;
}
// Returns true if the requested page is available, and provides a pointer,
// which may be NULL if the document is empty. May block, even though it
// doesn't guarantee to return true.
bool DocumentData::IsPageAvailable(int index, ImageData** page) {
SVAutoLock lock(&pages_mutex_);
int num_pages = NumPages();
if (num_pages == 0 || index < 0) {
*page = NULL; // Empty Document.
return true;
}
if (num_pages > 0) {
index = Modulo(index, num_pages);
if (pages_offset_ <= index && index < pages_offset_ + pages_.size()) {
*page = pages_[index - pages_offset_]; // Page is available already.
return true;
}
}
return false;
}
// Removes all pages from memory and frees the memory, but does not forget
// the document metadata.
inT64 DocumentData::UnCache() {
SVAutoLock lock(&pages_mutex_);
inT64 memory_saved = memory_used();
pages_.clear();
pages_offset_ = -1;
set_total_pages(-1);
set_memory_used(0);
tprintf("Unloaded document %s, saving %d memory\n", document_name_.string(),
memory_saved);
return memory_saved;
}
// Locks the pages_mutex_ and Loads as many pages can fit in max_memory_
// starting at index pages_offset_.
bool DocumentData::ReCachePages() {
SVAutoLock lock(&pages_mutex_);
// Read the file.
set_total_pages(0);
set_memory_used(0);
int loaded_pages = 0;
pages_.truncate(0);
TFile fp;
if (!fp.Open(document_name_, reader_) ||
!PointerVector<ImageData>::DeSerializeSize(false, &fp, &loaded_pages) ||
loaded_pages <= 0) {
tprintf("Deserialize header failed: %s\n", document_name_.string());
return false;
}
pages_offset_ %= loaded_pages;
// Skip pages before the first one we want, and load the rest until max
// memory and skip the rest after that.
int page;
for (page = 0; page < loaded_pages; ++page) {
if (page < pages_offset_ ||
(max_memory_ > 0 && memory_used() > max_memory_)) {
if (!PointerVector<ImageData>::DeSerializeSkip(false, &fp)) break;
} else {
if (!pages_.DeSerializeElement(false, &fp)) break;
ImageData* image_data = pages_.back();
if (image_data->imagefilename().length() == 0) {
image_data->set_imagefilename(document_name_);
image_data->set_page_number(page);
}
image_data->set_language(lang_);
set_memory_used(memory_used() + image_data->MemoryUsed());
}
}
if (page < loaded_pages) {
tprintf("Deserialize failed: %s read %d/%d pages\n",
document_name_.string(), page, loaded_pages);
pages_.truncate(0);
} else {
tprintf("Loaded %d/%d pages (%d-%d) of document %s\n", pages_.size(),
loaded_pages, pages_offset_, pages_offset_ + pages_.size(),
document_name_.string());
}
set_total_pages(loaded_pages);
return !pages_.empty();
}
// A collection of DocumentData that knows roughly how much memory it is using.
DocumentCache::DocumentCache(inT64 max_memory)
: num_pages_per_doc_(0), max_memory_(max_memory) {}
DocumentCache::~DocumentCache() {}
// Adds all the documents in the list of filenames, counting memory.
// The reader is used to read the files.
bool DocumentCache::LoadDocuments(const GenericVector<STRING>& filenames,
const char* lang,
CachingStrategy cache_strategy,
FileReader reader) {
cache_strategy_ = cache_strategy;
inT64 fair_share_memory = 0;
// In the round-robin case, each DocumentData handles restricting its content
// to its fair share of memory. In the sequential case, DocumentCache
// determines which DocumentDatas are held entirely in memory.
if (cache_strategy_ == CS_ROUND_ROBIN)
fair_share_memory = max_memory_ / filenames.size();
for (int arg = 0; arg < filenames.size(); ++arg) {
STRING filename = filenames[arg];
DocumentData* document = new DocumentData(filename);
document->SetDocument(filename.string(), lang, fair_share_memory, reader);
AddToCache(document);
}
if (!documents_.empty()) {
// Try to get the first page now to verify the list of filenames.
if (GetPageBySerial(0) != NULL) return true;
tprintf("Load of page 0 failed!\n");
}
return false;
}
// Adds document to the cache.
bool DocumentCache::AddToCache(DocumentData* data) {
inT64 new_memory = data->memory_used();
documents_.push_back(data);
return true;
}
// Finds and returns a document by name.
DocumentData* DocumentCache::FindDocument(const STRING& document_name) const {
for (int i = 0; i < documents_.size(); ++i) {
if (documents_[i]->document_name() == document_name)
return documents_[i];
}
return NULL;
}
// Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
// strategy, could take a long time.
int DocumentCache::TotalPages() {
if (cache_strategy_ == CS_SEQUENTIAL) {
// In sequential mode, we assume each doc has the same number of pages
// whether it is true or not.
if (num_pages_per_doc_ == 0) GetPageSequential(0);
return num_pages_per_doc_ * documents_.size();
}
int total_pages = 0;
int num_docs = documents_.size();
for (int d = 0; d < num_docs; ++d) {
// We have to load a page to make NumPages() valid.
documents_[d]->GetPage(0);
total_pages += documents_[d]->NumPages();
}
return total_pages;
}
// Returns a page by serial number, selecting them in a round-robin fashion
// from all the documents. Highly disk-intensive, but doesn't need samples
// to be shuffled between files to begin with.
const ImageData* DocumentCache::GetPageRoundRobin(int serial) {
int num_docs = documents_.size();
int doc_index = serial % num_docs;
const ImageData* doc = documents_[doc_index]->GetPage(serial / num_docs);
for (int offset = 1; offset <= kMaxReadAhead && offset < num_docs; ++offset) {
doc_index = (serial + offset) % num_docs;
int page = (serial + offset) / num_docs;
documents_[doc_index]->LoadPageInBackground(page);
}
return doc;
}
// Returns a page by serial number, selecting them in sequence from each file.
// Requires the samples to be shuffled between the files to give a random or
// uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
const ImageData* DocumentCache::GetPageSequential(int serial) {
int num_docs = documents_.size();
ASSERT_HOST(num_docs > 0);
if (num_pages_per_doc_ == 0) {
// Use the pages in the first doc as the number of pages in each doc.
documents_[0]->GetPage(0);
num_pages_per_doc_ = documents_[0]->NumPages();
if (num_pages_per_doc_ == 0) {
tprintf("First document cannot be empty!!\n");
ASSERT_HOST(num_pages_per_doc_ > 0);
}
// Get rid of zero now if we don't need it.
if (serial / num_pages_per_doc_ % num_docs > 0) documents_[0]->UnCache();
}
int doc_index = serial / num_pages_per_doc_ % num_docs;
const ImageData* doc =
documents_[doc_index]->GetPage(serial % num_pages_per_doc_);
// Count up total memory. Background loading makes it more complicated to
// keep a running count.
inT64 total_memory = 0;
for (int d = 0; d < num_docs; ++d) {
total_memory += documents_[d]->memory_used();
}
if (total_memory >= max_memory_) {
// Find something to un-cache.
// If there are more than 3 in front, then serial is from the back reader
// of a pair of readers. If we un-cache from in-front-2 to 2-ahead, then
// we create a hole between them and then un-caching the backmost occupied
// will work for both.
int num_in_front = CountNeighbourDocs(doc_index, 1);
for (int offset = num_in_front - 2;
offset > 1 && total_memory >= max_memory_; --offset) {
int next_index = (doc_index + offset) % num_docs;
total_memory -= documents_[next_index]->UnCache();
}
// If that didn't work, the best solution is to un-cache from the back. If
// we take away the document that a 2nd reader is using, it will put it
// back and make a hole between.
int num_behind = CountNeighbourDocs(doc_index, -1);
for (int offset = num_behind; offset < 0 && total_memory >= max_memory_;
++offset) {
int next_index = (doc_index + offset + num_docs) % num_docs;
total_memory -= documents_[next_index]->UnCache();
}
}
int next_index = (doc_index + 1) % num_docs;
if (!documents_[next_index]->IsCached() && total_memory < max_memory_) {
documents_[next_index]->LoadPageInBackground(0);
}
return doc;
}
// Helper counts the number of adjacent cached neighbours of index looking in
// direction dir, ie index+dir, index+2*dir etc.
int DocumentCache::CountNeighbourDocs(int index, int dir) {
int num_docs = documents_.size();
for (int offset = dir; abs(offset) < num_docs; offset += dir) {
int offset_index = (index + offset + num_docs) % num_docs;
if (!documents_[offset_index]->IsCached()) return offset - dir;
}
return num_docs;
}
} // namespace tesseract.

View File

@ -0,0 +1,379 @@
///////////////////////////////////////////////////////////////////////
// File: imagedata.h
// Description: Class to hold information about a single image and its
// corresponding boxes or text file.
// Author: Ray Smith
// Created: Mon Jul 22 14:17:06 PDT 2013
//
// (C) Copyright 2013, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_IMAGE_IMAGEDATA_H_
#define TESSERACT_IMAGE_IMAGEDATA_H_
#include "genericvector.h"
#include "normalis.h"
#include "rect.h"
#include "strngs.h"
#include "svutil.h"
struct Pix;
namespace tesseract {
// Amount of padding to apply in output pixels in feature mode.
const int kFeaturePadding = 2;
// Number of pixels to pad around text boxes.
const int kImagePadding = 4;
// Enum to determine the caching and data sequencing strategy.
enum CachingStrategy {
// Reads all of one file before moving on to the next. Requires samples to be
// shuffled across files. Uses the count of samples in the first file as
// the count in all the files to achieve high-speed random access. As a
// consequence, if subsequent files are smaller, they get entries used more
// than once, and if subsequent files are larger, some entries are not used.
// Best for larger data sets that don't fit in memory.
CS_SEQUENTIAL,
// Reads one sample from each file in rotation. Does not require shuffled
// samples, but is extremely disk-intensive. Samples in smaller files also
// get used more often than samples in larger files.
// Best for smaller data sets that mostly fit in memory.
CS_ROUND_ROBIN,
};
class WordFeature {
public:
WordFeature();
WordFeature(const FCOORD& fcoord, uinT8 dir);
// Computes the maximum x and y value in the features.
static void ComputeSize(const GenericVector<WordFeature>& features,
int* max_x, int* max_y);
// Draws the features in the given window.
static void Draw(const GenericVector<WordFeature>& features,
ScrollView* window);
// Accessors.
int x() const { return x_; }
int y() const { return y_; }
int dir() const { return dir_; }
// Writes to the given file. Returns false in case of error.
bool Serialize(FILE* fp) const;
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
private:
inT16 x_;
uinT8 y_;
uinT8 dir_;
};
// A floating-point version of WordFeature, used as an intermediate during
// scaling.
struct FloatWordFeature {
static void FromWordFeatures(const GenericVector<WordFeature>& word_features,
GenericVector<FloatWordFeature>* float_features);
// Sort function to sort first by x-bucket, then by y.
static int SortByXBucket(const void*, const void*);
float x;
float y;
float dir;
int x_bucket;
};
// Class to hold information on a single image:
// Filename, cached image as a Pix*, character boxes, text transcription.
// The text transcription is the ground truth UTF-8 text for the image.
// Character boxes are optional and indicate the desired segmentation of
// the text into recognition units.
class ImageData {
public:
ImageData();
// Takes ownership of the pix.
ImageData(bool vertical, Pix* pix);
~ImageData();
// Builds and returns an ImageData from the basic data. Note that imagedata,
// truth_text, and box_text are all the actual file data, NOT filenames.
static ImageData* Build(const char* name, int page_number, const char* lang,
const char* imagedata, int imagedatasize,
const char* truth_text, const char* box_text);
// Writes to the given file. Returns false in case of error.
bool Serialize(TFile* fp) const;
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, TFile* fp);
// As DeSerialize, but only seeks past the data - hence a static method.
static bool SkipDeSerialize(bool swap, tesseract::TFile* fp);
// Other accessors.
const STRING& imagefilename() const {
return imagefilename_;
}
void set_imagefilename(const STRING& name) {
imagefilename_ = name;
}
int page_number() const {
return page_number_;
}
void set_page_number(int num) {
page_number_ = num;
}
const GenericVector<char>& image_data() const {
return image_data_;
}
const STRING& language() const {
return language_;
}
void set_language(const STRING& lang) {
language_ = lang;
}
const STRING& transcription() const {
return transcription_;
}
const GenericVector<TBOX>& boxes() const {
return boxes_;
}
const GenericVector<STRING>& box_texts() const {
return box_texts_;
}
const STRING& box_text(int index) const {
return box_texts_[index];
}
// Saves the given Pix as a PNG-encoded string and destroys it.
void SetPix(Pix* pix);
// Returns the Pix image for *this. Must be pixDestroyed after use.
Pix* GetPix() const;
// Gets anything and everything with a non-NULL pointer, prescaled to a
// given target_height (if 0, then the original image height), and aligned.
// Also returns (if not NULL) the width and height of the scaled image.
// The return value is the scaled Pix, which must be pixDestroyed after use,
// and scale_factor (if not NULL) is set to the scale factor that was applied
// to the image to achieve the target_height.
Pix* PreScale(int target_height, int max_height, float* scale_factor,
int* scaled_width, int* scaled_height,
GenericVector<TBOX>* boxes) const;
int MemoryUsed() const;
// Draws the data in a new window.
void Display() const;
// Adds the supplied boxes and transcriptions that correspond to the correct
// page number.
void AddBoxes(const GenericVector<TBOX>& boxes,
const GenericVector<STRING>& texts,
const GenericVector<int>& box_pages);
private:
// Saves the given Pix as a PNG-encoded string and destroys it.
static void SetPixInternal(Pix* pix, GenericVector<char>* image_data);
// Returns the Pix image for the image_data. Must be pixDestroyed after use.
static Pix* GetPixInternal(const GenericVector<char>& image_data);
// Parses the text string as a box file and adds any discovered boxes that
// match the page number. Returns false on error.
bool AddBoxes(const char* box_text);
private:
STRING imagefilename_; // File to read image from.
inT32 page_number_; // Page number if multi-page tif or -1.
GenericVector<char> image_data_; // PNG file data.
STRING language_; // Language code for image.
STRING transcription_; // UTF-8 ground truth of image.
GenericVector<TBOX> boxes_; // If non-empty boxes of the image.
GenericVector<STRING> box_texts_; // String for text in each box.
bool vertical_text_; // Image has been rotated from vertical.
};
// A collection of ImageData that knows roughly how much memory it is using.
class DocumentData {
friend void* ReCachePagesFunc(void* data);
public:
explicit DocumentData(const STRING& name);
~DocumentData();
// Reads all the pages in the given lstmf filename to the cache. The reader
// is used to read the file.
bool LoadDocument(const char* filename, const char* lang, int start_page,
inT64 max_memory, FileReader reader);
// Sets up the document, without actually loading it.
void SetDocument(const char* filename, const char* lang, inT64 max_memory,
FileReader reader);
// Writes all the pages to the given filename. Returns false on error.
bool SaveDocument(const char* filename, FileWriter writer);
bool SaveToBuffer(GenericVector<char>* buffer);
// Adds the given page data to this document, counting up memory.
void AddPageToDocument(ImageData* page);
const STRING& document_name() const {
SVAutoLock lock(&general_mutex_);
return document_name_;
}
int NumPages() const {
SVAutoLock lock(&general_mutex_);
return total_pages_;
}
inT64 memory_used() const {
SVAutoLock lock(&general_mutex_);
return memory_used_;
}
// If the given index is not currently loaded, loads it using a separate
// thread. Note: there are 4 cases:
// Document uncached: IsCached() returns false, total_pages_ < 0.
// Required page is available: IsPageAvailable returns true. In this case,
// total_pages_ > 0 and
// pages_offset_ <= index%total_pages_ <= pages_offset_+pages_.size()
// Pages are loaded, but the required one is not.
// The requested page is being loaded by LoadPageInBackground. In this case,
// index == pages_offset_. Once the loading starts, the pages lock is held
// until it completes, at which point IsPageAvailable will unblock and return
// true.
void LoadPageInBackground(int index);
// Returns a pointer to the page with the given index, modulo the total
// number of pages. Blocks until the background load is completed.
const ImageData* GetPage(int index);
// Returns true if the requested page is available, and provides a pointer,
// which may be NULL if the document is empty. May block, even though it
// doesn't guarantee to return true.
bool IsPageAvailable(int index, ImageData** page);
// Takes ownership of the given page index. The page is made NULL in *this.
ImageData* TakePage(int index) {
SVAutoLock lock(&pages_mutex_);
ImageData* page = pages_[index];
pages_[index] = NULL;
return page;
}
// Returns true if the document is currently loaded or in the process of
// loading.
bool IsCached() const { return NumPages() >= 0; }
// Removes all pages from memory and frees the memory, but does not forget
// the document metadata. Returns the memory saved.
inT64 UnCache();
private:
// Sets the value of total_pages_ behind a mutex.
void set_total_pages(int total) {
SVAutoLock lock(&general_mutex_);
total_pages_ = total;
}
void set_memory_used(inT64 memory_used) {
SVAutoLock lock(&general_mutex_);
memory_used_ = memory_used;
}
// Locks the pages_mutex_ and Loads as many pages can fit in max_memory_
// starting at index pages_offset_.
bool ReCachePages();
private:
// A name for this document.
STRING document_name_;
// The language of this document.
STRING lang_;
// A group of pages that corresponds in some loose way to a document.
PointerVector<ImageData> pages_;
// Page number of the first index in pages_.
int pages_offset_;
// Total number of pages in document (may exceed size of pages_.)
int total_pages_;
// Total of all pix sizes in the document.
inT64 memory_used_;
// Max memory to use at any time.
inT64 max_memory_;
// Saved reader from LoadDocument to allow re-caching.
FileReader reader_;
// Mutex that protects pages_ and pages_offset_ against multiple parallel
// loads, and provides a wait for page.
SVMutex pages_mutex_;
// Mutex that protects other data members that callers want to access without
// waiting for a load operation.
mutable SVMutex general_mutex_;
};
// A collection of DocumentData that knows roughly how much memory it is using.
// Note that while it supports background read-ahead, it assumes that a single
// thread is accessing documents, ie it is not safe for multiple threads to
// access different documents in parallel, as one may de-cache the other's
// content.
class DocumentCache {
public:
explicit DocumentCache(inT64 max_memory);
~DocumentCache();
// Deletes all existing documents from the cache.
void Clear() {
documents_.clear();
num_pages_per_doc_ = 0;
}
// Adds all the documents in the list of filenames, counting memory.
// The reader is used to read the files.
bool LoadDocuments(const GenericVector<STRING>& filenames, const char* lang,
CachingStrategy cache_strategy, FileReader reader);
// Adds document to the cache.
bool AddToCache(DocumentData* data);
// Finds and returns a document by name.
DocumentData* FindDocument(const STRING& document_name) const;
// Returns a page by serial number using the current cache_strategy_ to
// determine the mapping from serial number to page.
const ImageData* GetPageBySerial(int serial) {
if (cache_strategy_ == CS_SEQUENTIAL)
return GetPageSequential(serial);
else
return GetPageRoundRobin(serial);
}
const PointerVector<DocumentData>& documents() const {
return documents_;
}
// Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
// strategy, could take a long time.
int TotalPages();
private:
// Returns a page by serial number, selecting them in a round-robin fashion
// from all the documents. Highly disk-intensive, but doesn't need samples
// to be shuffled between files to begin with.
const ImageData* GetPageRoundRobin(int serial);
// Returns a page by serial number, selecting them in sequence from each file.
// Requires the samples to be shuffled between the files to give a random or
// uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
const ImageData* GetPageSequential(int serial);
// Helper counts the number of adjacent cached neighbour documents_ of index
// looking in direction dir, ie index+dir, index+2*dir etc.
int CountNeighbourDocs(int index, int dir);
// A group of pages that corresponds in some loose way to a document.
PointerVector<DocumentData> documents_;
// Strategy to use for caching and serializing data samples.
CachingStrategy cache_strategy_;
// Number of pages in the first document, used as a divisor in
// GetPageSequential to determine the document index.
int num_pages_per_doc_;
// Max memory allowed in this cache.
inT64 max_memory_;
};
} // namespace tesseract
#endif // TESSERACT_IMAGE_IMAGEDATA_H_

Some files were not shown because too many files have changed in this diff Show More