diff --git a/build/linux/HGImgProc/HGImgProc.cbp b/build/linux/HGImgProc/HGImgProc.cbp index 58319482..2713d21f 100644 --- a/build/linux/HGImgProc/HGImgProc.cbp +++ b/build/linux/HGImgProc/HGImgProc.cbp @@ -29,8 +29,6 @@ - - @@ -61,8 +59,6 @@ - - @@ -495,8 +491,6 @@ - - diff --git a/modules/imgproc/HGOCR.cpp b/modules/imgproc/HGOCR.cpp index 5422b225..bcaf36c1 100644 --- a/modules/imgproc/HGOCR.cpp +++ b/modules/imgproc/HGOCR.cpp @@ -1,7 +1,9 @@ #include "HGOCR.h" #include "HGOCRBase.hpp" #include "HGOCRHanvon.hpp" +#if defined(HG_CMP_MSC) #include "HGOCRTesseract.hpp" +#endif #include "HGOCRRetImpl.hpp" HGResult HGAPI HGImgProc_CreateOCRMgr(HGUInt algo, HGOCRMgr* ocrMgr) @@ -13,6 +15,7 @@ HGResult HGAPI HGImgProc_CreateOCRMgr(HGUInt algo, HGOCRMgr* ocrMgr) if (HGIMGPROC_OCRALGO_DEFAULT == algo) { +#if defined(HG_CMP_MSC) HGOCRBase* ocrMgrImpl = new HGOCRHanvon; HGResult ret = ocrMgrImpl->Init(); if (HGBASE_ERR_OK != ret) @@ -26,6 +29,15 @@ HGResult HGAPI HGImgProc_CreateOCRMgr(HGUInt algo, HGOCRMgr* ocrMgr) return ret; } } +#else + HGOCRBase* ocrMgrImpl = new HGOCRHanvon; + HGResult ret = ocrMgrImpl->Init(); + if (HGBASE_ERR_OK != ret) + { + delete ocrMgrImpl; + return ret; + } +#endif *ocrMgr = (HGOCRMgr)ocrMgrImpl; return HGBASE_ERR_OK; @@ -45,6 +57,7 @@ HGResult HGAPI HGImgProc_CreateOCRMgr(HGUInt algo, HGOCRMgr* ocrMgr) } else if (HGIMGPROC_OCRALGO_TESSERACT == algo) { +#if defined(HG_CMP_MSC) HGOCRBase* ocrMgrImpl = new HGOCRTesseract; HGResult ret = ocrMgrImpl->Init(); if (HGBASE_ERR_OK != ret) @@ -55,6 +68,10 @@ HGResult HGAPI HGImgProc_CreateOCRMgr(HGUInt algo, HGOCRMgr* ocrMgr) *ocrMgr = (HGOCRMgr)ocrMgrImpl; return HGBASE_ERR_OK; +#else + return HGBASE_ERR_INVALIDARG; +#endif + } return HGBASE_ERR_INVALIDARG; diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/baseapi.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/baseapi.h deleted file mode 100644 index 5e1e4830..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/baseapi.h +++ /dev/null @@ -1,812 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: baseapi.h -// Description: Simple API for calling tesseract. -// Author: Ray Smith -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_BASEAPI_H_ -#define TESSERACT_API_BASEAPI_H_ - -#ifdef HAVE_CONFIG_H -# include "config_auto.h" // DISABLED_LEGACY_ENGINE -#endif - -#include "export.h" -#include "pageiterator.h" -#include "publictypes.h" -#include "resultiterator.h" -#include "unichar.h" - -#include "version.h" - -#include -#include // for std::vector - -struct Pix; -struct Pixa; -struct Boxa; - -namespace tesseract { - -class PAGE_RES; -class ParagraphModel; -class BLOCK_LIST; -class ETEXT_DESC; -struct OSResults; -class UNICHARSET; - -class Dawg; -class Dict; -class EquationDetect; -class PageIterator; -class ImageThresholder; -class LTRResultIterator; -class ResultIterator; -class MutableIterator; -class TessResultRenderer; -class Tesseract; - -// Function to read a std::vector from a whole file. -// Returns false on failure. -using FileReader = bool (*)(const char *filename, std::vector *data); - -using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID, - bool) const; -using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *, - int, const char *, int); - -/** - * Base class for all tesseract APIs. - * Specific classes can add ability to work on different inputs or produce - * different outputs. - * This class is mostly an interface layer on top of the Tesseract instance - * class to hide the data types so that users of this class don't have to - * include any other Tesseract headers. - */ -class TESS_API TessBaseAPI { -public: - TessBaseAPI(); - virtual ~TessBaseAPI(); - // Copy constructor and assignment operator are currently unsupported. - TessBaseAPI(TessBaseAPI const &) = delete; - TessBaseAPI &operator=(TessBaseAPI const &) = delete; - - /** - * Returns the version identifier as a static string. Do not delete. - */ - static const char *Version(); - - /** - * If compiled with OpenCL AND an available OpenCL - * device is deemed faster than serial code, then - * "device" is populated with the cl_device_id - * and returns sizeof(cl_device_id) - * otherwise *device=nullptr and returns 0. - */ - static size_t getOpenCLDevice(void **device); - - /** - * Set the name of the input file. Needed for training and - * reading a UNLV zone file, and for searchable PDF output. - */ - void SetInputName(const char *name); - /** - * These functions are required for searchable PDF output. - * We need our hands on the input file so that we can include - * it in the PDF without transcoding. If that is not possible, - * we need the original image. Finally, resolution metadata - * is stored in the PDF so we need that as well. - */ - const char *GetInputName(); - // Takes ownership of the input pix. - void SetInputImage(Pix *pix); - Pix *GetInputImage(); - int GetSourceYResolution(); - const char *GetDatapath(); - - /** Set the name of the bonus output files. Needed only for debugging. */ - void SetOutputName(const char *name); - - /** - * Set the value of an internal "parameter." - * Supply the name of the parameter and the value as a string, just as - * you would in a config file. - * Returns false if the name lookup failed. - * Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z. - * Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode. - * SetVariable may be used before Init, but settings will revert to - * defaults on End(). - * - * Note: Must be called after Init(). Only works for non-init variables - * (init variables should be passed to Init()). - */ - bool SetVariable(const char *name, const char *value); - bool SetDebugVariable(const char *name, const char *value); - - /** - * Returns true if the parameter was found among Tesseract parameters. - * Fills in value with the value of the parameter. - */ - bool GetIntVariable(const char *name, int *value) const; - bool GetBoolVariable(const char *name, bool *value) const; - bool GetDoubleVariable(const char *name, double *value) const; - - /** - * Returns the pointer to the string that represents the value of the - * parameter if it was found among Tesseract parameters. - */ - const char *GetStringVariable(const char *name) const; - -#ifndef DISABLED_LEGACY_ENGINE - - /** - * Print Tesseract fonts table to the given file. - */ - void PrintFontsTable(FILE *fp) const; - -#endif - - /** - * Print Tesseract parameters to the given file. - */ - void PrintVariables(FILE *fp) const; - - /** - * Get value of named variable as a string, if it exists. - */ - bool GetVariableAsString(const char *name, std::string *val) const; - - /** - * Instances are now mostly thread-safe and totally independent, - * but some global parameters remain. Basically it is safe to use multiple - * TessBaseAPIs in different threads in parallel, UNLESS: - * you use SetVariable on some of the Params in classify and textord. - * If you do, then the effect will be to change it for all your instances. - * - * Start tesseract. Returns zero on success and -1 on failure. - * NOTE that the only members that may be called before Init are those - * listed above here in the class definition. - * - * The datapath must be the name of the tessdata directory. - * The language is (usually) an ISO 639-3 string or nullptr will default to - * eng. It is entirely safe (and eventually will be efficient too) to call - * Init multiple times on the same instance to change language, or just - * to reset the classifier. - * The language may be a string of the form [~][+[~]]* indicating - * that multiple languages are to be loaded. Eg hin+eng will load Hindi and - * English. Languages may specify internally that they want to be loaded - * with one or more other languages, so the ~ sign is available to override - * that. Eg if hin were set to load eng by default, then hin+~eng would force - * loading only hin. The number of loaded languages is limited only by - * memory, with the caveat that loading additional languages will impact - * both speed and accuracy, as there is more work to do to decide on the - * applicable language, and there is more chance of hallucinating incorrect - * words. - * WARNING: On changing languages, all Tesseract parameters are reset - * back to their default values. (Which may vary between languages.) - * If you have a rare need to set a Variable that controls - * initialization for a second call to Init you should explicitly - * call End() and then use SetVariable before Init. This is only a very - * rare use case, since there are very few uses that require any parameters - * to be set before Init. - * - * If set_only_non_debug_params is true, only params that do not contain - * "debug" in the name will be set. - */ - int Init(const char *datapath, const char *language, OcrEngineMode mode, - char **configs, int configs_size, - const std::vector *vars_vec, - const std::vector *vars_values, - bool set_only_non_debug_params); - int Init(const char *datapath, const char *language, OcrEngineMode oem) { - return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false); - } - int Init(const char *datapath, const char *language) { - return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr, - false); - } - // In-memory version reads the traineddata file directly from the given - // data[data_size] array, and/or reads data via a FileReader. - int Init(const char *data, int data_size, const char *language, - OcrEngineMode mode, char **configs, int configs_size, - const std::vector *vars_vec, - const std::vector *vars_values, - bool set_only_non_debug_params, FileReader reader); - - /** - * Returns the languages string used in the last valid initialization. - * If the last initialization specified "deu+hin" then that will be - * returned. If hin loaded eng automatically as well, then that will - * not be included in this list. To find the languages actually - * loaded use GetLoadedLanguagesAsVector. - * The returned string should NOT be deleted. - */ - const char *GetInitLanguagesAsString() const; - - /** - * Returns the loaded languages in the vector of std::string. - * Includes all languages loaded by the last Init, including those loaded - * as dependencies of other loaded languages. - */ - void GetLoadedLanguagesAsVector(std::vector *langs) const; - - /** - * Returns the available languages in the sorted vector of std::string. - */ - void GetAvailableLanguagesAsVector(std::vector *langs) const; - - /** - * Init only for page layout analysis. Use only for calls to SetImage and - * AnalysePage. Calls that attempt recognition will generate an error. - */ - void InitForAnalysePage(); - - /** - * Read a "config" file containing a set of param, value pairs. - * Searches the standard places: tessdata/configs, tessdata/tessconfigs - * and also accepts a relative or absolute path name. - * Note: only non-init params will be set (init params are set by Init()). - */ - void ReadConfigFile(const char *filename); - /** Same as above, but only set debug params from the given config file. */ - void ReadDebugConfigFile(const char *filename); - - /** - * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK. - * The mode is stored as an IntParam so it can also be modified by - * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string). - */ - void SetPageSegMode(PageSegMode mode); - - /** Return the current page segmentation mode. */ - PageSegMode GetPageSegMode() const; - - /** - * Recognize a rectangle from an image and return the result as a string. - * May be called many times for a single Init. - * Currently has no error checking. - * Greyscale of 8 and color of 24 or 32 bits per pixel may be given. - * Palette color images will not work properly and must be converted to - * 24 bit. - * Binary images of 1 bit per pixel may also be given but they must be - * byte packed with the MSB of the first byte being the first pixel, and a - * 1 represents WHITE. For binary images set bytes_per_pixel=0. - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - * - * Note that TesseractRect is the simplified convenience interface. - * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize, - * and one or more of the Get*Text functions below. - */ - char *TesseractRect(const unsigned char *imagedata, int bytes_per_pixel, - int bytes_per_line, int left, int top, int width, - int height); - - /** - * Call between pages or documents etc to free up memory and forget - * adaptive data. - */ - void ClearAdaptiveClassifier(); - - /** - * @defgroup AdvancedAPI Advanced API - * The following methods break TesseractRect into pieces, so you can - * get hold of the thresholded image, get the text in different formats, - * get bounding boxes, confidences etc. - */ - /* @{ */ - - /** - * Provide an image for Tesseract to recognize. Format is as - * TesseractRect above. Copies the image buffer and converts to Pix. - * SetImage clears all recognition results, and sets the rectangle to the - * full image, so it may be followed immediately by a GetUTF8Text, and it - * will automatically perform recognition. - */ - void SetImage(const unsigned char *imagedata, int width, int height, - int bytes_per_pixel, int bytes_per_line); - - /** - * Provide an image for Tesseract to recognize. As with SetImage above, - * Tesseract takes its own copy of the image, so it need not persist until - * after Recognize. - * Pix vs raw, which to use? - * Use Pix where possible. Tesseract uses Pix as its internal representation - * and it is therefore more efficient to provide a Pix directly. - */ - void SetImage(Pix *pix); - - /** - * Set the resolution of the source image in pixels per inch so font size - * information can be calculated in results. Call this after SetImage(). - */ - void SetSourceResolution(int ppi); - - /** - * Restrict recognition to a sub-rectangle of the image. Call after SetImage. - * Each SetRectangle clears the recogntion results so multiple rectangles - * can be recognized with the same image. - */ - void SetRectangle(int left, int top, int width, int height); - - /** - * Get a copy of the internal thresholded image from Tesseract. - * Caller takes ownership of the Pix and must pixDestroy it. - * May be called any time after SetImage, or after TesseractRect. - */ - Pix *GetThresholdedImage(); - - /** - * Get the result of page layout analysis as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - */ - Boxa *GetRegions(Pixa **pixa); - - /** - * Get the textlines as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * If raw_image is true, then extract from the original image instead of the - * thresholded image and pad by raw_padding pixels. - * If blockids is not nullptr, the block-id of each line is also returned as - * an array of one element per line. delete [] after use. If paraids is not - * nullptr, the paragraph-id of each line within its block is also returned as - * an array of one element per line. delete [] after use. - */ - Boxa *GetTextlines(bool raw_image, int raw_padding, Pixa **pixa, - int **blockids, int **paraids); - /* - Helper method to extract from the thresholded image. (most common usage) -*/ - Boxa *GetTextlines(Pixa **pixa, int **blockids) { - return GetTextlines(false, 0, pixa, blockids, nullptr); - } - - /** - * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa - * pair, in reading order. Enables downstream handling of non-rectangular - * regions. - * Can be called before or after Recognize. - * If blockids is not nullptr, the block-id of each line is also returned as - * an array of one element per line. delete [] after use. - */ - Boxa *GetStrips(Pixa **pixa, int **blockids); - - /** - * Get the words as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - */ - Boxa *GetWords(Pixa **pixa); - - /** - * Gets the individual connected (text) components (created - * after pages segmentation step, but before recognition) - * as a leptonica-style Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * Note: the caller is responsible for calling boxaDestroy() - * on the returned Boxa array and pixaDestroy() on cc array. - */ - Boxa *GetConnectedComponents(Pixa **cc); - - /** - * Get the given level kind of components (block, textline, word etc.) as a - * leptonica-style Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * If blockids is not nullptr, the block-id of each component is also returned - * as an array of one element per component. delete [] after use. - * If blockids is not nullptr, the paragraph-id of each component with its - * block is also returned as an array of one element per component. delete [] - * after use. If raw_image is true, then portions of the original image are - * extracted instead of the thresholded image and padded with raw_padding. If - * text_only is true, then only text components are returned. - */ - Boxa *GetComponentImages(PageIteratorLevel level, bool text_only, - bool raw_image, int raw_padding, Pixa **pixa, - int **blockids, int **paraids); - // Helper function to get binary images with no padding (most common usage). - Boxa *GetComponentImages(const PageIteratorLevel level, const bool text_only, - Pixa **pixa, int **blockids) { - return GetComponentImages(level, text_only, false, 0, pixa, blockids, - nullptr); - } - - /** - * Returns the scale factor of the thresholded image that would be returned by - * GetThresholdedImage() and the various GetX() methods that call - * GetComponentImages(). - * Returns 0 if no thresholder has been set. - */ - int GetThresholdedImageScaleFactor() const; - - /** - * Runs page layout analysis in the mode set by SetPageSegMode. - * May optionally be called prior to Recognize to get access to just - * the page layout results. Returns an iterator to the results. - * If merge_similar_words is true, words are combined where suitable for use - * with a line recognizer. Use if you want to use AnalyseLayout to find the - * textlines, and then want to process textline fragments with an external - * line recognizer. - * Returns nullptr on error or an empty page. - * The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - PageIterator *AnalyseLayout(); - PageIterator *AnalyseLayout(bool merge_similar_words); - - /** - * Recognize the image from SetAndThresholdImage, generating Tesseract - * internal structures. Returns 0 on success. - * Optional. The Get*Text functions below will call Recognize if needed. - * After Recognize, the output is kept internally until the next SetImage. - */ - int Recognize(ETEXT_DESC *monitor); - - /** - * Methods to retrieve information after SetAndThresholdImage(), - * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.) - */ - - /** - * Turns images into symbolic text. - * - * filename can point to a single image, a multi-page TIFF, - * or a plain text list of image filenames. - * - * retry_config is useful for debugging. If not nullptr, you can fall - * back to an alternate configuration if a page fails for some - * reason. - * - * timeout_millisec terminates processing if any single page - * takes too long. Set to 0 for unlimited time. - * - * renderer is responible for creating the output. For example, - * use the TessTextRenderer if you want plaintext output, or - * the TessPDFRender to produce searchable PDF. - * - * If tessedit_page_number is non-negative, will only process that - * single page. Works for multi-page tiff file, or filelist. - * - * Returns true if successful, false on error. - */ - bool ProcessPages(const char *filename, const char *retry_config, - int timeout_millisec, TessResultRenderer *renderer); - // Does the real work of ProcessPages. - bool ProcessPagesInternal(const char *filename, const char *retry_config, - int timeout_millisec, TessResultRenderer *renderer); - - /** - * Turn a single image into symbolic text. - * - * The pix is the image processed. filename and page_index are - * metadata used by side-effect processes, such as reading a box - * file or formatting as hOCR. - * - * See ProcessPages for descriptions of other parameters. - */ - bool ProcessPage(Pix *pix, int page_index, const char *filename, - const char *retry_config, int timeout_millisec, - TessResultRenderer *renderer); - - /** - * Get a reading-order iterator to the results of LayoutAnalysis and/or - * Recognize. The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - ResultIterator *GetIterator(); - - /** - * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize. - * The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - MutableIterator *GetMutableIterator(); - - /** - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - */ - char *GetUTF8Text(); - - /** - * Make a HTML-formatted string with hOCR markup from the internal - * data structures. - * page_number is 0-based but will appear in the output as 1-based. - * monitor can be used to - * cancel the recognition - * receive progress callbacks - * Returned string must be freed with the delete [] operator. - */ - char *GetHOCRText(ETEXT_DESC *monitor, int page_number); - - /** - * Make a HTML-formatted string with hOCR markup from the internal - * data structures. - * page_number is 0-based but will appear in the output as 1-based. - * Returned string must be freed with the delete [] operator. - */ - char *GetHOCRText(int page_number); - - /** - * Make an XML-formatted string with Alto markup from the internal - * data structures. - */ - char *GetAltoText(ETEXT_DESC *monitor, int page_number); - - /** - * Make an XML-formatted string with Alto markup from the internal - * data structures. - */ - char *GetAltoText(int page_number); - - /** - * Make a TSV-formatted string from the internal data structures. - * page_number is 0-based but will appear in the output as 1-based. - * Returned string must be freed with the delete [] operator. - */ - char *GetTSVText(int page_number); - - /** - * Make a box file for LSTM training from the internal data structures. - * Constructs coordinates in the original image - not just the rectangle. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetLSTMBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded in the same - * format as a box file used in training. - * Constructs coordinates in the original image - not just the rectangle. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded in the same - * format as a WordStr box file used in training. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetWordStrBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded - * as UNLV format Latin-1 with specific reject and suspect codes. - * Returned string must be freed with the delete [] operator. - */ - char *GetUNLVText(); - - /** - * Detect the orientation of the input image and apparent script (alphabet). - * orient_deg is the detected clockwise rotation of the input image in degrees - * (0, 90, 180, 270) - * orient_conf is the confidence (15.0 is reasonably confident) - * script_name is an ASCII string, the name of the script, e.g. "Latin" - * script_conf is confidence level in the script - * Returns true on success and writes values to each parameter as an output - */ - bool DetectOrientationScript(int *orient_deg, float *orient_conf, - const char **script_name, float *script_conf); - - /** - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - * page_number is a 0-based page index that will appear in the osd file. - */ - char *GetOsdText(int page_number); - - /** Returns the (average) confidence value between 0 and 100. */ - int MeanTextConf(); - /** - * Returns all word confidences (between 0 and 100) in an array, terminated - * by -1. The calling function must delete [] after use. - * The number of confidences should correspond to the number of space- - * delimited words in GetUTF8Text. - */ - int *AllWordConfidences(); - -#ifndef DISABLED_LEGACY_ENGINE - /** - * Applies the given word to the adaptive classifier if possible. - * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can - * tell the boundaries of the graphemes. - * Assumes that SetImage/SetRectangle have been used to set the image - * to the given word. The mode arg should be PSM_SINGLE_WORD or - * PSM_CIRCLE_WORD, as that will be used to control layout analysis. - * The currently set PageSegMode is preserved. - * Returns false if adaption was not possible for some reason. - */ - bool AdaptToWordStr(PageSegMode mode, const char *wordstr); -#endif // ndef DISABLED_LEGACY_ENGINE - - /** - * Free up recognition results and any stored image data, without actually - * freeing any recognition data that would be time-consuming to reload. - * Afterwards, you must call SetImage or TesseractRect before doing - * any Recognize or Get* operation. - */ - void Clear(); - - /** - * Close down tesseract and free up all memory. End() is equivalent to - * destructing and reconstructing your TessBaseAPI. - * Once End() has been used, none of the other API functions may be used - * other than Init and anything declared above it in the class definition. - */ - void End(); - - /** - * Clear any library-level memory caches. - * There are a variety of expensive-to-load constant data structures (mostly - * language dictionaries) that are cached globally -- surviving the Init() - * and End() of individual TessBaseAPI's. This function allows the clearing - * of these caches. - **/ - static void ClearPersistentCache(); - - /** - * Check whether a word is valid according to Tesseract's language model - * @return 0 if the word is invalid, non-zero if valid. - * @warning temporary! This function will be removed from here and placed - * in a separate API at some future time. - */ - int IsValidWord(const char *word) const; - // Returns true if utf8_character is defined in the UniCharset. - bool IsValidCharacter(const char *utf8_character) const; - - bool GetTextDirection(int *out_offset, float *out_slope); - - /** Sets Dict::letter_is_okay_ function to point to the given function. */ - void SetDictFunc(DictFunc f); - - /** Sets Dict::probability_in_context_ function to point to the given - * function. - */ - void SetProbabilityInContextFunc(ProbabilityInContextFunc f); - - /** - * Estimates the Orientation And Script of the image. - * @return true if the image was processed successfully. - */ - bool DetectOS(OSResults *); - - /** - * Return text orientation of each block as determined by an earlier run - * of layout analysis. - */ - void GetBlockTextOrientations(int **block_orientation, - bool **vertical_writing); - - /** This method returns the string form of the specified unichar. */ - const char *GetUnichar(int unichar_id) const; - - /** Return the pointer to the i-th dawg loaded into tesseract_ object. */ - const Dawg *GetDawg(int i) const; - - /** Return the number of dawgs loaded into tesseract_ object. */ - int NumDawgs() const; - - Tesseract *tesseract() const { - return tesseract_; - } - - OcrEngineMode oem() const { - return last_oem_requested_; - } - - void set_min_orientation_margin(double margin); - /* @} */ - -protected: - /** Common code for setting the image. Returns true if Init has been called. - */ - bool InternalSetImage(); - - /** - * Run the thresholder to make the thresholded image. If pix is not nullptr, - * the source is thresholded to pix instead of the internal IMAGE. - */ - virtual bool Threshold(Pix **pix); - - /** - * Find lines from the image making the BLOCK_LIST. - * @return 0 on success. - */ - int FindLines(); - - /** Delete the pageres and block list ready for a new page. */ - void ClearResults(); - - /** - * Return an LTR Result Iterator -- used only for training, as we really want - * to ignore all BiDi smarts at that point. - * delete once you're done with it. - */ - LTRResultIterator *GetLTRIterator(); - - /** - * Return the length of the output text string, as UTF8, assuming - * one newline per line and one per block, with a terminator, - * and assuming a single character reject marker for each rejected character. - * Also return the number of recognized blobs in blob_count. - */ - int TextLength(int *blob_count) const; - - //// paragraphs.cpp //////////////////////////////////////////////////// - void DetectParagraphs(bool after_text_recognition); - - const PAGE_RES *GetPageRes() const { - return page_res_; - } - -protected: - Tesseract *tesseract_; ///< The underlying data object. - Tesseract *osd_tesseract_; ///< For orientation & script detection. - EquationDetect *equ_detect_; ///< The equation detector. - FileReader reader_; ///< Reads files from any filesystem. - ImageThresholder *thresholder_; ///< Image thresholding module. - std::vector *paragraph_models_; - BLOCK_LIST *block_list_; ///< The page layout. - PAGE_RES *page_res_; ///< The page-level data. - std::string input_file_; ///< Name used by training code. - std::string output_file_; ///< Name used by debug code. - std::string datapath_; ///< Current location of tessdata. - std::string language_; ///< Last initialized language. - OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested. - bool recognition_done_; ///< page_res_ contains recognition data. - - /** - * @defgroup ThresholderParams Thresholder Parameters - * Parameters saved from the Thresholder. Needed to rebuild coordinates. - */ - /* @{ */ - int rect_left_; - int rect_top_; - int rect_width_; - int rect_height_; - int image_width_; - int image_height_; - /* @} */ - -private: - // A list of image filenames gets special consideration - bool ProcessPagesFileList(FILE *fp, std::string *buf, - const char *retry_config, int timeout_millisec, - TessResultRenderer *renderer, - int tessedit_page_number); - // TIFF supports multipage so gets special consideration. - bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size, - const char *filename, const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer, - int tessedit_page_number); -}; // class TessBaseAPI. - -/** Escape a char string - remove &<>"' with HTML codes. */ -std::string HOcrEscape(const char *text); - -} // namespace tesseract - -#endif // TESSERACT_API_BASEAPI_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/capi.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/capi.h deleted file mode 100644 index 40f4856a..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/capi.h +++ /dev/null @@ -1,484 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: capi.h -// Description: C-API TessBaseAPI -// -// (C) Copyright 2012, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef API_CAPI_H_ -#define API_CAPI_H_ - -#include "export.h" - -#ifdef __cplusplus -# include -# include -# include -# include -# include -#endif - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef BOOL -# define BOOL int -# define TRUE 1 -# define FALSE 0 -#endif - -#ifdef __cplusplus -typedef tesseract::TessResultRenderer TessResultRenderer; -typedef tesseract::TessBaseAPI TessBaseAPI; -typedef tesseract::PageIterator TessPageIterator; -typedef tesseract::ResultIterator TessResultIterator; -typedef tesseract::MutableIterator TessMutableIterator; -typedef tesseract::ChoiceIterator TessChoiceIterator; -typedef tesseract::OcrEngineMode TessOcrEngineMode; -typedef tesseract::PageSegMode TessPageSegMode; -typedef tesseract::PageIteratorLevel TessPageIteratorLevel; -typedef tesseract::Orientation TessOrientation; -typedef tesseract::ParagraphJustification TessParagraphJustification; -typedef tesseract::WritingDirection TessWritingDirection; -typedef tesseract::TextlineOrder TessTextlineOrder; -typedef tesseract::PolyBlockType TessPolyBlockType; -typedef tesseract::ETEXT_DESC ETEXT_DESC; -#else -typedef struct TessResultRenderer TessResultRenderer; -typedef struct TessBaseAPI TessBaseAPI; -typedef struct TessPageIterator TessPageIterator; -typedef struct TessResultIterator TessResultIterator; -typedef struct TessMutableIterator TessMutableIterator; -typedef struct TessChoiceIterator TessChoiceIterator; -typedef enum TessOcrEngineMode { - OEM_TESSERACT_ONLY, - OEM_LSTM_ONLY, - OEM_TESSERACT_LSTM_COMBINED, - OEM_DEFAULT -} TessOcrEngineMode; -typedef enum TessPageSegMode { - PSM_OSD_ONLY, - PSM_AUTO_OSD, - PSM_AUTO_ONLY, - PSM_AUTO, - PSM_SINGLE_COLUMN, - PSM_SINGLE_BLOCK_VERT_TEXT, - PSM_SINGLE_BLOCK, - PSM_SINGLE_LINE, - PSM_SINGLE_WORD, - PSM_CIRCLE_WORD, - PSM_SINGLE_CHAR, - PSM_SPARSE_TEXT, - PSM_SPARSE_TEXT_OSD, - PSM_RAW_LINE, - PSM_COUNT -} TessPageSegMode; -typedef enum TessPageIteratorLevel { - RIL_BLOCK, - RIL_PARA, - RIL_TEXTLINE, - RIL_WORD, - RIL_SYMBOL -} TessPageIteratorLevel; -typedef enum TessPolyBlockType { - PT_UNKNOWN, - PT_FLOWING_TEXT, - PT_HEADING_TEXT, - PT_PULLOUT_TEXT, - PT_EQUATION, - PT_INLINE_EQUATION, - PT_TABLE, - PT_VERTICAL_TEXT, - PT_CAPTION_TEXT, - PT_FLOWING_IMAGE, - PT_HEADING_IMAGE, - PT_PULLOUT_IMAGE, - PT_HORZ_LINE, - PT_VERT_LINE, - PT_NOISE, - PT_COUNT -} TessPolyBlockType; -typedef enum TessOrientation { - ORIENTATION_PAGE_UP, - ORIENTATION_PAGE_RIGHT, - ORIENTATION_PAGE_DOWN, - ORIENTATION_PAGE_LEFT -} TessOrientation; -typedef enum TessParagraphJustification { - JUSTIFICATION_UNKNOWN, - JUSTIFICATION_LEFT, - JUSTIFICATION_CENTER, - JUSTIFICATION_RIGHT -} TessParagraphJustification; -typedef enum TessWritingDirection { - WRITING_DIRECTION_LEFT_TO_RIGHT, - WRITING_DIRECTION_RIGHT_TO_LEFT, - WRITING_DIRECTION_TOP_TO_BOTTOM -} TessWritingDirection; -typedef enum TessTextlineOrder { - TEXTLINE_ORDER_LEFT_TO_RIGHT, - TEXTLINE_ORDER_RIGHT_TO_LEFT, - TEXTLINE_ORDER_TOP_TO_BOTTOM -} TessTextlineOrder; -typedef struct ETEXT_DESC ETEXT_DESC; -#endif - -typedef bool (*TessCancelFunc)(void *cancel_this, int words); -typedef bool (*TessProgressFunc)(ETEXT_DESC *ths, int left, int right, int top, - int bottom); - -struct Pix; -struct Boxa; -struct Pixa; - -/* General free functions */ - -TESS_API const char *TessVersion(); -TESS_API void TessDeleteText(const char *text); -TESS_API void TessDeleteTextArray(char **arr); -TESS_API void TessDeleteIntArray(const int *arr); - -/* Renderer API */ -TESS_API TessResultRenderer *TessTextRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessHOcrRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase, - BOOL font_info); -TESS_API TessResultRenderer *TessAltoRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessTsvRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessPDFRendererCreate(const char *outputbase, - const char *datadir, - BOOL textonly); -TESS_API TessResultRenderer *TessUnlvRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessWordStrBoxRendererCreate( - const char *outputbase); - -TESS_API void TessDeleteResultRenderer(TessResultRenderer *renderer); -TESS_API void TessResultRendererInsert(TessResultRenderer *renderer, - TessResultRenderer *next); -TESS_API TessResultRenderer *TessResultRendererNext( - TessResultRenderer *renderer); -TESS_API BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer, - const char *title); -TESS_API BOOL TessResultRendererAddImage(TessResultRenderer *renderer, - TessBaseAPI *api); -TESS_API BOOL TessResultRendererEndDocument(TessResultRenderer *renderer); - -TESS_API const char *TessResultRendererExtention(TessResultRenderer *renderer); -TESS_API const char *TessResultRendererTitle(TessResultRenderer *renderer); -TESS_API int TessResultRendererImageNum(TessResultRenderer *renderer); - -/* Base API */ - -TESS_API TessBaseAPI *TessBaseAPICreate(); -TESS_API void TessBaseAPIDelete(TessBaseAPI *handle); - -TESS_API size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI *handle, void **device); - -TESS_API void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name); -TESS_API const char *TessBaseAPIGetInputName(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetInputImage(TessBaseAPI *handle, struct Pix *pix); -TESS_API struct Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle); - -TESS_API int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle); -TESS_API const char *TessBaseAPIGetDatapath(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name); - -TESS_API BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name, - const char *value); -TESS_API BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name, - const char *value); - -TESS_API BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle, - const char *name, int *value); -TESS_API BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle, - const char *name, BOOL *value); -TESS_API BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle, - const char *name, double *value); -TESS_API const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle, - const char *name); - -TESS_API void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp); -TESS_API BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle, - const char *filename); - -TESS_API int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode oem, - char **configs, int configs_size); -TESS_API int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode oem); -TESS_API int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath, - const char *language); - -TESS_API int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode mode, - char **configs, int configs_size, char **vars_vec, - char **vars_values, size_t vars_vec_size, - BOOL set_only_non_debug_params); - -TESS_API int TessBaseAPIInit5(TessBaseAPI *handle, const char *data, int data_size, - const char *language, TessOcrEngineMode mode, - char **configs, int configs_size, char **vars_vec, - char **vars_values, size_t vars_vec_size, - BOOL set_only_non_debug_params); - -TESS_API const char *TessBaseAPIGetInitLanguagesAsString( - const TessBaseAPI *handle); -TESS_API char **TessBaseAPIGetLoadedLanguagesAsVector( - const TessBaseAPI *handle); -TESS_API char **TessBaseAPIGetAvailableLanguagesAsVector( - const TessBaseAPI *handle); - -TESS_API void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle); - -TESS_API void TessBaseAPIReadConfigFile(TessBaseAPI *handle, - const char *filename); -TESS_API void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle, - const char *filename); - -TESS_API void TessBaseAPISetPageSegMode(TessBaseAPI *handle, - TessPageSegMode mode); -TESS_API TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle); - -TESS_API char *TessBaseAPIRect(TessBaseAPI *handle, - const unsigned char *imagedata, - int bytes_per_pixel, int bytes_per_line, - int left, int top, int width, int height); - -TESS_API void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetImage(TessBaseAPI *handle, - const unsigned char *imagedata, int width, - int height, int bytes_per_pixel, - int bytes_per_line); -TESS_API void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix); - -TESS_API void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi); - -TESS_API void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top, - int width, int height); - -TESS_API struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle); -TESS_API struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle, - struct Pixa **pixa); -TESS_API struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle, - struct Pixa **pixa, - int **blockids); -TESS_API struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle, - BOOL raw_image, int raw_padding, - struct Pixa **pixa, - int **blockids, int **paraids); -TESS_API struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle, - struct Pixa **pixa, int **blockids); -TESS_API struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle, - struct Pixa **pixa); -TESS_API struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle, - struct Pixa **cc); -TESS_API struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle, - TessPageIteratorLevel level, - BOOL text_only, - struct Pixa **pixa, - int **blockids); -TESS_API struct Boxa *TessBaseAPIGetComponentImages1( - TessBaseAPI *handle, TessPageIteratorLevel level, BOOL text_only, - BOOL raw_image, int raw_padding, struct Pixa **pixa, int **blockids, - int **paraids); - -TESS_API int TessBaseAPIGetThresholdedImageScaleFactor( - const TessBaseAPI *handle); - -TESS_API TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle); - -TESS_API int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor); - -TESS_API BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename, - const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer); -TESS_API BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix, - int page_index, const char *filename, - const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer); - -TESS_API TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle); -TESS_API TessMutableIterator *TessBaseAPIGetMutableIterator( - TessBaseAPI *handle); - -TESS_API char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle); -TESS_API char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number); - -TESS_API char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number); - -TESS_API char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle, - int page_number); - -TESS_API char *TessBaseAPIGetUNLVText(TessBaseAPI *handle); -TESS_API int TessBaseAPIMeanTextConf(TessBaseAPI *handle); - -TESS_API int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle); - -#ifndef DISABLED_LEGACY_ENGINE -TESS_API BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle, - TessPageSegMode mode, - const char *wordstr); -#endif // #ifndef DISABLED_LEGACY_ENGINE - -TESS_API void TessBaseAPIClear(TessBaseAPI *handle); -TESS_API void TessBaseAPIEnd(TessBaseAPI *handle); - -TESS_API int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word); -TESS_API BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset, - float *out_slope); - -TESS_API const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id); - -TESS_API void TessBaseAPIClearPersistentCache(TessBaseAPI *handle); - -#ifndef DISABLED_LEGACY_ENGINE - -// Call TessDeleteText(*best_script_name) to free memory allocated by this -// function -TESS_API BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle, - int *orient_deg, - float *orient_conf, - const char **script_name, - float *script_conf); -#endif // #ifndef DISABLED_LEGACY_ENGINE - -TESS_API void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle, - double margin); - -TESS_API int TessBaseAPINumDawgs(const TessBaseAPI *handle); - -TESS_API TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle); - -TESS_API void TessBaseGetBlockTextOrientations(TessBaseAPI *handle, - int **block_orientation, - bool **vertical_writing); - -/* Page iterator */ - -TESS_API void TessPageIteratorDelete(TessPageIterator *handle); - -TESS_API TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle); - -TESS_API void TessPageIteratorBegin(TessPageIterator *handle); - -TESS_API BOOL TessPageIteratorNext(TessPageIterator *handle, - TessPageIteratorLevel level); - -TESS_API BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle, - TessPageIteratorLevel level); - -TESS_API BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle, - TessPageIteratorLevel level, - TessPageIteratorLevel element); - -TESS_API BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle, - TessPageIteratorLevel level, - int *left, int *top, int *right, - int *bottom); - -TESS_API TessPolyBlockType -TessPageIteratorBlockType(const TessPageIterator *handle); - -TESS_API struct Pix *TessPageIteratorGetBinaryImage( - const TessPageIterator *handle, TessPageIteratorLevel level); - -TESS_API struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle, - TessPageIteratorLevel level, - int padding, - struct Pix *original_image, - int *left, int *top); - -TESS_API BOOL TessPageIteratorBaseline(const TessPageIterator *handle, - TessPageIteratorLevel level, int *x1, - int *y1, int *x2, int *y2); - -TESS_API void TessPageIteratorOrientation( - TessPageIterator *handle, TessOrientation *orientation, - TessWritingDirection *writing_direction, TessTextlineOrder *textline_order, - float *deskew_angle); - -TESS_API void TessPageIteratorParagraphInfo( - TessPageIterator *handle, TessParagraphJustification *justification, - BOOL *is_list_item, BOOL *is_crown, int *first_line_indent); - -/* Result iterator */ - -TESS_API void TessResultIteratorDelete(TessResultIterator *handle); -TESS_API TessResultIterator *TessResultIteratorCopy( - const TessResultIterator *handle); -TESS_API TessPageIterator *TessResultIteratorGetPageIterator( - TessResultIterator *handle); -TESS_API const TessPageIterator *TessResultIteratorGetPageIteratorConst( - const TessResultIterator *handle); -TESS_API TessChoiceIterator *TessResultIteratorGetChoiceIterator( - const TessResultIterator *handle); - -TESS_API BOOL TessResultIteratorNext(TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API float TessResultIteratorConfidence(const TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API const char *TessResultIteratorWordRecognitionLanguage( - const TessResultIterator *handle); -TESS_API const char *TessResultIteratorWordFontAttributes( - const TessResultIterator *handle, BOOL *is_bold, BOOL *is_italic, - BOOL *is_underlined, BOOL *is_monospace, BOOL *is_serif, BOOL *is_smallcaps, - int *pointsize, int *font_id); - -TESS_API BOOL -TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle); -TESS_API BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle); - -TESS_API void TessChoiceIteratorDelete(TessChoiceIterator *handle); -TESS_API BOOL TessChoiceIteratorNext(TessChoiceIterator *handle); -TESS_API const char *TessChoiceIteratorGetUTF8Text( - const TessChoiceIterator *handle); -TESS_API float TessChoiceIteratorConfidence(const TessChoiceIterator *handle); - -/* Progress monitor */ - -TESS_API ETEXT_DESC *TessMonitorCreate(); -TESS_API void TessMonitorDelete(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetCancelFunc(ETEXT_DESC *monitor, - TessCancelFunc cancelFunc); -TESS_API void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis); -TESS_API void *TessMonitorGetCancelThis(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetProgressFunc(ETEXT_DESC *monitor, - TessProgressFunc progressFunc); -TESS_API int TessMonitorGetProgress(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline); - -#ifdef __cplusplus -} -#endif - -#endif // API_CAPI_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/export.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/export.h deleted file mode 100644 index d238b628..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/export.h +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: export.h -// Description: Place holder -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_PLATFORM_H_ -#define TESSERACT_PLATFORM_H_ - -#ifndef TESS_API -# if defined(_WIN32) || defined(__CYGWIN__) -# if defined(TESS_EXPORTS) -# define TESS_API __declspec(dllexport) -# elif defined(TESS_IMPORTS) -# define TESS_API __declspec(dllimport) -# else -# define TESS_API -# endif -# else -# if defined(TESS_EXPORTS) || defined(TESS_IMPORTS) -# define TESS_API __attribute__((visibility("default"))) -# else -# define TESS_API -# endif -# endif -#endif - -#endif // TESSERACT_PLATFORM_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/ltrresultiterator.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/ltrresultiterator.h deleted file mode 100644 index 6ca0a98e..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/ltrresultiterator.h +++ /dev/null @@ -1,235 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: ltrresultiterator.h -// Description: Iterator for tesseract results in strict left-to-right -// order that avoids using tesseract internal data structures. -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ -#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ - -#include "export.h" // for TESS_API -#include "pageiterator.h" // for PageIterator -#include "publictypes.h" // for PageIteratorLevel -#include "unichar.h" // for StrongScriptDirection - -namespace tesseract { - -class BLOB_CHOICE_IT; -class PAGE_RES; -class WERD_RES; - -class Tesseract; - -// Class to iterate over tesseract results, providing access to all levels -// of the page hierarchy, without including any tesseract headers or having -// to handle any tesseract structures. -// WARNING! This class points to data held within the TessBaseAPI class, and -// therefore can only be used while the TessBaseAPI class still exists and -// has not been subjected to a call of Init, SetImage, Recognize, Clear, End -// DetectOS, or anything else that changes the internal PAGE_RES. -// See tesseract/publictypes.h for the definition of PageIteratorLevel. -// See also base class PageIterator, which contains the bulk of the interface. -// LTRResultIterator adds text-specific methods for access to OCR output. - -class TESS_API LTRResultIterator : public PageIterator { - friend class ChoiceIterator; - -public: - // page_res and tesseract come directly from the BaseAPI. - // The rectangle parameters are copied indirectly from the Thresholder, - // via the BaseAPI. They represent the coordinates of some rectangle in an - // original image (in top-left-origin coordinates) and therefore the top-left - // needs to be added to any output boxes in order to specify coordinates - // in the original image. See TessBaseAPI::SetRectangle. - // The scale and scaled_yres are in case the Thresholder scaled the image - // rectangle prior to thresholding. Any coordinates in tesseract's image - // must be divided by scale before adding (rect_left, rect_top). - // The scaled_yres indicates the effective resolution of the binary image - // that tesseract has been given by the Thresholder. - // After the constructor, Begin has already been called. - LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, - int scaled_yres, int rect_left, int rect_top, - int rect_width, int rect_height); - - ~LTRResultIterator() override; - - // LTRResultIterators may be copied! This makes it possible to iterate over - // all the objects at a lower level, while maintaining an iterator to - // objects at a higher level. These constructors DO NOT CALL Begin, so - // iterations will continue from the location of src. - // TODO: For now the copy constructor and operator= only need the base class - // versions, but if new data members are added, don't forget to add them! - - // ============= Moving around within the page ============. - - // See PageIterator. - - // ============= Accessing data ==============. - - // Returns the null terminated UTF-8 encoded text string for the current - // object at the given level. Use delete [] to free after use. - char *GetUTF8Text(PageIteratorLevel level) const; - - // Set the string inserted at the end of each text line. "\n" by default. - void SetLineSeparator(const char *new_line); - - // Set the string inserted at the end of each paragraph. "\n" by default. - void SetParagraphSeparator(const char *new_para); - - // Returns the mean confidence of the current object at the given level. - // The number should be interpreted as a percent probability. (0.0f-100.0f) - float Confidence(PageIteratorLevel level) const; - - // ============= Functions that refer to words only ============. - - // Returns the font attributes of the current word. If iterating at a higher - // level object than words, eg textlines, then this will return the - // attributes of the first word in that textline. - // The actual return value is a string representing a font name. It points - // to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as - // the iterator itself, ie rendered invalid by various members of - // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI. - // Pointsize is returned in printers points (1/72 inch.) - const char *WordFontAttributes(bool *is_bold, bool *is_italic, - bool *is_underlined, bool *is_monospace, - bool *is_serif, bool *is_smallcaps, - int *pointsize, int *font_id) const; - - // Return the name of the language used to recognize this word. - // On error, nullptr. Do not delete this pointer. - const char *WordRecognitionLanguage() const; - - // Return the overall directionality of this word. - StrongScriptDirection WordDirection() const; - - // Returns true if the current word was found in a dictionary. - bool WordIsFromDictionary() const; - - // Returns the number of blanks before the current word. - int BlanksBeforeWord() const; - - // Returns true if the current word is numeric. - bool WordIsNumeric() const; - - // Returns true if the word contains blamer information. - bool HasBlamerInfo() const; - - // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle - // of the current word. - const void *GetParamsTrainingBundle() const; - - // Returns a pointer to the string with blamer information for this word. - // Assumes that the word's blamer_bundle is not nullptr. - const char *GetBlamerDebug() const; - - // Returns a pointer to the string with misadaption information for this word. - // Assumes that the word's blamer_bundle is not nullptr. - const char *GetBlamerMisadaptionDebug() const; - - // Returns true if a truth string was recorded for the current word. - bool HasTruthString() const; - - // Returns true if the given string is equivalent to the truth string for - // the current word. - bool EquivalentToTruth(const char *str) const; - - // Returns a null terminated UTF-8 encoded truth string for the current word. - // Use delete [] to free after use. - char *WordTruthUTF8Text() const; - - // Returns a null terminated UTF-8 encoded normalized OCR string for the - // current word. Use delete [] to free after use. - char *WordNormedUTF8Text() const; - - // Returns a pointer to serialized choice lattice. - // Fills lattice_size with the number of bytes in lattice data. - const char *WordLattice(int *lattice_size) const; - - // ============= Functions that refer to symbols only ============. - - // Returns true if the current symbol is a superscript. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsSuperscript() const; - // Returns true if the current symbol is a subscript. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsSubscript() const; - // Returns true if the current symbol is a dropcap. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsDropcap() const; - -protected: - const char *line_separator_; - const char *paragraph_separator_; -}; - -// Class to iterate over the classifier choices for a single RIL_SYMBOL. -class TESS_API ChoiceIterator { -public: - // Construction is from a LTRResultIterator that points to the symbol of - // interest. The ChoiceIterator allows a one-shot iteration over the - // choices for this symbol and after that it is useless. - explicit ChoiceIterator(const LTRResultIterator &result_it); - ~ChoiceIterator(); - - // Moves to the next choice for the symbol and returns false if there - // are none left. - bool Next(); - - // ============= Accessing data ==============. - - // Returns the null terminated UTF-8 encoded text string for the current - // choice. - // NOTE: Unlike LTRResultIterator::GetUTF8Text, the return points to an - // internal structure and should NOT be delete[]ed to free after use. - const char *GetUTF8Text() const; - - // Returns the confidence of the current choice depending on the used language - // data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All - // choices for one symbol should roughly add up to 1.0f. - // If only traineddata of the legacy engine is used, the number should be - // interpreted as a percent probability. (0.0f-100.0f) In this case - // probabilities won't add up to 100. Each one stands on its own. - float Confidence() const; - - // Returns a vector containing all timesteps, which belong to the currently - // selected symbol. A timestep is a vector containing pairs of symbols and - // floating point numbers. The number states the probability for the - // corresponding symbol. - std::vector>> *Timesteps() const; - -private: - // clears the remaining spaces out of the results and adapt the probabilities - void filterSpaces(); - // Pointer to the WERD_RES object owned by the API. - WERD_RES *word_res_; - // Iterator over the blob choices. - BLOB_CHOICE_IT *choice_it_; - std::vector> *LSTM_choices_ = nullptr; - std::vector>::iterator LSTM_choice_it_; - - const int *tstep_index_; - // regulates the rating granularity - double rating_coefficient_; - // leading blanks - int blanks_before_word_; - // true when there is lstm engine related trained data - bool oemLSTM_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/ocrclass.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/ocrclass.h deleted file mode 100644 index a55e6528..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/ocrclass.h +++ /dev/null @@ -1,158 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -/********************************************************************** - * File: ocrclass.h - * Description: Class definitions and constants for the OCR API. - * Author: Hewlett-Packard Co - * - * (C) Copyright 1996, Hewlett-Packard Co. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -/********************************************************************** - * This file contains typedefs for all the structures used by - * the HP OCR interface. - * The structures are designed to allow them to be used with any - * structure alignment up to 8. - **********************************************************************/ - -#ifndef CCUTIL_OCRCLASS_H_ -#define CCUTIL_OCRCLASS_H_ - -#include -#include - -namespace tesseract { - -/********************************************************************** - * EANYCODE_CHAR - * Description of a single character. The character code is defined by - * the character set of the current font. - * Output text is sent as an array of these structures. - * Spaces and line endings in the output are represented in the - * structures of the surrounding characters. They are not directly - * represented as characters. - * The first character in a word has a positive value of blanks. - * Missing information should be set to the defaults in the comments. - * If word bounds are known, but not character bounds, then the top and - * bottom of each character should be those of the word. The left of the - * first and right of the last char in each word should be set. All other - * lefts and rights should be set to -1. - * If set, the values of right and bottom are left+width and top+height. - * Most of the members come directly from the parameters to ocr_append_char. - * The formatting member uses the enhancement parameter and combines the - * line direction stuff into the top 3 bits. - * The coding is 0=RL char, 1=LR char, 2=DR NL, 3=UL NL, 4=DR Para, - * 5=UL Para, 6=TB char, 7=BT char. API users do not need to know what - * the coding is, only that it is backwards compatible with the previous - * version. - **********************************************************************/ - -struct EANYCODE_CHAR { /*single character */ - // It should be noted that the format for char_code for version 2.0 and beyond - // is UTF8 which means that ASCII characters will come out as one structure - // but other characters will be returned in two or more instances of this - // structure with a single byte of the UTF8 code in each, but each will have - // the same bounding box. Programs which want to handle languagues with - // different characters sets will need to handle extended characters - // appropriately, but *all* code needs to be prepared to receive UTF8 coded - // characters for characters such as bullet and fancy quotes. - uint16_t char_code; /*character itself */ - int16_t left; /*of char (-1) */ - int16_t right; /*of char (-1) */ - int16_t top; /*of char (-1) */ - int16_t bottom; /*of char (-1) */ - int16_t font_index; /*what font (0) */ - uint8_t confidence; /*0=perfect, 100=reject (0/100) */ - uint8_t point_size; /*of char, 72=i inch, (10) */ - int8_t blanks; /*no of spaces before this char (1) */ - uint8_t formatting; /*char formatting (0) */ -}; - -/********************************************************************** - * ETEXT_DESC - * Description of the output of the OCR engine. - * This structure is used as both a progress monitor and the final - * output header, since it needs to be a valid progress monitor while - * the OCR engine is storing its output to shared memory. - * During progress, all the buffer info is -1. - * Progress starts at 0 and increases to 100 during OCR. No other constraint. - * Additionally the progress callback contains the bounding box of the word that - * is currently being processed. - * Every progress callback, the OCR engine must set ocr_alive to 1. - * The HP side will set ocr_alive to 0. Repeated failure to reset - * to 1 indicates that the OCR engine is dead. - * If the cancel function is not null then it is called with the number of - * user words found. If it returns true then operation is cancelled. - **********************************************************************/ -class ETEXT_DESC; - -using CANCEL_FUNC = bool (*)(void *, int); -using PROGRESS_FUNC = bool (*)(int, int, int, int, int); -using PROGRESS_FUNC2 = bool (*)(ETEXT_DESC *, int, int, int, int); - -class ETEXT_DESC { // output header -public: - int16_t count{0}; /// chars in this buffer(0) - int16_t progress{0}; /// percent complete increasing (0-100) - /** Progress monitor covers word recognition and it does not cover layout - * analysis. - * See Ray comment in https://github.com/tesseract-ocr/tesseract/pull/27 */ - int8_t more_to_come{0}; /// true if not last - volatile int8_t ocr_alive{0}; /// ocr sets to 1, HP 0 - int8_t err_code{0}; /// for errcode use - CANCEL_FUNC cancel{nullptr}; /// returns true to cancel - PROGRESS_FUNC progress_callback{ - nullptr}; /// called whenever progress increases - PROGRESS_FUNC2 progress_callback2; /// monitor-aware progress callback - void *cancel_this{nullptr}; /// this or other data for cancel - std::chrono::steady_clock::time_point end_time; - /// Time to stop. Expected to be set only - /// by call to set_deadline_msecs(). - EANYCODE_CHAR text[1]{}; /// character data - - ETEXT_DESC() : progress_callback2(&default_progress_func) { - end_time = std::chrono::time_point(); - } - - // Sets the end time to be deadline_msecs milliseconds from now. - void set_deadline_msecs(int32_t deadline_msecs) { - if (deadline_msecs > 0) { - end_time = std::chrono::steady_clock::now() + - std::chrono::milliseconds(deadline_msecs); - } - } - - // Returns false if we've not passed the end_time, or have not set a deadline. - bool deadline_exceeded() const { - if (end_time.time_since_epoch() == - std::chrono::steady_clock::duration::zero()) { - return false; - } - auto now = std::chrono::steady_clock::now(); - return (now > end_time); - } - -private: - static bool default_progress_func(ETEXT_DESC *ths, int left, int right, - int top, int bottom) { - if (ths->progress_callback != nullptr) { - return (*(ths->progress_callback))(ths->progress, left, right, top, - bottom); - } - return true; - } -}; - -} // namespace tesseract - -#endif // CCUTIL_OCRCLASS_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/osdetect.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/osdetect.h deleted file mode 100644 index 34bfb557..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/osdetect.h +++ /dev/null @@ -1,139 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: osdetect.h -// Description: Orientation and script detection. -// Author: Samuel Charron -// Ranjith Unnikrishnan -// -// (C) Copyright 2008, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_OSDETECT_H_ -#define TESSERACT_CCMAIN_OSDETECT_H_ - -#include "export.h" // for TESS_API - -#include // for std::vector - -namespace tesseract { - -class BLOBNBOX; -class BLOBNBOX_CLIST; -class BLOB_CHOICE_LIST; -class TO_BLOCK_LIST; -class UNICHARSET; - -class Tesseract; - -// Max number of scripts in ICU + "NULL" + Japanese and Korean + Fraktur -const int kMaxNumberOfScripts = 116 + 1 + 2 + 1; - -struct OSBestResult { - OSBestResult() - : orientation_id(0), script_id(0), sconfidence(0.0), oconfidence(0.0) {} - int orientation_id; - int script_id; - float sconfidence; - float oconfidence; -}; - -struct OSResults { - OSResults() : unicharset(nullptr) { - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < kMaxNumberOfScripts; ++j) { - scripts_na[i][j] = 0; - } - orientations[i] = 0; - } - } - void update_best_orientation(); - // Set the estimate of the orientation to the given id. - void set_best_orientation(int orientation_id); - // Update/Compute the best estimate of the script assuming the given - // orientation id. - void update_best_script(int orientation_id); - // Return the index of the script with the highest score for this orientation. - TESS_API int get_best_script(int orientation_id) const; - // Accumulate scores with given OSResults instance and update the best script. - void accumulate(const OSResults &osr); - - // Print statistics. - void print_scores(void) const; - void print_scores(int orientation_id) const; - - // Array holding scores for each orientation id [0,3]. - // Orientation ids [0..3] map to [0, 270, 180, 90] degree orientations of the - // page respectively, where the values refer to the amount of clockwise - // rotation to be applied to the page for the text to be upright and readable. - float orientations[4]; - // Script confidence scores for each of 4 possible orientations. - float scripts_na[4][kMaxNumberOfScripts]; - - UNICHARSET *unicharset; - OSBestResult best_result; -}; - -class OrientationDetector { -public: - OrientationDetector(const std::vector *allowed_scripts, - OSResults *results); - bool detect_blob(BLOB_CHOICE_LIST *scores); - int get_orientation(); - -private: - OSResults *osr_; - const std::vector *allowed_scripts_; -}; - -class ScriptDetector { -public: - ScriptDetector(const std::vector *allowed_scripts, OSResults *osr, - tesseract::Tesseract *tess); - void detect_blob(BLOB_CHOICE_LIST *scores); - bool must_stop(int orientation) const; - -private: - OSResults *osr_; - static const char *korean_script_; - static const char *japanese_script_; - static const char *fraktur_script_; - int korean_id_; - int japanese_id_; - int katakana_id_; - int hiragana_id_; - int han_id_; - int hangul_id_; - int latin_id_; - int fraktur_id_; - tesseract::Tesseract *tess_; - const std::vector *allowed_scripts_; -}; - -int orientation_and_script_detection(const char *filename, OSResults *, - tesseract::Tesseract *); - -int os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr, - tesseract::Tesseract *tess); - -int os_detect_blobs(const std::vector *allowed_scripts, - BLOBNBOX_CLIST *blob_list, OSResults *osr, - tesseract::Tesseract *tess); - -bool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s, - OSResults *, tesseract::Tesseract *tess); - -// Helper method to convert an orientation index to its value in degrees. -// The value represents the amount of clockwise rotation in degrees that must be -// applied for the text to be upright (readable). -TESS_API int OrientationIdToValue(const int &id); - -} // namespace tesseract - -#endif // TESSERACT_CCMAIN_OSDETECT_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/pageiterator.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/pageiterator.h deleted file mode 100644 index 68739715..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/pageiterator.h +++ /dev/null @@ -1,364 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: pageiterator.h -// Description: Iterator for tesseract page structure that avoids using -// tesseract internal data structures. -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_ -#define TESSERACT_CCMAIN_PAGEITERATOR_H_ - -#include "export.h" -#include "publictypes.h" - -struct Pix; -struct Pta; - -namespace tesseract { - -struct BlamerBundle; -class C_BLOB_IT; -class PAGE_RES; -class PAGE_RES_IT; -class WERD; - -class Tesseract; - -/** - * Class to iterate over tesseract page structure, providing access to all - * levels of the page hierarchy, without including any tesseract headers or - * having to handle any tesseract structures. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - * See tesseract/publictypes.h for the definition of PageIteratorLevel. - * See also ResultIterator, derived from PageIterator, which adds in the - * ability to access OCR output with text-specific methods. - */ - -class TESS_API PageIterator { -public: - /** - * page_res and tesseract come directly from the BaseAPI. - * The rectangle parameters are copied indirectly from the Thresholder, - * via the BaseAPI. They represent the coordinates of some rectangle in an - * original image (in top-left-origin coordinates) and therefore the top-left - * needs to be added to any output boxes in order to specify coordinates - * in the original image. See TessBaseAPI::SetRectangle. - * The scale and scaled_yres are in case the Thresholder scaled the image - * rectangle prior to thresholding. Any coordinates in tesseract's image - * must be divided by scale before adding (rect_left, rect_top). - * The scaled_yres indicates the effective resolution of the binary image - * that tesseract has been given by the Thresholder. - * After the constructor, Begin has already been called. - */ - PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, - int scaled_yres, int rect_left, int rect_top, int rect_width, - int rect_height); - virtual ~PageIterator(); - - /** - * Page/ResultIterators may be copied! This makes it possible to iterate over - * all the objects at a lower level, while maintaining an iterator to - * objects at a higher level. These constructors DO NOT CALL Begin, so - * iterations will continue from the location of src. - */ - PageIterator(const PageIterator &src); - const PageIterator &operator=(const PageIterator &src); - - /** Are we positioned at the same location as other? */ - bool PositionedAtSameWord(const PAGE_RES_IT *other) const; - - // ============= Moving around within the page ============. - - /** - * Moves the iterator to point to the start of the page to begin an - * iteration. - */ - virtual void Begin(); - - /** - * Moves the iterator to the beginning of the paragraph. - * This class implements this functionality by moving it to the zero indexed - * blob of the first (leftmost) word on the first row of the paragraph. - */ - virtual void RestartParagraph(); - - /** - * Return whether this iterator points anywhere in the first textline of a - * paragraph. - */ - bool IsWithinFirstTextlineOfParagraph() const; - - /** - * Moves the iterator to the beginning of the text line. - * This class implements this functionality by moving it to the zero indexed - * blob of the first (leftmost) word of the row. - */ - virtual void RestartRow(); - - /** - * Moves to the start of the next object at the given level in the - * page hierarchy, and returns false if the end of the page was reached. - * NOTE that RIL_SYMBOL will skip non-text blocks, but all other - * PageIteratorLevel level values will visit each non-text block once. - * Think of non text blocks as containing a single para, with a single line, - * with a single imaginary word. - * Calls to Next with different levels may be freely intermixed. - * This function iterates words in right-to-left scripts correctly, if - * the appropriate language has been loaded into Tesseract. - */ - virtual bool Next(PageIteratorLevel level); - - /** - * Returns true if the iterator is at the start of an object at the given - * level. - * - * For instance, suppose an iterator it is pointed to the first symbol of the - * first word of the third line of the second paragraph of the first block in - * a page, then: - * it.IsAtBeginningOf(RIL_BLOCK) = false - * it.IsAtBeginningOf(RIL_PARA) = false - * it.IsAtBeginningOf(RIL_TEXTLINE) = true - * it.IsAtBeginningOf(RIL_WORD) = true - * it.IsAtBeginningOf(RIL_SYMBOL) = true - */ - virtual bool IsAtBeginningOf(PageIteratorLevel level) const; - - /** - * Returns whether the iterator is positioned at the last element in a - * given level. (e.g. the last word in a line, the last line in a block) - * - * Here's some two-paragraph example - * text. It starts off innocuously - * enough but quickly turns bizarre. - * The author inserts a cornucopia - * of words to guard against confused - * references. - * - * Now take an iterator it pointed to the start of "bizarre." - * it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false - * it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true - * it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false - */ - virtual bool IsAtFinalElement(PageIteratorLevel level, - PageIteratorLevel element) const; - - /** - * Returns whether this iterator is positioned - * before other: -1 - * equal to other: 0 - * after other: 1 - */ - int Cmp(const PageIterator &other) const; - - // ============= Accessing data ==============. - // Coordinate system: - // Integer coordinates are at the cracks between the pixels. - // The top-left corner of the top-left pixel in the image is at (0,0). - // The bottom-right corner of the bottom-right pixel in the image is at - // (width, height). - // Every bounding box goes from the top-left of the top-left contained - // pixel to the bottom-right of the bottom-right contained pixel, so - // the bounding box of the single top-left pixel in the image is: - // (0,0)->(1,1). - // If an image rectangle has been set in the API, then returned coordinates - // relate to the original (full) image, rather than the rectangle. - - /** - * Controls what to include in a bounding box. Bounding boxes of all levels - * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics. - * Between layout analysis and recognition, it isn't known where all - * diacritics belong, so this control is used to include or exclude some - * diacritics that are above or below the main body of the word. In most cases - * where the placement is obvious, and after recognition, it doesn't make as - * much difference, as the diacritics will already be included in the word. - */ - void SetBoundingBoxComponents(bool include_upper_dots, - bool include_lower_dots) { - include_upper_dots_ = include_upper_dots; - include_lower_dots_ = include_lower_dots; - } - - /** - * Returns the bounding rectangle of the current object at the given level. - * See comment on coordinate system above. - * Returns false if there is no such object at the current position. - * The returned bounding box is guaranteed to match the size and position - * of the image returned by GetBinaryImage, but may clip foreground pixels - * from a grey image. The padding argument to GetImage can be used to expand - * the image to include more foreground pixels. See GetImage below. - */ - bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, - int *bottom) const; - bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top, - int *right, int *bottom) const; - /** - * Returns the bounding rectangle of the object in a coordinate system of the - * working image rectangle having its origin at (rect_left_, rect_top_) with - * respect to the original image and is scaled by a factor scale_. - */ - bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top, - int *right, int *bottom) const; - - /** Returns whether there is no object of a given level. */ - bool Empty(PageIteratorLevel level) const; - - /** - * Returns the type of the current block. - * See tesseract/publictypes.h for PolyBlockType. - */ - PolyBlockType BlockType() const; - - /** - * Returns the polygon outline of the current block. The returned Pta must - * be ptaDestroy-ed after use. Note that the returned Pta lists the vertices - * of the polygon, and the last edge is the line segment between the last - * point and the first point. nullptr will be returned if the iterator is - * at the end of the document or layout analysis was not used. - */ - Pta *BlockPolygon() const; - - /** - * Returns a binary image of the current object at the given level. - * The position and size match the return from BoundingBoxInternal, and so - * this could be upscaled with respect to the original input image. - * Use pixDestroy to delete the image after use. - */ - Pix *GetBinaryImage(PageIteratorLevel level) const; - - /** - * Returns an image of the current object at the given level in greyscale - * if available in the input. To guarantee a binary image use BinaryImage. - * NOTE that in order to give the best possible image, the bounds are - * expanded slightly over the binary connected component, by the supplied - * padding, so the top-left position of the returned image is returned - * in (left,top). These will most likely not match the coordinates - * returned by BoundingBox. - * If you do not supply an original image, you will get a binary one. - * Use pixDestroy to delete the image after use. - */ - Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img, - int *left, int *top) const; - - /** - * Returns the baseline of the current object at the given level. - * The baseline is the line that passes through (x1, y1) and (x2, y2). - * WARNING: with vertical text, baselines may be vertical! - * Returns false if there is no baseline at the current position. - */ - bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2, - int *y2) const; - - // Returns the attributes of the current row. - void RowAttributes(float *row_height, float *descenders, - float *ascenders) const; - - /** - * Returns orientation for the block the iterator points to. - * orientation, writing_direction, textline_order: see publictypes.h - * deskew_angle: after rotating the block so the text orientation is - * upright, how many radians does one have to rotate the - * block anti-clockwise for it to be level? - * -Pi/4 <= deskew_angle <= Pi/4 - */ - void Orientation(tesseract::Orientation *orientation, - tesseract::WritingDirection *writing_direction, - tesseract::TextlineOrder *textline_order, - float *deskew_angle) const; - - /** - * Returns information about the current paragraph, if available. - * - * justification - - * LEFT if ragged right, or fully justified and script is left-to-right. - * RIGHT if ragged left, or fully justified and script is right-to-left. - * unknown if it looks like source code or we have very few lines. - * is_list_item - - * true if we believe this is a member of an ordered or unordered list. - * is_crown - - * true if the first line of the paragraph is aligned with the other - * lines of the paragraph even though subsequent paragraphs have first - * line indents. This typically indicates that this is the continuation - * of a previous paragraph or that it is the very first paragraph in - * the chapter. - * first_line_indent - - * For LEFT aligned paragraphs, the first text line of paragraphs of - * this kind are indented this many pixels from the left edge of the - * rest of the paragraph. - * for RIGHT aligned paragraphs, the first text line of paragraphs of - * this kind are indented this many pixels from the right edge of the - * rest of the paragraph. - * NOTE 1: This value may be negative. - * NOTE 2: if *is_crown == true, the first line of this paragraph is - * actually flush, and first_line_indent is set to the "common" - * first_line_indent for subsequent paragraphs in this block - * of text. - */ - void ParagraphInfo(tesseract::ParagraphJustification *justification, - bool *is_list_item, bool *is_crown, - int *first_line_indent) const; - - // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle - // of the current word to the given pointer (takes ownership of the pointer) - // and returns true. - // Can only be used when iterating on the word level. - bool SetWordBlamerBundle(BlamerBundle *blamer_bundle); - -protected: - /** - * Sets up the internal data for iterating the blobs of a new word, then - * moves the iterator to the given offset. - */ - void BeginWord(int offset); - - /** Pointer to the page_res owned by the API. */ - PAGE_RES *page_res_; - /** Pointer to the Tesseract object owned by the API. */ - Tesseract *tesseract_; - /** - * The iterator to the page_res_. Owned by this ResultIterator. - * A pointer just to avoid dragging in Tesseract includes. - */ - PAGE_RES_IT *it_; - /** - * The current input WERD being iterated. If there is an output from OCR, - * then word_ is nullptr. Owned by the API - */ - WERD *word_; - /** The length of the current word_. */ - int word_length_; - /** The current blob index within the word. */ - int blob_index_; - /** - * Iterator to the blobs within the word. If nullptr, then we are iterating - * OCR results in the box_word. - * Owned by this ResultIterator. - */ - C_BLOB_IT *cblob_it_; - /** Control over what to include in bounding boxes. */ - bool include_upper_dots_; - bool include_lower_dots_; - /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/ - int scale_; - int scaled_yres_; - int rect_left_; - int rect_top_; - int rect_width_; - int rect_height_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_PAGEITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/publictypes.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/publictypes.h deleted file mode 100644 index 0069cf28..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/publictypes.h +++ /dev/null @@ -1,281 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: publictypes.h -// Description: Types used in both the API and internally -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCSTRUCT_PUBLICTYPES_H_ -#define TESSERACT_CCSTRUCT_PUBLICTYPES_H_ - -namespace tesseract { - -// This file contains types that are used both by the API and internally -// to Tesseract. In order to decouple the API from Tesseract and prevent cyclic -// dependencies, THIS FILE SHOULD NOT DEPEND ON ANY OTHER PART OF TESSERACT. -// Restated: It is OK for low-level Tesseract files to include publictypes.h, -// but not for the low-level tesseract code to include top-level API code. -// This file should not use other Tesseract types, as that would drag -// their includes into the API-level. - -/** Number of printers' points in an inch. The unit of the pointsize return. */ -constexpr int kPointsPerInch = 72; -/** - * Minimum believable resolution. Used as a default if there is no other - * information, as it is safer to under-estimate than over-estimate. - */ -constexpr int kMinCredibleResolution = 70; -/** Maximum believable resolution. */ -constexpr int kMaxCredibleResolution = 2400; -/** - * Ratio between median blob size and likely resolution. Used to estimate - * resolution when none is provided. This is basically 1/usual text size in - * inches. */ -constexpr int kResolutionEstimationFactor = 10; - -/** - * Possible types for a POLY_BLOCK or ColPartition. - * Must be kept in sync with kPBColors in polyblk.cpp and PTIs*Type functions - * below, as well as kPolyBlockNames in layout_test.cc. - * Used extensively by ColPartition, and POLY_BLOCK. - */ -enum PolyBlockType { - PT_UNKNOWN, // Type is not yet known. Keep as the first element. - PT_FLOWING_TEXT, // Text that lives inside a column. - PT_HEADING_TEXT, // Text that spans more than one column. - PT_PULLOUT_TEXT, // Text that is in a cross-column pull-out region. - PT_EQUATION, // Partition belonging to an equation region. - PT_INLINE_EQUATION, // Partition has inline equation. - PT_TABLE, // Partition belonging to a table region. - PT_VERTICAL_TEXT, // Text-line runs vertically. - PT_CAPTION_TEXT, // Text that belongs to an image. - PT_FLOWING_IMAGE, // Image that lives inside a column. - PT_HEADING_IMAGE, // Image that spans more than one column. - PT_PULLOUT_IMAGE, // Image that is in a cross-column pull-out region. - PT_HORZ_LINE, // Horizontal Line. - PT_VERT_LINE, // Vertical Line. - PT_NOISE, // Lies outside of any column. - PT_COUNT -}; - -/** Returns true if PolyBlockType is of horizontal line type */ -inline bool PTIsLineType(PolyBlockType type) { - return type == PT_HORZ_LINE || type == PT_VERT_LINE; -} -/** Returns true if PolyBlockType is of image type */ -inline bool PTIsImageType(PolyBlockType type) { - return type == PT_FLOWING_IMAGE || type == PT_HEADING_IMAGE || - type == PT_PULLOUT_IMAGE; -} -/** Returns true if PolyBlockType is of text type */ -inline bool PTIsTextType(PolyBlockType type) { - return type == PT_FLOWING_TEXT || type == PT_HEADING_TEXT || - type == PT_PULLOUT_TEXT || type == PT_TABLE || - type == PT_VERTICAL_TEXT || type == PT_CAPTION_TEXT || - type == PT_INLINE_EQUATION; -} -// Returns true if PolyBlockType is of pullout(inter-column) type -inline bool PTIsPulloutType(PolyBlockType type) { - return type == PT_PULLOUT_IMAGE || type == PT_PULLOUT_TEXT; -} - -/** - * +------------------+ Orientation Example: - * | 1 Aaaa Aaaa Aaaa | ==================== - * | Aaa aa aaa aa | To left is a diagram of some (1) English and - * | aaaaaa A aa aaa. | (2) Chinese text and a (3) photo credit. - * | 2 | - * | ####### c c C | Upright Latin characters are represented as A and a. - * | ####### c c c | '<' represents a latin character rotated - * | < ####### c c c | anti-clockwise 90 degrees. - * | < ####### c c | - * | < ####### . c | Upright Chinese characters are represented C and c. - * | 3 ####### c | - * +------------------+ NOTA BENE: enum values here should match goodoc.proto - - * If you orient your head so that "up" aligns with Orientation, - * then the characters will appear "right side up" and readable. - * - * In the example above, both the English and Chinese paragraphs are oriented - * so their "up" is the top of the page (page up). The photo credit is read - * with one's head turned leftward ("up" is to page left). - * - * The values of this enum match the convention of Tesseract's osdetect.h -*/ -enum Orientation { - ORIENTATION_PAGE_UP = 0, - ORIENTATION_PAGE_RIGHT = 1, - ORIENTATION_PAGE_DOWN = 2, - ORIENTATION_PAGE_LEFT = 3, -}; - -/** - * The grapheme clusters within a line of text are laid out logically - * in this direction, judged when looking at the text line rotated so that - * its Orientation is "page up". - * - * For English text, the writing direction is left-to-right. For the - * Chinese text in the above example, the writing direction is top-to-bottom. - */ -enum WritingDirection { - WRITING_DIRECTION_LEFT_TO_RIGHT = 0, - WRITING_DIRECTION_RIGHT_TO_LEFT = 1, - WRITING_DIRECTION_TOP_TO_BOTTOM = 2, -}; - -/** - * The text lines are read in the given sequence. - * - * In English, the order is top-to-bottom. - * In Chinese, vertical text lines are read right-to-left. Mongolian is - * written in vertical columns top to bottom like Chinese, but the lines - * order left-to right. - * - * Note that only some combinations make sense. For example, - * WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM - */ -enum TextlineOrder { - TEXTLINE_ORDER_LEFT_TO_RIGHT = 0, - TEXTLINE_ORDER_RIGHT_TO_LEFT = 1, - TEXTLINE_ORDER_TOP_TO_BOTTOM = 2, -}; - -/** - * Possible modes for page layout analysis. These *must* be kept in order - * of decreasing amount of layout analysis to be done, except for OSD_ONLY, - * so that the inequality test macros below work. - */ -enum PageSegMode { - PSM_OSD_ONLY = 0, ///< Orientation and script detection only. - PSM_AUTO_OSD = 1, ///< Automatic page segmentation with orientation and - ///< script detection. (OSD) - PSM_AUTO_ONLY = 2, ///< Automatic page segmentation, but no OSD, or OCR. - PSM_AUTO = 3, ///< Fully automatic page segmentation, but no OSD. - PSM_SINGLE_COLUMN = 4, ///< Assume a single column of text of variable sizes. - PSM_SINGLE_BLOCK_VERT_TEXT = 5, ///< Assume a single uniform block of - ///< vertically aligned text. - PSM_SINGLE_BLOCK = 6, ///< Assume a single uniform block of text. (Default.) - PSM_SINGLE_LINE = 7, ///< Treat the image as a single text line. - PSM_SINGLE_WORD = 8, ///< Treat the image as a single word. - PSM_CIRCLE_WORD = 9, ///< Treat the image as a single word in a circle. - PSM_SINGLE_CHAR = 10, ///< Treat the image as a single character. - PSM_SPARSE_TEXT = - 11, ///< Find as much text as possible in no particular order. - PSM_SPARSE_TEXT_OSD = 12, ///< Sparse text with orientation and script det. - PSM_RAW_LINE = 13, ///< Treat the image as a single text line, bypassing - ///< hacks that are Tesseract-specific. - - PSM_COUNT ///< Number of enum entries. -}; - -/** - * Inline functions that act on a PageSegMode to determine whether components of - * layout analysis are enabled. - * *Depend critically on the order of elements of PageSegMode.* - * NOTE that arg is an int for compatibility with INT_PARAM. - */ -inline bool PSM_OSD_ENABLED(int pageseg_mode) { - return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_ORIENTATION_ENABLED(int pageseg_mode) { - return pageseg_mode <= PSM_AUTO || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_COL_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO; -} -inline bool PSM_SPARSE(int pageseg_mode) { - return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN; -} -inline bool PSM_LINE_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK; -} -inline bool PSM_WORD_FIND_ENABLED(int pageseg_mode) { - return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) || - pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} - -/** - * enum of the elements of the page hierarchy, used in ResultIterator - * to provide functions that operate on each level without having to - * have 5x as many functions. - */ -enum PageIteratorLevel { - RIL_BLOCK, // Block of text/image/separator line. - RIL_PARA, // Paragraph within a block. - RIL_TEXTLINE, // Line within a paragraph. - RIL_WORD, // Word within a textline. - RIL_SYMBOL // Symbol/character within a word. -}; - -/** - * JUSTIFICATION_UNKNOWN - * The alignment is not clearly one of the other options. This could happen - * for example if there are only one or two lines of text or the text looks - * like source code or poetry. - * - * NOTA BENE: Fully justified paragraphs (text aligned to both left and right - * margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text - * is written with a left-to-right script and with JUSTIFICATION_RIGHT if - * their text is written in a right-to-left script. - * - * Interpretation for text read in vertical lines: - * "Left" is wherever the starting reading position is. - * - * JUSTIFICATION_LEFT - * Each line, except possibly the first, is flush to the same left tab stop. - * - * JUSTIFICATION_CENTER - * The text lines of the paragraph are centered about a line going - * down through their middle of the text lines. - * - * JUSTIFICATION_RIGHT - * Each line, except possibly the first, is flush to the same right tab stop. - */ -enum ParagraphJustification { - JUSTIFICATION_UNKNOWN, - JUSTIFICATION_LEFT, - JUSTIFICATION_CENTER, - JUSTIFICATION_RIGHT, -}; - -/** - * When Tesseract/Cube is initialized we can choose to instantiate/load/run - * only the Tesseract part, only the Cube part or both along with the combiner. - * The preference of which engine to use is stored in tessedit_ocr_engine_mode. - * - * ATTENTION: When modifying this enum, please make sure to make the - * appropriate changes to all the enums mirroring it (e.g. OCREngine in - * cityblock/workflow/detection/detection_storage.proto). Such enums will - * mention the connection to OcrEngineMode in the comments. - */ -enum OcrEngineMode { - OEM_TESSERACT_ONLY, // Run Tesseract only - fastest; deprecated - OEM_LSTM_ONLY, // Run just the LSTM line recognizer. - OEM_TESSERACT_LSTM_COMBINED, // Run the LSTM recognizer, but allow fallback - // to Tesseract when things get difficult. - // deprecated - OEM_DEFAULT, // Specify this mode when calling init_*(), - // to indicate that any of the above modes - // should be automatically inferred from the - // variables in the language-specific config, - // command-line configs, or if not specified - // in any of the above should be set to the - // default OEM_TESSERACT_ONLY. - OEM_COUNT // Number of OEMs -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCSTRUCT_PUBLICTYPES_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/renderer.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/renderer.h deleted file mode 100644 index 6f405233..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/renderer.h +++ /dev/null @@ -1,311 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: renderer.h -// Description: Rendering interface to inject into TessBaseAPI -// -// (C) Copyright 2011, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_RENDERER_H_ -#define TESSERACT_API_RENDERER_H_ - -#include "export.h" - -// To avoid collision with other typenames include the ABSOLUTE MINIMUM -// complexity of includes here. Use forward declarations wherever possible -// and hide includes of complex types in baseapi.cpp. -#include -#include // for std::string -#include // for std::vector - -struct Pix; - -namespace tesseract { - -class TessBaseAPI; - -/** - * Interface for rendering tesseract results into a document, such as text, - * HOCR or pdf. This class is abstract. Specific classes handle individual - * formats. This interface is then used to inject the renderer class into - * tesseract when processing images. - * - * For simplicity implementing this with tesseract version 3.01, - * the renderer contains document state that is cleared from document - * to document just as the TessBaseAPI is. This way the base API can just - * delegate its rendering functionality to injected renderers, and the - * renderers can manage the associated state needed for the specific formats - * in addition to the heuristics for producing it. - */ -class TESS_API TessResultRenderer { -public: - virtual ~TessResultRenderer(); - - // Takes ownership of pointer so must be new'd instance. - // Renderers aren't ordered, but appends the sequences of next parameter - // and existing next(). The renderers should be unique across both lists. - void insert(TessResultRenderer *next); - - // Returns the next renderer or nullptr. - TessResultRenderer *next() { - return next_; - } - - /** - * Starts a new document with the given title. - * This clears the contents of the output data. - * Title should use UTF-8 encoding. - */ - bool BeginDocument(const char *title); - - /** - * Adds the recognized text from the source image to the current document. - * Invalid if BeginDocument not yet called. - * - * Note that this API is a bit weird but is designed to fit into the - * current TessBaseAPI implementation where the api has lots of state - * information that we might want to add in. - */ - bool AddImage(TessBaseAPI *api); - - /** - * Finishes the document and finalizes the output data - * Invalid if BeginDocument not yet called. - */ - bool EndDocument(); - - const char *file_extension() const { - return file_extension_; - } - const char *title() const { - return title_.c_str(); - } - - // Is everything fine? Otherwise something went wrong. - bool happy() const { - return happy_; - } - - /** - * Returns the index of the last image given to AddImage - * (i.e. images are incremented whether the image succeeded or not) - * - * This is always defined. It means either the number of the - * current image, the last image ended, or in the completed document - * depending on when in the document lifecycle you are looking at it. - * Will return -1 if a document was never started. - */ - int imagenum() const { - return imagenum_; - } - -protected: - /** - * Called by concrete classes. - * - * outputbase is the name of the output file excluding - * extension. For example, "/path/to/chocolate-chip-cookie-recipe" - * - * extension indicates the file extension to be used for output - * files. For example "pdf" will produce a .pdf file, and "hocr" - * will produce .hocr files. - */ - TessResultRenderer(const char *outputbase, const char *extension); - - // Hook for specialized handling in BeginDocument() - virtual bool BeginDocumentHandler(); - - // This must be overridden to render the OCR'd results - virtual bool AddImageHandler(TessBaseAPI *api) = 0; - - // Hook for specialized handling in EndDocument() - virtual bool EndDocumentHandler(); - - // Renderers can call this to append '\0' terminated strings into - // the output string returned by GetOutput. - // This method will grow the output buffer if needed. - void AppendString(const char *s); - - // Renderers can call this to append binary byte sequences into - // the output string returned by GetOutput. Note that s is not necessarily - // '\0' terminated (and can contain '\0' within it). - // This method will grow the output buffer if needed. - void AppendData(const char *s, int len); - -private: - TessResultRenderer *next_; // Can link multiple renderers together - FILE *fout_; // output file pointer - const char *file_extension_; // standard extension for generated output - std::string title_; // title of document being rendered - int imagenum_; // index of last image added - bool happy_; // I get grumpy when the disk fills up, etc. -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessTextRenderer : public TessResultRenderer { -public: - explicit TessTextRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into an hocr text string - */ -class TESS_API TessHOcrRenderer : public TessResultRenderer { -public: - explicit TessHOcrRenderer(const char *outputbase, bool font_info); - explicit TessHOcrRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool font_info_; // whether to print font information -}; - -/** - * Renders tesseract output into an alto text string - */ -class TESS_API TessAltoRenderer : public TessResultRenderer { -public: - explicit TessAltoRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool begin_document; -}; - -/** - * Renders Tesseract output into a TSV string - */ -class TESS_API TessTsvRenderer : public TessResultRenderer { -public: - explicit TessTsvRenderer(const char *outputbase, bool font_info); - explicit TessTsvRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool font_info_; // whether to print font information -}; - -/** - * Renders tesseract output into searchable PDF - */ -class TESS_API TessPDFRenderer : public TessResultRenderer { -public: - // datadir is the location of the TESSDATA. We need it because - // we load a custom PDF font from this location. - TessPDFRenderer(const char *outputbase, const char *datadir, - bool textonly = false); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - // We don't want to have every image in memory at once, - // so we store some metadata as we go along producing - // PDFs one page at a time. At the end, that metadata is - // used to make everything that isn't easily handled in a - // streaming fashion. - long int obj_; // counter for PDF objects - std::vector offsets_; // offset of every PDF object in bytes - std::vector pages_; // object number for every /Page object - std::string datadir_; // where to find the custom font - bool textonly_; // skip images if set - // Bookkeeping only. DIY = Do It Yourself. - void AppendPDFObjectDIY(size_t objectsize); - // Bookkeeping + emit data. - void AppendPDFObject(const char *data); - // Create the /Contents object for an entire page. - char *GetPDFTextObjects(TessBaseAPI *api, double width, double height); - // Turn an image into a PDF object. Only transcode if we have to. - static bool imageToPDFObj(Pix *pix, const char *filename, long int objnum, - char **pdf_object, long int *pdf_object_size, - int jpg_quality); -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessUnlvRenderer : public TessResultRenderer { -public: - explicit TessUnlvRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string for LSTMBox - */ -class TESS_API TessLSTMBoxRenderer : public TessResultRenderer { -public: - explicit TessLSTMBoxRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessBoxTextRenderer : public TessResultRenderer { -public: - explicit TessBoxTextRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string in WordStr format - */ -class TESS_API TessWordStrBoxRenderer : public TessResultRenderer { -public: - explicit TessWordStrBoxRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -#ifndef DISABLED_LEGACY_ENGINE - -/** - * Renders tesseract output into an osd text string - */ -class TESS_API TessOsdRenderer : public TessResultRenderer { -public: - explicit TessOsdRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -#endif // ndef DISABLED_LEGACY_ENGINE - -} // namespace tesseract. - -#endif // TESSERACT_API_RENDERER_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/resultiterator.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/resultiterator.h deleted file mode 100644 index 3e4d5807..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/resultiterator.h +++ /dev/null @@ -1,250 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: resultiterator.h -// Description: Iterator for tesseract results that is capable of -// iterating in proper reading order over Bi Directional -// (e.g. mixed Hebrew and English) text. -// Author: David Eger -// -// (C) Copyright 2011, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_ -#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_ - -#include "export.h" // for TESS_API, TESS_LOCAL -#include "ltrresultiterator.h" // for LTRResultIterator -#include "publictypes.h" // for PageIteratorLevel -#include "unichar.h" // for StrongScriptDirection - -#include // for std::pair -#include // for std::vector - -namespace tesseract { - -class TESS_API ResultIterator : public LTRResultIterator { -public: - static ResultIterator *StartOfParagraph(const LTRResultIterator &resit); - - /** - * ResultIterator is copy constructible! - * The default copy constructor works just fine for us. - */ - ~ResultIterator() override = default; - - // ============= Moving around within the page ============. - /** - * Moves the iterator to point to the start of the page to begin - * an iteration. - */ - void Begin() override; - - /** - * Moves to the start of the next object at the given level in the - * page hierarchy in the appropriate reading order and returns false if - * the end of the page was reached. - * NOTE that RIL_SYMBOL will skip non-text blocks, but all other - * PageIteratorLevel level values will visit each non-text block once. - * Think of non text blocks as containing a single para, with a single line, - * with a single imaginary word. - * Calls to Next with different levels may be freely intermixed. - * This function iterates words in right-to-left scripts correctly, if - * the appropriate language has been loaded into Tesseract. - */ - bool Next(PageIteratorLevel level) override; - - /** - * IsAtBeginningOf() returns whether we're at the logical beginning of the - * given level. (as opposed to ResultIterator's left-to-right top-to-bottom - * order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf(). - * For a full description, see pageiterator.h - */ - bool IsAtBeginningOf(PageIteratorLevel level) const override; - - /** - * Implement PageIterator's IsAtFinalElement correctly in a BiDi context. - * For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we - * point at the last word in a paragraph. See PageIterator for full comment. - */ - bool IsAtFinalElement(PageIteratorLevel level, - PageIteratorLevel element) const override; - - // ============= Functions that refer to words only ============. - // Returns the number of blanks before the current word. - int BlanksBeforeWord() const; - - // ============= Accessing data ==============. - - /** - * Returns the null terminated UTF-8 encoded text string for the current - * object at the given level. Use delete [] to free after use. - */ - virtual char *GetUTF8Text(PageIteratorLevel level) const; - - /** - * Returns the LSTM choices for every LSTM timestep for the current word. - */ - virtual std::vector>>> - *GetRawLSTMTimesteps() const; - virtual std::vector>> - *GetBestLSTMSymbolChoices() const; - - /** - * Return whether the current paragraph's dominant reading direction - * is left-to-right (as opposed to right-to-left). - */ - bool ParagraphIsLtr() const; - - // ============= Exposed only for testing =============. - - /** - * Yields the reading order as a sequence of indices and (optional) - * meta-marks for a set of words (given left-to-right). - * The meta marks are passed as negative values: - * kMinorRunStart Start of minor direction text. - * kMinorRunEnd End of minor direction text. - * kComplexWord The next indexed word contains both left-to-right and - * right-to-left characters and was treated as neutral. - * - * For example, suppose we have five words in a text line, - * indexed [0,1,2,3,4] from the leftmost side of the text line. - * The following are all believable reading_orders: - * - * Left-to-Right (in ltr paragraph): - * { 0, 1, 2, 3, 4 } - * Left-to-Right (in rtl paragraph): - * { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd } - * Right-to-Left (in rtl paragraph): - * { 4, 3, 2, 1, 0 } - * Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph: - * { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 } - */ - static void CalculateTextlineOrder( - bool paragraph_is_ltr, - const std::vector &word_dirs, - std::vector *reading_order); - - static const int kMinorRunStart; - static const int kMinorRunEnd; - static const int kComplexWord; - -protected: - /** - * We presume the data associated with the given iterator will outlive us. - * NB: This is private because it does something that is non-obvious: - * it resets to the beginning of the paragraph instead of staying wherever - * resit might have pointed. - */ - explicit ResultIterator(const LTRResultIterator &resit); - -private: - /** - * Calculates the current paragraph's dominant writing direction. - * Typically, members should use current_paragraph_ltr_ instead. - */ - bool CurrentParagraphIsLtr() const; - - /** - * Returns word indices as measured from resit->RestartRow() = index 0 - * for the reading order of words within a textline given an iterator - * into the middle of the text line. - * In addition to non-negative word indices, the following negative values - * may be inserted: - * kMinorRunStart Start of minor direction text. - * kMinorRunEnd End of minor direction text. - * kComplexWord The previous word contains both left-to-right and - * right-to-left characters and was treated as neutral. - */ - void CalculateTextlineOrder(bool paragraph_is_ltr, - const LTRResultIterator &resit, - std::vector *indices) const; - /** Same as above, but the caller's ssd gets filled in if ssd != nullptr. */ - void CalculateTextlineOrder(bool paragraph_is_ltr, - const LTRResultIterator &resit, - std::vector *ssd, - std::vector *indices) const; - - /** - * What is the index of the current word in a strict left-to-right reading - * of the row? - */ - int LTRWordIndex() const; - - /** - * Given an iterator pointing at a word, returns the logical reading order - * of blob indices for the word. - */ - void CalculateBlobOrder(std::vector *blob_indices) const; - - /** Precondition: current_paragraph_is_ltr_ is set. */ - void MoveToLogicalStartOfTextline(); - - /** - * Precondition: current_paragraph_is_ltr_ and in_minor_direction_ - * are set. - */ - void MoveToLogicalStartOfWord(); - - /** Are we pointing at the final (reading order) symbol of the word? */ - bool IsAtFinalSymbolOfWord() const; - - /** Are we pointing at the first (reading order) symbol of the word? */ - bool IsAtFirstSymbolOfWord() const; - - /** - * Append any extra marks that should be appended to this word when printed. - * Mostly, these are Unicode BiDi control characters. - */ - void AppendSuffixMarks(std::string *text) const; - - /** Appends the current word in reading order to the given buffer.*/ - void AppendUTF8WordText(std::string *text) const; - - /** - * Appends the text of the current text line, *assuming this iterator is - * positioned at the beginning of the text line* This function - * updates the iterator to point to the first position past the text line. - * Each textline is terminated in a single newline character. - * If the textline ends a paragraph, it gets a second terminal newline. - */ - void IterateAndAppendUTF8TextlineText(std::string *text); - - /** - * Appends the text of the current paragraph in reading order - * to the given buffer. - * Each textline is terminated in a single newline character, and the - * paragraph gets an extra newline at the end. - */ - void AppendUTF8ParagraphText(std::string *text) const; - - /** Returns whether the bidi_debug flag is set to at least min_level. */ - bool BidiDebug(int min_level) const; - - bool current_paragraph_is_ltr_; - - /** - * Is the currently pointed-at character at the beginning of - * a minor-direction run? - */ - bool at_beginning_of_minor_run_; - - /** Is the currently pointed-at character in a minor-direction sequence? */ - bool in_minor_direction_; - - /** - * Should detected inter-word spaces be preserved, or "compressed" to a single - * space character (default behavior). - */ - bool preserve_interword_spaces_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/unichar.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/unichar.h deleted file mode 100644 index 015109d7..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/unichar.h +++ /dev/null @@ -1,174 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: unichar.h -// Description: Unicode character/ligature class. -// Author: Ray Smith -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCUTIL_UNICHAR_H_ -#define TESSERACT_CCUTIL_UNICHAR_H_ - -#include "export.h" - -#include -#include -#include -#include - -namespace tesseract { - -// Maximum number of characters that can be stored in a UNICHAR. Must be -// at least 4. Must not exceed 31 without changing the coding of length. -#define UNICHAR_LEN 30 - -// A UNICHAR_ID is the unique id of a unichar. -using UNICHAR_ID = int; - -// A variable to indicate an invalid or uninitialized unichar id. -static const int INVALID_UNICHAR_ID = -1; -// A special unichar that corresponds to INVALID_UNICHAR_ID. -static const char INVALID_UNICHAR[] = "__INVALID_UNICHAR__"; - -enum StrongScriptDirection { - DIR_NEUTRAL = 0, // Text contains only neutral characters. - DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters. - DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters. - DIR_MIX = 3, // Text contains a mixture of left-to-right - // and right-to-left characters. -}; - -using char32 = signed int; - -// The UNICHAR class holds a single classification result. This may be -// a single Unicode character (stored as between 1 and 4 utf8 bytes) or -// multiple Unicode characters representing the NFKC expansion of a ligature -// such as fi, ffl etc. These are also stored as utf8. -class TESS_API UNICHAR { -public: - UNICHAR() { - memset(chars, 0, UNICHAR_LEN); - } - - // Construct from a utf8 string. If len<0 then the string is null terminated. - // If the string is too long to fit in the UNICHAR then it takes only what - // will fit. - UNICHAR(const char *utf8_str, int len); - - // Construct from a single UCS4 character. - explicit UNICHAR(int unicode); - - // Default copy constructor and operator= are OK. - - // Get the first character as UCS-4. - int first_uni() const; - - // Get the length of the UTF8 string. - int utf8_len() const { - int len = chars[UNICHAR_LEN - 1]; - return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN; - } - - // Get a UTF8 string, but NOT nullptr terminated. - const char *utf8() const { - return chars; - } - - // Get a terminated UTF8 string: Must delete[] it after use. - char *utf8_str() const; - - // Get the number of bytes in the first character of the given utf8 string. - static int utf8_step(const char *utf8_str); - - // A class to simplify iterating over and accessing elements of a UTF8 - // string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or - // take ownership of the underlying byte array. It also does not permit - // modification of the array (as the name suggests). - // - // Example: - // for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len); - // it != UNICHAR::end(str, len); - // ++it) { - // printf("UCS-4 symbol code = %d\n", *it); - // char buf[5]; - // int char_len = it.get_utf8(buf); buf[char_len] = '\0'; - // printf("Char = %s\n", buf); - // } - class TESS_API const_iterator { - using CI = const_iterator; - - public: - // Step to the next UTF8 character. - // If the current position is at an illegal UTF8 character, then print an - // error message and step by one byte. If the current position is at a - // nullptr value, don't step past it. - const_iterator &operator++(); - - // Return the UCS-4 value at the current position. - // If the current position is at an illegal UTF8 value, return a single - // space character. - int operator*() const; - - // Store the UTF-8 encoding of the current codepoint into buf, which must be - // at least 4 bytes long. Return the number of bytes written. - // If the current position is at an illegal UTF8 value, writes a single - // space character and returns 1. - // Note that this method does not null-terminate the buffer. - int get_utf8(char *buf) const; - // Returns the number of bytes of the current codepoint. Returns 1 if the - // current position is at an illegal UTF8 value. - int utf8_len() const; - // Returns true if the UTF-8 encoding at the current position is legal. - bool is_legal() const; - - // Return the pointer into the string at the current position. - const char *utf8_data() const { - return it_; - } - - // Iterator equality operators. - friend bool operator==(const CI &lhs, const CI &rhs) { - return lhs.it_ == rhs.it_; - } - friend bool operator!=(const CI &lhs, const CI &rhs) { - return !(lhs == rhs); - } - - private: - friend class UNICHAR; - explicit const_iterator(const char *it) : it_(it) {} - - const char *it_; // Pointer into the string. - }; - - // Create a start/end iterator pointing to a string. Note that these methods - // are static and do NOT create a copy or take ownership of the underlying - // array. - static const_iterator begin(const char *utf8_str, int byte_length); - static const_iterator end(const char *utf8_str, int byte_length); - - // Converts a utf-8 string to a vector of unicodes. - // Returns an empty vector if the input contains invalid UTF-8. - static std::vector UTF8ToUTF32(const char *utf8_str); - // Converts a vector of unicodes to a utf8 string. - // Returns an empty string if the input contains an invalid unicode. - static std::string UTF32ToUTF8(const std::vector &str32); - -private: - // A UTF-8 representation of 1 or more Unicode characters. - // The last element (chars[UNICHAR_LEN - 1]) is a length if - // its value < UNICHAR_LEN, otherwise it is a genuine character. - char chars[UNICHAR_LEN]{}; -}; - -} // namespace tesseract - -#endif // TESSERACT_CCUTIL_UNICHAR_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/version.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/version.h deleted file mode 100644 index 6bac5d66..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/version.h +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: version.h -// Description: Version information -// -// (C) Copyright 2018, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_VERSION_H_ -#define TESSERACT_API_VERSION_H_ - -// clang-format off - -#define TESSERACT_MAJOR_VERSION @GENERIC_MAJOR_VERSION@ -#define TESSERACT_MINOR_VERSION @GENERIC_MINOR_VERSION@ -#define TESSERACT_MICRO_VERSION @GENERIC_MICRO_VERSION@ - -#define TESSERACT_VERSION \ - (TESSERACT_MAJOR_VERSION << 16 | \ - TESSERACT_MINOR_VERSION << 8 | \ - TESSERACT_MICRO_VERSION) - -#define TESSERACT_VERSION_STR "@PACKAGE_VERSION@" - -// clang-format on - -#endif // TESSERACT_API_VERSION_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/baseapi.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/baseapi.h deleted file mode 100644 index 5e1e4830..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/baseapi.h +++ /dev/null @@ -1,812 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: baseapi.h -// Description: Simple API for calling tesseract. -// Author: Ray Smith -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_BASEAPI_H_ -#define TESSERACT_API_BASEAPI_H_ - -#ifdef HAVE_CONFIG_H -# include "config_auto.h" // DISABLED_LEGACY_ENGINE -#endif - -#include "export.h" -#include "pageiterator.h" -#include "publictypes.h" -#include "resultiterator.h" -#include "unichar.h" - -#include "version.h" - -#include -#include // for std::vector - -struct Pix; -struct Pixa; -struct Boxa; - -namespace tesseract { - -class PAGE_RES; -class ParagraphModel; -class BLOCK_LIST; -class ETEXT_DESC; -struct OSResults; -class UNICHARSET; - -class Dawg; -class Dict; -class EquationDetect; -class PageIterator; -class ImageThresholder; -class LTRResultIterator; -class ResultIterator; -class MutableIterator; -class TessResultRenderer; -class Tesseract; - -// Function to read a std::vector from a whole file. -// Returns false on failure. -using FileReader = bool (*)(const char *filename, std::vector *data); - -using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID, - bool) const; -using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *, - int, const char *, int); - -/** - * Base class for all tesseract APIs. - * Specific classes can add ability to work on different inputs or produce - * different outputs. - * This class is mostly an interface layer on top of the Tesseract instance - * class to hide the data types so that users of this class don't have to - * include any other Tesseract headers. - */ -class TESS_API TessBaseAPI { -public: - TessBaseAPI(); - virtual ~TessBaseAPI(); - // Copy constructor and assignment operator are currently unsupported. - TessBaseAPI(TessBaseAPI const &) = delete; - TessBaseAPI &operator=(TessBaseAPI const &) = delete; - - /** - * Returns the version identifier as a static string. Do not delete. - */ - static const char *Version(); - - /** - * If compiled with OpenCL AND an available OpenCL - * device is deemed faster than serial code, then - * "device" is populated with the cl_device_id - * and returns sizeof(cl_device_id) - * otherwise *device=nullptr and returns 0. - */ - static size_t getOpenCLDevice(void **device); - - /** - * Set the name of the input file. Needed for training and - * reading a UNLV zone file, and for searchable PDF output. - */ - void SetInputName(const char *name); - /** - * These functions are required for searchable PDF output. - * We need our hands on the input file so that we can include - * it in the PDF without transcoding. If that is not possible, - * we need the original image. Finally, resolution metadata - * is stored in the PDF so we need that as well. - */ - const char *GetInputName(); - // Takes ownership of the input pix. - void SetInputImage(Pix *pix); - Pix *GetInputImage(); - int GetSourceYResolution(); - const char *GetDatapath(); - - /** Set the name of the bonus output files. Needed only for debugging. */ - void SetOutputName(const char *name); - - /** - * Set the value of an internal "parameter." - * Supply the name of the parameter and the value as a string, just as - * you would in a config file. - * Returns false if the name lookup failed. - * Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z. - * Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode. - * SetVariable may be used before Init, but settings will revert to - * defaults on End(). - * - * Note: Must be called after Init(). Only works for non-init variables - * (init variables should be passed to Init()). - */ - bool SetVariable(const char *name, const char *value); - bool SetDebugVariable(const char *name, const char *value); - - /** - * Returns true if the parameter was found among Tesseract parameters. - * Fills in value with the value of the parameter. - */ - bool GetIntVariable(const char *name, int *value) const; - bool GetBoolVariable(const char *name, bool *value) const; - bool GetDoubleVariable(const char *name, double *value) const; - - /** - * Returns the pointer to the string that represents the value of the - * parameter if it was found among Tesseract parameters. - */ - const char *GetStringVariable(const char *name) const; - -#ifndef DISABLED_LEGACY_ENGINE - - /** - * Print Tesseract fonts table to the given file. - */ - void PrintFontsTable(FILE *fp) const; - -#endif - - /** - * Print Tesseract parameters to the given file. - */ - void PrintVariables(FILE *fp) const; - - /** - * Get value of named variable as a string, if it exists. - */ - bool GetVariableAsString(const char *name, std::string *val) const; - - /** - * Instances are now mostly thread-safe and totally independent, - * but some global parameters remain. Basically it is safe to use multiple - * TessBaseAPIs in different threads in parallel, UNLESS: - * you use SetVariable on some of the Params in classify and textord. - * If you do, then the effect will be to change it for all your instances. - * - * Start tesseract. Returns zero on success and -1 on failure. - * NOTE that the only members that may be called before Init are those - * listed above here in the class definition. - * - * The datapath must be the name of the tessdata directory. - * The language is (usually) an ISO 639-3 string or nullptr will default to - * eng. It is entirely safe (and eventually will be efficient too) to call - * Init multiple times on the same instance to change language, or just - * to reset the classifier. - * The language may be a string of the form [~][+[~]]* indicating - * that multiple languages are to be loaded. Eg hin+eng will load Hindi and - * English. Languages may specify internally that they want to be loaded - * with one or more other languages, so the ~ sign is available to override - * that. Eg if hin were set to load eng by default, then hin+~eng would force - * loading only hin. The number of loaded languages is limited only by - * memory, with the caveat that loading additional languages will impact - * both speed and accuracy, as there is more work to do to decide on the - * applicable language, and there is more chance of hallucinating incorrect - * words. - * WARNING: On changing languages, all Tesseract parameters are reset - * back to their default values. (Which may vary between languages.) - * If you have a rare need to set a Variable that controls - * initialization for a second call to Init you should explicitly - * call End() and then use SetVariable before Init. This is only a very - * rare use case, since there are very few uses that require any parameters - * to be set before Init. - * - * If set_only_non_debug_params is true, only params that do not contain - * "debug" in the name will be set. - */ - int Init(const char *datapath, const char *language, OcrEngineMode mode, - char **configs, int configs_size, - const std::vector *vars_vec, - const std::vector *vars_values, - bool set_only_non_debug_params); - int Init(const char *datapath, const char *language, OcrEngineMode oem) { - return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false); - } - int Init(const char *datapath, const char *language) { - return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr, - false); - } - // In-memory version reads the traineddata file directly from the given - // data[data_size] array, and/or reads data via a FileReader. - int Init(const char *data, int data_size, const char *language, - OcrEngineMode mode, char **configs, int configs_size, - const std::vector *vars_vec, - const std::vector *vars_values, - bool set_only_non_debug_params, FileReader reader); - - /** - * Returns the languages string used in the last valid initialization. - * If the last initialization specified "deu+hin" then that will be - * returned. If hin loaded eng automatically as well, then that will - * not be included in this list. To find the languages actually - * loaded use GetLoadedLanguagesAsVector. - * The returned string should NOT be deleted. - */ - const char *GetInitLanguagesAsString() const; - - /** - * Returns the loaded languages in the vector of std::string. - * Includes all languages loaded by the last Init, including those loaded - * as dependencies of other loaded languages. - */ - void GetLoadedLanguagesAsVector(std::vector *langs) const; - - /** - * Returns the available languages in the sorted vector of std::string. - */ - void GetAvailableLanguagesAsVector(std::vector *langs) const; - - /** - * Init only for page layout analysis. Use only for calls to SetImage and - * AnalysePage. Calls that attempt recognition will generate an error. - */ - void InitForAnalysePage(); - - /** - * Read a "config" file containing a set of param, value pairs. - * Searches the standard places: tessdata/configs, tessdata/tessconfigs - * and also accepts a relative or absolute path name. - * Note: only non-init params will be set (init params are set by Init()). - */ - void ReadConfigFile(const char *filename); - /** Same as above, but only set debug params from the given config file. */ - void ReadDebugConfigFile(const char *filename); - - /** - * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK. - * The mode is stored as an IntParam so it can also be modified by - * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string). - */ - void SetPageSegMode(PageSegMode mode); - - /** Return the current page segmentation mode. */ - PageSegMode GetPageSegMode() const; - - /** - * Recognize a rectangle from an image and return the result as a string. - * May be called many times for a single Init. - * Currently has no error checking. - * Greyscale of 8 and color of 24 or 32 bits per pixel may be given. - * Palette color images will not work properly and must be converted to - * 24 bit. - * Binary images of 1 bit per pixel may also be given but they must be - * byte packed with the MSB of the first byte being the first pixel, and a - * 1 represents WHITE. For binary images set bytes_per_pixel=0. - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - * - * Note that TesseractRect is the simplified convenience interface. - * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize, - * and one or more of the Get*Text functions below. - */ - char *TesseractRect(const unsigned char *imagedata, int bytes_per_pixel, - int bytes_per_line, int left, int top, int width, - int height); - - /** - * Call between pages or documents etc to free up memory and forget - * adaptive data. - */ - void ClearAdaptiveClassifier(); - - /** - * @defgroup AdvancedAPI Advanced API - * The following methods break TesseractRect into pieces, so you can - * get hold of the thresholded image, get the text in different formats, - * get bounding boxes, confidences etc. - */ - /* @{ */ - - /** - * Provide an image for Tesseract to recognize. Format is as - * TesseractRect above. Copies the image buffer and converts to Pix. - * SetImage clears all recognition results, and sets the rectangle to the - * full image, so it may be followed immediately by a GetUTF8Text, and it - * will automatically perform recognition. - */ - void SetImage(const unsigned char *imagedata, int width, int height, - int bytes_per_pixel, int bytes_per_line); - - /** - * Provide an image for Tesseract to recognize. As with SetImage above, - * Tesseract takes its own copy of the image, so it need not persist until - * after Recognize. - * Pix vs raw, which to use? - * Use Pix where possible. Tesseract uses Pix as its internal representation - * and it is therefore more efficient to provide a Pix directly. - */ - void SetImage(Pix *pix); - - /** - * Set the resolution of the source image in pixels per inch so font size - * information can be calculated in results. Call this after SetImage(). - */ - void SetSourceResolution(int ppi); - - /** - * Restrict recognition to a sub-rectangle of the image. Call after SetImage. - * Each SetRectangle clears the recogntion results so multiple rectangles - * can be recognized with the same image. - */ - void SetRectangle(int left, int top, int width, int height); - - /** - * Get a copy of the internal thresholded image from Tesseract. - * Caller takes ownership of the Pix and must pixDestroy it. - * May be called any time after SetImage, or after TesseractRect. - */ - Pix *GetThresholdedImage(); - - /** - * Get the result of page layout analysis as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - */ - Boxa *GetRegions(Pixa **pixa); - - /** - * Get the textlines as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * If raw_image is true, then extract from the original image instead of the - * thresholded image and pad by raw_padding pixels. - * If blockids is not nullptr, the block-id of each line is also returned as - * an array of one element per line. delete [] after use. If paraids is not - * nullptr, the paragraph-id of each line within its block is also returned as - * an array of one element per line. delete [] after use. - */ - Boxa *GetTextlines(bool raw_image, int raw_padding, Pixa **pixa, - int **blockids, int **paraids); - /* - Helper method to extract from the thresholded image. (most common usage) -*/ - Boxa *GetTextlines(Pixa **pixa, int **blockids) { - return GetTextlines(false, 0, pixa, blockids, nullptr); - } - - /** - * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa - * pair, in reading order. Enables downstream handling of non-rectangular - * regions. - * Can be called before or after Recognize. - * If blockids is not nullptr, the block-id of each line is also returned as - * an array of one element per line. delete [] after use. - */ - Boxa *GetStrips(Pixa **pixa, int **blockids); - - /** - * Get the words as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - */ - Boxa *GetWords(Pixa **pixa); - - /** - * Gets the individual connected (text) components (created - * after pages segmentation step, but before recognition) - * as a leptonica-style Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * Note: the caller is responsible for calling boxaDestroy() - * on the returned Boxa array and pixaDestroy() on cc array. - */ - Boxa *GetConnectedComponents(Pixa **cc); - - /** - * Get the given level kind of components (block, textline, word etc.) as a - * leptonica-style Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * If blockids is not nullptr, the block-id of each component is also returned - * as an array of one element per component. delete [] after use. - * If blockids is not nullptr, the paragraph-id of each component with its - * block is also returned as an array of one element per component. delete [] - * after use. If raw_image is true, then portions of the original image are - * extracted instead of the thresholded image and padded with raw_padding. If - * text_only is true, then only text components are returned. - */ - Boxa *GetComponentImages(PageIteratorLevel level, bool text_only, - bool raw_image, int raw_padding, Pixa **pixa, - int **blockids, int **paraids); - // Helper function to get binary images with no padding (most common usage). - Boxa *GetComponentImages(const PageIteratorLevel level, const bool text_only, - Pixa **pixa, int **blockids) { - return GetComponentImages(level, text_only, false, 0, pixa, blockids, - nullptr); - } - - /** - * Returns the scale factor of the thresholded image that would be returned by - * GetThresholdedImage() and the various GetX() methods that call - * GetComponentImages(). - * Returns 0 if no thresholder has been set. - */ - int GetThresholdedImageScaleFactor() const; - - /** - * Runs page layout analysis in the mode set by SetPageSegMode. - * May optionally be called prior to Recognize to get access to just - * the page layout results. Returns an iterator to the results. - * If merge_similar_words is true, words are combined where suitable for use - * with a line recognizer. Use if you want to use AnalyseLayout to find the - * textlines, and then want to process textline fragments with an external - * line recognizer. - * Returns nullptr on error or an empty page. - * The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - PageIterator *AnalyseLayout(); - PageIterator *AnalyseLayout(bool merge_similar_words); - - /** - * Recognize the image from SetAndThresholdImage, generating Tesseract - * internal structures. Returns 0 on success. - * Optional. The Get*Text functions below will call Recognize if needed. - * After Recognize, the output is kept internally until the next SetImage. - */ - int Recognize(ETEXT_DESC *monitor); - - /** - * Methods to retrieve information after SetAndThresholdImage(), - * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.) - */ - - /** - * Turns images into symbolic text. - * - * filename can point to a single image, a multi-page TIFF, - * or a plain text list of image filenames. - * - * retry_config is useful for debugging. If not nullptr, you can fall - * back to an alternate configuration if a page fails for some - * reason. - * - * timeout_millisec terminates processing if any single page - * takes too long. Set to 0 for unlimited time. - * - * renderer is responible for creating the output. For example, - * use the TessTextRenderer if you want plaintext output, or - * the TessPDFRender to produce searchable PDF. - * - * If tessedit_page_number is non-negative, will only process that - * single page. Works for multi-page tiff file, or filelist. - * - * Returns true if successful, false on error. - */ - bool ProcessPages(const char *filename, const char *retry_config, - int timeout_millisec, TessResultRenderer *renderer); - // Does the real work of ProcessPages. - bool ProcessPagesInternal(const char *filename, const char *retry_config, - int timeout_millisec, TessResultRenderer *renderer); - - /** - * Turn a single image into symbolic text. - * - * The pix is the image processed. filename and page_index are - * metadata used by side-effect processes, such as reading a box - * file or formatting as hOCR. - * - * See ProcessPages for descriptions of other parameters. - */ - bool ProcessPage(Pix *pix, int page_index, const char *filename, - const char *retry_config, int timeout_millisec, - TessResultRenderer *renderer); - - /** - * Get a reading-order iterator to the results of LayoutAnalysis and/or - * Recognize. The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - ResultIterator *GetIterator(); - - /** - * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize. - * The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - MutableIterator *GetMutableIterator(); - - /** - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - */ - char *GetUTF8Text(); - - /** - * Make a HTML-formatted string with hOCR markup from the internal - * data structures. - * page_number is 0-based but will appear in the output as 1-based. - * monitor can be used to - * cancel the recognition - * receive progress callbacks - * Returned string must be freed with the delete [] operator. - */ - char *GetHOCRText(ETEXT_DESC *monitor, int page_number); - - /** - * Make a HTML-formatted string with hOCR markup from the internal - * data structures. - * page_number is 0-based but will appear in the output as 1-based. - * Returned string must be freed with the delete [] operator. - */ - char *GetHOCRText(int page_number); - - /** - * Make an XML-formatted string with Alto markup from the internal - * data structures. - */ - char *GetAltoText(ETEXT_DESC *monitor, int page_number); - - /** - * Make an XML-formatted string with Alto markup from the internal - * data structures. - */ - char *GetAltoText(int page_number); - - /** - * Make a TSV-formatted string from the internal data structures. - * page_number is 0-based but will appear in the output as 1-based. - * Returned string must be freed with the delete [] operator. - */ - char *GetTSVText(int page_number); - - /** - * Make a box file for LSTM training from the internal data structures. - * Constructs coordinates in the original image - not just the rectangle. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetLSTMBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded in the same - * format as a box file used in training. - * Constructs coordinates in the original image - not just the rectangle. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded in the same - * format as a WordStr box file used in training. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetWordStrBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded - * as UNLV format Latin-1 with specific reject and suspect codes. - * Returned string must be freed with the delete [] operator. - */ - char *GetUNLVText(); - - /** - * Detect the orientation of the input image and apparent script (alphabet). - * orient_deg is the detected clockwise rotation of the input image in degrees - * (0, 90, 180, 270) - * orient_conf is the confidence (15.0 is reasonably confident) - * script_name is an ASCII string, the name of the script, e.g. "Latin" - * script_conf is confidence level in the script - * Returns true on success and writes values to each parameter as an output - */ - bool DetectOrientationScript(int *orient_deg, float *orient_conf, - const char **script_name, float *script_conf); - - /** - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - * page_number is a 0-based page index that will appear in the osd file. - */ - char *GetOsdText(int page_number); - - /** Returns the (average) confidence value between 0 and 100. */ - int MeanTextConf(); - /** - * Returns all word confidences (between 0 and 100) in an array, terminated - * by -1. The calling function must delete [] after use. - * The number of confidences should correspond to the number of space- - * delimited words in GetUTF8Text. - */ - int *AllWordConfidences(); - -#ifndef DISABLED_LEGACY_ENGINE - /** - * Applies the given word to the adaptive classifier if possible. - * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can - * tell the boundaries of the graphemes. - * Assumes that SetImage/SetRectangle have been used to set the image - * to the given word. The mode arg should be PSM_SINGLE_WORD or - * PSM_CIRCLE_WORD, as that will be used to control layout analysis. - * The currently set PageSegMode is preserved. - * Returns false if adaption was not possible for some reason. - */ - bool AdaptToWordStr(PageSegMode mode, const char *wordstr); -#endif // ndef DISABLED_LEGACY_ENGINE - - /** - * Free up recognition results and any stored image data, without actually - * freeing any recognition data that would be time-consuming to reload. - * Afterwards, you must call SetImage or TesseractRect before doing - * any Recognize or Get* operation. - */ - void Clear(); - - /** - * Close down tesseract and free up all memory. End() is equivalent to - * destructing and reconstructing your TessBaseAPI. - * Once End() has been used, none of the other API functions may be used - * other than Init and anything declared above it in the class definition. - */ - void End(); - - /** - * Clear any library-level memory caches. - * There are a variety of expensive-to-load constant data structures (mostly - * language dictionaries) that are cached globally -- surviving the Init() - * and End() of individual TessBaseAPI's. This function allows the clearing - * of these caches. - **/ - static void ClearPersistentCache(); - - /** - * Check whether a word is valid according to Tesseract's language model - * @return 0 if the word is invalid, non-zero if valid. - * @warning temporary! This function will be removed from here and placed - * in a separate API at some future time. - */ - int IsValidWord(const char *word) const; - // Returns true if utf8_character is defined in the UniCharset. - bool IsValidCharacter(const char *utf8_character) const; - - bool GetTextDirection(int *out_offset, float *out_slope); - - /** Sets Dict::letter_is_okay_ function to point to the given function. */ - void SetDictFunc(DictFunc f); - - /** Sets Dict::probability_in_context_ function to point to the given - * function. - */ - void SetProbabilityInContextFunc(ProbabilityInContextFunc f); - - /** - * Estimates the Orientation And Script of the image. - * @return true if the image was processed successfully. - */ - bool DetectOS(OSResults *); - - /** - * Return text orientation of each block as determined by an earlier run - * of layout analysis. - */ - void GetBlockTextOrientations(int **block_orientation, - bool **vertical_writing); - - /** This method returns the string form of the specified unichar. */ - const char *GetUnichar(int unichar_id) const; - - /** Return the pointer to the i-th dawg loaded into tesseract_ object. */ - const Dawg *GetDawg(int i) const; - - /** Return the number of dawgs loaded into tesseract_ object. */ - int NumDawgs() const; - - Tesseract *tesseract() const { - return tesseract_; - } - - OcrEngineMode oem() const { - return last_oem_requested_; - } - - void set_min_orientation_margin(double margin); - /* @} */ - -protected: - /** Common code for setting the image. Returns true if Init has been called. - */ - bool InternalSetImage(); - - /** - * Run the thresholder to make the thresholded image. If pix is not nullptr, - * the source is thresholded to pix instead of the internal IMAGE. - */ - virtual bool Threshold(Pix **pix); - - /** - * Find lines from the image making the BLOCK_LIST. - * @return 0 on success. - */ - int FindLines(); - - /** Delete the pageres and block list ready for a new page. */ - void ClearResults(); - - /** - * Return an LTR Result Iterator -- used only for training, as we really want - * to ignore all BiDi smarts at that point. - * delete once you're done with it. - */ - LTRResultIterator *GetLTRIterator(); - - /** - * Return the length of the output text string, as UTF8, assuming - * one newline per line and one per block, with a terminator, - * and assuming a single character reject marker for each rejected character. - * Also return the number of recognized blobs in blob_count. - */ - int TextLength(int *blob_count) const; - - //// paragraphs.cpp //////////////////////////////////////////////////// - void DetectParagraphs(bool after_text_recognition); - - const PAGE_RES *GetPageRes() const { - return page_res_; - } - -protected: - Tesseract *tesseract_; ///< The underlying data object. - Tesseract *osd_tesseract_; ///< For orientation & script detection. - EquationDetect *equ_detect_; ///< The equation detector. - FileReader reader_; ///< Reads files from any filesystem. - ImageThresholder *thresholder_; ///< Image thresholding module. - std::vector *paragraph_models_; - BLOCK_LIST *block_list_; ///< The page layout. - PAGE_RES *page_res_; ///< The page-level data. - std::string input_file_; ///< Name used by training code. - std::string output_file_; ///< Name used by debug code. - std::string datapath_; ///< Current location of tessdata. - std::string language_; ///< Last initialized language. - OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested. - bool recognition_done_; ///< page_res_ contains recognition data. - - /** - * @defgroup ThresholderParams Thresholder Parameters - * Parameters saved from the Thresholder. Needed to rebuild coordinates. - */ - /* @{ */ - int rect_left_; - int rect_top_; - int rect_width_; - int rect_height_; - int image_width_; - int image_height_; - /* @} */ - -private: - // A list of image filenames gets special consideration - bool ProcessPagesFileList(FILE *fp, std::string *buf, - const char *retry_config, int timeout_millisec, - TessResultRenderer *renderer, - int tessedit_page_number); - // TIFF supports multipage so gets special consideration. - bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size, - const char *filename, const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer, - int tessedit_page_number); -}; // class TessBaseAPI. - -/** Escape a char string - remove &<>"' with HTML codes. */ -std::string HOcrEscape(const char *text); - -} // namespace tesseract - -#endif // TESSERACT_API_BASEAPI_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/capi.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/capi.h deleted file mode 100644 index 40f4856a..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/capi.h +++ /dev/null @@ -1,484 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: capi.h -// Description: C-API TessBaseAPI -// -// (C) Copyright 2012, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef API_CAPI_H_ -#define API_CAPI_H_ - -#include "export.h" - -#ifdef __cplusplus -# include -# include -# include -# include -# include -#endif - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef BOOL -# define BOOL int -# define TRUE 1 -# define FALSE 0 -#endif - -#ifdef __cplusplus -typedef tesseract::TessResultRenderer TessResultRenderer; -typedef tesseract::TessBaseAPI TessBaseAPI; -typedef tesseract::PageIterator TessPageIterator; -typedef tesseract::ResultIterator TessResultIterator; -typedef tesseract::MutableIterator TessMutableIterator; -typedef tesseract::ChoiceIterator TessChoiceIterator; -typedef tesseract::OcrEngineMode TessOcrEngineMode; -typedef tesseract::PageSegMode TessPageSegMode; -typedef tesseract::PageIteratorLevel TessPageIteratorLevel; -typedef tesseract::Orientation TessOrientation; -typedef tesseract::ParagraphJustification TessParagraphJustification; -typedef tesseract::WritingDirection TessWritingDirection; -typedef tesseract::TextlineOrder TessTextlineOrder; -typedef tesseract::PolyBlockType TessPolyBlockType; -typedef tesseract::ETEXT_DESC ETEXT_DESC; -#else -typedef struct TessResultRenderer TessResultRenderer; -typedef struct TessBaseAPI TessBaseAPI; -typedef struct TessPageIterator TessPageIterator; -typedef struct TessResultIterator TessResultIterator; -typedef struct TessMutableIterator TessMutableIterator; -typedef struct TessChoiceIterator TessChoiceIterator; -typedef enum TessOcrEngineMode { - OEM_TESSERACT_ONLY, - OEM_LSTM_ONLY, - OEM_TESSERACT_LSTM_COMBINED, - OEM_DEFAULT -} TessOcrEngineMode; -typedef enum TessPageSegMode { - PSM_OSD_ONLY, - PSM_AUTO_OSD, - PSM_AUTO_ONLY, - PSM_AUTO, - PSM_SINGLE_COLUMN, - PSM_SINGLE_BLOCK_VERT_TEXT, - PSM_SINGLE_BLOCK, - PSM_SINGLE_LINE, - PSM_SINGLE_WORD, - PSM_CIRCLE_WORD, - PSM_SINGLE_CHAR, - PSM_SPARSE_TEXT, - PSM_SPARSE_TEXT_OSD, - PSM_RAW_LINE, - PSM_COUNT -} TessPageSegMode; -typedef enum TessPageIteratorLevel { - RIL_BLOCK, - RIL_PARA, - RIL_TEXTLINE, - RIL_WORD, - RIL_SYMBOL -} TessPageIteratorLevel; -typedef enum TessPolyBlockType { - PT_UNKNOWN, - PT_FLOWING_TEXT, - PT_HEADING_TEXT, - PT_PULLOUT_TEXT, - PT_EQUATION, - PT_INLINE_EQUATION, - PT_TABLE, - PT_VERTICAL_TEXT, - PT_CAPTION_TEXT, - PT_FLOWING_IMAGE, - PT_HEADING_IMAGE, - PT_PULLOUT_IMAGE, - PT_HORZ_LINE, - PT_VERT_LINE, - PT_NOISE, - PT_COUNT -} TessPolyBlockType; -typedef enum TessOrientation { - ORIENTATION_PAGE_UP, - ORIENTATION_PAGE_RIGHT, - ORIENTATION_PAGE_DOWN, - ORIENTATION_PAGE_LEFT -} TessOrientation; -typedef enum TessParagraphJustification { - JUSTIFICATION_UNKNOWN, - JUSTIFICATION_LEFT, - JUSTIFICATION_CENTER, - JUSTIFICATION_RIGHT -} TessParagraphJustification; -typedef enum TessWritingDirection { - WRITING_DIRECTION_LEFT_TO_RIGHT, - WRITING_DIRECTION_RIGHT_TO_LEFT, - WRITING_DIRECTION_TOP_TO_BOTTOM -} TessWritingDirection; -typedef enum TessTextlineOrder { - TEXTLINE_ORDER_LEFT_TO_RIGHT, - TEXTLINE_ORDER_RIGHT_TO_LEFT, - TEXTLINE_ORDER_TOP_TO_BOTTOM -} TessTextlineOrder; -typedef struct ETEXT_DESC ETEXT_DESC; -#endif - -typedef bool (*TessCancelFunc)(void *cancel_this, int words); -typedef bool (*TessProgressFunc)(ETEXT_DESC *ths, int left, int right, int top, - int bottom); - -struct Pix; -struct Boxa; -struct Pixa; - -/* General free functions */ - -TESS_API const char *TessVersion(); -TESS_API void TessDeleteText(const char *text); -TESS_API void TessDeleteTextArray(char **arr); -TESS_API void TessDeleteIntArray(const int *arr); - -/* Renderer API */ -TESS_API TessResultRenderer *TessTextRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessHOcrRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase, - BOOL font_info); -TESS_API TessResultRenderer *TessAltoRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessTsvRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessPDFRendererCreate(const char *outputbase, - const char *datadir, - BOOL textonly); -TESS_API TessResultRenderer *TessUnlvRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessWordStrBoxRendererCreate( - const char *outputbase); - -TESS_API void TessDeleteResultRenderer(TessResultRenderer *renderer); -TESS_API void TessResultRendererInsert(TessResultRenderer *renderer, - TessResultRenderer *next); -TESS_API TessResultRenderer *TessResultRendererNext( - TessResultRenderer *renderer); -TESS_API BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer, - const char *title); -TESS_API BOOL TessResultRendererAddImage(TessResultRenderer *renderer, - TessBaseAPI *api); -TESS_API BOOL TessResultRendererEndDocument(TessResultRenderer *renderer); - -TESS_API const char *TessResultRendererExtention(TessResultRenderer *renderer); -TESS_API const char *TessResultRendererTitle(TessResultRenderer *renderer); -TESS_API int TessResultRendererImageNum(TessResultRenderer *renderer); - -/* Base API */ - -TESS_API TessBaseAPI *TessBaseAPICreate(); -TESS_API void TessBaseAPIDelete(TessBaseAPI *handle); - -TESS_API size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI *handle, void **device); - -TESS_API void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name); -TESS_API const char *TessBaseAPIGetInputName(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetInputImage(TessBaseAPI *handle, struct Pix *pix); -TESS_API struct Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle); - -TESS_API int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle); -TESS_API const char *TessBaseAPIGetDatapath(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name); - -TESS_API BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name, - const char *value); -TESS_API BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name, - const char *value); - -TESS_API BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle, - const char *name, int *value); -TESS_API BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle, - const char *name, BOOL *value); -TESS_API BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle, - const char *name, double *value); -TESS_API const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle, - const char *name); - -TESS_API void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp); -TESS_API BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle, - const char *filename); - -TESS_API int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode oem, - char **configs, int configs_size); -TESS_API int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode oem); -TESS_API int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath, - const char *language); - -TESS_API int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode mode, - char **configs, int configs_size, char **vars_vec, - char **vars_values, size_t vars_vec_size, - BOOL set_only_non_debug_params); - -TESS_API int TessBaseAPIInit5(TessBaseAPI *handle, const char *data, int data_size, - const char *language, TessOcrEngineMode mode, - char **configs, int configs_size, char **vars_vec, - char **vars_values, size_t vars_vec_size, - BOOL set_only_non_debug_params); - -TESS_API const char *TessBaseAPIGetInitLanguagesAsString( - const TessBaseAPI *handle); -TESS_API char **TessBaseAPIGetLoadedLanguagesAsVector( - const TessBaseAPI *handle); -TESS_API char **TessBaseAPIGetAvailableLanguagesAsVector( - const TessBaseAPI *handle); - -TESS_API void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle); - -TESS_API void TessBaseAPIReadConfigFile(TessBaseAPI *handle, - const char *filename); -TESS_API void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle, - const char *filename); - -TESS_API void TessBaseAPISetPageSegMode(TessBaseAPI *handle, - TessPageSegMode mode); -TESS_API TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle); - -TESS_API char *TessBaseAPIRect(TessBaseAPI *handle, - const unsigned char *imagedata, - int bytes_per_pixel, int bytes_per_line, - int left, int top, int width, int height); - -TESS_API void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetImage(TessBaseAPI *handle, - const unsigned char *imagedata, int width, - int height, int bytes_per_pixel, - int bytes_per_line); -TESS_API void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix); - -TESS_API void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi); - -TESS_API void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top, - int width, int height); - -TESS_API struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle); -TESS_API struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle, - struct Pixa **pixa); -TESS_API struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle, - struct Pixa **pixa, - int **blockids); -TESS_API struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle, - BOOL raw_image, int raw_padding, - struct Pixa **pixa, - int **blockids, int **paraids); -TESS_API struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle, - struct Pixa **pixa, int **blockids); -TESS_API struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle, - struct Pixa **pixa); -TESS_API struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle, - struct Pixa **cc); -TESS_API struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle, - TessPageIteratorLevel level, - BOOL text_only, - struct Pixa **pixa, - int **blockids); -TESS_API struct Boxa *TessBaseAPIGetComponentImages1( - TessBaseAPI *handle, TessPageIteratorLevel level, BOOL text_only, - BOOL raw_image, int raw_padding, struct Pixa **pixa, int **blockids, - int **paraids); - -TESS_API int TessBaseAPIGetThresholdedImageScaleFactor( - const TessBaseAPI *handle); - -TESS_API TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle); - -TESS_API int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor); - -TESS_API BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename, - const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer); -TESS_API BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix, - int page_index, const char *filename, - const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer); - -TESS_API TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle); -TESS_API TessMutableIterator *TessBaseAPIGetMutableIterator( - TessBaseAPI *handle); - -TESS_API char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle); -TESS_API char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number); - -TESS_API char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number); - -TESS_API char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle, - int page_number); - -TESS_API char *TessBaseAPIGetUNLVText(TessBaseAPI *handle); -TESS_API int TessBaseAPIMeanTextConf(TessBaseAPI *handle); - -TESS_API int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle); - -#ifndef DISABLED_LEGACY_ENGINE -TESS_API BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle, - TessPageSegMode mode, - const char *wordstr); -#endif // #ifndef DISABLED_LEGACY_ENGINE - -TESS_API void TessBaseAPIClear(TessBaseAPI *handle); -TESS_API void TessBaseAPIEnd(TessBaseAPI *handle); - -TESS_API int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word); -TESS_API BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset, - float *out_slope); - -TESS_API const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id); - -TESS_API void TessBaseAPIClearPersistentCache(TessBaseAPI *handle); - -#ifndef DISABLED_LEGACY_ENGINE - -// Call TessDeleteText(*best_script_name) to free memory allocated by this -// function -TESS_API BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle, - int *orient_deg, - float *orient_conf, - const char **script_name, - float *script_conf); -#endif // #ifndef DISABLED_LEGACY_ENGINE - -TESS_API void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle, - double margin); - -TESS_API int TessBaseAPINumDawgs(const TessBaseAPI *handle); - -TESS_API TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle); - -TESS_API void TessBaseGetBlockTextOrientations(TessBaseAPI *handle, - int **block_orientation, - bool **vertical_writing); - -/* Page iterator */ - -TESS_API void TessPageIteratorDelete(TessPageIterator *handle); - -TESS_API TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle); - -TESS_API void TessPageIteratorBegin(TessPageIterator *handle); - -TESS_API BOOL TessPageIteratorNext(TessPageIterator *handle, - TessPageIteratorLevel level); - -TESS_API BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle, - TessPageIteratorLevel level); - -TESS_API BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle, - TessPageIteratorLevel level, - TessPageIteratorLevel element); - -TESS_API BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle, - TessPageIteratorLevel level, - int *left, int *top, int *right, - int *bottom); - -TESS_API TessPolyBlockType -TessPageIteratorBlockType(const TessPageIterator *handle); - -TESS_API struct Pix *TessPageIteratorGetBinaryImage( - const TessPageIterator *handle, TessPageIteratorLevel level); - -TESS_API struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle, - TessPageIteratorLevel level, - int padding, - struct Pix *original_image, - int *left, int *top); - -TESS_API BOOL TessPageIteratorBaseline(const TessPageIterator *handle, - TessPageIteratorLevel level, int *x1, - int *y1, int *x2, int *y2); - -TESS_API void TessPageIteratorOrientation( - TessPageIterator *handle, TessOrientation *orientation, - TessWritingDirection *writing_direction, TessTextlineOrder *textline_order, - float *deskew_angle); - -TESS_API void TessPageIteratorParagraphInfo( - TessPageIterator *handle, TessParagraphJustification *justification, - BOOL *is_list_item, BOOL *is_crown, int *first_line_indent); - -/* Result iterator */ - -TESS_API void TessResultIteratorDelete(TessResultIterator *handle); -TESS_API TessResultIterator *TessResultIteratorCopy( - const TessResultIterator *handle); -TESS_API TessPageIterator *TessResultIteratorGetPageIterator( - TessResultIterator *handle); -TESS_API const TessPageIterator *TessResultIteratorGetPageIteratorConst( - const TessResultIterator *handle); -TESS_API TessChoiceIterator *TessResultIteratorGetChoiceIterator( - const TessResultIterator *handle); - -TESS_API BOOL TessResultIteratorNext(TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API float TessResultIteratorConfidence(const TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API const char *TessResultIteratorWordRecognitionLanguage( - const TessResultIterator *handle); -TESS_API const char *TessResultIteratorWordFontAttributes( - const TessResultIterator *handle, BOOL *is_bold, BOOL *is_italic, - BOOL *is_underlined, BOOL *is_monospace, BOOL *is_serif, BOOL *is_smallcaps, - int *pointsize, int *font_id); - -TESS_API BOOL -TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle); -TESS_API BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle); - -TESS_API void TessChoiceIteratorDelete(TessChoiceIterator *handle); -TESS_API BOOL TessChoiceIteratorNext(TessChoiceIterator *handle); -TESS_API const char *TessChoiceIteratorGetUTF8Text( - const TessChoiceIterator *handle); -TESS_API float TessChoiceIteratorConfidence(const TessChoiceIterator *handle); - -/* Progress monitor */ - -TESS_API ETEXT_DESC *TessMonitorCreate(); -TESS_API void TessMonitorDelete(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetCancelFunc(ETEXT_DESC *monitor, - TessCancelFunc cancelFunc); -TESS_API void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis); -TESS_API void *TessMonitorGetCancelThis(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetProgressFunc(ETEXT_DESC *monitor, - TessProgressFunc progressFunc); -TESS_API int TessMonitorGetProgress(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline); - -#ifdef __cplusplus -} -#endif - -#endif // API_CAPI_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/export.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/export.h deleted file mode 100644 index d238b628..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/export.h +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: export.h -// Description: Place holder -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_PLATFORM_H_ -#define TESSERACT_PLATFORM_H_ - -#ifndef TESS_API -# if defined(_WIN32) || defined(__CYGWIN__) -# if defined(TESS_EXPORTS) -# define TESS_API __declspec(dllexport) -# elif defined(TESS_IMPORTS) -# define TESS_API __declspec(dllimport) -# else -# define TESS_API -# endif -# else -# if defined(TESS_EXPORTS) || defined(TESS_IMPORTS) -# define TESS_API __attribute__((visibility("default"))) -# else -# define TESS_API -# endif -# endif -#endif - -#endif // TESSERACT_PLATFORM_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/ltrresultiterator.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/ltrresultiterator.h deleted file mode 100644 index 6ca0a98e..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/ltrresultiterator.h +++ /dev/null @@ -1,235 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: ltrresultiterator.h -// Description: Iterator for tesseract results in strict left-to-right -// order that avoids using tesseract internal data structures. -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ -#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ - -#include "export.h" // for TESS_API -#include "pageiterator.h" // for PageIterator -#include "publictypes.h" // for PageIteratorLevel -#include "unichar.h" // for StrongScriptDirection - -namespace tesseract { - -class BLOB_CHOICE_IT; -class PAGE_RES; -class WERD_RES; - -class Tesseract; - -// Class to iterate over tesseract results, providing access to all levels -// of the page hierarchy, without including any tesseract headers or having -// to handle any tesseract structures. -// WARNING! This class points to data held within the TessBaseAPI class, and -// therefore can only be used while the TessBaseAPI class still exists and -// has not been subjected to a call of Init, SetImage, Recognize, Clear, End -// DetectOS, or anything else that changes the internal PAGE_RES. -// See tesseract/publictypes.h for the definition of PageIteratorLevel. -// See also base class PageIterator, which contains the bulk of the interface. -// LTRResultIterator adds text-specific methods for access to OCR output. - -class TESS_API LTRResultIterator : public PageIterator { - friend class ChoiceIterator; - -public: - // page_res and tesseract come directly from the BaseAPI. - // The rectangle parameters are copied indirectly from the Thresholder, - // via the BaseAPI. They represent the coordinates of some rectangle in an - // original image (in top-left-origin coordinates) and therefore the top-left - // needs to be added to any output boxes in order to specify coordinates - // in the original image. See TessBaseAPI::SetRectangle. - // The scale and scaled_yres are in case the Thresholder scaled the image - // rectangle prior to thresholding. Any coordinates in tesseract's image - // must be divided by scale before adding (rect_left, rect_top). - // The scaled_yres indicates the effective resolution of the binary image - // that tesseract has been given by the Thresholder. - // After the constructor, Begin has already been called. - LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, - int scaled_yres, int rect_left, int rect_top, - int rect_width, int rect_height); - - ~LTRResultIterator() override; - - // LTRResultIterators may be copied! This makes it possible to iterate over - // all the objects at a lower level, while maintaining an iterator to - // objects at a higher level. These constructors DO NOT CALL Begin, so - // iterations will continue from the location of src. - // TODO: For now the copy constructor and operator= only need the base class - // versions, but if new data members are added, don't forget to add them! - - // ============= Moving around within the page ============. - - // See PageIterator. - - // ============= Accessing data ==============. - - // Returns the null terminated UTF-8 encoded text string for the current - // object at the given level. Use delete [] to free after use. - char *GetUTF8Text(PageIteratorLevel level) const; - - // Set the string inserted at the end of each text line. "\n" by default. - void SetLineSeparator(const char *new_line); - - // Set the string inserted at the end of each paragraph. "\n" by default. - void SetParagraphSeparator(const char *new_para); - - // Returns the mean confidence of the current object at the given level. - // The number should be interpreted as a percent probability. (0.0f-100.0f) - float Confidence(PageIteratorLevel level) const; - - // ============= Functions that refer to words only ============. - - // Returns the font attributes of the current word. If iterating at a higher - // level object than words, eg textlines, then this will return the - // attributes of the first word in that textline. - // The actual return value is a string representing a font name. It points - // to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as - // the iterator itself, ie rendered invalid by various members of - // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI. - // Pointsize is returned in printers points (1/72 inch.) - const char *WordFontAttributes(bool *is_bold, bool *is_italic, - bool *is_underlined, bool *is_monospace, - bool *is_serif, bool *is_smallcaps, - int *pointsize, int *font_id) const; - - // Return the name of the language used to recognize this word. - // On error, nullptr. Do not delete this pointer. - const char *WordRecognitionLanguage() const; - - // Return the overall directionality of this word. - StrongScriptDirection WordDirection() const; - - // Returns true if the current word was found in a dictionary. - bool WordIsFromDictionary() const; - - // Returns the number of blanks before the current word. - int BlanksBeforeWord() const; - - // Returns true if the current word is numeric. - bool WordIsNumeric() const; - - // Returns true if the word contains blamer information. - bool HasBlamerInfo() const; - - // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle - // of the current word. - const void *GetParamsTrainingBundle() const; - - // Returns a pointer to the string with blamer information for this word. - // Assumes that the word's blamer_bundle is not nullptr. - const char *GetBlamerDebug() const; - - // Returns a pointer to the string with misadaption information for this word. - // Assumes that the word's blamer_bundle is not nullptr. - const char *GetBlamerMisadaptionDebug() const; - - // Returns true if a truth string was recorded for the current word. - bool HasTruthString() const; - - // Returns true if the given string is equivalent to the truth string for - // the current word. - bool EquivalentToTruth(const char *str) const; - - // Returns a null terminated UTF-8 encoded truth string for the current word. - // Use delete [] to free after use. - char *WordTruthUTF8Text() const; - - // Returns a null terminated UTF-8 encoded normalized OCR string for the - // current word. Use delete [] to free after use. - char *WordNormedUTF8Text() const; - - // Returns a pointer to serialized choice lattice. - // Fills lattice_size with the number of bytes in lattice data. - const char *WordLattice(int *lattice_size) const; - - // ============= Functions that refer to symbols only ============. - - // Returns true if the current symbol is a superscript. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsSuperscript() const; - // Returns true if the current symbol is a subscript. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsSubscript() const; - // Returns true if the current symbol is a dropcap. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsDropcap() const; - -protected: - const char *line_separator_; - const char *paragraph_separator_; -}; - -// Class to iterate over the classifier choices for a single RIL_SYMBOL. -class TESS_API ChoiceIterator { -public: - // Construction is from a LTRResultIterator that points to the symbol of - // interest. The ChoiceIterator allows a one-shot iteration over the - // choices for this symbol and after that it is useless. - explicit ChoiceIterator(const LTRResultIterator &result_it); - ~ChoiceIterator(); - - // Moves to the next choice for the symbol and returns false if there - // are none left. - bool Next(); - - // ============= Accessing data ==============. - - // Returns the null terminated UTF-8 encoded text string for the current - // choice. - // NOTE: Unlike LTRResultIterator::GetUTF8Text, the return points to an - // internal structure and should NOT be delete[]ed to free after use. - const char *GetUTF8Text() const; - - // Returns the confidence of the current choice depending on the used language - // data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All - // choices for one symbol should roughly add up to 1.0f. - // If only traineddata of the legacy engine is used, the number should be - // interpreted as a percent probability. (0.0f-100.0f) In this case - // probabilities won't add up to 100. Each one stands on its own. - float Confidence() const; - - // Returns a vector containing all timesteps, which belong to the currently - // selected symbol. A timestep is a vector containing pairs of symbols and - // floating point numbers. The number states the probability for the - // corresponding symbol. - std::vector>> *Timesteps() const; - -private: - // clears the remaining spaces out of the results and adapt the probabilities - void filterSpaces(); - // Pointer to the WERD_RES object owned by the API. - WERD_RES *word_res_; - // Iterator over the blob choices. - BLOB_CHOICE_IT *choice_it_; - std::vector> *LSTM_choices_ = nullptr; - std::vector>::iterator LSTM_choice_it_; - - const int *tstep_index_; - // regulates the rating granularity - double rating_coefficient_; - // leading blanks - int blanks_before_word_; - // true when there is lstm engine related trained data - bool oemLSTM_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/ocrclass.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/ocrclass.h deleted file mode 100644 index a55e6528..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/ocrclass.h +++ /dev/null @@ -1,158 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -/********************************************************************** - * File: ocrclass.h - * Description: Class definitions and constants for the OCR API. - * Author: Hewlett-Packard Co - * - * (C) Copyright 1996, Hewlett-Packard Co. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -/********************************************************************** - * This file contains typedefs for all the structures used by - * the HP OCR interface. - * The structures are designed to allow them to be used with any - * structure alignment up to 8. - **********************************************************************/ - -#ifndef CCUTIL_OCRCLASS_H_ -#define CCUTIL_OCRCLASS_H_ - -#include -#include - -namespace tesseract { - -/********************************************************************** - * EANYCODE_CHAR - * Description of a single character. The character code is defined by - * the character set of the current font. - * Output text is sent as an array of these structures. - * Spaces and line endings in the output are represented in the - * structures of the surrounding characters. They are not directly - * represented as characters. - * The first character in a word has a positive value of blanks. - * Missing information should be set to the defaults in the comments. - * If word bounds are known, but not character bounds, then the top and - * bottom of each character should be those of the word. The left of the - * first and right of the last char in each word should be set. All other - * lefts and rights should be set to -1. - * If set, the values of right and bottom are left+width and top+height. - * Most of the members come directly from the parameters to ocr_append_char. - * The formatting member uses the enhancement parameter and combines the - * line direction stuff into the top 3 bits. - * The coding is 0=RL char, 1=LR char, 2=DR NL, 3=UL NL, 4=DR Para, - * 5=UL Para, 6=TB char, 7=BT char. API users do not need to know what - * the coding is, only that it is backwards compatible with the previous - * version. - **********************************************************************/ - -struct EANYCODE_CHAR { /*single character */ - // It should be noted that the format for char_code for version 2.0 and beyond - // is UTF8 which means that ASCII characters will come out as one structure - // but other characters will be returned in two or more instances of this - // structure with a single byte of the UTF8 code in each, but each will have - // the same bounding box. Programs which want to handle languagues with - // different characters sets will need to handle extended characters - // appropriately, but *all* code needs to be prepared to receive UTF8 coded - // characters for characters such as bullet and fancy quotes. - uint16_t char_code; /*character itself */ - int16_t left; /*of char (-1) */ - int16_t right; /*of char (-1) */ - int16_t top; /*of char (-1) */ - int16_t bottom; /*of char (-1) */ - int16_t font_index; /*what font (0) */ - uint8_t confidence; /*0=perfect, 100=reject (0/100) */ - uint8_t point_size; /*of char, 72=i inch, (10) */ - int8_t blanks; /*no of spaces before this char (1) */ - uint8_t formatting; /*char formatting (0) */ -}; - -/********************************************************************** - * ETEXT_DESC - * Description of the output of the OCR engine. - * This structure is used as both a progress monitor and the final - * output header, since it needs to be a valid progress monitor while - * the OCR engine is storing its output to shared memory. - * During progress, all the buffer info is -1. - * Progress starts at 0 and increases to 100 during OCR. No other constraint. - * Additionally the progress callback contains the bounding box of the word that - * is currently being processed. - * Every progress callback, the OCR engine must set ocr_alive to 1. - * The HP side will set ocr_alive to 0. Repeated failure to reset - * to 1 indicates that the OCR engine is dead. - * If the cancel function is not null then it is called with the number of - * user words found. If it returns true then operation is cancelled. - **********************************************************************/ -class ETEXT_DESC; - -using CANCEL_FUNC = bool (*)(void *, int); -using PROGRESS_FUNC = bool (*)(int, int, int, int, int); -using PROGRESS_FUNC2 = bool (*)(ETEXT_DESC *, int, int, int, int); - -class ETEXT_DESC { // output header -public: - int16_t count{0}; /// chars in this buffer(0) - int16_t progress{0}; /// percent complete increasing (0-100) - /** Progress monitor covers word recognition and it does not cover layout - * analysis. - * See Ray comment in https://github.com/tesseract-ocr/tesseract/pull/27 */ - int8_t more_to_come{0}; /// true if not last - volatile int8_t ocr_alive{0}; /// ocr sets to 1, HP 0 - int8_t err_code{0}; /// for errcode use - CANCEL_FUNC cancel{nullptr}; /// returns true to cancel - PROGRESS_FUNC progress_callback{ - nullptr}; /// called whenever progress increases - PROGRESS_FUNC2 progress_callback2; /// monitor-aware progress callback - void *cancel_this{nullptr}; /// this or other data for cancel - std::chrono::steady_clock::time_point end_time; - /// Time to stop. Expected to be set only - /// by call to set_deadline_msecs(). - EANYCODE_CHAR text[1]{}; /// character data - - ETEXT_DESC() : progress_callback2(&default_progress_func) { - end_time = std::chrono::time_point(); - } - - // Sets the end time to be deadline_msecs milliseconds from now. - void set_deadline_msecs(int32_t deadline_msecs) { - if (deadline_msecs > 0) { - end_time = std::chrono::steady_clock::now() + - std::chrono::milliseconds(deadline_msecs); - } - } - - // Returns false if we've not passed the end_time, or have not set a deadline. - bool deadline_exceeded() const { - if (end_time.time_since_epoch() == - std::chrono::steady_clock::duration::zero()) { - return false; - } - auto now = std::chrono::steady_clock::now(); - return (now > end_time); - } - -private: - static bool default_progress_func(ETEXT_DESC *ths, int left, int right, - int top, int bottom) { - if (ths->progress_callback != nullptr) { - return (*(ths->progress_callback))(ths->progress, left, right, top, - bottom); - } - return true; - } -}; - -} // namespace tesseract - -#endif // CCUTIL_OCRCLASS_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/osdetect.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/osdetect.h deleted file mode 100644 index 34bfb557..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/osdetect.h +++ /dev/null @@ -1,139 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: osdetect.h -// Description: Orientation and script detection. -// Author: Samuel Charron -// Ranjith Unnikrishnan -// -// (C) Copyright 2008, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_OSDETECT_H_ -#define TESSERACT_CCMAIN_OSDETECT_H_ - -#include "export.h" // for TESS_API - -#include // for std::vector - -namespace tesseract { - -class BLOBNBOX; -class BLOBNBOX_CLIST; -class BLOB_CHOICE_LIST; -class TO_BLOCK_LIST; -class UNICHARSET; - -class Tesseract; - -// Max number of scripts in ICU + "NULL" + Japanese and Korean + Fraktur -const int kMaxNumberOfScripts = 116 + 1 + 2 + 1; - -struct OSBestResult { - OSBestResult() - : orientation_id(0), script_id(0), sconfidence(0.0), oconfidence(0.0) {} - int orientation_id; - int script_id; - float sconfidence; - float oconfidence; -}; - -struct OSResults { - OSResults() : unicharset(nullptr) { - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < kMaxNumberOfScripts; ++j) { - scripts_na[i][j] = 0; - } - orientations[i] = 0; - } - } - void update_best_orientation(); - // Set the estimate of the orientation to the given id. - void set_best_orientation(int orientation_id); - // Update/Compute the best estimate of the script assuming the given - // orientation id. - void update_best_script(int orientation_id); - // Return the index of the script with the highest score for this orientation. - TESS_API int get_best_script(int orientation_id) const; - // Accumulate scores with given OSResults instance and update the best script. - void accumulate(const OSResults &osr); - - // Print statistics. - void print_scores(void) const; - void print_scores(int orientation_id) const; - - // Array holding scores for each orientation id [0,3]. - // Orientation ids [0..3] map to [0, 270, 180, 90] degree orientations of the - // page respectively, where the values refer to the amount of clockwise - // rotation to be applied to the page for the text to be upright and readable. - float orientations[4]; - // Script confidence scores for each of 4 possible orientations. - float scripts_na[4][kMaxNumberOfScripts]; - - UNICHARSET *unicharset; - OSBestResult best_result; -}; - -class OrientationDetector { -public: - OrientationDetector(const std::vector *allowed_scripts, - OSResults *results); - bool detect_blob(BLOB_CHOICE_LIST *scores); - int get_orientation(); - -private: - OSResults *osr_; - const std::vector *allowed_scripts_; -}; - -class ScriptDetector { -public: - ScriptDetector(const std::vector *allowed_scripts, OSResults *osr, - tesseract::Tesseract *tess); - void detect_blob(BLOB_CHOICE_LIST *scores); - bool must_stop(int orientation) const; - -private: - OSResults *osr_; - static const char *korean_script_; - static const char *japanese_script_; - static const char *fraktur_script_; - int korean_id_; - int japanese_id_; - int katakana_id_; - int hiragana_id_; - int han_id_; - int hangul_id_; - int latin_id_; - int fraktur_id_; - tesseract::Tesseract *tess_; - const std::vector *allowed_scripts_; -}; - -int orientation_and_script_detection(const char *filename, OSResults *, - tesseract::Tesseract *); - -int os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr, - tesseract::Tesseract *tess); - -int os_detect_blobs(const std::vector *allowed_scripts, - BLOBNBOX_CLIST *blob_list, OSResults *osr, - tesseract::Tesseract *tess); - -bool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s, - OSResults *, tesseract::Tesseract *tess); - -// Helper method to convert an orientation index to its value in degrees. -// The value represents the amount of clockwise rotation in degrees that must be -// applied for the text to be upright (readable). -TESS_API int OrientationIdToValue(const int &id); - -} // namespace tesseract - -#endif // TESSERACT_CCMAIN_OSDETECT_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/pageiterator.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/pageiterator.h deleted file mode 100644 index 68739715..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/pageiterator.h +++ /dev/null @@ -1,364 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: pageiterator.h -// Description: Iterator for tesseract page structure that avoids using -// tesseract internal data structures. -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_ -#define TESSERACT_CCMAIN_PAGEITERATOR_H_ - -#include "export.h" -#include "publictypes.h" - -struct Pix; -struct Pta; - -namespace tesseract { - -struct BlamerBundle; -class C_BLOB_IT; -class PAGE_RES; -class PAGE_RES_IT; -class WERD; - -class Tesseract; - -/** - * Class to iterate over tesseract page structure, providing access to all - * levels of the page hierarchy, without including any tesseract headers or - * having to handle any tesseract structures. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - * See tesseract/publictypes.h for the definition of PageIteratorLevel. - * See also ResultIterator, derived from PageIterator, which adds in the - * ability to access OCR output with text-specific methods. - */ - -class TESS_API PageIterator { -public: - /** - * page_res and tesseract come directly from the BaseAPI. - * The rectangle parameters are copied indirectly from the Thresholder, - * via the BaseAPI. They represent the coordinates of some rectangle in an - * original image (in top-left-origin coordinates) and therefore the top-left - * needs to be added to any output boxes in order to specify coordinates - * in the original image. See TessBaseAPI::SetRectangle. - * The scale and scaled_yres are in case the Thresholder scaled the image - * rectangle prior to thresholding. Any coordinates in tesseract's image - * must be divided by scale before adding (rect_left, rect_top). - * The scaled_yres indicates the effective resolution of the binary image - * that tesseract has been given by the Thresholder. - * After the constructor, Begin has already been called. - */ - PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, - int scaled_yres, int rect_left, int rect_top, int rect_width, - int rect_height); - virtual ~PageIterator(); - - /** - * Page/ResultIterators may be copied! This makes it possible to iterate over - * all the objects at a lower level, while maintaining an iterator to - * objects at a higher level. These constructors DO NOT CALL Begin, so - * iterations will continue from the location of src. - */ - PageIterator(const PageIterator &src); - const PageIterator &operator=(const PageIterator &src); - - /** Are we positioned at the same location as other? */ - bool PositionedAtSameWord(const PAGE_RES_IT *other) const; - - // ============= Moving around within the page ============. - - /** - * Moves the iterator to point to the start of the page to begin an - * iteration. - */ - virtual void Begin(); - - /** - * Moves the iterator to the beginning of the paragraph. - * This class implements this functionality by moving it to the zero indexed - * blob of the first (leftmost) word on the first row of the paragraph. - */ - virtual void RestartParagraph(); - - /** - * Return whether this iterator points anywhere in the first textline of a - * paragraph. - */ - bool IsWithinFirstTextlineOfParagraph() const; - - /** - * Moves the iterator to the beginning of the text line. - * This class implements this functionality by moving it to the zero indexed - * blob of the first (leftmost) word of the row. - */ - virtual void RestartRow(); - - /** - * Moves to the start of the next object at the given level in the - * page hierarchy, and returns false if the end of the page was reached. - * NOTE that RIL_SYMBOL will skip non-text blocks, but all other - * PageIteratorLevel level values will visit each non-text block once. - * Think of non text blocks as containing a single para, with a single line, - * with a single imaginary word. - * Calls to Next with different levels may be freely intermixed. - * This function iterates words in right-to-left scripts correctly, if - * the appropriate language has been loaded into Tesseract. - */ - virtual bool Next(PageIteratorLevel level); - - /** - * Returns true if the iterator is at the start of an object at the given - * level. - * - * For instance, suppose an iterator it is pointed to the first symbol of the - * first word of the third line of the second paragraph of the first block in - * a page, then: - * it.IsAtBeginningOf(RIL_BLOCK) = false - * it.IsAtBeginningOf(RIL_PARA) = false - * it.IsAtBeginningOf(RIL_TEXTLINE) = true - * it.IsAtBeginningOf(RIL_WORD) = true - * it.IsAtBeginningOf(RIL_SYMBOL) = true - */ - virtual bool IsAtBeginningOf(PageIteratorLevel level) const; - - /** - * Returns whether the iterator is positioned at the last element in a - * given level. (e.g. the last word in a line, the last line in a block) - * - * Here's some two-paragraph example - * text. It starts off innocuously - * enough but quickly turns bizarre. - * The author inserts a cornucopia - * of words to guard against confused - * references. - * - * Now take an iterator it pointed to the start of "bizarre." - * it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false - * it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true - * it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false - */ - virtual bool IsAtFinalElement(PageIteratorLevel level, - PageIteratorLevel element) const; - - /** - * Returns whether this iterator is positioned - * before other: -1 - * equal to other: 0 - * after other: 1 - */ - int Cmp(const PageIterator &other) const; - - // ============= Accessing data ==============. - // Coordinate system: - // Integer coordinates are at the cracks between the pixels. - // The top-left corner of the top-left pixel in the image is at (0,0). - // The bottom-right corner of the bottom-right pixel in the image is at - // (width, height). - // Every bounding box goes from the top-left of the top-left contained - // pixel to the bottom-right of the bottom-right contained pixel, so - // the bounding box of the single top-left pixel in the image is: - // (0,0)->(1,1). - // If an image rectangle has been set in the API, then returned coordinates - // relate to the original (full) image, rather than the rectangle. - - /** - * Controls what to include in a bounding box. Bounding boxes of all levels - * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics. - * Between layout analysis and recognition, it isn't known where all - * diacritics belong, so this control is used to include or exclude some - * diacritics that are above or below the main body of the word. In most cases - * where the placement is obvious, and after recognition, it doesn't make as - * much difference, as the diacritics will already be included in the word. - */ - void SetBoundingBoxComponents(bool include_upper_dots, - bool include_lower_dots) { - include_upper_dots_ = include_upper_dots; - include_lower_dots_ = include_lower_dots; - } - - /** - * Returns the bounding rectangle of the current object at the given level. - * See comment on coordinate system above. - * Returns false if there is no such object at the current position. - * The returned bounding box is guaranteed to match the size and position - * of the image returned by GetBinaryImage, but may clip foreground pixels - * from a grey image. The padding argument to GetImage can be used to expand - * the image to include more foreground pixels. See GetImage below. - */ - bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, - int *bottom) const; - bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top, - int *right, int *bottom) const; - /** - * Returns the bounding rectangle of the object in a coordinate system of the - * working image rectangle having its origin at (rect_left_, rect_top_) with - * respect to the original image and is scaled by a factor scale_. - */ - bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top, - int *right, int *bottom) const; - - /** Returns whether there is no object of a given level. */ - bool Empty(PageIteratorLevel level) const; - - /** - * Returns the type of the current block. - * See tesseract/publictypes.h for PolyBlockType. - */ - PolyBlockType BlockType() const; - - /** - * Returns the polygon outline of the current block. The returned Pta must - * be ptaDestroy-ed after use. Note that the returned Pta lists the vertices - * of the polygon, and the last edge is the line segment between the last - * point and the first point. nullptr will be returned if the iterator is - * at the end of the document or layout analysis was not used. - */ - Pta *BlockPolygon() const; - - /** - * Returns a binary image of the current object at the given level. - * The position and size match the return from BoundingBoxInternal, and so - * this could be upscaled with respect to the original input image. - * Use pixDestroy to delete the image after use. - */ - Pix *GetBinaryImage(PageIteratorLevel level) const; - - /** - * Returns an image of the current object at the given level in greyscale - * if available in the input. To guarantee a binary image use BinaryImage. - * NOTE that in order to give the best possible image, the bounds are - * expanded slightly over the binary connected component, by the supplied - * padding, so the top-left position of the returned image is returned - * in (left,top). These will most likely not match the coordinates - * returned by BoundingBox. - * If you do not supply an original image, you will get a binary one. - * Use pixDestroy to delete the image after use. - */ - Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img, - int *left, int *top) const; - - /** - * Returns the baseline of the current object at the given level. - * The baseline is the line that passes through (x1, y1) and (x2, y2). - * WARNING: with vertical text, baselines may be vertical! - * Returns false if there is no baseline at the current position. - */ - bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2, - int *y2) const; - - // Returns the attributes of the current row. - void RowAttributes(float *row_height, float *descenders, - float *ascenders) const; - - /** - * Returns orientation for the block the iterator points to. - * orientation, writing_direction, textline_order: see publictypes.h - * deskew_angle: after rotating the block so the text orientation is - * upright, how many radians does one have to rotate the - * block anti-clockwise for it to be level? - * -Pi/4 <= deskew_angle <= Pi/4 - */ - void Orientation(tesseract::Orientation *orientation, - tesseract::WritingDirection *writing_direction, - tesseract::TextlineOrder *textline_order, - float *deskew_angle) const; - - /** - * Returns information about the current paragraph, if available. - * - * justification - - * LEFT if ragged right, or fully justified and script is left-to-right. - * RIGHT if ragged left, or fully justified and script is right-to-left. - * unknown if it looks like source code or we have very few lines. - * is_list_item - - * true if we believe this is a member of an ordered or unordered list. - * is_crown - - * true if the first line of the paragraph is aligned with the other - * lines of the paragraph even though subsequent paragraphs have first - * line indents. This typically indicates that this is the continuation - * of a previous paragraph or that it is the very first paragraph in - * the chapter. - * first_line_indent - - * For LEFT aligned paragraphs, the first text line of paragraphs of - * this kind are indented this many pixels from the left edge of the - * rest of the paragraph. - * for RIGHT aligned paragraphs, the first text line of paragraphs of - * this kind are indented this many pixels from the right edge of the - * rest of the paragraph. - * NOTE 1: This value may be negative. - * NOTE 2: if *is_crown == true, the first line of this paragraph is - * actually flush, and first_line_indent is set to the "common" - * first_line_indent for subsequent paragraphs in this block - * of text. - */ - void ParagraphInfo(tesseract::ParagraphJustification *justification, - bool *is_list_item, bool *is_crown, - int *first_line_indent) const; - - // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle - // of the current word to the given pointer (takes ownership of the pointer) - // and returns true. - // Can only be used when iterating on the word level. - bool SetWordBlamerBundle(BlamerBundle *blamer_bundle); - -protected: - /** - * Sets up the internal data for iterating the blobs of a new word, then - * moves the iterator to the given offset. - */ - void BeginWord(int offset); - - /** Pointer to the page_res owned by the API. */ - PAGE_RES *page_res_; - /** Pointer to the Tesseract object owned by the API. */ - Tesseract *tesseract_; - /** - * The iterator to the page_res_. Owned by this ResultIterator. - * A pointer just to avoid dragging in Tesseract includes. - */ - PAGE_RES_IT *it_; - /** - * The current input WERD being iterated. If there is an output from OCR, - * then word_ is nullptr. Owned by the API - */ - WERD *word_; - /** The length of the current word_. */ - int word_length_; - /** The current blob index within the word. */ - int blob_index_; - /** - * Iterator to the blobs within the word. If nullptr, then we are iterating - * OCR results in the box_word. - * Owned by this ResultIterator. - */ - C_BLOB_IT *cblob_it_; - /** Control over what to include in bounding boxes. */ - bool include_upper_dots_; - bool include_lower_dots_; - /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/ - int scale_; - int scaled_yres_; - int rect_left_; - int rect_top_; - int rect_width_; - int rect_height_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_PAGEITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/publictypes.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/publictypes.h deleted file mode 100644 index 0069cf28..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/publictypes.h +++ /dev/null @@ -1,281 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: publictypes.h -// Description: Types used in both the API and internally -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCSTRUCT_PUBLICTYPES_H_ -#define TESSERACT_CCSTRUCT_PUBLICTYPES_H_ - -namespace tesseract { - -// This file contains types that are used both by the API and internally -// to Tesseract. In order to decouple the API from Tesseract and prevent cyclic -// dependencies, THIS FILE SHOULD NOT DEPEND ON ANY OTHER PART OF TESSERACT. -// Restated: It is OK for low-level Tesseract files to include publictypes.h, -// but not for the low-level tesseract code to include top-level API code. -// This file should not use other Tesseract types, as that would drag -// their includes into the API-level. - -/** Number of printers' points in an inch. The unit of the pointsize return. */ -constexpr int kPointsPerInch = 72; -/** - * Minimum believable resolution. Used as a default if there is no other - * information, as it is safer to under-estimate than over-estimate. - */ -constexpr int kMinCredibleResolution = 70; -/** Maximum believable resolution. */ -constexpr int kMaxCredibleResolution = 2400; -/** - * Ratio between median blob size and likely resolution. Used to estimate - * resolution when none is provided. This is basically 1/usual text size in - * inches. */ -constexpr int kResolutionEstimationFactor = 10; - -/** - * Possible types for a POLY_BLOCK or ColPartition. - * Must be kept in sync with kPBColors in polyblk.cpp and PTIs*Type functions - * below, as well as kPolyBlockNames in layout_test.cc. - * Used extensively by ColPartition, and POLY_BLOCK. - */ -enum PolyBlockType { - PT_UNKNOWN, // Type is not yet known. Keep as the first element. - PT_FLOWING_TEXT, // Text that lives inside a column. - PT_HEADING_TEXT, // Text that spans more than one column. - PT_PULLOUT_TEXT, // Text that is in a cross-column pull-out region. - PT_EQUATION, // Partition belonging to an equation region. - PT_INLINE_EQUATION, // Partition has inline equation. - PT_TABLE, // Partition belonging to a table region. - PT_VERTICAL_TEXT, // Text-line runs vertically. - PT_CAPTION_TEXT, // Text that belongs to an image. - PT_FLOWING_IMAGE, // Image that lives inside a column. - PT_HEADING_IMAGE, // Image that spans more than one column. - PT_PULLOUT_IMAGE, // Image that is in a cross-column pull-out region. - PT_HORZ_LINE, // Horizontal Line. - PT_VERT_LINE, // Vertical Line. - PT_NOISE, // Lies outside of any column. - PT_COUNT -}; - -/** Returns true if PolyBlockType is of horizontal line type */ -inline bool PTIsLineType(PolyBlockType type) { - return type == PT_HORZ_LINE || type == PT_VERT_LINE; -} -/** Returns true if PolyBlockType is of image type */ -inline bool PTIsImageType(PolyBlockType type) { - return type == PT_FLOWING_IMAGE || type == PT_HEADING_IMAGE || - type == PT_PULLOUT_IMAGE; -} -/** Returns true if PolyBlockType is of text type */ -inline bool PTIsTextType(PolyBlockType type) { - return type == PT_FLOWING_TEXT || type == PT_HEADING_TEXT || - type == PT_PULLOUT_TEXT || type == PT_TABLE || - type == PT_VERTICAL_TEXT || type == PT_CAPTION_TEXT || - type == PT_INLINE_EQUATION; -} -// Returns true if PolyBlockType is of pullout(inter-column) type -inline bool PTIsPulloutType(PolyBlockType type) { - return type == PT_PULLOUT_IMAGE || type == PT_PULLOUT_TEXT; -} - -/** - * +------------------+ Orientation Example: - * | 1 Aaaa Aaaa Aaaa | ==================== - * | Aaa aa aaa aa | To left is a diagram of some (1) English and - * | aaaaaa A aa aaa. | (2) Chinese text and a (3) photo credit. - * | 2 | - * | ####### c c C | Upright Latin characters are represented as A and a. - * | ####### c c c | '<' represents a latin character rotated - * | < ####### c c c | anti-clockwise 90 degrees. - * | < ####### c c | - * | < ####### . c | Upright Chinese characters are represented C and c. - * | 3 ####### c | - * +------------------+ NOTA BENE: enum values here should match goodoc.proto - - * If you orient your head so that "up" aligns with Orientation, - * then the characters will appear "right side up" and readable. - * - * In the example above, both the English and Chinese paragraphs are oriented - * so their "up" is the top of the page (page up). The photo credit is read - * with one's head turned leftward ("up" is to page left). - * - * The values of this enum match the convention of Tesseract's osdetect.h -*/ -enum Orientation { - ORIENTATION_PAGE_UP = 0, - ORIENTATION_PAGE_RIGHT = 1, - ORIENTATION_PAGE_DOWN = 2, - ORIENTATION_PAGE_LEFT = 3, -}; - -/** - * The grapheme clusters within a line of text are laid out logically - * in this direction, judged when looking at the text line rotated so that - * its Orientation is "page up". - * - * For English text, the writing direction is left-to-right. For the - * Chinese text in the above example, the writing direction is top-to-bottom. - */ -enum WritingDirection { - WRITING_DIRECTION_LEFT_TO_RIGHT = 0, - WRITING_DIRECTION_RIGHT_TO_LEFT = 1, - WRITING_DIRECTION_TOP_TO_BOTTOM = 2, -}; - -/** - * The text lines are read in the given sequence. - * - * In English, the order is top-to-bottom. - * In Chinese, vertical text lines are read right-to-left. Mongolian is - * written in vertical columns top to bottom like Chinese, but the lines - * order left-to right. - * - * Note that only some combinations make sense. For example, - * WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM - */ -enum TextlineOrder { - TEXTLINE_ORDER_LEFT_TO_RIGHT = 0, - TEXTLINE_ORDER_RIGHT_TO_LEFT = 1, - TEXTLINE_ORDER_TOP_TO_BOTTOM = 2, -}; - -/** - * Possible modes for page layout analysis. These *must* be kept in order - * of decreasing amount of layout analysis to be done, except for OSD_ONLY, - * so that the inequality test macros below work. - */ -enum PageSegMode { - PSM_OSD_ONLY = 0, ///< Orientation and script detection only. - PSM_AUTO_OSD = 1, ///< Automatic page segmentation with orientation and - ///< script detection. (OSD) - PSM_AUTO_ONLY = 2, ///< Automatic page segmentation, but no OSD, or OCR. - PSM_AUTO = 3, ///< Fully automatic page segmentation, but no OSD. - PSM_SINGLE_COLUMN = 4, ///< Assume a single column of text of variable sizes. - PSM_SINGLE_BLOCK_VERT_TEXT = 5, ///< Assume a single uniform block of - ///< vertically aligned text. - PSM_SINGLE_BLOCK = 6, ///< Assume a single uniform block of text. (Default.) - PSM_SINGLE_LINE = 7, ///< Treat the image as a single text line. - PSM_SINGLE_WORD = 8, ///< Treat the image as a single word. - PSM_CIRCLE_WORD = 9, ///< Treat the image as a single word in a circle. - PSM_SINGLE_CHAR = 10, ///< Treat the image as a single character. - PSM_SPARSE_TEXT = - 11, ///< Find as much text as possible in no particular order. - PSM_SPARSE_TEXT_OSD = 12, ///< Sparse text with orientation and script det. - PSM_RAW_LINE = 13, ///< Treat the image as a single text line, bypassing - ///< hacks that are Tesseract-specific. - - PSM_COUNT ///< Number of enum entries. -}; - -/** - * Inline functions that act on a PageSegMode to determine whether components of - * layout analysis are enabled. - * *Depend critically on the order of elements of PageSegMode.* - * NOTE that arg is an int for compatibility with INT_PARAM. - */ -inline bool PSM_OSD_ENABLED(int pageseg_mode) { - return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_ORIENTATION_ENABLED(int pageseg_mode) { - return pageseg_mode <= PSM_AUTO || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_COL_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO; -} -inline bool PSM_SPARSE(int pageseg_mode) { - return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN; -} -inline bool PSM_LINE_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK; -} -inline bool PSM_WORD_FIND_ENABLED(int pageseg_mode) { - return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) || - pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} - -/** - * enum of the elements of the page hierarchy, used in ResultIterator - * to provide functions that operate on each level without having to - * have 5x as many functions. - */ -enum PageIteratorLevel { - RIL_BLOCK, // Block of text/image/separator line. - RIL_PARA, // Paragraph within a block. - RIL_TEXTLINE, // Line within a paragraph. - RIL_WORD, // Word within a textline. - RIL_SYMBOL // Symbol/character within a word. -}; - -/** - * JUSTIFICATION_UNKNOWN - * The alignment is not clearly one of the other options. This could happen - * for example if there are only one or two lines of text or the text looks - * like source code or poetry. - * - * NOTA BENE: Fully justified paragraphs (text aligned to both left and right - * margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text - * is written with a left-to-right script and with JUSTIFICATION_RIGHT if - * their text is written in a right-to-left script. - * - * Interpretation for text read in vertical lines: - * "Left" is wherever the starting reading position is. - * - * JUSTIFICATION_LEFT - * Each line, except possibly the first, is flush to the same left tab stop. - * - * JUSTIFICATION_CENTER - * The text lines of the paragraph are centered about a line going - * down through their middle of the text lines. - * - * JUSTIFICATION_RIGHT - * Each line, except possibly the first, is flush to the same right tab stop. - */ -enum ParagraphJustification { - JUSTIFICATION_UNKNOWN, - JUSTIFICATION_LEFT, - JUSTIFICATION_CENTER, - JUSTIFICATION_RIGHT, -}; - -/** - * When Tesseract/Cube is initialized we can choose to instantiate/load/run - * only the Tesseract part, only the Cube part or both along with the combiner. - * The preference of which engine to use is stored in tessedit_ocr_engine_mode. - * - * ATTENTION: When modifying this enum, please make sure to make the - * appropriate changes to all the enums mirroring it (e.g. OCREngine in - * cityblock/workflow/detection/detection_storage.proto). Such enums will - * mention the connection to OcrEngineMode in the comments. - */ -enum OcrEngineMode { - OEM_TESSERACT_ONLY, // Run Tesseract only - fastest; deprecated - OEM_LSTM_ONLY, // Run just the LSTM line recognizer. - OEM_TESSERACT_LSTM_COMBINED, // Run the LSTM recognizer, but allow fallback - // to Tesseract when things get difficult. - // deprecated - OEM_DEFAULT, // Specify this mode when calling init_*(), - // to indicate that any of the above modes - // should be automatically inferred from the - // variables in the language-specific config, - // command-line configs, or if not specified - // in any of the above should be set to the - // default OEM_TESSERACT_ONLY. - OEM_COUNT // Number of OEMs -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCSTRUCT_PUBLICTYPES_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/renderer.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/renderer.h deleted file mode 100644 index 6f405233..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/renderer.h +++ /dev/null @@ -1,311 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: renderer.h -// Description: Rendering interface to inject into TessBaseAPI -// -// (C) Copyright 2011, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_RENDERER_H_ -#define TESSERACT_API_RENDERER_H_ - -#include "export.h" - -// To avoid collision with other typenames include the ABSOLUTE MINIMUM -// complexity of includes here. Use forward declarations wherever possible -// and hide includes of complex types in baseapi.cpp. -#include -#include // for std::string -#include // for std::vector - -struct Pix; - -namespace tesseract { - -class TessBaseAPI; - -/** - * Interface for rendering tesseract results into a document, such as text, - * HOCR or pdf. This class is abstract. Specific classes handle individual - * formats. This interface is then used to inject the renderer class into - * tesseract when processing images. - * - * For simplicity implementing this with tesseract version 3.01, - * the renderer contains document state that is cleared from document - * to document just as the TessBaseAPI is. This way the base API can just - * delegate its rendering functionality to injected renderers, and the - * renderers can manage the associated state needed for the specific formats - * in addition to the heuristics for producing it. - */ -class TESS_API TessResultRenderer { -public: - virtual ~TessResultRenderer(); - - // Takes ownership of pointer so must be new'd instance. - // Renderers aren't ordered, but appends the sequences of next parameter - // and existing next(). The renderers should be unique across both lists. - void insert(TessResultRenderer *next); - - // Returns the next renderer or nullptr. - TessResultRenderer *next() { - return next_; - } - - /** - * Starts a new document with the given title. - * This clears the contents of the output data. - * Title should use UTF-8 encoding. - */ - bool BeginDocument(const char *title); - - /** - * Adds the recognized text from the source image to the current document. - * Invalid if BeginDocument not yet called. - * - * Note that this API is a bit weird but is designed to fit into the - * current TessBaseAPI implementation where the api has lots of state - * information that we might want to add in. - */ - bool AddImage(TessBaseAPI *api); - - /** - * Finishes the document and finalizes the output data - * Invalid if BeginDocument not yet called. - */ - bool EndDocument(); - - const char *file_extension() const { - return file_extension_; - } - const char *title() const { - return title_.c_str(); - } - - // Is everything fine? Otherwise something went wrong. - bool happy() const { - return happy_; - } - - /** - * Returns the index of the last image given to AddImage - * (i.e. images are incremented whether the image succeeded or not) - * - * This is always defined. It means either the number of the - * current image, the last image ended, or in the completed document - * depending on when in the document lifecycle you are looking at it. - * Will return -1 if a document was never started. - */ - int imagenum() const { - return imagenum_; - } - -protected: - /** - * Called by concrete classes. - * - * outputbase is the name of the output file excluding - * extension. For example, "/path/to/chocolate-chip-cookie-recipe" - * - * extension indicates the file extension to be used for output - * files. For example "pdf" will produce a .pdf file, and "hocr" - * will produce .hocr files. - */ - TessResultRenderer(const char *outputbase, const char *extension); - - // Hook for specialized handling in BeginDocument() - virtual bool BeginDocumentHandler(); - - // This must be overridden to render the OCR'd results - virtual bool AddImageHandler(TessBaseAPI *api) = 0; - - // Hook for specialized handling in EndDocument() - virtual bool EndDocumentHandler(); - - // Renderers can call this to append '\0' terminated strings into - // the output string returned by GetOutput. - // This method will grow the output buffer if needed. - void AppendString(const char *s); - - // Renderers can call this to append binary byte sequences into - // the output string returned by GetOutput. Note that s is not necessarily - // '\0' terminated (and can contain '\0' within it). - // This method will grow the output buffer if needed. - void AppendData(const char *s, int len); - -private: - TessResultRenderer *next_; // Can link multiple renderers together - FILE *fout_; // output file pointer - const char *file_extension_; // standard extension for generated output - std::string title_; // title of document being rendered - int imagenum_; // index of last image added - bool happy_; // I get grumpy when the disk fills up, etc. -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessTextRenderer : public TessResultRenderer { -public: - explicit TessTextRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into an hocr text string - */ -class TESS_API TessHOcrRenderer : public TessResultRenderer { -public: - explicit TessHOcrRenderer(const char *outputbase, bool font_info); - explicit TessHOcrRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool font_info_; // whether to print font information -}; - -/** - * Renders tesseract output into an alto text string - */ -class TESS_API TessAltoRenderer : public TessResultRenderer { -public: - explicit TessAltoRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool begin_document; -}; - -/** - * Renders Tesseract output into a TSV string - */ -class TESS_API TessTsvRenderer : public TessResultRenderer { -public: - explicit TessTsvRenderer(const char *outputbase, bool font_info); - explicit TessTsvRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool font_info_; // whether to print font information -}; - -/** - * Renders tesseract output into searchable PDF - */ -class TESS_API TessPDFRenderer : public TessResultRenderer { -public: - // datadir is the location of the TESSDATA. We need it because - // we load a custom PDF font from this location. - TessPDFRenderer(const char *outputbase, const char *datadir, - bool textonly = false); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - // We don't want to have every image in memory at once, - // so we store some metadata as we go along producing - // PDFs one page at a time. At the end, that metadata is - // used to make everything that isn't easily handled in a - // streaming fashion. - long int obj_; // counter for PDF objects - std::vector offsets_; // offset of every PDF object in bytes - std::vector pages_; // object number for every /Page object - std::string datadir_; // where to find the custom font - bool textonly_; // skip images if set - // Bookkeeping only. DIY = Do It Yourself. - void AppendPDFObjectDIY(size_t objectsize); - // Bookkeeping + emit data. - void AppendPDFObject(const char *data); - // Create the /Contents object for an entire page. - char *GetPDFTextObjects(TessBaseAPI *api, double width, double height); - // Turn an image into a PDF object. Only transcode if we have to. - static bool imageToPDFObj(Pix *pix, const char *filename, long int objnum, - char **pdf_object, long int *pdf_object_size, - int jpg_quality); -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessUnlvRenderer : public TessResultRenderer { -public: - explicit TessUnlvRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string for LSTMBox - */ -class TESS_API TessLSTMBoxRenderer : public TessResultRenderer { -public: - explicit TessLSTMBoxRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessBoxTextRenderer : public TessResultRenderer { -public: - explicit TessBoxTextRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string in WordStr format - */ -class TESS_API TessWordStrBoxRenderer : public TessResultRenderer { -public: - explicit TessWordStrBoxRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -#ifndef DISABLED_LEGACY_ENGINE - -/** - * Renders tesseract output into an osd text string - */ -class TESS_API TessOsdRenderer : public TessResultRenderer { -public: - explicit TessOsdRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -#endif // ndef DISABLED_LEGACY_ENGINE - -} // namespace tesseract. - -#endif // TESSERACT_API_RENDERER_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/resultiterator.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/resultiterator.h deleted file mode 100644 index 3e4d5807..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/resultiterator.h +++ /dev/null @@ -1,250 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: resultiterator.h -// Description: Iterator for tesseract results that is capable of -// iterating in proper reading order over Bi Directional -// (e.g. mixed Hebrew and English) text. -// Author: David Eger -// -// (C) Copyright 2011, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_ -#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_ - -#include "export.h" // for TESS_API, TESS_LOCAL -#include "ltrresultiterator.h" // for LTRResultIterator -#include "publictypes.h" // for PageIteratorLevel -#include "unichar.h" // for StrongScriptDirection - -#include // for std::pair -#include // for std::vector - -namespace tesseract { - -class TESS_API ResultIterator : public LTRResultIterator { -public: - static ResultIterator *StartOfParagraph(const LTRResultIterator &resit); - - /** - * ResultIterator is copy constructible! - * The default copy constructor works just fine for us. - */ - ~ResultIterator() override = default; - - // ============= Moving around within the page ============. - /** - * Moves the iterator to point to the start of the page to begin - * an iteration. - */ - void Begin() override; - - /** - * Moves to the start of the next object at the given level in the - * page hierarchy in the appropriate reading order and returns false if - * the end of the page was reached. - * NOTE that RIL_SYMBOL will skip non-text blocks, but all other - * PageIteratorLevel level values will visit each non-text block once. - * Think of non text blocks as containing a single para, with a single line, - * with a single imaginary word. - * Calls to Next with different levels may be freely intermixed. - * This function iterates words in right-to-left scripts correctly, if - * the appropriate language has been loaded into Tesseract. - */ - bool Next(PageIteratorLevel level) override; - - /** - * IsAtBeginningOf() returns whether we're at the logical beginning of the - * given level. (as opposed to ResultIterator's left-to-right top-to-bottom - * order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf(). - * For a full description, see pageiterator.h - */ - bool IsAtBeginningOf(PageIteratorLevel level) const override; - - /** - * Implement PageIterator's IsAtFinalElement correctly in a BiDi context. - * For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we - * point at the last word in a paragraph. See PageIterator for full comment. - */ - bool IsAtFinalElement(PageIteratorLevel level, - PageIteratorLevel element) const override; - - // ============= Functions that refer to words only ============. - // Returns the number of blanks before the current word. - int BlanksBeforeWord() const; - - // ============= Accessing data ==============. - - /** - * Returns the null terminated UTF-8 encoded text string for the current - * object at the given level. Use delete [] to free after use. - */ - virtual char *GetUTF8Text(PageIteratorLevel level) const; - - /** - * Returns the LSTM choices for every LSTM timestep for the current word. - */ - virtual std::vector>>> - *GetRawLSTMTimesteps() const; - virtual std::vector>> - *GetBestLSTMSymbolChoices() const; - - /** - * Return whether the current paragraph's dominant reading direction - * is left-to-right (as opposed to right-to-left). - */ - bool ParagraphIsLtr() const; - - // ============= Exposed only for testing =============. - - /** - * Yields the reading order as a sequence of indices and (optional) - * meta-marks for a set of words (given left-to-right). - * The meta marks are passed as negative values: - * kMinorRunStart Start of minor direction text. - * kMinorRunEnd End of minor direction text. - * kComplexWord The next indexed word contains both left-to-right and - * right-to-left characters and was treated as neutral. - * - * For example, suppose we have five words in a text line, - * indexed [0,1,2,3,4] from the leftmost side of the text line. - * The following are all believable reading_orders: - * - * Left-to-Right (in ltr paragraph): - * { 0, 1, 2, 3, 4 } - * Left-to-Right (in rtl paragraph): - * { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd } - * Right-to-Left (in rtl paragraph): - * { 4, 3, 2, 1, 0 } - * Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph: - * { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 } - */ - static void CalculateTextlineOrder( - bool paragraph_is_ltr, - const std::vector &word_dirs, - std::vector *reading_order); - - static const int kMinorRunStart; - static const int kMinorRunEnd; - static const int kComplexWord; - -protected: - /** - * We presume the data associated with the given iterator will outlive us. - * NB: This is private because it does something that is non-obvious: - * it resets to the beginning of the paragraph instead of staying wherever - * resit might have pointed. - */ - explicit ResultIterator(const LTRResultIterator &resit); - -private: - /** - * Calculates the current paragraph's dominant writing direction. - * Typically, members should use current_paragraph_ltr_ instead. - */ - bool CurrentParagraphIsLtr() const; - - /** - * Returns word indices as measured from resit->RestartRow() = index 0 - * for the reading order of words within a textline given an iterator - * into the middle of the text line. - * In addition to non-negative word indices, the following negative values - * may be inserted: - * kMinorRunStart Start of minor direction text. - * kMinorRunEnd End of minor direction text. - * kComplexWord The previous word contains both left-to-right and - * right-to-left characters and was treated as neutral. - */ - void CalculateTextlineOrder(bool paragraph_is_ltr, - const LTRResultIterator &resit, - std::vector *indices) const; - /** Same as above, but the caller's ssd gets filled in if ssd != nullptr. */ - void CalculateTextlineOrder(bool paragraph_is_ltr, - const LTRResultIterator &resit, - std::vector *ssd, - std::vector *indices) const; - - /** - * What is the index of the current word in a strict left-to-right reading - * of the row? - */ - int LTRWordIndex() const; - - /** - * Given an iterator pointing at a word, returns the logical reading order - * of blob indices for the word. - */ - void CalculateBlobOrder(std::vector *blob_indices) const; - - /** Precondition: current_paragraph_is_ltr_ is set. */ - void MoveToLogicalStartOfTextline(); - - /** - * Precondition: current_paragraph_is_ltr_ and in_minor_direction_ - * are set. - */ - void MoveToLogicalStartOfWord(); - - /** Are we pointing at the final (reading order) symbol of the word? */ - bool IsAtFinalSymbolOfWord() const; - - /** Are we pointing at the first (reading order) symbol of the word? */ - bool IsAtFirstSymbolOfWord() const; - - /** - * Append any extra marks that should be appended to this word when printed. - * Mostly, these are Unicode BiDi control characters. - */ - void AppendSuffixMarks(std::string *text) const; - - /** Appends the current word in reading order to the given buffer.*/ - void AppendUTF8WordText(std::string *text) const; - - /** - * Appends the text of the current text line, *assuming this iterator is - * positioned at the beginning of the text line* This function - * updates the iterator to point to the first position past the text line. - * Each textline is terminated in a single newline character. - * If the textline ends a paragraph, it gets a second terminal newline. - */ - void IterateAndAppendUTF8TextlineText(std::string *text); - - /** - * Appends the text of the current paragraph in reading order - * to the given buffer. - * Each textline is terminated in a single newline character, and the - * paragraph gets an extra newline at the end. - */ - void AppendUTF8ParagraphText(std::string *text) const; - - /** Returns whether the bidi_debug flag is set to at least min_level. */ - bool BidiDebug(int min_level) const; - - bool current_paragraph_is_ltr_; - - /** - * Is the currently pointed-at character at the beginning of - * a minor-direction run? - */ - bool at_beginning_of_minor_run_; - - /** Is the currently pointed-at character in a minor-direction sequence? */ - bool in_minor_direction_; - - /** - * Should detected inter-word spaces be preserved, or "compressed" to a single - * space character (default behavior). - */ - bool preserve_interword_spaces_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/unichar.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/unichar.h deleted file mode 100644 index 015109d7..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/unichar.h +++ /dev/null @@ -1,174 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: unichar.h -// Description: Unicode character/ligature class. -// Author: Ray Smith -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCUTIL_UNICHAR_H_ -#define TESSERACT_CCUTIL_UNICHAR_H_ - -#include "export.h" - -#include -#include -#include -#include - -namespace tesseract { - -// Maximum number of characters that can be stored in a UNICHAR. Must be -// at least 4. Must not exceed 31 without changing the coding of length. -#define UNICHAR_LEN 30 - -// A UNICHAR_ID is the unique id of a unichar. -using UNICHAR_ID = int; - -// A variable to indicate an invalid or uninitialized unichar id. -static const int INVALID_UNICHAR_ID = -1; -// A special unichar that corresponds to INVALID_UNICHAR_ID. -static const char INVALID_UNICHAR[] = "__INVALID_UNICHAR__"; - -enum StrongScriptDirection { - DIR_NEUTRAL = 0, // Text contains only neutral characters. - DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters. - DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters. - DIR_MIX = 3, // Text contains a mixture of left-to-right - // and right-to-left characters. -}; - -using char32 = signed int; - -// The UNICHAR class holds a single classification result. This may be -// a single Unicode character (stored as between 1 and 4 utf8 bytes) or -// multiple Unicode characters representing the NFKC expansion of a ligature -// such as fi, ffl etc. These are also stored as utf8. -class TESS_API UNICHAR { -public: - UNICHAR() { - memset(chars, 0, UNICHAR_LEN); - } - - // Construct from a utf8 string. If len<0 then the string is null terminated. - // If the string is too long to fit in the UNICHAR then it takes only what - // will fit. - UNICHAR(const char *utf8_str, int len); - - // Construct from a single UCS4 character. - explicit UNICHAR(int unicode); - - // Default copy constructor and operator= are OK. - - // Get the first character as UCS-4. - int first_uni() const; - - // Get the length of the UTF8 string. - int utf8_len() const { - int len = chars[UNICHAR_LEN - 1]; - return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN; - } - - // Get a UTF8 string, but NOT nullptr terminated. - const char *utf8() const { - return chars; - } - - // Get a terminated UTF8 string: Must delete[] it after use. - char *utf8_str() const; - - // Get the number of bytes in the first character of the given utf8 string. - static int utf8_step(const char *utf8_str); - - // A class to simplify iterating over and accessing elements of a UTF8 - // string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or - // take ownership of the underlying byte array. It also does not permit - // modification of the array (as the name suggests). - // - // Example: - // for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len); - // it != UNICHAR::end(str, len); - // ++it) { - // printf("UCS-4 symbol code = %d\n", *it); - // char buf[5]; - // int char_len = it.get_utf8(buf); buf[char_len] = '\0'; - // printf("Char = %s\n", buf); - // } - class TESS_API const_iterator { - using CI = const_iterator; - - public: - // Step to the next UTF8 character. - // If the current position is at an illegal UTF8 character, then print an - // error message and step by one byte. If the current position is at a - // nullptr value, don't step past it. - const_iterator &operator++(); - - // Return the UCS-4 value at the current position. - // If the current position is at an illegal UTF8 value, return a single - // space character. - int operator*() const; - - // Store the UTF-8 encoding of the current codepoint into buf, which must be - // at least 4 bytes long. Return the number of bytes written. - // If the current position is at an illegal UTF8 value, writes a single - // space character and returns 1. - // Note that this method does not null-terminate the buffer. - int get_utf8(char *buf) const; - // Returns the number of bytes of the current codepoint. Returns 1 if the - // current position is at an illegal UTF8 value. - int utf8_len() const; - // Returns true if the UTF-8 encoding at the current position is legal. - bool is_legal() const; - - // Return the pointer into the string at the current position. - const char *utf8_data() const { - return it_; - } - - // Iterator equality operators. - friend bool operator==(const CI &lhs, const CI &rhs) { - return lhs.it_ == rhs.it_; - } - friend bool operator!=(const CI &lhs, const CI &rhs) { - return !(lhs == rhs); - } - - private: - friend class UNICHAR; - explicit const_iterator(const char *it) : it_(it) {} - - const char *it_; // Pointer into the string. - }; - - // Create a start/end iterator pointing to a string. Note that these methods - // are static and do NOT create a copy or take ownership of the underlying - // array. - static const_iterator begin(const char *utf8_str, int byte_length); - static const_iterator end(const char *utf8_str, int byte_length); - - // Converts a utf-8 string to a vector of unicodes. - // Returns an empty vector if the input contains invalid UTF-8. - static std::vector UTF8ToUTF32(const char *utf8_str); - // Converts a vector of unicodes to a utf8 string. - // Returns an empty string if the input contains an invalid unicode. - static std::string UTF32ToUTF8(const std::vector &str32); - -private: - // A UTF-8 representation of 1 or more Unicode characters. - // The last element (chars[UNICHAR_LEN - 1]) is a length if - // its value < UNICHAR_LEN, otherwise it is a genuine character. - char chars[UNICHAR_LEN]{}; -}; - -} // namespace tesseract - -#endif // TESSERACT_CCUTIL_UNICHAR_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/version.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/version.h deleted file mode 100644 index 6bac5d66..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/version.h +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: version.h -// Description: Version information -// -// (C) Copyright 2018, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_VERSION_H_ -#define TESSERACT_API_VERSION_H_ - -// clang-format off - -#define TESSERACT_MAJOR_VERSION @GENERIC_MAJOR_VERSION@ -#define TESSERACT_MINOR_VERSION @GENERIC_MINOR_VERSION@ -#define TESSERACT_MICRO_VERSION @GENERIC_MICRO_VERSION@ - -#define TESSERACT_VERSION \ - (TESSERACT_MAJOR_VERSION << 16 | \ - TESSERACT_MINOR_VERSION << 8 | \ - TESSERACT_MICRO_VERSION) - -#define TESSERACT_VERSION_STR "@PACKAGE_VERSION@" - -// clang-format on - -#endif // TESSERACT_API_VERSION_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/baseapi.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/baseapi.h deleted file mode 100644 index 5e1e4830..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/baseapi.h +++ /dev/null @@ -1,812 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: baseapi.h -// Description: Simple API for calling tesseract. -// Author: Ray Smith -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_BASEAPI_H_ -#define TESSERACT_API_BASEAPI_H_ - -#ifdef HAVE_CONFIG_H -# include "config_auto.h" // DISABLED_LEGACY_ENGINE -#endif - -#include "export.h" -#include "pageiterator.h" -#include "publictypes.h" -#include "resultiterator.h" -#include "unichar.h" - -#include "version.h" - -#include -#include // for std::vector - -struct Pix; -struct Pixa; -struct Boxa; - -namespace tesseract { - -class PAGE_RES; -class ParagraphModel; -class BLOCK_LIST; -class ETEXT_DESC; -struct OSResults; -class UNICHARSET; - -class Dawg; -class Dict; -class EquationDetect; -class PageIterator; -class ImageThresholder; -class LTRResultIterator; -class ResultIterator; -class MutableIterator; -class TessResultRenderer; -class Tesseract; - -// Function to read a std::vector from a whole file. -// Returns false on failure. -using FileReader = bool (*)(const char *filename, std::vector *data); - -using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID, - bool) const; -using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *, - int, const char *, int); - -/** - * Base class for all tesseract APIs. - * Specific classes can add ability to work on different inputs or produce - * different outputs. - * This class is mostly an interface layer on top of the Tesseract instance - * class to hide the data types so that users of this class don't have to - * include any other Tesseract headers. - */ -class TESS_API TessBaseAPI { -public: - TessBaseAPI(); - virtual ~TessBaseAPI(); - // Copy constructor and assignment operator are currently unsupported. - TessBaseAPI(TessBaseAPI const &) = delete; - TessBaseAPI &operator=(TessBaseAPI const &) = delete; - - /** - * Returns the version identifier as a static string. Do not delete. - */ - static const char *Version(); - - /** - * If compiled with OpenCL AND an available OpenCL - * device is deemed faster than serial code, then - * "device" is populated with the cl_device_id - * and returns sizeof(cl_device_id) - * otherwise *device=nullptr and returns 0. - */ - static size_t getOpenCLDevice(void **device); - - /** - * Set the name of the input file. Needed for training and - * reading a UNLV zone file, and for searchable PDF output. - */ - void SetInputName(const char *name); - /** - * These functions are required for searchable PDF output. - * We need our hands on the input file so that we can include - * it in the PDF without transcoding. If that is not possible, - * we need the original image. Finally, resolution metadata - * is stored in the PDF so we need that as well. - */ - const char *GetInputName(); - // Takes ownership of the input pix. - void SetInputImage(Pix *pix); - Pix *GetInputImage(); - int GetSourceYResolution(); - const char *GetDatapath(); - - /** Set the name of the bonus output files. Needed only for debugging. */ - void SetOutputName(const char *name); - - /** - * Set the value of an internal "parameter." - * Supply the name of the parameter and the value as a string, just as - * you would in a config file. - * Returns false if the name lookup failed. - * Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z. - * Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode. - * SetVariable may be used before Init, but settings will revert to - * defaults on End(). - * - * Note: Must be called after Init(). Only works for non-init variables - * (init variables should be passed to Init()). - */ - bool SetVariable(const char *name, const char *value); - bool SetDebugVariable(const char *name, const char *value); - - /** - * Returns true if the parameter was found among Tesseract parameters. - * Fills in value with the value of the parameter. - */ - bool GetIntVariable(const char *name, int *value) const; - bool GetBoolVariable(const char *name, bool *value) const; - bool GetDoubleVariable(const char *name, double *value) const; - - /** - * Returns the pointer to the string that represents the value of the - * parameter if it was found among Tesseract parameters. - */ - const char *GetStringVariable(const char *name) const; - -#ifndef DISABLED_LEGACY_ENGINE - - /** - * Print Tesseract fonts table to the given file. - */ - void PrintFontsTable(FILE *fp) const; - -#endif - - /** - * Print Tesseract parameters to the given file. - */ - void PrintVariables(FILE *fp) const; - - /** - * Get value of named variable as a string, if it exists. - */ - bool GetVariableAsString(const char *name, std::string *val) const; - - /** - * Instances are now mostly thread-safe and totally independent, - * but some global parameters remain. Basically it is safe to use multiple - * TessBaseAPIs in different threads in parallel, UNLESS: - * you use SetVariable on some of the Params in classify and textord. - * If you do, then the effect will be to change it for all your instances. - * - * Start tesseract. Returns zero on success and -1 on failure. - * NOTE that the only members that may be called before Init are those - * listed above here in the class definition. - * - * The datapath must be the name of the tessdata directory. - * The language is (usually) an ISO 639-3 string or nullptr will default to - * eng. It is entirely safe (and eventually will be efficient too) to call - * Init multiple times on the same instance to change language, or just - * to reset the classifier. - * The language may be a string of the form [~][+[~]]* indicating - * that multiple languages are to be loaded. Eg hin+eng will load Hindi and - * English. Languages may specify internally that they want to be loaded - * with one or more other languages, so the ~ sign is available to override - * that. Eg if hin were set to load eng by default, then hin+~eng would force - * loading only hin. The number of loaded languages is limited only by - * memory, with the caveat that loading additional languages will impact - * both speed and accuracy, as there is more work to do to decide on the - * applicable language, and there is more chance of hallucinating incorrect - * words. - * WARNING: On changing languages, all Tesseract parameters are reset - * back to their default values. (Which may vary between languages.) - * If you have a rare need to set a Variable that controls - * initialization for a second call to Init you should explicitly - * call End() and then use SetVariable before Init. This is only a very - * rare use case, since there are very few uses that require any parameters - * to be set before Init. - * - * If set_only_non_debug_params is true, only params that do not contain - * "debug" in the name will be set. - */ - int Init(const char *datapath, const char *language, OcrEngineMode mode, - char **configs, int configs_size, - const std::vector *vars_vec, - const std::vector *vars_values, - bool set_only_non_debug_params); - int Init(const char *datapath, const char *language, OcrEngineMode oem) { - return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false); - } - int Init(const char *datapath, const char *language) { - return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr, - false); - } - // In-memory version reads the traineddata file directly from the given - // data[data_size] array, and/or reads data via a FileReader. - int Init(const char *data, int data_size, const char *language, - OcrEngineMode mode, char **configs, int configs_size, - const std::vector *vars_vec, - const std::vector *vars_values, - bool set_only_non_debug_params, FileReader reader); - - /** - * Returns the languages string used in the last valid initialization. - * If the last initialization specified "deu+hin" then that will be - * returned. If hin loaded eng automatically as well, then that will - * not be included in this list. To find the languages actually - * loaded use GetLoadedLanguagesAsVector. - * The returned string should NOT be deleted. - */ - const char *GetInitLanguagesAsString() const; - - /** - * Returns the loaded languages in the vector of std::string. - * Includes all languages loaded by the last Init, including those loaded - * as dependencies of other loaded languages. - */ - void GetLoadedLanguagesAsVector(std::vector *langs) const; - - /** - * Returns the available languages in the sorted vector of std::string. - */ - void GetAvailableLanguagesAsVector(std::vector *langs) const; - - /** - * Init only for page layout analysis. Use only for calls to SetImage and - * AnalysePage. Calls that attempt recognition will generate an error. - */ - void InitForAnalysePage(); - - /** - * Read a "config" file containing a set of param, value pairs. - * Searches the standard places: tessdata/configs, tessdata/tessconfigs - * and also accepts a relative or absolute path name. - * Note: only non-init params will be set (init params are set by Init()). - */ - void ReadConfigFile(const char *filename); - /** Same as above, but only set debug params from the given config file. */ - void ReadDebugConfigFile(const char *filename); - - /** - * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK. - * The mode is stored as an IntParam so it can also be modified by - * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string). - */ - void SetPageSegMode(PageSegMode mode); - - /** Return the current page segmentation mode. */ - PageSegMode GetPageSegMode() const; - - /** - * Recognize a rectangle from an image and return the result as a string. - * May be called many times for a single Init. - * Currently has no error checking. - * Greyscale of 8 and color of 24 or 32 bits per pixel may be given. - * Palette color images will not work properly and must be converted to - * 24 bit. - * Binary images of 1 bit per pixel may also be given but they must be - * byte packed with the MSB of the first byte being the first pixel, and a - * 1 represents WHITE. For binary images set bytes_per_pixel=0. - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - * - * Note that TesseractRect is the simplified convenience interface. - * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize, - * and one or more of the Get*Text functions below. - */ - char *TesseractRect(const unsigned char *imagedata, int bytes_per_pixel, - int bytes_per_line, int left, int top, int width, - int height); - - /** - * Call between pages or documents etc to free up memory and forget - * adaptive data. - */ - void ClearAdaptiveClassifier(); - - /** - * @defgroup AdvancedAPI Advanced API - * The following methods break TesseractRect into pieces, so you can - * get hold of the thresholded image, get the text in different formats, - * get bounding boxes, confidences etc. - */ - /* @{ */ - - /** - * Provide an image for Tesseract to recognize. Format is as - * TesseractRect above. Copies the image buffer and converts to Pix. - * SetImage clears all recognition results, and sets the rectangle to the - * full image, so it may be followed immediately by a GetUTF8Text, and it - * will automatically perform recognition. - */ - void SetImage(const unsigned char *imagedata, int width, int height, - int bytes_per_pixel, int bytes_per_line); - - /** - * Provide an image for Tesseract to recognize. As with SetImage above, - * Tesseract takes its own copy of the image, so it need not persist until - * after Recognize. - * Pix vs raw, which to use? - * Use Pix where possible. Tesseract uses Pix as its internal representation - * and it is therefore more efficient to provide a Pix directly. - */ - void SetImage(Pix *pix); - - /** - * Set the resolution of the source image in pixels per inch so font size - * information can be calculated in results. Call this after SetImage(). - */ - void SetSourceResolution(int ppi); - - /** - * Restrict recognition to a sub-rectangle of the image. Call after SetImage. - * Each SetRectangle clears the recogntion results so multiple rectangles - * can be recognized with the same image. - */ - void SetRectangle(int left, int top, int width, int height); - - /** - * Get a copy of the internal thresholded image from Tesseract. - * Caller takes ownership of the Pix and must pixDestroy it. - * May be called any time after SetImage, or after TesseractRect. - */ - Pix *GetThresholdedImage(); - - /** - * Get the result of page layout analysis as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - */ - Boxa *GetRegions(Pixa **pixa); - - /** - * Get the textlines as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * If raw_image is true, then extract from the original image instead of the - * thresholded image and pad by raw_padding pixels. - * If blockids is not nullptr, the block-id of each line is also returned as - * an array of one element per line. delete [] after use. If paraids is not - * nullptr, the paragraph-id of each line within its block is also returned as - * an array of one element per line. delete [] after use. - */ - Boxa *GetTextlines(bool raw_image, int raw_padding, Pixa **pixa, - int **blockids, int **paraids); - /* - Helper method to extract from the thresholded image. (most common usage) -*/ - Boxa *GetTextlines(Pixa **pixa, int **blockids) { - return GetTextlines(false, 0, pixa, blockids, nullptr); - } - - /** - * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa - * pair, in reading order. Enables downstream handling of non-rectangular - * regions. - * Can be called before or after Recognize. - * If blockids is not nullptr, the block-id of each line is also returned as - * an array of one element per line. delete [] after use. - */ - Boxa *GetStrips(Pixa **pixa, int **blockids); - - /** - * Get the words as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - */ - Boxa *GetWords(Pixa **pixa); - - /** - * Gets the individual connected (text) components (created - * after pages segmentation step, but before recognition) - * as a leptonica-style Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * Note: the caller is responsible for calling boxaDestroy() - * on the returned Boxa array and pixaDestroy() on cc array. - */ - Boxa *GetConnectedComponents(Pixa **cc); - - /** - * Get the given level kind of components (block, textline, word etc.) as a - * leptonica-style Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * If blockids is not nullptr, the block-id of each component is also returned - * as an array of one element per component. delete [] after use. - * If blockids is not nullptr, the paragraph-id of each component with its - * block is also returned as an array of one element per component. delete [] - * after use. If raw_image is true, then portions of the original image are - * extracted instead of the thresholded image and padded with raw_padding. If - * text_only is true, then only text components are returned. - */ - Boxa *GetComponentImages(PageIteratorLevel level, bool text_only, - bool raw_image, int raw_padding, Pixa **pixa, - int **blockids, int **paraids); - // Helper function to get binary images with no padding (most common usage). - Boxa *GetComponentImages(const PageIteratorLevel level, const bool text_only, - Pixa **pixa, int **blockids) { - return GetComponentImages(level, text_only, false, 0, pixa, blockids, - nullptr); - } - - /** - * Returns the scale factor of the thresholded image that would be returned by - * GetThresholdedImage() and the various GetX() methods that call - * GetComponentImages(). - * Returns 0 if no thresholder has been set. - */ - int GetThresholdedImageScaleFactor() const; - - /** - * Runs page layout analysis in the mode set by SetPageSegMode. - * May optionally be called prior to Recognize to get access to just - * the page layout results. Returns an iterator to the results. - * If merge_similar_words is true, words are combined where suitable for use - * with a line recognizer. Use if you want to use AnalyseLayout to find the - * textlines, and then want to process textline fragments with an external - * line recognizer. - * Returns nullptr on error or an empty page. - * The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - PageIterator *AnalyseLayout(); - PageIterator *AnalyseLayout(bool merge_similar_words); - - /** - * Recognize the image from SetAndThresholdImage, generating Tesseract - * internal structures. Returns 0 on success. - * Optional. The Get*Text functions below will call Recognize if needed. - * After Recognize, the output is kept internally until the next SetImage. - */ - int Recognize(ETEXT_DESC *monitor); - - /** - * Methods to retrieve information after SetAndThresholdImage(), - * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.) - */ - - /** - * Turns images into symbolic text. - * - * filename can point to a single image, a multi-page TIFF, - * or a plain text list of image filenames. - * - * retry_config is useful for debugging. If not nullptr, you can fall - * back to an alternate configuration if a page fails for some - * reason. - * - * timeout_millisec terminates processing if any single page - * takes too long. Set to 0 for unlimited time. - * - * renderer is responible for creating the output. For example, - * use the TessTextRenderer if you want plaintext output, or - * the TessPDFRender to produce searchable PDF. - * - * If tessedit_page_number is non-negative, will only process that - * single page. Works for multi-page tiff file, or filelist. - * - * Returns true if successful, false on error. - */ - bool ProcessPages(const char *filename, const char *retry_config, - int timeout_millisec, TessResultRenderer *renderer); - // Does the real work of ProcessPages. - bool ProcessPagesInternal(const char *filename, const char *retry_config, - int timeout_millisec, TessResultRenderer *renderer); - - /** - * Turn a single image into symbolic text. - * - * The pix is the image processed. filename and page_index are - * metadata used by side-effect processes, such as reading a box - * file or formatting as hOCR. - * - * See ProcessPages for descriptions of other parameters. - */ - bool ProcessPage(Pix *pix, int page_index, const char *filename, - const char *retry_config, int timeout_millisec, - TessResultRenderer *renderer); - - /** - * Get a reading-order iterator to the results of LayoutAnalysis and/or - * Recognize. The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - ResultIterator *GetIterator(); - - /** - * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize. - * The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - MutableIterator *GetMutableIterator(); - - /** - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - */ - char *GetUTF8Text(); - - /** - * Make a HTML-formatted string with hOCR markup from the internal - * data structures. - * page_number is 0-based but will appear in the output as 1-based. - * monitor can be used to - * cancel the recognition - * receive progress callbacks - * Returned string must be freed with the delete [] operator. - */ - char *GetHOCRText(ETEXT_DESC *monitor, int page_number); - - /** - * Make a HTML-formatted string with hOCR markup from the internal - * data structures. - * page_number is 0-based but will appear in the output as 1-based. - * Returned string must be freed with the delete [] operator. - */ - char *GetHOCRText(int page_number); - - /** - * Make an XML-formatted string with Alto markup from the internal - * data structures. - */ - char *GetAltoText(ETEXT_DESC *monitor, int page_number); - - /** - * Make an XML-formatted string with Alto markup from the internal - * data structures. - */ - char *GetAltoText(int page_number); - - /** - * Make a TSV-formatted string from the internal data structures. - * page_number is 0-based but will appear in the output as 1-based. - * Returned string must be freed with the delete [] operator. - */ - char *GetTSVText(int page_number); - - /** - * Make a box file for LSTM training from the internal data structures. - * Constructs coordinates in the original image - not just the rectangle. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetLSTMBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded in the same - * format as a box file used in training. - * Constructs coordinates in the original image - not just the rectangle. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded in the same - * format as a WordStr box file used in training. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetWordStrBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded - * as UNLV format Latin-1 with specific reject and suspect codes. - * Returned string must be freed with the delete [] operator. - */ - char *GetUNLVText(); - - /** - * Detect the orientation of the input image and apparent script (alphabet). - * orient_deg is the detected clockwise rotation of the input image in degrees - * (0, 90, 180, 270) - * orient_conf is the confidence (15.0 is reasonably confident) - * script_name is an ASCII string, the name of the script, e.g. "Latin" - * script_conf is confidence level in the script - * Returns true on success and writes values to each parameter as an output - */ - bool DetectOrientationScript(int *orient_deg, float *orient_conf, - const char **script_name, float *script_conf); - - /** - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - * page_number is a 0-based page index that will appear in the osd file. - */ - char *GetOsdText(int page_number); - - /** Returns the (average) confidence value between 0 and 100. */ - int MeanTextConf(); - /** - * Returns all word confidences (between 0 and 100) in an array, terminated - * by -1. The calling function must delete [] after use. - * The number of confidences should correspond to the number of space- - * delimited words in GetUTF8Text. - */ - int *AllWordConfidences(); - -#ifndef DISABLED_LEGACY_ENGINE - /** - * Applies the given word to the adaptive classifier if possible. - * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can - * tell the boundaries of the graphemes. - * Assumes that SetImage/SetRectangle have been used to set the image - * to the given word. The mode arg should be PSM_SINGLE_WORD or - * PSM_CIRCLE_WORD, as that will be used to control layout analysis. - * The currently set PageSegMode is preserved. - * Returns false if adaption was not possible for some reason. - */ - bool AdaptToWordStr(PageSegMode mode, const char *wordstr); -#endif // ndef DISABLED_LEGACY_ENGINE - - /** - * Free up recognition results and any stored image data, without actually - * freeing any recognition data that would be time-consuming to reload. - * Afterwards, you must call SetImage or TesseractRect before doing - * any Recognize or Get* operation. - */ - void Clear(); - - /** - * Close down tesseract and free up all memory. End() is equivalent to - * destructing and reconstructing your TessBaseAPI. - * Once End() has been used, none of the other API functions may be used - * other than Init and anything declared above it in the class definition. - */ - void End(); - - /** - * Clear any library-level memory caches. - * There are a variety of expensive-to-load constant data structures (mostly - * language dictionaries) that are cached globally -- surviving the Init() - * and End() of individual TessBaseAPI's. This function allows the clearing - * of these caches. - **/ - static void ClearPersistentCache(); - - /** - * Check whether a word is valid according to Tesseract's language model - * @return 0 if the word is invalid, non-zero if valid. - * @warning temporary! This function will be removed from here and placed - * in a separate API at some future time. - */ - int IsValidWord(const char *word) const; - // Returns true if utf8_character is defined in the UniCharset. - bool IsValidCharacter(const char *utf8_character) const; - - bool GetTextDirection(int *out_offset, float *out_slope); - - /** Sets Dict::letter_is_okay_ function to point to the given function. */ - void SetDictFunc(DictFunc f); - - /** Sets Dict::probability_in_context_ function to point to the given - * function. - */ - void SetProbabilityInContextFunc(ProbabilityInContextFunc f); - - /** - * Estimates the Orientation And Script of the image. - * @return true if the image was processed successfully. - */ - bool DetectOS(OSResults *); - - /** - * Return text orientation of each block as determined by an earlier run - * of layout analysis. - */ - void GetBlockTextOrientations(int **block_orientation, - bool **vertical_writing); - - /** This method returns the string form of the specified unichar. */ - const char *GetUnichar(int unichar_id) const; - - /** Return the pointer to the i-th dawg loaded into tesseract_ object. */ - const Dawg *GetDawg(int i) const; - - /** Return the number of dawgs loaded into tesseract_ object. */ - int NumDawgs() const; - - Tesseract *tesseract() const { - return tesseract_; - } - - OcrEngineMode oem() const { - return last_oem_requested_; - } - - void set_min_orientation_margin(double margin); - /* @} */ - -protected: - /** Common code for setting the image. Returns true if Init has been called. - */ - bool InternalSetImage(); - - /** - * Run the thresholder to make the thresholded image. If pix is not nullptr, - * the source is thresholded to pix instead of the internal IMAGE. - */ - virtual bool Threshold(Pix **pix); - - /** - * Find lines from the image making the BLOCK_LIST. - * @return 0 on success. - */ - int FindLines(); - - /** Delete the pageres and block list ready for a new page. */ - void ClearResults(); - - /** - * Return an LTR Result Iterator -- used only for training, as we really want - * to ignore all BiDi smarts at that point. - * delete once you're done with it. - */ - LTRResultIterator *GetLTRIterator(); - - /** - * Return the length of the output text string, as UTF8, assuming - * one newline per line and one per block, with a terminator, - * and assuming a single character reject marker for each rejected character. - * Also return the number of recognized blobs in blob_count. - */ - int TextLength(int *blob_count) const; - - //// paragraphs.cpp //////////////////////////////////////////////////// - void DetectParagraphs(bool after_text_recognition); - - const PAGE_RES *GetPageRes() const { - return page_res_; - } - -protected: - Tesseract *tesseract_; ///< The underlying data object. - Tesseract *osd_tesseract_; ///< For orientation & script detection. - EquationDetect *equ_detect_; ///< The equation detector. - FileReader reader_; ///< Reads files from any filesystem. - ImageThresholder *thresholder_; ///< Image thresholding module. - std::vector *paragraph_models_; - BLOCK_LIST *block_list_; ///< The page layout. - PAGE_RES *page_res_; ///< The page-level data. - std::string input_file_; ///< Name used by training code. - std::string output_file_; ///< Name used by debug code. - std::string datapath_; ///< Current location of tessdata. - std::string language_; ///< Last initialized language. - OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested. - bool recognition_done_; ///< page_res_ contains recognition data. - - /** - * @defgroup ThresholderParams Thresholder Parameters - * Parameters saved from the Thresholder. Needed to rebuild coordinates. - */ - /* @{ */ - int rect_left_; - int rect_top_; - int rect_width_; - int rect_height_; - int image_width_; - int image_height_; - /* @} */ - -private: - // A list of image filenames gets special consideration - bool ProcessPagesFileList(FILE *fp, std::string *buf, - const char *retry_config, int timeout_millisec, - TessResultRenderer *renderer, - int tessedit_page_number); - // TIFF supports multipage so gets special consideration. - bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size, - const char *filename, const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer, - int tessedit_page_number); -}; // class TessBaseAPI. - -/** Escape a char string - remove &<>"' with HTML codes. */ -std::string HOcrEscape(const char *text); - -} // namespace tesseract - -#endif // TESSERACT_API_BASEAPI_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/capi.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/capi.h deleted file mode 100644 index 40f4856a..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/capi.h +++ /dev/null @@ -1,484 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: capi.h -// Description: C-API TessBaseAPI -// -// (C) Copyright 2012, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef API_CAPI_H_ -#define API_CAPI_H_ - -#include "export.h" - -#ifdef __cplusplus -# include -# include -# include -# include -# include -#endif - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef BOOL -# define BOOL int -# define TRUE 1 -# define FALSE 0 -#endif - -#ifdef __cplusplus -typedef tesseract::TessResultRenderer TessResultRenderer; -typedef tesseract::TessBaseAPI TessBaseAPI; -typedef tesseract::PageIterator TessPageIterator; -typedef tesseract::ResultIterator TessResultIterator; -typedef tesseract::MutableIterator TessMutableIterator; -typedef tesseract::ChoiceIterator TessChoiceIterator; -typedef tesseract::OcrEngineMode TessOcrEngineMode; -typedef tesseract::PageSegMode TessPageSegMode; -typedef tesseract::PageIteratorLevel TessPageIteratorLevel; -typedef tesseract::Orientation TessOrientation; -typedef tesseract::ParagraphJustification TessParagraphJustification; -typedef tesseract::WritingDirection TessWritingDirection; -typedef tesseract::TextlineOrder TessTextlineOrder; -typedef tesseract::PolyBlockType TessPolyBlockType; -typedef tesseract::ETEXT_DESC ETEXT_DESC; -#else -typedef struct TessResultRenderer TessResultRenderer; -typedef struct TessBaseAPI TessBaseAPI; -typedef struct TessPageIterator TessPageIterator; -typedef struct TessResultIterator TessResultIterator; -typedef struct TessMutableIterator TessMutableIterator; -typedef struct TessChoiceIterator TessChoiceIterator; -typedef enum TessOcrEngineMode { - OEM_TESSERACT_ONLY, - OEM_LSTM_ONLY, - OEM_TESSERACT_LSTM_COMBINED, - OEM_DEFAULT -} TessOcrEngineMode; -typedef enum TessPageSegMode { - PSM_OSD_ONLY, - PSM_AUTO_OSD, - PSM_AUTO_ONLY, - PSM_AUTO, - PSM_SINGLE_COLUMN, - PSM_SINGLE_BLOCK_VERT_TEXT, - PSM_SINGLE_BLOCK, - PSM_SINGLE_LINE, - PSM_SINGLE_WORD, - PSM_CIRCLE_WORD, - PSM_SINGLE_CHAR, - PSM_SPARSE_TEXT, - PSM_SPARSE_TEXT_OSD, - PSM_RAW_LINE, - PSM_COUNT -} TessPageSegMode; -typedef enum TessPageIteratorLevel { - RIL_BLOCK, - RIL_PARA, - RIL_TEXTLINE, - RIL_WORD, - RIL_SYMBOL -} TessPageIteratorLevel; -typedef enum TessPolyBlockType { - PT_UNKNOWN, - PT_FLOWING_TEXT, - PT_HEADING_TEXT, - PT_PULLOUT_TEXT, - PT_EQUATION, - PT_INLINE_EQUATION, - PT_TABLE, - PT_VERTICAL_TEXT, - PT_CAPTION_TEXT, - PT_FLOWING_IMAGE, - PT_HEADING_IMAGE, - PT_PULLOUT_IMAGE, - PT_HORZ_LINE, - PT_VERT_LINE, - PT_NOISE, - PT_COUNT -} TessPolyBlockType; -typedef enum TessOrientation { - ORIENTATION_PAGE_UP, - ORIENTATION_PAGE_RIGHT, - ORIENTATION_PAGE_DOWN, - ORIENTATION_PAGE_LEFT -} TessOrientation; -typedef enum TessParagraphJustification { - JUSTIFICATION_UNKNOWN, - JUSTIFICATION_LEFT, - JUSTIFICATION_CENTER, - JUSTIFICATION_RIGHT -} TessParagraphJustification; -typedef enum TessWritingDirection { - WRITING_DIRECTION_LEFT_TO_RIGHT, - WRITING_DIRECTION_RIGHT_TO_LEFT, - WRITING_DIRECTION_TOP_TO_BOTTOM -} TessWritingDirection; -typedef enum TessTextlineOrder { - TEXTLINE_ORDER_LEFT_TO_RIGHT, - TEXTLINE_ORDER_RIGHT_TO_LEFT, - TEXTLINE_ORDER_TOP_TO_BOTTOM -} TessTextlineOrder; -typedef struct ETEXT_DESC ETEXT_DESC; -#endif - -typedef bool (*TessCancelFunc)(void *cancel_this, int words); -typedef bool (*TessProgressFunc)(ETEXT_DESC *ths, int left, int right, int top, - int bottom); - -struct Pix; -struct Boxa; -struct Pixa; - -/* General free functions */ - -TESS_API const char *TessVersion(); -TESS_API void TessDeleteText(const char *text); -TESS_API void TessDeleteTextArray(char **arr); -TESS_API void TessDeleteIntArray(const int *arr); - -/* Renderer API */ -TESS_API TessResultRenderer *TessTextRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessHOcrRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase, - BOOL font_info); -TESS_API TessResultRenderer *TessAltoRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessTsvRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessPDFRendererCreate(const char *outputbase, - const char *datadir, - BOOL textonly); -TESS_API TessResultRenderer *TessUnlvRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessWordStrBoxRendererCreate( - const char *outputbase); - -TESS_API void TessDeleteResultRenderer(TessResultRenderer *renderer); -TESS_API void TessResultRendererInsert(TessResultRenderer *renderer, - TessResultRenderer *next); -TESS_API TessResultRenderer *TessResultRendererNext( - TessResultRenderer *renderer); -TESS_API BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer, - const char *title); -TESS_API BOOL TessResultRendererAddImage(TessResultRenderer *renderer, - TessBaseAPI *api); -TESS_API BOOL TessResultRendererEndDocument(TessResultRenderer *renderer); - -TESS_API const char *TessResultRendererExtention(TessResultRenderer *renderer); -TESS_API const char *TessResultRendererTitle(TessResultRenderer *renderer); -TESS_API int TessResultRendererImageNum(TessResultRenderer *renderer); - -/* Base API */ - -TESS_API TessBaseAPI *TessBaseAPICreate(); -TESS_API void TessBaseAPIDelete(TessBaseAPI *handle); - -TESS_API size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI *handle, void **device); - -TESS_API void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name); -TESS_API const char *TessBaseAPIGetInputName(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetInputImage(TessBaseAPI *handle, struct Pix *pix); -TESS_API struct Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle); - -TESS_API int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle); -TESS_API const char *TessBaseAPIGetDatapath(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name); - -TESS_API BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name, - const char *value); -TESS_API BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name, - const char *value); - -TESS_API BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle, - const char *name, int *value); -TESS_API BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle, - const char *name, BOOL *value); -TESS_API BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle, - const char *name, double *value); -TESS_API const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle, - const char *name); - -TESS_API void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp); -TESS_API BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle, - const char *filename); - -TESS_API int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode oem, - char **configs, int configs_size); -TESS_API int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode oem); -TESS_API int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath, - const char *language); - -TESS_API int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode mode, - char **configs, int configs_size, char **vars_vec, - char **vars_values, size_t vars_vec_size, - BOOL set_only_non_debug_params); - -TESS_API int TessBaseAPIInit5(TessBaseAPI *handle, const char *data, int data_size, - const char *language, TessOcrEngineMode mode, - char **configs, int configs_size, char **vars_vec, - char **vars_values, size_t vars_vec_size, - BOOL set_only_non_debug_params); - -TESS_API const char *TessBaseAPIGetInitLanguagesAsString( - const TessBaseAPI *handle); -TESS_API char **TessBaseAPIGetLoadedLanguagesAsVector( - const TessBaseAPI *handle); -TESS_API char **TessBaseAPIGetAvailableLanguagesAsVector( - const TessBaseAPI *handle); - -TESS_API void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle); - -TESS_API void TessBaseAPIReadConfigFile(TessBaseAPI *handle, - const char *filename); -TESS_API void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle, - const char *filename); - -TESS_API void TessBaseAPISetPageSegMode(TessBaseAPI *handle, - TessPageSegMode mode); -TESS_API TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle); - -TESS_API char *TessBaseAPIRect(TessBaseAPI *handle, - const unsigned char *imagedata, - int bytes_per_pixel, int bytes_per_line, - int left, int top, int width, int height); - -TESS_API void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetImage(TessBaseAPI *handle, - const unsigned char *imagedata, int width, - int height, int bytes_per_pixel, - int bytes_per_line); -TESS_API void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix); - -TESS_API void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi); - -TESS_API void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top, - int width, int height); - -TESS_API struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle); -TESS_API struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle, - struct Pixa **pixa); -TESS_API struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle, - struct Pixa **pixa, - int **blockids); -TESS_API struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle, - BOOL raw_image, int raw_padding, - struct Pixa **pixa, - int **blockids, int **paraids); -TESS_API struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle, - struct Pixa **pixa, int **blockids); -TESS_API struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle, - struct Pixa **pixa); -TESS_API struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle, - struct Pixa **cc); -TESS_API struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle, - TessPageIteratorLevel level, - BOOL text_only, - struct Pixa **pixa, - int **blockids); -TESS_API struct Boxa *TessBaseAPIGetComponentImages1( - TessBaseAPI *handle, TessPageIteratorLevel level, BOOL text_only, - BOOL raw_image, int raw_padding, struct Pixa **pixa, int **blockids, - int **paraids); - -TESS_API int TessBaseAPIGetThresholdedImageScaleFactor( - const TessBaseAPI *handle); - -TESS_API TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle); - -TESS_API int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor); - -TESS_API BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename, - const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer); -TESS_API BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix, - int page_index, const char *filename, - const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer); - -TESS_API TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle); -TESS_API TessMutableIterator *TessBaseAPIGetMutableIterator( - TessBaseAPI *handle); - -TESS_API char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle); -TESS_API char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number); - -TESS_API char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number); - -TESS_API char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle, - int page_number); - -TESS_API char *TessBaseAPIGetUNLVText(TessBaseAPI *handle); -TESS_API int TessBaseAPIMeanTextConf(TessBaseAPI *handle); - -TESS_API int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle); - -#ifndef DISABLED_LEGACY_ENGINE -TESS_API BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle, - TessPageSegMode mode, - const char *wordstr); -#endif // #ifndef DISABLED_LEGACY_ENGINE - -TESS_API void TessBaseAPIClear(TessBaseAPI *handle); -TESS_API void TessBaseAPIEnd(TessBaseAPI *handle); - -TESS_API int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word); -TESS_API BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset, - float *out_slope); - -TESS_API const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id); - -TESS_API void TessBaseAPIClearPersistentCache(TessBaseAPI *handle); - -#ifndef DISABLED_LEGACY_ENGINE - -// Call TessDeleteText(*best_script_name) to free memory allocated by this -// function -TESS_API BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle, - int *orient_deg, - float *orient_conf, - const char **script_name, - float *script_conf); -#endif // #ifndef DISABLED_LEGACY_ENGINE - -TESS_API void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle, - double margin); - -TESS_API int TessBaseAPINumDawgs(const TessBaseAPI *handle); - -TESS_API TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle); - -TESS_API void TessBaseGetBlockTextOrientations(TessBaseAPI *handle, - int **block_orientation, - bool **vertical_writing); - -/* Page iterator */ - -TESS_API void TessPageIteratorDelete(TessPageIterator *handle); - -TESS_API TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle); - -TESS_API void TessPageIteratorBegin(TessPageIterator *handle); - -TESS_API BOOL TessPageIteratorNext(TessPageIterator *handle, - TessPageIteratorLevel level); - -TESS_API BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle, - TessPageIteratorLevel level); - -TESS_API BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle, - TessPageIteratorLevel level, - TessPageIteratorLevel element); - -TESS_API BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle, - TessPageIteratorLevel level, - int *left, int *top, int *right, - int *bottom); - -TESS_API TessPolyBlockType -TessPageIteratorBlockType(const TessPageIterator *handle); - -TESS_API struct Pix *TessPageIteratorGetBinaryImage( - const TessPageIterator *handle, TessPageIteratorLevel level); - -TESS_API struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle, - TessPageIteratorLevel level, - int padding, - struct Pix *original_image, - int *left, int *top); - -TESS_API BOOL TessPageIteratorBaseline(const TessPageIterator *handle, - TessPageIteratorLevel level, int *x1, - int *y1, int *x2, int *y2); - -TESS_API void TessPageIteratorOrientation( - TessPageIterator *handle, TessOrientation *orientation, - TessWritingDirection *writing_direction, TessTextlineOrder *textline_order, - float *deskew_angle); - -TESS_API void TessPageIteratorParagraphInfo( - TessPageIterator *handle, TessParagraphJustification *justification, - BOOL *is_list_item, BOOL *is_crown, int *first_line_indent); - -/* Result iterator */ - -TESS_API void TessResultIteratorDelete(TessResultIterator *handle); -TESS_API TessResultIterator *TessResultIteratorCopy( - const TessResultIterator *handle); -TESS_API TessPageIterator *TessResultIteratorGetPageIterator( - TessResultIterator *handle); -TESS_API const TessPageIterator *TessResultIteratorGetPageIteratorConst( - const TessResultIterator *handle); -TESS_API TessChoiceIterator *TessResultIteratorGetChoiceIterator( - const TessResultIterator *handle); - -TESS_API BOOL TessResultIteratorNext(TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API float TessResultIteratorConfidence(const TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API const char *TessResultIteratorWordRecognitionLanguage( - const TessResultIterator *handle); -TESS_API const char *TessResultIteratorWordFontAttributes( - const TessResultIterator *handle, BOOL *is_bold, BOOL *is_italic, - BOOL *is_underlined, BOOL *is_monospace, BOOL *is_serif, BOOL *is_smallcaps, - int *pointsize, int *font_id); - -TESS_API BOOL -TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle); -TESS_API BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle); - -TESS_API void TessChoiceIteratorDelete(TessChoiceIterator *handle); -TESS_API BOOL TessChoiceIteratorNext(TessChoiceIterator *handle); -TESS_API const char *TessChoiceIteratorGetUTF8Text( - const TessChoiceIterator *handle); -TESS_API float TessChoiceIteratorConfidence(const TessChoiceIterator *handle); - -/* Progress monitor */ - -TESS_API ETEXT_DESC *TessMonitorCreate(); -TESS_API void TessMonitorDelete(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetCancelFunc(ETEXT_DESC *monitor, - TessCancelFunc cancelFunc); -TESS_API void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis); -TESS_API void *TessMonitorGetCancelThis(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetProgressFunc(ETEXT_DESC *monitor, - TessProgressFunc progressFunc); -TESS_API int TessMonitorGetProgress(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline); - -#ifdef __cplusplus -} -#endif - -#endif // API_CAPI_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/export.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/export.h deleted file mode 100644 index d238b628..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/export.h +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: export.h -// Description: Place holder -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_PLATFORM_H_ -#define TESSERACT_PLATFORM_H_ - -#ifndef TESS_API -# if defined(_WIN32) || defined(__CYGWIN__) -# if defined(TESS_EXPORTS) -# define TESS_API __declspec(dllexport) -# elif defined(TESS_IMPORTS) -# define TESS_API __declspec(dllimport) -# else -# define TESS_API -# endif -# else -# if defined(TESS_EXPORTS) || defined(TESS_IMPORTS) -# define TESS_API __attribute__((visibility("default"))) -# else -# define TESS_API -# endif -# endif -#endif - -#endif // TESSERACT_PLATFORM_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/ltrresultiterator.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/ltrresultiterator.h deleted file mode 100644 index 6ca0a98e..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/ltrresultiterator.h +++ /dev/null @@ -1,235 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: ltrresultiterator.h -// Description: Iterator for tesseract results in strict left-to-right -// order that avoids using tesseract internal data structures. -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ -#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ - -#include "export.h" // for TESS_API -#include "pageiterator.h" // for PageIterator -#include "publictypes.h" // for PageIteratorLevel -#include "unichar.h" // for StrongScriptDirection - -namespace tesseract { - -class BLOB_CHOICE_IT; -class PAGE_RES; -class WERD_RES; - -class Tesseract; - -// Class to iterate over tesseract results, providing access to all levels -// of the page hierarchy, without including any tesseract headers or having -// to handle any tesseract structures. -// WARNING! This class points to data held within the TessBaseAPI class, and -// therefore can only be used while the TessBaseAPI class still exists and -// has not been subjected to a call of Init, SetImage, Recognize, Clear, End -// DetectOS, or anything else that changes the internal PAGE_RES. -// See tesseract/publictypes.h for the definition of PageIteratorLevel. -// See also base class PageIterator, which contains the bulk of the interface. -// LTRResultIterator adds text-specific methods for access to OCR output. - -class TESS_API LTRResultIterator : public PageIterator { - friend class ChoiceIterator; - -public: - // page_res and tesseract come directly from the BaseAPI. - // The rectangle parameters are copied indirectly from the Thresholder, - // via the BaseAPI. They represent the coordinates of some rectangle in an - // original image (in top-left-origin coordinates) and therefore the top-left - // needs to be added to any output boxes in order to specify coordinates - // in the original image. See TessBaseAPI::SetRectangle. - // The scale and scaled_yres are in case the Thresholder scaled the image - // rectangle prior to thresholding. Any coordinates in tesseract's image - // must be divided by scale before adding (rect_left, rect_top). - // The scaled_yres indicates the effective resolution of the binary image - // that tesseract has been given by the Thresholder. - // After the constructor, Begin has already been called. - LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, - int scaled_yres, int rect_left, int rect_top, - int rect_width, int rect_height); - - ~LTRResultIterator() override; - - // LTRResultIterators may be copied! This makes it possible to iterate over - // all the objects at a lower level, while maintaining an iterator to - // objects at a higher level. These constructors DO NOT CALL Begin, so - // iterations will continue from the location of src. - // TODO: For now the copy constructor and operator= only need the base class - // versions, but if new data members are added, don't forget to add them! - - // ============= Moving around within the page ============. - - // See PageIterator. - - // ============= Accessing data ==============. - - // Returns the null terminated UTF-8 encoded text string for the current - // object at the given level. Use delete [] to free after use. - char *GetUTF8Text(PageIteratorLevel level) const; - - // Set the string inserted at the end of each text line. "\n" by default. - void SetLineSeparator(const char *new_line); - - // Set the string inserted at the end of each paragraph. "\n" by default. - void SetParagraphSeparator(const char *new_para); - - // Returns the mean confidence of the current object at the given level. - // The number should be interpreted as a percent probability. (0.0f-100.0f) - float Confidence(PageIteratorLevel level) const; - - // ============= Functions that refer to words only ============. - - // Returns the font attributes of the current word. If iterating at a higher - // level object than words, eg textlines, then this will return the - // attributes of the first word in that textline. - // The actual return value is a string representing a font name. It points - // to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as - // the iterator itself, ie rendered invalid by various members of - // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI. - // Pointsize is returned in printers points (1/72 inch.) - const char *WordFontAttributes(bool *is_bold, bool *is_italic, - bool *is_underlined, bool *is_monospace, - bool *is_serif, bool *is_smallcaps, - int *pointsize, int *font_id) const; - - // Return the name of the language used to recognize this word. - // On error, nullptr. Do not delete this pointer. - const char *WordRecognitionLanguage() const; - - // Return the overall directionality of this word. - StrongScriptDirection WordDirection() const; - - // Returns true if the current word was found in a dictionary. - bool WordIsFromDictionary() const; - - // Returns the number of blanks before the current word. - int BlanksBeforeWord() const; - - // Returns true if the current word is numeric. - bool WordIsNumeric() const; - - // Returns true if the word contains blamer information. - bool HasBlamerInfo() const; - - // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle - // of the current word. - const void *GetParamsTrainingBundle() const; - - // Returns a pointer to the string with blamer information for this word. - // Assumes that the word's blamer_bundle is not nullptr. - const char *GetBlamerDebug() const; - - // Returns a pointer to the string with misadaption information for this word. - // Assumes that the word's blamer_bundle is not nullptr. - const char *GetBlamerMisadaptionDebug() const; - - // Returns true if a truth string was recorded for the current word. - bool HasTruthString() const; - - // Returns true if the given string is equivalent to the truth string for - // the current word. - bool EquivalentToTruth(const char *str) const; - - // Returns a null terminated UTF-8 encoded truth string for the current word. - // Use delete [] to free after use. - char *WordTruthUTF8Text() const; - - // Returns a null terminated UTF-8 encoded normalized OCR string for the - // current word. Use delete [] to free after use. - char *WordNormedUTF8Text() const; - - // Returns a pointer to serialized choice lattice. - // Fills lattice_size with the number of bytes in lattice data. - const char *WordLattice(int *lattice_size) const; - - // ============= Functions that refer to symbols only ============. - - // Returns true if the current symbol is a superscript. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsSuperscript() const; - // Returns true if the current symbol is a subscript. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsSubscript() const; - // Returns true if the current symbol is a dropcap. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsDropcap() const; - -protected: - const char *line_separator_; - const char *paragraph_separator_; -}; - -// Class to iterate over the classifier choices for a single RIL_SYMBOL. -class TESS_API ChoiceIterator { -public: - // Construction is from a LTRResultIterator that points to the symbol of - // interest. The ChoiceIterator allows a one-shot iteration over the - // choices for this symbol and after that it is useless. - explicit ChoiceIterator(const LTRResultIterator &result_it); - ~ChoiceIterator(); - - // Moves to the next choice for the symbol and returns false if there - // are none left. - bool Next(); - - // ============= Accessing data ==============. - - // Returns the null terminated UTF-8 encoded text string for the current - // choice. - // NOTE: Unlike LTRResultIterator::GetUTF8Text, the return points to an - // internal structure and should NOT be delete[]ed to free after use. - const char *GetUTF8Text() const; - - // Returns the confidence of the current choice depending on the used language - // data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All - // choices for one symbol should roughly add up to 1.0f. - // If only traineddata of the legacy engine is used, the number should be - // interpreted as a percent probability. (0.0f-100.0f) In this case - // probabilities won't add up to 100. Each one stands on its own. - float Confidence() const; - - // Returns a vector containing all timesteps, which belong to the currently - // selected symbol. A timestep is a vector containing pairs of symbols and - // floating point numbers. The number states the probability for the - // corresponding symbol. - std::vector>> *Timesteps() const; - -private: - // clears the remaining spaces out of the results and adapt the probabilities - void filterSpaces(); - // Pointer to the WERD_RES object owned by the API. - WERD_RES *word_res_; - // Iterator over the blob choices. - BLOB_CHOICE_IT *choice_it_; - std::vector> *LSTM_choices_ = nullptr; - std::vector>::iterator LSTM_choice_it_; - - const int *tstep_index_; - // regulates the rating granularity - double rating_coefficient_; - // leading blanks - int blanks_before_word_; - // true when there is lstm engine related trained data - bool oemLSTM_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/ocrclass.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/ocrclass.h deleted file mode 100644 index a55e6528..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/ocrclass.h +++ /dev/null @@ -1,158 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -/********************************************************************** - * File: ocrclass.h - * Description: Class definitions and constants for the OCR API. - * Author: Hewlett-Packard Co - * - * (C) Copyright 1996, Hewlett-Packard Co. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -/********************************************************************** - * This file contains typedefs for all the structures used by - * the HP OCR interface. - * The structures are designed to allow them to be used with any - * structure alignment up to 8. - **********************************************************************/ - -#ifndef CCUTIL_OCRCLASS_H_ -#define CCUTIL_OCRCLASS_H_ - -#include -#include - -namespace tesseract { - -/********************************************************************** - * EANYCODE_CHAR - * Description of a single character. The character code is defined by - * the character set of the current font. - * Output text is sent as an array of these structures. - * Spaces and line endings in the output are represented in the - * structures of the surrounding characters. They are not directly - * represented as characters. - * The first character in a word has a positive value of blanks. - * Missing information should be set to the defaults in the comments. - * If word bounds are known, but not character bounds, then the top and - * bottom of each character should be those of the word. The left of the - * first and right of the last char in each word should be set. All other - * lefts and rights should be set to -1. - * If set, the values of right and bottom are left+width and top+height. - * Most of the members come directly from the parameters to ocr_append_char. - * The formatting member uses the enhancement parameter and combines the - * line direction stuff into the top 3 bits. - * The coding is 0=RL char, 1=LR char, 2=DR NL, 3=UL NL, 4=DR Para, - * 5=UL Para, 6=TB char, 7=BT char. API users do not need to know what - * the coding is, only that it is backwards compatible with the previous - * version. - **********************************************************************/ - -struct EANYCODE_CHAR { /*single character */ - // It should be noted that the format for char_code for version 2.0 and beyond - // is UTF8 which means that ASCII characters will come out as one structure - // but other characters will be returned in two or more instances of this - // structure with a single byte of the UTF8 code in each, but each will have - // the same bounding box. Programs which want to handle languagues with - // different characters sets will need to handle extended characters - // appropriately, but *all* code needs to be prepared to receive UTF8 coded - // characters for characters such as bullet and fancy quotes. - uint16_t char_code; /*character itself */ - int16_t left; /*of char (-1) */ - int16_t right; /*of char (-1) */ - int16_t top; /*of char (-1) */ - int16_t bottom; /*of char (-1) */ - int16_t font_index; /*what font (0) */ - uint8_t confidence; /*0=perfect, 100=reject (0/100) */ - uint8_t point_size; /*of char, 72=i inch, (10) */ - int8_t blanks; /*no of spaces before this char (1) */ - uint8_t formatting; /*char formatting (0) */ -}; - -/********************************************************************** - * ETEXT_DESC - * Description of the output of the OCR engine. - * This structure is used as both a progress monitor and the final - * output header, since it needs to be a valid progress monitor while - * the OCR engine is storing its output to shared memory. - * During progress, all the buffer info is -1. - * Progress starts at 0 and increases to 100 during OCR. No other constraint. - * Additionally the progress callback contains the bounding box of the word that - * is currently being processed. - * Every progress callback, the OCR engine must set ocr_alive to 1. - * The HP side will set ocr_alive to 0. Repeated failure to reset - * to 1 indicates that the OCR engine is dead. - * If the cancel function is not null then it is called with the number of - * user words found. If it returns true then operation is cancelled. - **********************************************************************/ -class ETEXT_DESC; - -using CANCEL_FUNC = bool (*)(void *, int); -using PROGRESS_FUNC = bool (*)(int, int, int, int, int); -using PROGRESS_FUNC2 = bool (*)(ETEXT_DESC *, int, int, int, int); - -class ETEXT_DESC { // output header -public: - int16_t count{0}; /// chars in this buffer(0) - int16_t progress{0}; /// percent complete increasing (0-100) - /** Progress monitor covers word recognition and it does not cover layout - * analysis. - * See Ray comment in https://github.com/tesseract-ocr/tesseract/pull/27 */ - int8_t more_to_come{0}; /// true if not last - volatile int8_t ocr_alive{0}; /// ocr sets to 1, HP 0 - int8_t err_code{0}; /// for errcode use - CANCEL_FUNC cancel{nullptr}; /// returns true to cancel - PROGRESS_FUNC progress_callback{ - nullptr}; /// called whenever progress increases - PROGRESS_FUNC2 progress_callback2; /// monitor-aware progress callback - void *cancel_this{nullptr}; /// this or other data for cancel - std::chrono::steady_clock::time_point end_time; - /// Time to stop. Expected to be set only - /// by call to set_deadline_msecs(). - EANYCODE_CHAR text[1]{}; /// character data - - ETEXT_DESC() : progress_callback2(&default_progress_func) { - end_time = std::chrono::time_point(); - } - - // Sets the end time to be deadline_msecs milliseconds from now. - void set_deadline_msecs(int32_t deadline_msecs) { - if (deadline_msecs > 0) { - end_time = std::chrono::steady_clock::now() + - std::chrono::milliseconds(deadline_msecs); - } - } - - // Returns false if we've not passed the end_time, or have not set a deadline. - bool deadline_exceeded() const { - if (end_time.time_since_epoch() == - std::chrono::steady_clock::duration::zero()) { - return false; - } - auto now = std::chrono::steady_clock::now(); - return (now > end_time); - } - -private: - static bool default_progress_func(ETEXT_DESC *ths, int left, int right, - int top, int bottom) { - if (ths->progress_callback != nullptr) { - return (*(ths->progress_callback))(ths->progress, left, right, top, - bottom); - } - return true; - } -}; - -} // namespace tesseract - -#endif // CCUTIL_OCRCLASS_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/osdetect.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/osdetect.h deleted file mode 100644 index 34bfb557..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/osdetect.h +++ /dev/null @@ -1,139 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: osdetect.h -// Description: Orientation and script detection. -// Author: Samuel Charron -// Ranjith Unnikrishnan -// -// (C) Copyright 2008, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_OSDETECT_H_ -#define TESSERACT_CCMAIN_OSDETECT_H_ - -#include "export.h" // for TESS_API - -#include // for std::vector - -namespace tesseract { - -class BLOBNBOX; -class BLOBNBOX_CLIST; -class BLOB_CHOICE_LIST; -class TO_BLOCK_LIST; -class UNICHARSET; - -class Tesseract; - -// Max number of scripts in ICU + "NULL" + Japanese and Korean + Fraktur -const int kMaxNumberOfScripts = 116 + 1 + 2 + 1; - -struct OSBestResult { - OSBestResult() - : orientation_id(0), script_id(0), sconfidence(0.0), oconfidence(0.0) {} - int orientation_id; - int script_id; - float sconfidence; - float oconfidence; -}; - -struct OSResults { - OSResults() : unicharset(nullptr) { - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < kMaxNumberOfScripts; ++j) { - scripts_na[i][j] = 0; - } - orientations[i] = 0; - } - } - void update_best_orientation(); - // Set the estimate of the orientation to the given id. - void set_best_orientation(int orientation_id); - // Update/Compute the best estimate of the script assuming the given - // orientation id. - void update_best_script(int orientation_id); - // Return the index of the script with the highest score for this orientation. - TESS_API int get_best_script(int orientation_id) const; - // Accumulate scores with given OSResults instance and update the best script. - void accumulate(const OSResults &osr); - - // Print statistics. - void print_scores(void) const; - void print_scores(int orientation_id) const; - - // Array holding scores for each orientation id [0,3]. - // Orientation ids [0..3] map to [0, 270, 180, 90] degree orientations of the - // page respectively, where the values refer to the amount of clockwise - // rotation to be applied to the page for the text to be upright and readable. - float orientations[4]; - // Script confidence scores for each of 4 possible orientations. - float scripts_na[4][kMaxNumberOfScripts]; - - UNICHARSET *unicharset; - OSBestResult best_result; -}; - -class OrientationDetector { -public: - OrientationDetector(const std::vector *allowed_scripts, - OSResults *results); - bool detect_blob(BLOB_CHOICE_LIST *scores); - int get_orientation(); - -private: - OSResults *osr_; - const std::vector *allowed_scripts_; -}; - -class ScriptDetector { -public: - ScriptDetector(const std::vector *allowed_scripts, OSResults *osr, - tesseract::Tesseract *tess); - void detect_blob(BLOB_CHOICE_LIST *scores); - bool must_stop(int orientation) const; - -private: - OSResults *osr_; - static const char *korean_script_; - static const char *japanese_script_; - static const char *fraktur_script_; - int korean_id_; - int japanese_id_; - int katakana_id_; - int hiragana_id_; - int han_id_; - int hangul_id_; - int latin_id_; - int fraktur_id_; - tesseract::Tesseract *tess_; - const std::vector *allowed_scripts_; -}; - -int orientation_and_script_detection(const char *filename, OSResults *, - tesseract::Tesseract *); - -int os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr, - tesseract::Tesseract *tess); - -int os_detect_blobs(const std::vector *allowed_scripts, - BLOBNBOX_CLIST *blob_list, OSResults *osr, - tesseract::Tesseract *tess); - -bool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s, - OSResults *, tesseract::Tesseract *tess); - -// Helper method to convert an orientation index to its value in degrees. -// The value represents the amount of clockwise rotation in degrees that must be -// applied for the text to be upright (readable). -TESS_API int OrientationIdToValue(const int &id); - -} // namespace tesseract - -#endif // TESSERACT_CCMAIN_OSDETECT_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/pageiterator.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/pageiterator.h deleted file mode 100644 index 68739715..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/pageiterator.h +++ /dev/null @@ -1,364 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: pageiterator.h -// Description: Iterator for tesseract page structure that avoids using -// tesseract internal data structures. -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_ -#define TESSERACT_CCMAIN_PAGEITERATOR_H_ - -#include "export.h" -#include "publictypes.h" - -struct Pix; -struct Pta; - -namespace tesseract { - -struct BlamerBundle; -class C_BLOB_IT; -class PAGE_RES; -class PAGE_RES_IT; -class WERD; - -class Tesseract; - -/** - * Class to iterate over tesseract page structure, providing access to all - * levels of the page hierarchy, without including any tesseract headers or - * having to handle any tesseract structures. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - * See tesseract/publictypes.h for the definition of PageIteratorLevel. - * See also ResultIterator, derived from PageIterator, which adds in the - * ability to access OCR output with text-specific methods. - */ - -class TESS_API PageIterator { -public: - /** - * page_res and tesseract come directly from the BaseAPI. - * The rectangle parameters are copied indirectly from the Thresholder, - * via the BaseAPI. They represent the coordinates of some rectangle in an - * original image (in top-left-origin coordinates) and therefore the top-left - * needs to be added to any output boxes in order to specify coordinates - * in the original image. See TessBaseAPI::SetRectangle. - * The scale and scaled_yres are in case the Thresholder scaled the image - * rectangle prior to thresholding. Any coordinates in tesseract's image - * must be divided by scale before adding (rect_left, rect_top). - * The scaled_yres indicates the effective resolution of the binary image - * that tesseract has been given by the Thresholder. - * After the constructor, Begin has already been called. - */ - PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, - int scaled_yres, int rect_left, int rect_top, int rect_width, - int rect_height); - virtual ~PageIterator(); - - /** - * Page/ResultIterators may be copied! This makes it possible to iterate over - * all the objects at a lower level, while maintaining an iterator to - * objects at a higher level. These constructors DO NOT CALL Begin, so - * iterations will continue from the location of src. - */ - PageIterator(const PageIterator &src); - const PageIterator &operator=(const PageIterator &src); - - /** Are we positioned at the same location as other? */ - bool PositionedAtSameWord(const PAGE_RES_IT *other) const; - - // ============= Moving around within the page ============. - - /** - * Moves the iterator to point to the start of the page to begin an - * iteration. - */ - virtual void Begin(); - - /** - * Moves the iterator to the beginning of the paragraph. - * This class implements this functionality by moving it to the zero indexed - * blob of the first (leftmost) word on the first row of the paragraph. - */ - virtual void RestartParagraph(); - - /** - * Return whether this iterator points anywhere in the first textline of a - * paragraph. - */ - bool IsWithinFirstTextlineOfParagraph() const; - - /** - * Moves the iterator to the beginning of the text line. - * This class implements this functionality by moving it to the zero indexed - * blob of the first (leftmost) word of the row. - */ - virtual void RestartRow(); - - /** - * Moves to the start of the next object at the given level in the - * page hierarchy, and returns false if the end of the page was reached. - * NOTE that RIL_SYMBOL will skip non-text blocks, but all other - * PageIteratorLevel level values will visit each non-text block once. - * Think of non text blocks as containing a single para, with a single line, - * with a single imaginary word. - * Calls to Next with different levels may be freely intermixed. - * This function iterates words in right-to-left scripts correctly, if - * the appropriate language has been loaded into Tesseract. - */ - virtual bool Next(PageIteratorLevel level); - - /** - * Returns true if the iterator is at the start of an object at the given - * level. - * - * For instance, suppose an iterator it is pointed to the first symbol of the - * first word of the third line of the second paragraph of the first block in - * a page, then: - * it.IsAtBeginningOf(RIL_BLOCK) = false - * it.IsAtBeginningOf(RIL_PARA) = false - * it.IsAtBeginningOf(RIL_TEXTLINE) = true - * it.IsAtBeginningOf(RIL_WORD) = true - * it.IsAtBeginningOf(RIL_SYMBOL) = true - */ - virtual bool IsAtBeginningOf(PageIteratorLevel level) const; - - /** - * Returns whether the iterator is positioned at the last element in a - * given level. (e.g. the last word in a line, the last line in a block) - * - * Here's some two-paragraph example - * text. It starts off innocuously - * enough but quickly turns bizarre. - * The author inserts a cornucopia - * of words to guard against confused - * references. - * - * Now take an iterator it pointed to the start of "bizarre." - * it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false - * it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true - * it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false - */ - virtual bool IsAtFinalElement(PageIteratorLevel level, - PageIteratorLevel element) const; - - /** - * Returns whether this iterator is positioned - * before other: -1 - * equal to other: 0 - * after other: 1 - */ - int Cmp(const PageIterator &other) const; - - // ============= Accessing data ==============. - // Coordinate system: - // Integer coordinates are at the cracks between the pixels. - // The top-left corner of the top-left pixel in the image is at (0,0). - // The bottom-right corner of the bottom-right pixel in the image is at - // (width, height). - // Every bounding box goes from the top-left of the top-left contained - // pixel to the bottom-right of the bottom-right contained pixel, so - // the bounding box of the single top-left pixel in the image is: - // (0,0)->(1,1). - // If an image rectangle has been set in the API, then returned coordinates - // relate to the original (full) image, rather than the rectangle. - - /** - * Controls what to include in a bounding box. Bounding boxes of all levels - * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics. - * Between layout analysis and recognition, it isn't known where all - * diacritics belong, so this control is used to include or exclude some - * diacritics that are above or below the main body of the word. In most cases - * where the placement is obvious, and after recognition, it doesn't make as - * much difference, as the diacritics will already be included in the word. - */ - void SetBoundingBoxComponents(bool include_upper_dots, - bool include_lower_dots) { - include_upper_dots_ = include_upper_dots; - include_lower_dots_ = include_lower_dots; - } - - /** - * Returns the bounding rectangle of the current object at the given level. - * See comment on coordinate system above. - * Returns false if there is no such object at the current position. - * The returned bounding box is guaranteed to match the size and position - * of the image returned by GetBinaryImage, but may clip foreground pixels - * from a grey image. The padding argument to GetImage can be used to expand - * the image to include more foreground pixels. See GetImage below. - */ - bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, - int *bottom) const; - bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top, - int *right, int *bottom) const; - /** - * Returns the bounding rectangle of the object in a coordinate system of the - * working image rectangle having its origin at (rect_left_, rect_top_) with - * respect to the original image and is scaled by a factor scale_. - */ - bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top, - int *right, int *bottom) const; - - /** Returns whether there is no object of a given level. */ - bool Empty(PageIteratorLevel level) const; - - /** - * Returns the type of the current block. - * See tesseract/publictypes.h for PolyBlockType. - */ - PolyBlockType BlockType() const; - - /** - * Returns the polygon outline of the current block. The returned Pta must - * be ptaDestroy-ed after use. Note that the returned Pta lists the vertices - * of the polygon, and the last edge is the line segment between the last - * point and the first point. nullptr will be returned if the iterator is - * at the end of the document or layout analysis was not used. - */ - Pta *BlockPolygon() const; - - /** - * Returns a binary image of the current object at the given level. - * The position and size match the return from BoundingBoxInternal, and so - * this could be upscaled with respect to the original input image. - * Use pixDestroy to delete the image after use. - */ - Pix *GetBinaryImage(PageIteratorLevel level) const; - - /** - * Returns an image of the current object at the given level in greyscale - * if available in the input. To guarantee a binary image use BinaryImage. - * NOTE that in order to give the best possible image, the bounds are - * expanded slightly over the binary connected component, by the supplied - * padding, so the top-left position of the returned image is returned - * in (left,top). These will most likely not match the coordinates - * returned by BoundingBox. - * If you do not supply an original image, you will get a binary one. - * Use pixDestroy to delete the image after use. - */ - Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img, - int *left, int *top) const; - - /** - * Returns the baseline of the current object at the given level. - * The baseline is the line that passes through (x1, y1) and (x2, y2). - * WARNING: with vertical text, baselines may be vertical! - * Returns false if there is no baseline at the current position. - */ - bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2, - int *y2) const; - - // Returns the attributes of the current row. - void RowAttributes(float *row_height, float *descenders, - float *ascenders) const; - - /** - * Returns orientation for the block the iterator points to. - * orientation, writing_direction, textline_order: see publictypes.h - * deskew_angle: after rotating the block so the text orientation is - * upright, how many radians does one have to rotate the - * block anti-clockwise for it to be level? - * -Pi/4 <= deskew_angle <= Pi/4 - */ - void Orientation(tesseract::Orientation *orientation, - tesseract::WritingDirection *writing_direction, - tesseract::TextlineOrder *textline_order, - float *deskew_angle) const; - - /** - * Returns information about the current paragraph, if available. - * - * justification - - * LEFT if ragged right, or fully justified and script is left-to-right. - * RIGHT if ragged left, or fully justified and script is right-to-left. - * unknown if it looks like source code or we have very few lines. - * is_list_item - - * true if we believe this is a member of an ordered or unordered list. - * is_crown - - * true if the first line of the paragraph is aligned with the other - * lines of the paragraph even though subsequent paragraphs have first - * line indents. This typically indicates that this is the continuation - * of a previous paragraph or that it is the very first paragraph in - * the chapter. - * first_line_indent - - * For LEFT aligned paragraphs, the first text line of paragraphs of - * this kind are indented this many pixels from the left edge of the - * rest of the paragraph. - * for RIGHT aligned paragraphs, the first text line of paragraphs of - * this kind are indented this many pixels from the right edge of the - * rest of the paragraph. - * NOTE 1: This value may be negative. - * NOTE 2: if *is_crown == true, the first line of this paragraph is - * actually flush, and first_line_indent is set to the "common" - * first_line_indent for subsequent paragraphs in this block - * of text. - */ - void ParagraphInfo(tesseract::ParagraphJustification *justification, - bool *is_list_item, bool *is_crown, - int *first_line_indent) const; - - // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle - // of the current word to the given pointer (takes ownership of the pointer) - // and returns true. - // Can only be used when iterating on the word level. - bool SetWordBlamerBundle(BlamerBundle *blamer_bundle); - -protected: - /** - * Sets up the internal data for iterating the blobs of a new word, then - * moves the iterator to the given offset. - */ - void BeginWord(int offset); - - /** Pointer to the page_res owned by the API. */ - PAGE_RES *page_res_; - /** Pointer to the Tesseract object owned by the API. */ - Tesseract *tesseract_; - /** - * The iterator to the page_res_. Owned by this ResultIterator. - * A pointer just to avoid dragging in Tesseract includes. - */ - PAGE_RES_IT *it_; - /** - * The current input WERD being iterated. If there is an output from OCR, - * then word_ is nullptr. Owned by the API - */ - WERD *word_; - /** The length of the current word_. */ - int word_length_; - /** The current blob index within the word. */ - int blob_index_; - /** - * Iterator to the blobs within the word. If nullptr, then we are iterating - * OCR results in the box_word. - * Owned by this ResultIterator. - */ - C_BLOB_IT *cblob_it_; - /** Control over what to include in bounding boxes. */ - bool include_upper_dots_; - bool include_lower_dots_; - /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/ - int scale_; - int scaled_yres_; - int rect_left_; - int rect_top_; - int rect_width_; - int rect_height_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_PAGEITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/publictypes.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/publictypes.h deleted file mode 100644 index 0069cf28..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/publictypes.h +++ /dev/null @@ -1,281 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: publictypes.h -// Description: Types used in both the API and internally -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCSTRUCT_PUBLICTYPES_H_ -#define TESSERACT_CCSTRUCT_PUBLICTYPES_H_ - -namespace tesseract { - -// This file contains types that are used both by the API and internally -// to Tesseract. In order to decouple the API from Tesseract and prevent cyclic -// dependencies, THIS FILE SHOULD NOT DEPEND ON ANY OTHER PART OF TESSERACT. -// Restated: It is OK for low-level Tesseract files to include publictypes.h, -// but not for the low-level tesseract code to include top-level API code. -// This file should not use other Tesseract types, as that would drag -// their includes into the API-level. - -/** Number of printers' points in an inch. The unit of the pointsize return. */ -constexpr int kPointsPerInch = 72; -/** - * Minimum believable resolution. Used as a default if there is no other - * information, as it is safer to under-estimate than over-estimate. - */ -constexpr int kMinCredibleResolution = 70; -/** Maximum believable resolution. */ -constexpr int kMaxCredibleResolution = 2400; -/** - * Ratio between median blob size and likely resolution. Used to estimate - * resolution when none is provided. This is basically 1/usual text size in - * inches. */ -constexpr int kResolutionEstimationFactor = 10; - -/** - * Possible types for a POLY_BLOCK or ColPartition. - * Must be kept in sync with kPBColors in polyblk.cpp and PTIs*Type functions - * below, as well as kPolyBlockNames in layout_test.cc. - * Used extensively by ColPartition, and POLY_BLOCK. - */ -enum PolyBlockType { - PT_UNKNOWN, // Type is not yet known. Keep as the first element. - PT_FLOWING_TEXT, // Text that lives inside a column. - PT_HEADING_TEXT, // Text that spans more than one column. - PT_PULLOUT_TEXT, // Text that is in a cross-column pull-out region. - PT_EQUATION, // Partition belonging to an equation region. - PT_INLINE_EQUATION, // Partition has inline equation. - PT_TABLE, // Partition belonging to a table region. - PT_VERTICAL_TEXT, // Text-line runs vertically. - PT_CAPTION_TEXT, // Text that belongs to an image. - PT_FLOWING_IMAGE, // Image that lives inside a column. - PT_HEADING_IMAGE, // Image that spans more than one column. - PT_PULLOUT_IMAGE, // Image that is in a cross-column pull-out region. - PT_HORZ_LINE, // Horizontal Line. - PT_VERT_LINE, // Vertical Line. - PT_NOISE, // Lies outside of any column. - PT_COUNT -}; - -/** Returns true if PolyBlockType is of horizontal line type */ -inline bool PTIsLineType(PolyBlockType type) { - return type == PT_HORZ_LINE || type == PT_VERT_LINE; -} -/** Returns true if PolyBlockType is of image type */ -inline bool PTIsImageType(PolyBlockType type) { - return type == PT_FLOWING_IMAGE || type == PT_HEADING_IMAGE || - type == PT_PULLOUT_IMAGE; -} -/** Returns true if PolyBlockType is of text type */ -inline bool PTIsTextType(PolyBlockType type) { - return type == PT_FLOWING_TEXT || type == PT_HEADING_TEXT || - type == PT_PULLOUT_TEXT || type == PT_TABLE || - type == PT_VERTICAL_TEXT || type == PT_CAPTION_TEXT || - type == PT_INLINE_EQUATION; -} -// Returns true if PolyBlockType is of pullout(inter-column) type -inline bool PTIsPulloutType(PolyBlockType type) { - return type == PT_PULLOUT_IMAGE || type == PT_PULLOUT_TEXT; -} - -/** - * +------------------+ Orientation Example: - * | 1 Aaaa Aaaa Aaaa | ==================== - * | Aaa aa aaa aa | To left is a diagram of some (1) English and - * | aaaaaa A aa aaa. | (2) Chinese text and a (3) photo credit. - * | 2 | - * | ####### c c C | Upright Latin characters are represented as A and a. - * | ####### c c c | '<' represents a latin character rotated - * | < ####### c c c | anti-clockwise 90 degrees. - * | < ####### c c | - * | < ####### . c | Upright Chinese characters are represented C and c. - * | 3 ####### c | - * +------------------+ NOTA BENE: enum values here should match goodoc.proto - - * If you orient your head so that "up" aligns with Orientation, - * then the characters will appear "right side up" and readable. - * - * In the example above, both the English and Chinese paragraphs are oriented - * so their "up" is the top of the page (page up). The photo credit is read - * with one's head turned leftward ("up" is to page left). - * - * The values of this enum match the convention of Tesseract's osdetect.h -*/ -enum Orientation { - ORIENTATION_PAGE_UP = 0, - ORIENTATION_PAGE_RIGHT = 1, - ORIENTATION_PAGE_DOWN = 2, - ORIENTATION_PAGE_LEFT = 3, -}; - -/** - * The grapheme clusters within a line of text are laid out logically - * in this direction, judged when looking at the text line rotated so that - * its Orientation is "page up". - * - * For English text, the writing direction is left-to-right. For the - * Chinese text in the above example, the writing direction is top-to-bottom. - */ -enum WritingDirection { - WRITING_DIRECTION_LEFT_TO_RIGHT = 0, - WRITING_DIRECTION_RIGHT_TO_LEFT = 1, - WRITING_DIRECTION_TOP_TO_BOTTOM = 2, -}; - -/** - * The text lines are read in the given sequence. - * - * In English, the order is top-to-bottom. - * In Chinese, vertical text lines are read right-to-left. Mongolian is - * written in vertical columns top to bottom like Chinese, but the lines - * order left-to right. - * - * Note that only some combinations make sense. For example, - * WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM - */ -enum TextlineOrder { - TEXTLINE_ORDER_LEFT_TO_RIGHT = 0, - TEXTLINE_ORDER_RIGHT_TO_LEFT = 1, - TEXTLINE_ORDER_TOP_TO_BOTTOM = 2, -}; - -/** - * Possible modes for page layout analysis. These *must* be kept in order - * of decreasing amount of layout analysis to be done, except for OSD_ONLY, - * so that the inequality test macros below work. - */ -enum PageSegMode { - PSM_OSD_ONLY = 0, ///< Orientation and script detection only. - PSM_AUTO_OSD = 1, ///< Automatic page segmentation with orientation and - ///< script detection. (OSD) - PSM_AUTO_ONLY = 2, ///< Automatic page segmentation, but no OSD, or OCR. - PSM_AUTO = 3, ///< Fully automatic page segmentation, but no OSD. - PSM_SINGLE_COLUMN = 4, ///< Assume a single column of text of variable sizes. - PSM_SINGLE_BLOCK_VERT_TEXT = 5, ///< Assume a single uniform block of - ///< vertically aligned text. - PSM_SINGLE_BLOCK = 6, ///< Assume a single uniform block of text. (Default.) - PSM_SINGLE_LINE = 7, ///< Treat the image as a single text line. - PSM_SINGLE_WORD = 8, ///< Treat the image as a single word. - PSM_CIRCLE_WORD = 9, ///< Treat the image as a single word in a circle. - PSM_SINGLE_CHAR = 10, ///< Treat the image as a single character. - PSM_SPARSE_TEXT = - 11, ///< Find as much text as possible in no particular order. - PSM_SPARSE_TEXT_OSD = 12, ///< Sparse text with orientation and script det. - PSM_RAW_LINE = 13, ///< Treat the image as a single text line, bypassing - ///< hacks that are Tesseract-specific. - - PSM_COUNT ///< Number of enum entries. -}; - -/** - * Inline functions that act on a PageSegMode to determine whether components of - * layout analysis are enabled. - * *Depend critically on the order of elements of PageSegMode.* - * NOTE that arg is an int for compatibility with INT_PARAM. - */ -inline bool PSM_OSD_ENABLED(int pageseg_mode) { - return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_ORIENTATION_ENABLED(int pageseg_mode) { - return pageseg_mode <= PSM_AUTO || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_COL_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO; -} -inline bool PSM_SPARSE(int pageseg_mode) { - return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN; -} -inline bool PSM_LINE_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK; -} -inline bool PSM_WORD_FIND_ENABLED(int pageseg_mode) { - return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) || - pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} - -/** - * enum of the elements of the page hierarchy, used in ResultIterator - * to provide functions that operate on each level without having to - * have 5x as many functions. - */ -enum PageIteratorLevel { - RIL_BLOCK, // Block of text/image/separator line. - RIL_PARA, // Paragraph within a block. - RIL_TEXTLINE, // Line within a paragraph. - RIL_WORD, // Word within a textline. - RIL_SYMBOL // Symbol/character within a word. -}; - -/** - * JUSTIFICATION_UNKNOWN - * The alignment is not clearly one of the other options. This could happen - * for example if there are only one or two lines of text or the text looks - * like source code or poetry. - * - * NOTA BENE: Fully justified paragraphs (text aligned to both left and right - * margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text - * is written with a left-to-right script and with JUSTIFICATION_RIGHT if - * their text is written in a right-to-left script. - * - * Interpretation for text read in vertical lines: - * "Left" is wherever the starting reading position is. - * - * JUSTIFICATION_LEFT - * Each line, except possibly the first, is flush to the same left tab stop. - * - * JUSTIFICATION_CENTER - * The text lines of the paragraph are centered about a line going - * down through their middle of the text lines. - * - * JUSTIFICATION_RIGHT - * Each line, except possibly the first, is flush to the same right tab stop. - */ -enum ParagraphJustification { - JUSTIFICATION_UNKNOWN, - JUSTIFICATION_LEFT, - JUSTIFICATION_CENTER, - JUSTIFICATION_RIGHT, -}; - -/** - * When Tesseract/Cube is initialized we can choose to instantiate/load/run - * only the Tesseract part, only the Cube part or both along with the combiner. - * The preference of which engine to use is stored in tessedit_ocr_engine_mode. - * - * ATTENTION: When modifying this enum, please make sure to make the - * appropriate changes to all the enums mirroring it (e.g. OCREngine in - * cityblock/workflow/detection/detection_storage.proto). Such enums will - * mention the connection to OcrEngineMode in the comments. - */ -enum OcrEngineMode { - OEM_TESSERACT_ONLY, // Run Tesseract only - fastest; deprecated - OEM_LSTM_ONLY, // Run just the LSTM line recognizer. - OEM_TESSERACT_LSTM_COMBINED, // Run the LSTM recognizer, but allow fallback - // to Tesseract when things get difficult. - // deprecated - OEM_DEFAULT, // Specify this mode when calling init_*(), - // to indicate that any of the above modes - // should be automatically inferred from the - // variables in the language-specific config, - // command-line configs, or if not specified - // in any of the above should be set to the - // default OEM_TESSERACT_ONLY. - OEM_COUNT // Number of OEMs -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCSTRUCT_PUBLICTYPES_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/renderer.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/renderer.h deleted file mode 100644 index 6f405233..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/renderer.h +++ /dev/null @@ -1,311 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: renderer.h -// Description: Rendering interface to inject into TessBaseAPI -// -// (C) Copyright 2011, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_RENDERER_H_ -#define TESSERACT_API_RENDERER_H_ - -#include "export.h" - -// To avoid collision with other typenames include the ABSOLUTE MINIMUM -// complexity of includes here. Use forward declarations wherever possible -// and hide includes of complex types in baseapi.cpp. -#include -#include // for std::string -#include // for std::vector - -struct Pix; - -namespace tesseract { - -class TessBaseAPI; - -/** - * Interface for rendering tesseract results into a document, such as text, - * HOCR or pdf. This class is abstract. Specific classes handle individual - * formats. This interface is then used to inject the renderer class into - * tesseract when processing images. - * - * For simplicity implementing this with tesseract version 3.01, - * the renderer contains document state that is cleared from document - * to document just as the TessBaseAPI is. This way the base API can just - * delegate its rendering functionality to injected renderers, and the - * renderers can manage the associated state needed for the specific formats - * in addition to the heuristics for producing it. - */ -class TESS_API TessResultRenderer { -public: - virtual ~TessResultRenderer(); - - // Takes ownership of pointer so must be new'd instance. - // Renderers aren't ordered, but appends the sequences of next parameter - // and existing next(). The renderers should be unique across both lists. - void insert(TessResultRenderer *next); - - // Returns the next renderer or nullptr. - TessResultRenderer *next() { - return next_; - } - - /** - * Starts a new document with the given title. - * This clears the contents of the output data. - * Title should use UTF-8 encoding. - */ - bool BeginDocument(const char *title); - - /** - * Adds the recognized text from the source image to the current document. - * Invalid if BeginDocument not yet called. - * - * Note that this API is a bit weird but is designed to fit into the - * current TessBaseAPI implementation where the api has lots of state - * information that we might want to add in. - */ - bool AddImage(TessBaseAPI *api); - - /** - * Finishes the document and finalizes the output data - * Invalid if BeginDocument not yet called. - */ - bool EndDocument(); - - const char *file_extension() const { - return file_extension_; - } - const char *title() const { - return title_.c_str(); - } - - // Is everything fine? Otherwise something went wrong. - bool happy() const { - return happy_; - } - - /** - * Returns the index of the last image given to AddImage - * (i.e. images are incremented whether the image succeeded or not) - * - * This is always defined. It means either the number of the - * current image, the last image ended, or in the completed document - * depending on when in the document lifecycle you are looking at it. - * Will return -1 if a document was never started. - */ - int imagenum() const { - return imagenum_; - } - -protected: - /** - * Called by concrete classes. - * - * outputbase is the name of the output file excluding - * extension. For example, "/path/to/chocolate-chip-cookie-recipe" - * - * extension indicates the file extension to be used for output - * files. For example "pdf" will produce a .pdf file, and "hocr" - * will produce .hocr files. - */ - TessResultRenderer(const char *outputbase, const char *extension); - - // Hook for specialized handling in BeginDocument() - virtual bool BeginDocumentHandler(); - - // This must be overridden to render the OCR'd results - virtual bool AddImageHandler(TessBaseAPI *api) = 0; - - // Hook for specialized handling in EndDocument() - virtual bool EndDocumentHandler(); - - // Renderers can call this to append '\0' terminated strings into - // the output string returned by GetOutput. - // This method will grow the output buffer if needed. - void AppendString(const char *s); - - // Renderers can call this to append binary byte sequences into - // the output string returned by GetOutput. Note that s is not necessarily - // '\0' terminated (and can contain '\0' within it). - // This method will grow the output buffer if needed. - void AppendData(const char *s, int len); - -private: - TessResultRenderer *next_; // Can link multiple renderers together - FILE *fout_; // output file pointer - const char *file_extension_; // standard extension for generated output - std::string title_; // title of document being rendered - int imagenum_; // index of last image added - bool happy_; // I get grumpy when the disk fills up, etc. -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessTextRenderer : public TessResultRenderer { -public: - explicit TessTextRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into an hocr text string - */ -class TESS_API TessHOcrRenderer : public TessResultRenderer { -public: - explicit TessHOcrRenderer(const char *outputbase, bool font_info); - explicit TessHOcrRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool font_info_; // whether to print font information -}; - -/** - * Renders tesseract output into an alto text string - */ -class TESS_API TessAltoRenderer : public TessResultRenderer { -public: - explicit TessAltoRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool begin_document; -}; - -/** - * Renders Tesseract output into a TSV string - */ -class TESS_API TessTsvRenderer : public TessResultRenderer { -public: - explicit TessTsvRenderer(const char *outputbase, bool font_info); - explicit TessTsvRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool font_info_; // whether to print font information -}; - -/** - * Renders tesseract output into searchable PDF - */ -class TESS_API TessPDFRenderer : public TessResultRenderer { -public: - // datadir is the location of the TESSDATA. We need it because - // we load a custom PDF font from this location. - TessPDFRenderer(const char *outputbase, const char *datadir, - bool textonly = false); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - // We don't want to have every image in memory at once, - // so we store some metadata as we go along producing - // PDFs one page at a time. At the end, that metadata is - // used to make everything that isn't easily handled in a - // streaming fashion. - long int obj_; // counter for PDF objects - std::vector offsets_; // offset of every PDF object in bytes - std::vector pages_; // object number for every /Page object - std::string datadir_; // where to find the custom font - bool textonly_; // skip images if set - // Bookkeeping only. DIY = Do It Yourself. - void AppendPDFObjectDIY(size_t objectsize); - // Bookkeeping + emit data. - void AppendPDFObject(const char *data); - // Create the /Contents object for an entire page. - char *GetPDFTextObjects(TessBaseAPI *api, double width, double height); - // Turn an image into a PDF object. Only transcode if we have to. - static bool imageToPDFObj(Pix *pix, const char *filename, long int objnum, - char **pdf_object, long int *pdf_object_size, - int jpg_quality); -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessUnlvRenderer : public TessResultRenderer { -public: - explicit TessUnlvRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string for LSTMBox - */ -class TESS_API TessLSTMBoxRenderer : public TessResultRenderer { -public: - explicit TessLSTMBoxRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessBoxTextRenderer : public TessResultRenderer { -public: - explicit TessBoxTextRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string in WordStr format - */ -class TESS_API TessWordStrBoxRenderer : public TessResultRenderer { -public: - explicit TessWordStrBoxRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -#ifndef DISABLED_LEGACY_ENGINE - -/** - * Renders tesseract output into an osd text string - */ -class TESS_API TessOsdRenderer : public TessResultRenderer { -public: - explicit TessOsdRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -#endif // ndef DISABLED_LEGACY_ENGINE - -} // namespace tesseract. - -#endif // TESSERACT_API_RENDERER_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/resultiterator.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/resultiterator.h deleted file mode 100644 index 3e4d5807..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/resultiterator.h +++ /dev/null @@ -1,250 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: resultiterator.h -// Description: Iterator for tesseract results that is capable of -// iterating in proper reading order over Bi Directional -// (e.g. mixed Hebrew and English) text. -// Author: David Eger -// -// (C) Copyright 2011, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_ -#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_ - -#include "export.h" // for TESS_API, TESS_LOCAL -#include "ltrresultiterator.h" // for LTRResultIterator -#include "publictypes.h" // for PageIteratorLevel -#include "unichar.h" // for StrongScriptDirection - -#include // for std::pair -#include // for std::vector - -namespace tesseract { - -class TESS_API ResultIterator : public LTRResultIterator { -public: - static ResultIterator *StartOfParagraph(const LTRResultIterator &resit); - - /** - * ResultIterator is copy constructible! - * The default copy constructor works just fine for us. - */ - ~ResultIterator() override = default; - - // ============= Moving around within the page ============. - /** - * Moves the iterator to point to the start of the page to begin - * an iteration. - */ - void Begin() override; - - /** - * Moves to the start of the next object at the given level in the - * page hierarchy in the appropriate reading order and returns false if - * the end of the page was reached. - * NOTE that RIL_SYMBOL will skip non-text blocks, but all other - * PageIteratorLevel level values will visit each non-text block once. - * Think of non text blocks as containing a single para, with a single line, - * with a single imaginary word. - * Calls to Next with different levels may be freely intermixed. - * This function iterates words in right-to-left scripts correctly, if - * the appropriate language has been loaded into Tesseract. - */ - bool Next(PageIteratorLevel level) override; - - /** - * IsAtBeginningOf() returns whether we're at the logical beginning of the - * given level. (as opposed to ResultIterator's left-to-right top-to-bottom - * order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf(). - * For a full description, see pageiterator.h - */ - bool IsAtBeginningOf(PageIteratorLevel level) const override; - - /** - * Implement PageIterator's IsAtFinalElement correctly in a BiDi context. - * For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we - * point at the last word in a paragraph. See PageIterator for full comment. - */ - bool IsAtFinalElement(PageIteratorLevel level, - PageIteratorLevel element) const override; - - // ============= Functions that refer to words only ============. - // Returns the number of blanks before the current word. - int BlanksBeforeWord() const; - - // ============= Accessing data ==============. - - /** - * Returns the null terminated UTF-8 encoded text string for the current - * object at the given level. Use delete [] to free after use. - */ - virtual char *GetUTF8Text(PageIteratorLevel level) const; - - /** - * Returns the LSTM choices for every LSTM timestep for the current word. - */ - virtual std::vector>>> - *GetRawLSTMTimesteps() const; - virtual std::vector>> - *GetBestLSTMSymbolChoices() const; - - /** - * Return whether the current paragraph's dominant reading direction - * is left-to-right (as opposed to right-to-left). - */ - bool ParagraphIsLtr() const; - - // ============= Exposed only for testing =============. - - /** - * Yields the reading order as a sequence of indices and (optional) - * meta-marks for a set of words (given left-to-right). - * The meta marks are passed as negative values: - * kMinorRunStart Start of minor direction text. - * kMinorRunEnd End of minor direction text. - * kComplexWord The next indexed word contains both left-to-right and - * right-to-left characters and was treated as neutral. - * - * For example, suppose we have five words in a text line, - * indexed [0,1,2,3,4] from the leftmost side of the text line. - * The following are all believable reading_orders: - * - * Left-to-Right (in ltr paragraph): - * { 0, 1, 2, 3, 4 } - * Left-to-Right (in rtl paragraph): - * { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd } - * Right-to-Left (in rtl paragraph): - * { 4, 3, 2, 1, 0 } - * Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph: - * { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 } - */ - static void CalculateTextlineOrder( - bool paragraph_is_ltr, - const std::vector &word_dirs, - std::vector *reading_order); - - static const int kMinorRunStart; - static const int kMinorRunEnd; - static const int kComplexWord; - -protected: - /** - * We presume the data associated with the given iterator will outlive us. - * NB: This is private because it does something that is non-obvious: - * it resets to the beginning of the paragraph instead of staying wherever - * resit might have pointed. - */ - explicit ResultIterator(const LTRResultIterator &resit); - -private: - /** - * Calculates the current paragraph's dominant writing direction. - * Typically, members should use current_paragraph_ltr_ instead. - */ - bool CurrentParagraphIsLtr() const; - - /** - * Returns word indices as measured from resit->RestartRow() = index 0 - * for the reading order of words within a textline given an iterator - * into the middle of the text line. - * In addition to non-negative word indices, the following negative values - * may be inserted: - * kMinorRunStart Start of minor direction text. - * kMinorRunEnd End of minor direction text. - * kComplexWord The previous word contains both left-to-right and - * right-to-left characters and was treated as neutral. - */ - void CalculateTextlineOrder(bool paragraph_is_ltr, - const LTRResultIterator &resit, - std::vector *indices) const; - /** Same as above, but the caller's ssd gets filled in if ssd != nullptr. */ - void CalculateTextlineOrder(bool paragraph_is_ltr, - const LTRResultIterator &resit, - std::vector *ssd, - std::vector *indices) const; - - /** - * What is the index of the current word in a strict left-to-right reading - * of the row? - */ - int LTRWordIndex() const; - - /** - * Given an iterator pointing at a word, returns the logical reading order - * of blob indices for the word. - */ - void CalculateBlobOrder(std::vector *blob_indices) const; - - /** Precondition: current_paragraph_is_ltr_ is set. */ - void MoveToLogicalStartOfTextline(); - - /** - * Precondition: current_paragraph_is_ltr_ and in_minor_direction_ - * are set. - */ - void MoveToLogicalStartOfWord(); - - /** Are we pointing at the final (reading order) symbol of the word? */ - bool IsAtFinalSymbolOfWord() const; - - /** Are we pointing at the first (reading order) symbol of the word? */ - bool IsAtFirstSymbolOfWord() const; - - /** - * Append any extra marks that should be appended to this word when printed. - * Mostly, these are Unicode BiDi control characters. - */ - void AppendSuffixMarks(std::string *text) const; - - /** Appends the current word in reading order to the given buffer.*/ - void AppendUTF8WordText(std::string *text) const; - - /** - * Appends the text of the current text line, *assuming this iterator is - * positioned at the beginning of the text line* This function - * updates the iterator to point to the first position past the text line. - * Each textline is terminated in a single newline character. - * If the textline ends a paragraph, it gets a second terminal newline. - */ - void IterateAndAppendUTF8TextlineText(std::string *text); - - /** - * Appends the text of the current paragraph in reading order - * to the given buffer. - * Each textline is terminated in a single newline character, and the - * paragraph gets an extra newline at the end. - */ - void AppendUTF8ParagraphText(std::string *text) const; - - /** Returns whether the bidi_debug flag is set to at least min_level. */ - bool BidiDebug(int min_level) const; - - bool current_paragraph_is_ltr_; - - /** - * Is the currently pointed-at character at the beginning of - * a minor-direction run? - */ - bool at_beginning_of_minor_run_; - - /** Is the currently pointed-at character in a minor-direction sequence? */ - bool in_minor_direction_; - - /** - * Should detected inter-word spaces be preserved, or "compressed" to a single - * space character (default behavior). - */ - bool preserve_interword_spaces_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/unichar.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/unichar.h deleted file mode 100644 index 015109d7..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/unichar.h +++ /dev/null @@ -1,174 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: unichar.h -// Description: Unicode character/ligature class. -// Author: Ray Smith -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCUTIL_UNICHAR_H_ -#define TESSERACT_CCUTIL_UNICHAR_H_ - -#include "export.h" - -#include -#include -#include -#include - -namespace tesseract { - -// Maximum number of characters that can be stored in a UNICHAR. Must be -// at least 4. Must not exceed 31 without changing the coding of length. -#define UNICHAR_LEN 30 - -// A UNICHAR_ID is the unique id of a unichar. -using UNICHAR_ID = int; - -// A variable to indicate an invalid or uninitialized unichar id. -static const int INVALID_UNICHAR_ID = -1; -// A special unichar that corresponds to INVALID_UNICHAR_ID. -static const char INVALID_UNICHAR[] = "__INVALID_UNICHAR__"; - -enum StrongScriptDirection { - DIR_NEUTRAL = 0, // Text contains only neutral characters. - DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters. - DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters. - DIR_MIX = 3, // Text contains a mixture of left-to-right - // and right-to-left characters. -}; - -using char32 = signed int; - -// The UNICHAR class holds a single classification result. This may be -// a single Unicode character (stored as between 1 and 4 utf8 bytes) or -// multiple Unicode characters representing the NFKC expansion of a ligature -// such as fi, ffl etc. These are also stored as utf8. -class TESS_API UNICHAR { -public: - UNICHAR() { - memset(chars, 0, UNICHAR_LEN); - } - - // Construct from a utf8 string. If len<0 then the string is null terminated. - // If the string is too long to fit in the UNICHAR then it takes only what - // will fit. - UNICHAR(const char *utf8_str, int len); - - // Construct from a single UCS4 character. - explicit UNICHAR(int unicode); - - // Default copy constructor and operator= are OK. - - // Get the first character as UCS-4. - int first_uni() const; - - // Get the length of the UTF8 string. - int utf8_len() const { - int len = chars[UNICHAR_LEN - 1]; - return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN; - } - - // Get a UTF8 string, but NOT nullptr terminated. - const char *utf8() const { - return chars; - } - - // Get a terminated UTF8 string: Must delete[] it after use. - char *utf8_str() const; - - // Get the number of bytes in the first character of the given utf8 string. - static int utf8_step(const char *utf8_str); - - // A class to simplify iterating over and accessing elements of a UTF8 - // string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or - // take ownership of the underlying byte array. It also does not permit - // modification of the array (as the name suggests). - // - // Example: - // for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len); - // it != UNICHAR::end(str, len); - // ++it) { - // printf("UCS-4 symbol code = %d\n", *it); - // char buf[5]; - // int char_len = it.get_utf8(buf); buf[char_len] = '\0'; - // printf("Char = %s\n", buf); - // } - class TESS_API const_iterator { - using CI = const_iterator; - - public: - // Step to the next UTF8 character. - // If the current position is at an illegal UTF8 character, then print an - // error message and step by one byte. If the current position is at a - // nullptr value, don't step past it. - const_iterator &operator++(); - - // Return the UCS-4 value at the current position. - // If the current position is at an illegal UTF8 value, return a single - // space character. - int operator*() const; - - // Store the UTF-8 encoding of the current codepoint into buf, which must be - // at least 4 bytes long. Return the number of bytes written. - // If the current position is at an illegal UTF8 value, writes a single - // space character and returns 1. - // Note that this method does not null-terminate the buffer. - int get_utf8(char *buf) const; - // Returns the number of bytes of the current codepoint. Returns 1 if the - // current position is at an illegal UTF8 value. - int utf8_len() const; - // Returns true if the UTF-8 encoding at the current position is legal. - bool is_legal() const; - - // Return the pointer into the string at the current position. - const char *utf8_data() const { - return it_; - } - - // Iterator equality operators. - friend bool operator==(const CI &lhs, const CI &rhs) { - return lhs.it_ == rhs.it_; - } - friend bool operator!=(const CI &lhs, const CI &rhs) { - return !(lhs == rhs); - } - - private: - friend class UNICHAR; - explicit const_iterator(const char *it) : it_(it) {} - - const char *it_; // Pointer into the string. - }; - - // Create a start/end iterator pointing to a string. Note that these methods - // are static and do NOT create a copy or take ownership of the underlying - // array. - static const_iterator begin(const char *utf8_str, int byte_length); - static const_iterator end(const char *utf8_str, int byte_length); - - // Converts a utf-8 string to a vector of unicodes. - // Returns an empty vector if the input contains invalid UTF-8. - static std::vector UTF8ToUTF32(const char *utf8_str); - // Converts a vector of unicodes to a utf8 string. - // Returns an empty string if the input contains an invalid unicode. - static std::string UTF32ToUTF8(const std::vector &str32); - -private: - // A UTF-8 representation of 1 or more Unicode characters. - // The last element (chars[UNICHAR_LEN - 1]) is a length if - // its value < UNICHAR_LEN, otherwise it is a genuine character. - char chars[UNICHAR_LEN]{}; -}; - -} // namespace tesseract - -#endif // TESSERACT_CCUTIL_UNICHAR_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/version.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/version.h deleted file mode 100644 index 6bac5d66..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/version.h +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: version.h -// Description: Version information -// -// (C) Copyright 2018, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_VERSION_H_ -#define TESSERACT_API_VERSION_H_ - -// clang-format off - -#define TESSERACT_MAJOR_VERSION @GENERIC_MAJOR_VERSION@ -#define TESSERACT_MINOR_VERSION @GENERIC_MINOR_VERSION@ -#define TESSERACT_MICRO_VERSION @GENERIC_MICRO_VERSION@ - -#define TESSERACT_VERSION \ - (TESSERACT_MAJOR_VERSION << 16 | \ - TESSERACT_MINOR_VERSION << 8 | \ - TESSERACT_MICRO_VERSION) - -#define TESSERACT_VERSION_STR "@PACKAGE_VERSION@" - -// clang-format on - -#endif // TESSERACT_API_VERSION_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/baseapi.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/baseapi.h deleted file mode 100644 index 5e1e4830..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/baseapi.h +++ /dev/null @@ -1,812 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: baseapi.h -// Description: Simple API for calling tesseract. -// Author: Ray Smith -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_BASEAPI_H_ -#define TESSERACT_API_BASEAPI_H_ - -#ifdef HAVE_CONFIG_H -# include "config_auto.h" // DISABLED_LEGACY_ENGINE -#endif - -#include "export.h" -#include "pageiterator.h" -#include "publictypes.h" -#include "resultiterator.h" -#include "unichar.h" - -#include "version.h" - -#include -#include // for std::vector - -struct Pix; -struct Pixa; -struct Boxa; - -namespace tesseract { - -class PAGE_RES; -class ParagraphModel; -class BLOCK_LIST; -class ETEXT_DESC; -struct OSResults; -class UNICHARSET; - -class Dawg; -class Dict; -class EquationDetect; -class PageIterator; -class ImageThresholder; -class LTRResultIterator; -class ResultIterator; -class MutableIterator; -class TessResultRenderer; -class Tesseract; - -// Function to read a std::vector from a whole file. -// Returns false on failure. -using FileReader = bool (*)(const char *filename, std::vector *data); - -using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID, - bool) const; -using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *, - int, const char *, int); - -/** - * Base class for all tesseract APIs. - * Specific classes can add ability to work on different inputs or produce - * different outputs. - * This class is mostly an interface layer on top of the Tesseract instance - * class to hide the data types so that users of this class don't have to - * include any other Tesseract headers. - */ -class TESS_API TessBaseAPI { -public: - TessBaseAPI(); - virtual ~TessBaseAPI(); - // Copy constructor and assignment operator are currently unsupported. - TessBaseAPI(TessBaseAPI const &) = delete; - TessBaseAPI &operator=(TessBaseAPI const &) = delete; - - /** - * Returns the version identifier as a static string. Do not delete. - */ - static const char *Version(); - - /** - * If compiled with OpenCL AND an available OpenCL - * device is deemed faster than serial code, then - * "device" is populated with the cl_device_id - * and returns sizeof(cl_device_id) - * otherwise *device=nullptr and returns 0. - */ - static size_t getOpenCLDevice(void **device); - - /** - * Set the name of the input file. Needed for training and - * reading a UNLV zone file, and for searchable PDF output. - */ - void SetInputName(const char *name); - /** - * These functions are required for searchable PDF output. - * We need our hands on the input file so that we can include - * it in the PDF without transcoding. If that is not possible, - * we need the original image. Finally, resolution metadata - * is stored in the PDF so we need that as well. - */ - const char *GetInputName(); - // Takes ownership of the input pix. - void SetInputImage(Pix *pix); - Pix *GetInputImage(); - int GetSourceYResolution(); - const char *GetDatapath(); - - /** Set the name of the bonus output files. Needed only for debugging. */ - void SetOutputName(const char *name); - - /** - * Set the value of an internal "parameter." - * Supply the name of the parameter and the value as a string, just as - * you would in a config file. - * Returns false if the name lookup failed. - * Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z. - * Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode. - * SetVariable may be used before Init, but settings will revert to - * defaults on End(). - * - * Note: Must be called after Init(). Only works for non-init variables - * (init variables should be passed to Init()). - */ - bool SetVariable(const char *name, const char *value); - bool SetDebugVariable(const char *name, const char *value); - - /** - * Returns true if the parameter was found among Tesseract parameters. - * Fills in value with the value of the parameter. - */ - bool GetIntVariable(const char *name, int *value) const; - bool GetBoolVariable(const char *name, bool *value) const; - bool GetDoubleVariable(const char *name, double *value) const; - - /** - * Returns the pointer to the string that represents the value of the - * parameter if it was found among Tesseract parameters. - */ - const char *GetStringVariable(const char *name) const; - -#ifndef DISABLED_LEGACY_ENGINE - - /** - * Print Tesseract fonts table to the given file. - */ - void PrintFontsTable(FILE *fp) const; - -#endif - - /** - * Print Tesseract parameters to the given file. - */ - void PrintVariables(FILE *fp) const; - - /** - * Get value of named variable as a string, if it exists. - */ - bool GetVariableAsString(const char *name, std::string *val) const; - - /** - * Instances are now mostly thread-safe and totally independent, - * but some global parameters remain. Basically it is safe to use multiple - * TessBaseAPIs in different threads in parallel, UNLESS: - * you use SetVariable on some of the Params in classify and textord. - * If you do, then the effect will be to change it for all your instances. - * - * Start tesseract. Returns zero on success and -1 on failure. - * NOTE that the only members that may be called before Init are those - * listed above here in the class definition. - * - * The datapath must be the name of the tessdata directory. - * The language is (usually) an ISO 639-3 string or nullptr will default to - * eng. It is entirely safe (and eventually will be efficient too) to call - * Init multiple times on the same instance to change language, or just - * to reset the classifier. - * The language may be a string of the form [~][+[~]]* indicating - * that multiple languages are to be loaded. Eg hin+eng will load Hindi and - * English. Languages may specify internally that they want to be loaded - * with one or more other languages, so the ~ sign is available to override - * that. Eg if hin were set to load eng by default, then hin+~eng would force - * loading only hin. The number of loaded languages is limited only by - * memory, with the caveat that loading additional languages will impact - * both speed and accuracy, as there is more work to do to decide on the - * applicable language, and there is more chance of hallucinating incorrect - * words. - * WARNING: On changing languages, all Tesseract parameters are reset - * back to their default values. (Which may vary between languages.) - * If you have a rare need to set a Variable that controls - * initialization for a second call to Init you should explicitly - * call End() and then use SetVariable before Init. This is only a very - * rare use case, since there are very few uses that require any parameters - * to be set before Init. - * - * If set_only_non_debug_params is true, only params that do not contain - * "debug" in the name will be set. - */ - int Init(const char *datapath, const char *language, OcrEngineMode mode, - char **configs, int configs_size, - const std::vector *vars_vec, - const std::vector *vars_values, - bool set_only_non_debug_params); - int Init(const char *datapath, const char *language, OcrEngineMode oem) { - return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false); - } - int Init(const char *datapath, const char *language) { - return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr, - false); - } - // In-memory version reads the traineddata file directly from the given - // data[data_size] array, and/or reads data via a FileReader. - int Init(const char *data, int data_size, const char *language, - OcrEngineMode mode, char **configs, int configs_size, - const std::vector *vars_vec, - const std::vector *vars_values, - bool set_only_non_debug_params, FileReader reader); - - /** - * Returns the languages string used in the last valid initialization. - * If the last initialization specified "deu+hin" then that will be - * returned. If hin loaded eng automatically as well, then that will - * not be included in this list. To find the languages actually - * loaded use GetLoadedLanguagesAsVector. - * The returned string should NOT be deleted. - */ - const char *GetInitLanguagesAsString() const; - - /** - * Returns the loaded languages in the vector of std::string. - * Includes all languages loaded by the last Init, including those loaded - * as dependencies of other loaded languages. - */ - void GetLoadedLanguagesAsVector(std::vector *langs) const; - - /** - * Returns the available languages in the sorted vector of std::string. - */ - void GetAvailableLanguagesAsVector(std::vector *langs) const; - - /** - * Init only for page layout analysis. Use only for calls to SetImage and - * AnalysePage. Calls that attempt recognition will generate an error. - */ - void InitForAnalysePage(); - - /** - * Read a "config" file containing a set of param, value pairs. - * Searches the standard places: tessdata/configs, tessdata/tessconfigs - * and also accepts a relative or absolute path name. - * Note: only non-init params will be set (init params are set by Init()). - */ - void ReadConfigFile(const char *filename); - /** Same as above, but only set debug params from the given config file. */ - void ReadDebugConfigFile(const char *filename); - - /** - * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK. - * The mode is stored as an IntParam so it can also be modified by - * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string). - */ - void SetPageSegMode(PageSegMode mode); - - /** Return the current page segmentation mode. */ - PageSegMode GetPageSegMode() const; - - /** - * Recognize a rectangle from an image and return the result as a string. - * May be called many times for a single Init. - * Currently has no error checking. - * Greyscale of 8 and color of 24 or 32 bits per pixel may be given. - * Palette color images will not work properly and must be converted to - * 24 bit. - * Binary images of 1 bit per pixel may also be given but they must be - * byte packed with the MSB of the first byte being the first pixel, and a - * 1 represents WHITE. For binary images set bytes_per_pixel=0. - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - * - * Note that TesseractRect is the simplified convenience interface. - * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize, - * and one or more of the Get*Text functions below. - */ - char *TesseractRect(const unsigned char *imagedata, int bytes_per_pixel, - int bytes_per_line, int left, int top, int width, - int height); - - /** - * Call between pages or documents etc to free up memory and forget - * adaptive data. - */ - void ClearAdaptiveClassifier(); - - /** - * @defgroup AdvancedAPI Advanced API - * The following methods break TesseractRect into pieces, so you can - * get hold of the thresholded image, get the text in different formats, - * get bounding boxes, confidences etc. - */ - /* @{ */ - - /** - * Provide an image for Tesseract to recognize. Format is as - * TesseractRect above. Copies the image buffer and converts to Pix. - * SetImage clears all recognition results, and sets the rectangle to the - * full image, so it may be followed immediately by a GetUTF8Text, and it - * will automatically perform recognition. - */ - void SetImage(const unsigned char *imagedata, int width, int height, - int bytes_per_pixel, int bytes_per_line); - - /** - * Provide an image for Tesseract to recognize. As with SetImage above, - * Tesseract takes its own copy of the image, so it need not persist until - * after Recognize. - * Pix vs raw, which to use? - * Use Pix where possible. Tesseract uses Pix as its internal representation - * and it is therefore more efficient to provide a Pix directly. - */ - void SetImage(Pix *pix); - - /** - * Set the resolution of the source image in pixels per inch so font size - * information can be calculated in results. Call this after SetImage(). - */ - void SetSourceResolution(int ppi); - - /** - * Restrict recognition to a sub-rectangle of the image. Call after SetImage. - * Each SetRectangle clears the recogntion results so multiple rectangles - * can be recognized with the same image. - */ - void SetRectangle(int left, int top, int width, int height); - - /** - * Get a copy of the internal thresholded image from Tesseract. - * Caller takes ownership of the Pix and must pixDestroy it. - * May be called any time after SetImage, or after TesseractRect. - */ - Pix *GetThresholdedImage(); - - /** - * Get the result of page layout analysis as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - */ - Boxa *GetRegions(Pixa **pixa); - - /** - * Get the textlines as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * If raw_image is true, then extract from the original image instead of the - * thresholded image and pad by raw_padding pixels. - * If blockids is not nullptr, the block-id of each line is also returned as - * an array of one element per line. delete [] after use. If paraids is not - * nullptr, the paragraph-id of each line within its block is also returned as - * an array of one element per line. delete [] after use. - */ - Boxa *GetTextlines(bool raw_image, int raw_padding, Pixa **pixa, - int **blockids, int **paraids); - /* - Helper method to extract from the thresholded image. (most common usage) -*/ - Boxa *GetTextlines(Pixa **pixa, int **blockids) { - return GetTextlines(false, 0, pixa, blockids, nullptr); - } - - /** - * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa - * pair, in reading order. Enables downstream handling of non-rectangular - * regions. - * Can be called before or after Recognize. - * If blockids is not nullptr, the block-id of each line is also returned as - * an array of one element per line. delete [] after use. - */ - Boxa *GetStrips(Pixa **pixa, int **blockids); - - /** - * Get the words as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - */ - Boxa *GetWords(Pixa **pixa); - - /** - * Gets the individual connected (text) components (created - * after pages segmentation step, but before recognition) - * as a leptonica-style Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * Note: the caller is responsible for calling boxaDestroy() - * on the returned Boxa array and pixaDestroy() on cc array. - */ - Boxa *GetConnectedComponents(Pixa **cc); - - /** - * Get the given level kind of components (block, textline, word etc.) as a - * leptonica-style Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * If blockids is not nullptr, the block-id of each component is also returned - * as an array of one element per component. delete [] after use. - * If blockids is not nullptr, the paragraph-id of each component with its - * block is also returned as an array of one element per component. delete [] - * after use. If raw_image is true, then portions of the original image are - * extracted instead of the thresholded image and padded with raw_padding. If - * text_only is true, then only text components are returned. - */ - Boxa *GetComponentImages(PageIteratorLevel level, bool text_only, - bool raw_image, int raw_padding, Pixa **pixa, - int **blockids, int **paraids); - // Helper function to get binary images with no padding (most common usage). - Boxa *GetComponentImages(const PageIteratorLevel level, const bool text_only, - Pixa **pixa, int **blockids) { - return GetComponentImages(level, text_only, false, 0, pixa, blockids, - nullptr); - } - - /** - * Returns the scale factor of the thresholded image that would be returned by - * GetThresholdedImage() and the various GetX() methods that call - * GetComponentImages(). - * Returns 0 if no thresholder has been set. - */ - int GetThresholdedImageScaleFactor() const; - - /** - * Runs page layout analysis in the mode set by SetPageSegMode. - * May optionally be called prior to Recognize to get access to just - * the page layout results. Returns an iterator to the results. - * If merge_similar_words is true, words are combined where suitable for use - * with a line recognizer. Use if you want to use AnalyseLayout to find the - * textlines, and then want to process textline fragments with an external - * line recognizer. - * Returns nullptr on error or an empty page. - * The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - PageIterator *AnalyseLayout(); - PageIterator *AnalyseLayout(bool merge_similar_words); - - /** - * Recognize the image from SetAndThresholdImage, generating Tesseract - * internal structures. Returns 0 on success. - * Optional. The Get*Text functions below will call Recognize if needed. - * After Recognize, the output is kept internally until the next SetImage. - */ - int Recognize(ETEXT_DESC *monitor); - - /** - * Methods to retrieve information after SetAndThresholdImage(), - * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.) - */ - - /** - * Turns images into symbolic text. - * - * filename can point to a single image, a multi-page TIFF, - * or a plain text list of image filenames. - * - * retry_config is useful for debugging. If not nullptr, you can fall - * back to an alternate configuration if a page fails for some - * reason. - * - * timeout_millisec terminates processing if any single page - * takes too long. Set to 0 for unlimited time. - * - * renderer is responible for creating the output. For example, - * use the TessTextRenderer if you want plaintext output, or - * the TessPDFRender to produce searchable PDF. - * - * If tessedit_page_number is non-negative, will only process that - * single page. Works for multi-page tiff file, or filelist. - * - * Returns true if successful, false on error. - */ - bool ProcessPages(const char *filename, const char *retry_config, - int timeout_millisec, TessResultRenderer *renderer); - // Does the real work of ProcessPages. - bool ProcessPagesInternal(const char *filename, const char *retry_config, - int timeout_millisec, TessResultRenderer *renderer); - - /** - * Turn a single image into symbolic text. - * - * The pix is the image processed. filename and page_index are - * metadata used by side-effect processes, such as reading a box - * file or formatting as hOCR. - * - * See ProcessPages for descriptions of other parameters. - */ - bool ProcessPage(Pix *pix, int page_index, const char *filename, - const char *retry_config, int timeout_millisec, - TessResultRenderer *renderer); - - /** - * Get a reading-order iterator to the results of LayoutAnalysis and/or - * Recognize. The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - ResultIterator *GetIterator(); - - /** - * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize. - * The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - MutableIterator *GetMutableIterator(); - - /** - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - */ - char *GetUTF8Text(); - - /** - * Make a HTML-formatted string with hOCR markup from the internal - * data structures. - * page_number is 0-based but will appear in the output as 1-based. - * monitor can be used to - * cancel the recognition - * receive progress callbacks - * Returned string must be freed with the delete [] operator. - */ - char *GetHOCRText(ETEXT_DESC *monitor, int page_number); - - /** - * Make a HTML-formatted string with hOCR markup from the internal - * data structures. - * page_number is 0-based but will appear in the output as 1-based. - * Returned string must be freed with the delete [] operator. - */ - char *GetHOCRText(int page_number); - - /** - * Make an XML-formatted string with Alto markup from the internal - * data structures. - */ - char *GetAltoText(ETEXT_DESC *monitor, int page_number); - - /** - * Make an XML-formatted string with Alto markup from the internal - * data structures. - */ - char *GetAltoText(int page_number); - - /** - * Make a TSV-formatted string from the internal data structures. - * page_number is 0-based but will appear in the output as 1-based. - * Returned string must be freed with the delete [] operator. - */ - char *GetTSVText(int page_number); - - /** - * Make a box file for LSTM training from the internal data structures. - * Constructs coordinates in the original image - not just the rectangle. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetLSTMBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded in the same - * format as a box file used in training. - * Constructs coordinates in the original image - not just the rectangle. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded in the same - * format as a WordStr box file used in training. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetWordStrBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded - * as UNLV format Latin-1 with specific reject and suspect codes. - * Returned string must be freed with the delete [] operator. - */ - char *GetUNLVText(); - - /** - * Detect the orientation of the input image and apparent script (alphabet). - * orient_deg is the detected clockwise rotation of the input image in degrees - * (0, 90, 180, 270) - * orient_conf is the confidence (15.0 is reasonably confident) - * script_name is an ASCII string, the name of the script, e.g. "Latin" - * script_conf is confidence level in the script - * Returns true on success and writes values to each parameter as an output - */ - bool DetectOrientationScript(int *orient_deg, float *orient_conf, - const char **script_name, float *script_conf); - - /** - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - * page_number is a 0-based page index that will appear in the osd file. - */ - char *GetOsdText(int page_number); - - /** Returns the (average) confidence value between 0 and 100. */ - int MeanTextConf(); - /** - * Returns all word confidences (between 0 and 100) in an array, terminated - * by -1. The calling function must delete [] after use. - * The number of confidences should correspond to the number of space- - * delimited words in GetUTF8Text. - */ - int *AllWordConfidences(); - -#ifndef DISABLED_LEGACY_ENGINE - /** - * Applies the given word to the adaptive classifier if possible. - * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can - * tell the boundaries of the graphemes. - * Assumes that SetImage/SetRectangle have been used to set the image - * to the given word. The mode arg should be PSM_SINGLE_WORD or - * PSM_CIRCLE_WORD, as that will be used to control layout analysis. - * The currently set PageSegMode is preserved. - * Returns false if adaption was not possible for some reason. - */ - bool AdaptToWordStr(PageSegMode mode, const char *wordstr); -#endif // ndef DISABLED_LEGACY_ENGINE - - /** - * Free up recognition results and any stored image data, without actually - * freeing any recognition data that would be time-consuming to reload. - * Afterwards, you must call SetImage or TesseractRect before doing - * any Recognize or Get* operation. - */ - void Clear(); - - /** - * Close down tesseract and free up all memory. End() is equivalent to - * destructing and reconstructing your TessBaseAPI. - * Once End() has been used, none of the other API functions may be used - * other than Init and anything declared above it in the class definition. - */ - void End(); - - /** - * Clear any library-level memory caches. - * There are a variety of expensive-to-load constant data structures (mostly - * language dictionaries) that are cached globally -- surviving the Init() - * and End() of individual TessBaseAPI's. This function allows the clearing - * of these caches. - **/ - static void ClearPersistentCache(); - - /** - * Check whether a word is valid according to Tesseract's language model - * @return 0 if the word is invalid, non-zero if valid. - * @warning temporary! This function will be removed from here and placed - * in a separate API at some future time. - */ - int IsValidWord(const char *word) const; - // Returns true if utf8_character is defined in the UniCharset. - bool IsValidCharacter(const char *utf8_character) const; - - bool GetTextDirection(int *out_offset, float *out_slope); - - /** Sets Dict::letter_is_okay_ function to point to the given function. */ - void SetDictFunc(DictFunc f); - - /** Sets Dict::probability_in_context_ function to point to the given - * function. - */ - void SetProbabilityInContextFunc(ProbabilityInContextFunc f); - - /** - * Estimates the Orientation And Script of the image. - * @return true if the image was processed successfully. - */ - bool DetectOS(OSResults *); - - /** - * Return text orientation of each block as determined by an earlier run - * of layout analysis. - */ - void GetBlockTextOrientations(int **block_orientation, - bool **vertical_writing); - - /** This method returns the string form of the specified unichar. */ - const char *GetUnichar(int unichar_id) const; - - /** Return the pointer to the i-th dawg loaded into tesseract_ object. */ - const Dawg *GetDawg(int i) const; - - /** Return the number of dawgs loaded into tesseract_ object. */ - int NumDawgs() const; - - Tesseract *tesseract() const { - return tesseract_; - } - - OcrEngineMode oem() const { - return last_oem_requested_; - } - - void set_min_orientation_margin(double margin); - /* @} */ - -protected: - /** Common code for setting the image. Returns true if Init has been called. - */ - bool InternalSetImage(); - - /** - * Run the thresholder to make the thresholded image. If pix is not nullptr, - * the source is thresholded to pix instead of the internal IMAGE. - */ - virtual bool Threshold(Pix **pix); - - /** - * Find lines from the image making the BLOCK_LIST. - * @return 0 on success. - */ - int FindLines(); - - /** Delete the pageres and block list ready for a new page. */ - void ClearResults(); - - /** - * Return an LTR Result Iterator -- used only for training, as we really want - * to ignore all BiDi smarts at that point. - * delete once you're done with it. - */ - LTRResultIterator *GetLTRIterator(); - - /** - * Return the length of the output text string, as UTF8, assuming - * one newline per line and one per block, with a terminator, - * and assuming a single character reject marker for each rejected character. - * Also return the number of recognized blobs in blob_count. - */ - int TextLength(int *blob_count) const; - - //// paragraphs.cpp //////////////////////////////////////////////////// - void DetectParagraphs(bool after_text_recognition); - - const PAGE_RES *GetPageRes() const { - return page_res_; - } - -protected: - Tesseract *tesseract_; ///< The underlying data object. - Tesseract *osd_tesseract_; ///< For orientation & script detection. - EquationDetect *equ_detect_; ///< The equation detector. - FileReader reader_; ///< Reads files from any filesystem. - ImageThresholder *thresholder_; ///< Image thresholding module. - std::vector *paragraph_models_; - BLOCK_LIST *block_list_; ///< The page layout. - PAGE_RES *page_res_; ///< The page-level data. - std::string input_file_; ///< Name used by training code. - std::string output_file_; ///< Name used by debug code. - std::string datapath_; ///< Current location of tessdata. - std::string language_; ///< Last initialized language. - OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested. - bool recognition_done_; ///< page_res_ contains recognition data. - - /** - * @defgroup ThresholderParams Thresholder Parameters - * Parameters saved from the Thresholder. Needed to rebuild coordinates. - */ - /* @{ */ - int rect_left_; - int rect_top_; - int rect_width_; - int rect_height_; - int image_width_; - int image_height_; - /* @} */ - -private: - // A list of image filenames gets special consideration - bool ProcessPagesFileList(FILE *fp, std::string *buf, - const char *retry_config, int timeout_millisec, - TessResultRenderer *renderer, - int tessedit_page_number); - // TIFF supports multipage so gets special consideration. - bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size, - const char *filename, const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer, - int tessedit_page_number); -}; // class TessBaseAPI. - -/** Escape a char string - remove &<>"' with HTML codes. */ -std::string HOcrEscape(const char *text); - -} // namespace tesseract - -#endif // TESSERACT_API_BASEAPI_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/capi.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/capi.h deleted file mode 100644 index 40f4856a..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/capi.h +++ /dev/null @@ -1,484 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: capi.h -// Description: C-API TessBaseAPI -// -// (C) Copyright 2012, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef API_CAPI_H_ -#define API_CAPI_H_ - -#include "export.h" - -#ifdef __cplusplus -# include -# include -# include -# include -# include -#endif - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef BOOL -# define BOOL int -# define TRUE 1 -# define FALSE 0 -#endif - -#ifdef __cplusplus -typedef tesseract::TessResultRenderer TessResultRenderer; -typedef tesseract::TessBaseAPI TessBaseAPI; -typedef tesseract::PageIterator TessPageIterator; -typedef tesseract::ResultIterator TessResultIterator; -typedef tesseract::MutableIterator TessMutableIterator; -typedef tesseract::ChoiceIterator TessChoiceIterator; -typedef tesseract::OcrEngineMode TessOcrEngineMode; -typedef tesseract::PageSegMode TessPageSegMode; -typedef tesseract::PageIteratorLevel TessPageIteratorLevel; -typedef tesseract::Orientation TessOrientation; -typedef tesseract::ParagraphJustification TessParagraphJustification; -typedef tesseract::WritingDirection TessWritingDirection; -typedef tesseract::TextlineOrder TessTextlineOrder; -typedef tesseract::PolyBlockType TessPolyBlockType; -typedef tesseract::ETEXT_DESC ETEXT_DESC; -#else -typedef struct TessResultRenderer TessResultRenderer; -typedef struct TessBaseAPI TessBaseAPI; -typedef struct TessPageIterator TessPageIterator; -typedef struct TessResultIterator TessResultIterator; -typedef struct TessMutableIterator TessMutableIterator; -typedef struct TessChoiceIterator TessChoiceIterator; -typedef enum TessOcrEngineMode { - OEM_TESSERACT_ONLY, - OEM_LSTM_ONLY, - OEM_TESSERACT_LSTM_COMBINED, - OEM_DEFAULT -} TessOcrEngineMode; -typedef enum TessPageSegMode { - PSM_OSD_ONLY, - PSM_AUTO_OSD, - PSM_AUTO_ONLY, - PSM_AUTO, - PSM_SINGLE_COLUMN, - PSM_SINGLE_BLOCK_VERT_TEXT, - PSM_SINGLE_BLOCK, - PSM_SINGLE_LINE, - PSM_SINGLE_WORD, - PSM_CIRCLE_WORD, - PSM_SINGLE_CHAR, - PSM_SPARSE_TEXT, - PSM_SPARSE_TEXT_OSD, - PSM_RAW_LINE, - PSM_COUNT -} TessPageSegMode; -typedef enum TessPageIteratorLevel { - RIL_BLOCK, - RIL_PARA, - RIL_TEXTLINE, - RIL_WORD, - RIL_SYMBOL -} TessPageIteratorLevel; -typedef enum TessPolyBlockType { - PT_UNKNOWN, - PT_FLOWING_TEXT, - PT_HEADING_TEXT, - PT_PULLOUT_TEXT, - PT_EQUATION, - PT_INLINE_EQUATION, - PT_TABLE, - PT_VERTICAL_TEXT, - PT_CAPTION_TEXT, - PT_FLOWING_IMAGE, - PT_HEADING_IMAGE, - PT_PULLOUT_IMAGE, - PT_HORZ_LINE, - PT_VERT_LINE, - PT_NOISE, - PT_COUNT -} TessPolyBlockType; -typedef enum TessOrientation { - ORIENTATION_PAGE_UP, - ORIENTATION_PAGE_RIGHT, - ORIENTATION_PAGE_DOWN, - ORIENTATION_PAGE_LEFT -} TessOrientation; -typedef enum TessParagraphJustification { - JUSTIFICATION_UNKNOWN, - JUSTIFICATION_LEFT, - JUSTIFICATION_CENTER, - JUSTIFICATION_RIGHT -} TessParagraphJustification; -typedef enum TessWritingDirection { - WRITING_DIRECTION_LEFT_TO_RIGHT, - WRITING_DIRECTION_RIGHT_TO_LEFT, - WRITING_DIRECTION_TOP_TO_BOTTOM -} TessWritingDirection; -typedef enum TessTextlineOrder { - TEXTLINE_ORDER_LEFT_TO_RIGHT, - TEXTLINE_ORDER_RIGHT_TO_LEFT, - TEXTLINE_ORDER_TOP_TO_BOTTOM -} TessTextlineOrder; -typedef struct ETEXT_DESC ETEXT_DESC; -#endif - -typedef bool (*TessCancelFunc)(void *cancel_this, int words); -typedef bool (*TessProgressFunc)(ETEXT_DESC *ths, int left, int right, int top, - int bottom); - -struct Pix; -struct Boxa; -struct Pixa; - -/* General free functions */ - -TESS_API const char *TessVersion(); -TESS_API void TessDeleteText(const char *text); -TESS_API void TessDeleteTextArray(char **arr); -TESS_API void TessDeleteIntArray(const int *arr); - -/* Renderer API */ -TESS_API TessResultRenderer *TessTextRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessHOcrRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase, - BOOL font_info); -TESS_API TessResultRenderer *TessAltoRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessTsvRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessPDFRendererCreate(const char *outputbase, - const char *datadir, - BOOL textonly); -TESS_API TessResultRenderer *TessUnlvRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessWordStrBoxRendererCreate( - const char *outputbase); - -TESS_API void TessDeleteResultRenderer(TessResultRenderer *renderer); -TESS_API void TessResultRendererInsert(TessResultRenderer *renderer, - TessResultRenderer *next); -TESS_API TessResultRenderer *TessResultRendererNext( - TessResultRenderer *renderer); -TESS_API BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer, - const char *title); -TESS_API BOOL TessResultRendererAddImage(TessResultRenderer *renderer, - TessBaseAPI *api); -TESS_API BOOL TessResultRendererEndDocument(TessResultRenderer *renderer); - -TESS_API const char *TessResultRendererExtention(TessResultRenderer *renderer); -TESS_API const char *TessResultRendererTitle(TessResultRenderer *renderer); -TESS_API int TessResultRendererImageNum(TessResultRenderer *renderer); - -/* Base API */ - -TESS_API TessBaseAPI *TessBaseAPICreate(); -TESS_API void TessBaseAPIDelete(TessBaseAPI *handle); - -TESS_API size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI *handle, void **device); - -TESS_API void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name); -TESS_API const char *TessBaseAPIGetInputName(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetInputImage(TessBaseAPI *handle, struct Pix *pix); -TESS_API struct Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle); - -TESS_API int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle); -TESS_API const char *TessBaseAPIGetDatapath(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name); - -TESS_API BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name, - const char *value); -TESS_API BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name, - const char *value); - -TESS_API BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle, - const char *name, int *value); -TESS_API BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle, - const char *name, BOOL *value); -TESS_API BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle, - const char *name, double *value); -TESS_API const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle, - const char *name); - -TESS_API void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp); -TESS_API BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle, - const char *filename); - -TESS_API int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode oem, - char **configs, int configs_size); -TESS_API int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode oem); -TESS_API int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath, - const char *language); - -TESS_API int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode mode, - char **configs, int configs_size, char **vars_vec, - char **vars_values, size_t vars_vec_size, - BOOL set_only_non_debug_params); - -TESS_API int TessBaseAPIInit5(TessBaseAPI *handle, const char *data, int data_size, - const char *language, TessOcrEngineMode mode, - char **configs, int configs_size, char **vars_vec, - char **vars_values, size_t vars_vec_size, - BOOL set_only_non_debug_params); - -TESS_API const char *TessBaseAPIGetInitLanguagesAsString( - const TessBaseAPI *handle); -TESS_API char **TessBaseAPIGetLoadedLanguagesAsVector( - const TessBaseAPI *handle); -TESS_API char **TessBaseAPIGetAvailableLanguagesAsVector( - const TessBaseAPI *handle); - -TESS_API void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle); - -TESS_API void TessBaseAPIReadConfigFile(TessBaseAPI *handle, - const char *filename); -TESS_API void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle, - const char *filename); - -TESS_API void TessBaseAPISetPageSegMode(TessBaseAPI *handle, - TessPageSegMode mode); -TESS_API TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle); - -TESS_API char *TessBaseAPIRect(TessBaseAPI *handle, - const unsigned char *imagedata, - int bytes_per_pixel, int bytes_per_line, - int left, int top, int width, int height); - -TESS_API void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetImage(TessBaseAPI *handle, - const unsigned char *imagedata, int width, - int height, int bytes_per_pixel, - int bytes_per_line); -TESS_API void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix); - -TESS_API void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi); - -TESS_API void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top, - int width, int height); - -TESS_API struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle); -TESS_API struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle, - struct Pixa **pixa); -TESS_API struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle, - struct Pixa **pixa, - int **blockids); -TESS_API struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle, - BOOL raw_image, int raw_padding, - struct Pixa **pixa, - int **blockids, int **paraids); -TESS_API struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle, - struct Pixa **pixa, int **blockids); -TESS_API struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle, - struct Pixa **pixa); -TESS_API struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle, - struct Pixa **cc); -TESS_API struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle, - TessPageIteratorLevel level, - BOOL text_only, - struct Pixa **pixa, - int **blockids); -TESS_API struct Boxa *TessBaseAPIGetComponentImages1( - TessBaseAPI *handle, TessPageIteratorLevel level, BOOL text_only, - BOOL raw_image, int raw_padding, struct Pixa **pixa, int **blockids, - int **paraids); - -TESS_API int TessBaseAPIGetThresholdedImageScaleFactor( - const TessBaseAPI *handle); - -TESS_API TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle); - -TESS_API int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor); - -TESS_API BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename, - const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer); -TESS_API BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix, - int page_index, const char *filename, - const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer); - -TESS_API TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle); -TESS_API TessMutableIterator *TessBaseAPIGetMutableIterator( - TessBaseAPI *handle); - -TESS_API char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle); -TESS_API char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number); - -TESS_API char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number); - -TESS_API char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle, - int page_number); - -TESS_API char *TessBaseAPIGetUNLVText(TessBaseAPI *handle); -TESS_API int TessBaseAPIMeanTextConf(TessBaseAPI *handle); - -TESS_API int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle); - -#ifndef DISABLED_LEGACY_ENGINE -TESS_API BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle, - TessPageSegMode mode, - const char *wordstr); -#endif // #ifndef DISABLED_LEGACY_ENGINE - -TESS_API void TessBaseAPIClear(TessBaseAPI *handle); -TESS_API void TessBaseAPIEnd(TessBaseAPI *handle); - -TESS_API int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word); -TESS_API BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset, - float *out_slope); - -TESS_API const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id); - -TESS_API void TessBaseAPIClearPersistentCache(TessBaseAPI *handle); - -#ifndef DISABLED_LEGACY_ENGINE - -// Call TessDeleteText(*best_script_name) to free memory allocated by this -// function -TESS_API BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle, - int *orient_deg, - float *orient_conf, - const char **script_name, - float *script_conf); -#endif // #ifndef DISABLED_LEGACY_ENGINE - -TESS_API void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle, - double margin); - -TESS_API int TessBaseAPINumDawgs(const TessBaseAPI *handle); - -TESS_API TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle); - -TESS_API void TessBaseGetBlockTextOrientations(TessBaseAPI *handle, - int **block_orientation, - bool **vertical_writing); - -/* Page iterator */ - -TESS_API void TessPageIteratorDelete(TessPageIterator *handle); - -TESS_API TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle); - -TESS_API void TessPageIteratorBegin(TessPageIterator *handle); - -TESS_API BOOL TessPageIteratorNext(TessPageIterator *handle, - TessPageIteratorLevel level); - -TESS_API BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle, - TessPageIteratorLevel level); - -TESS_API BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle, - TessPageIteratorLevel level, - TessPageIteratorLevel element); - -TESS_API BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle, - TessPageIteratorLevel level, - int *left, int *top, int *right, - int *bottom); - -TESS_API TessPolyBlockType -TessPageIteratorBlockType(const TessPageIterator *handle); - -TESS_API struct Pix *TessPageIteratorGetBinaryImage( - const TessPageIterator *handle, TessPageIteratorLevel level); - -TESS_API struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle, - TessPageIteratorLevel level, - int padding, - struct Pix *original_image, - int *left, int *top); - -TESS_API BOOL TessPageIteratorBaseline(const TessPageIterator *handle, - TessPageIteratorLevel level, int *x1, - int *y1, int *x2, int *y2); - -TESS_API void TessPageIteratorOrientation( - TessPageIterator *handle, TessOrientation *orientation, - TessWritingDirection *writing_direction, TessTextlineOrder *textline_order, - float *deskew_angle); - -TESS_API void TessPageIteratorParagraphInfo( - TessPageIterator *handle, TessParagraphJustification *justification, - BOOL *is_list_item, BOOL *is_crown, int *first_line_indent); - -/* Result iterator */ - -TESS_API void TessResultIteratorDelete(TessResultIterator *handle); -TESS_API TessResultIterator *TessResultIteratorCopy( - const TessResultIterator *handle); -TESS_API TessPageIterator *TessResultIteratorGetPageIterator( - TessResultIterator *handle); -TESS_API const TessPageIterator *TessResultIteratorGetPageIteratorConst( - const TessResultIterator *handle); -TESS_API TessChoiceIterator *TessResultIteratorGetChoiceIterator( - const TessResultIterator *handle); - -TESS_API BOOL TessResultIteratorNext(TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API float TessResultIteratorConfidence(const TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API const char *TessResultIteratorWordRecognitionLanguage( - const TessResultIterator *handle); -TESS_API const char *TessResultIteratorWordFontAttributes( - const TessResultIterator *handle, BOOL *is_bold, BOOL *is_italic, - BOOL *is_underlined, BOOL *is_monospace, BOOL *is_serif, BOOL *is_smallcaps, - int *pointsize, int *font_id); - -TESS_API BOOL -TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle); -TESS_API BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle); - -TESS_API void TessChoiceIteratorDelete(TessChoiceIterator *handle); -TESS_API BOOL TessChoiceIteratorNext(TessChoiceIterator *handle); -TESS_API const char *TessChoiceIteratorGetUTF8Text( - const TessChoiceIterator *handle); -TESS_API float TessChoiceIteratorConfidence(const TessChoiceIterator *handle); - -/* Progress monitor */ - -TESS_API ETEXT_DESC *TessMonitorCreate(); -TESS_API void TessMonitorDelete(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetCancelFunc(ETEXT_DESC *monitor, - TessCancelFunc cancelFunc); -TESS_API void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis); -TESS_API void *TessMonitorGetCancelThis(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetProgressFunc(ETEXT_DESC *monitor, - TessProgressFunc progressFunc); -TESS_API int TessMonitorGetProgress(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline); - -#ifdef __cplusplus -} -#endif - -#endif // API_CAPI_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/export.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/export.h deleted file mode 100644 index d238b628..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/export.h +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: export.h -// Description: Place holder -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_PLATFORM_H_ -#define TESSERACT_PLATFORM_H_ - -#ifndef TESS_API -# if defined(_WIN32) || defined(__CYGWIN__) -# if defined(TESS_EXPORTS) -# define TESS_API __declspec(dllexport) -# elif defined(TESS_IMPORTS) -# define TESS_API __declspec(dllimport) -# else -# define TESS_API -# endif -# else -# if defined(TESS_EXPORTS) || defined(TESS_IMPORTS) -# define TESS_API __attribute__((visibility("default"))) -# else -# define TESS_API -# endif -# endif -#endif - -#endif // TESSERACT_PLATFORM_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/ltrresultiterator.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/ltrresultiterator.h deleted file mode 100644 index 6ca0a98e..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/ltrresultiterator.h +++ /dev/null @@ -1,235 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: ltrresultiterator.h -// Description: Iterator for tesseract results in strict left-to-right -// order that avoids using tesseract internal data structures. -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ -#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ - -#include "export.h" // for TESS_API -#include "pageiterator.h" // for PageIterator -#include "publictypes.h" // for PageIteratorLevel -#include "unichar.h" // for StrongScriptDirection - -namespace tesseract { - -class BLOB_CHOICE_IT; -class PAGE_RES; -class WERD_RES; - -class Tesseract; - -// Class to iterate over tesseract results, providing access to all levels -// of the page hierarchy, without including any tesseract headers or having -// to handle any tesseract structures. -// WARNING! This class points to data held within the TessBaseAPI class, and -// therefore can only be used while the TessBaseAPI class still exists and -// has not been subjected to a call of Init, SetImage, Recognize, Clear, End -// DetectOS, or anything else that changes the internal PAGE_RES. -// See tesseract/publictypes.h for the definition of PageIteratorLevel. -// See also base class PageIterator, which contains the bulk of the interface. -// LTRResultIterator adds text-specific methods for access to OCR output. - -class TESS_API LTRResultIterator : public PageIterator { - friend class ChoiceIterator; - -public: - // page_res and tesseract come directly from the BaseAPI. - // The rectangle parameters are copied indirectly from the Thresholder, - // via the BaseAPI. They represent the coordinates of some rectangle in an - // original image (in top-left-origin coordinates) and therefore the top-left - // needs to be added to any output boxes in order to specify coordinates - // in the original image. See TessBaseAPI::SetRectangle. - // The scale and scaled_yres are in case the Thresholder scaled the image - // rectangle prior to thresholding. Any coordinates in tesseract's image - // must be divided by scale before adding (rect_left, rect_top). - // The scaled_yres indicates the effective resolution of the binary image - // that tesseract has been given by the Thresholder. - // After the constructor, Begin has already been called. - LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, - int scaled_yres, int rect_left, int rect_top, - int rect_width, int rect_height); - - ~LTRResultIterator() override; - - // LTRResultIterators may be copied! This makes it possible to iterate over - // all the objects at a lower level, while maintaining an iterator to - // objects at a higher level. These constructors DO NOT CALL Begin, so - // iterations will continue from the location of src. - // TODO: For now the copy constructor and operator= only need the base class - // versions, but if new data members are added, don't forget to add them! - - // ============= Moving around within the page ============. - - // See PageIterator. - - // ============= Accessing data ==============. - - // Returns the null terminated UTF-8 encoded text string for the current - // object at the given level. Use delete [] to free after use. - char *GetUTF8Text(PageIteratorLevel level) const; - - // Set the string inserted at the end of each text line. "\n" by default. - void SetLineSeparator(const char *new_line); - - // Set the string inserted at the end of each paragraph. "\n" by default. - void SetParagraphSeparator(const char *new_para); - - // Returns the mean confidence of the current object at the given level. - // The number should be interpreted as a percent probability. (0.0f-100.0f) - float Confidence(PageIteratorLevel level) const; - - // ============= Functions that refer to words only ============. - - // Returns the font attributes of the current word. If iterating at a higher - // level object than words, eg textlines, then this will return the - // attributes of the first word in that textline. - // The actual return value is a string representing a font name. It points - // to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as - // the iterator itself, ie rendered invalid by various members of - // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI. - // Pointsize is returned in printers points (1/72 inch.) - const char *WordFontAttributes(bool *is_bold, bool *is_italic, - bool *is_underlined, bool *is_monospace, - bool *is_serif, bool *is_smallcaps, - int *pointsize, int *font_id) const; - - // Return the name of the language used to recognize this word. - // On error, nullptr. Do not delete this pointer. - const char *WordRecognitionLanguage() const; - - // Return the overall directionality of this word. - StrongScriptDirection WordDirection() const; - - // Returns true if the current word was found in a dictionary. - bool WordIsFromDictionary() const; - - // Returns the number of blanks before the current word. - int BlanksBeforeWord() const; - - // Returns true if the current word is numeric. - bool WordIsNumeric() const; - - // Returns true if the word contains blamer information. - bool HasBlamerInfo() const; - - // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle - // of the current word. - const void *GetParamsTrainingBundle() const; - - // Returns a pointer to the string with blamer information for this word. - // Assumes that the word's blamer_bundle is not nullptr. - const char *GetBlamerDebug() const; - - // Returns a pointer to the string with misadaption information for this word. - // Assumes that the word's blamer_bundle is not nullptr. - const char *GetBlamerMisadaptionDebug() const; - - // Returns true if a truth string was recorded for the current word. - bool HasTruthString() const; - - // Returns true if the given string is equivalent to the truth string for - // the current word. - bool EquivalentToTruth(const char *str) const; - - // Returns a null terminated UTF-8 encoded truth string for the current word. - // Use delete [] to free after use. - char *WordTruthUTF8Text() const; - - // Returns a null terminated UTF-8 encoded normalized OCR string for the - // current word. Use delete [] to free after use. - char *WordNormedUTF8Text() const; - - // Returns a pointer to serialized choice lattice. - // Fills lattice_size with the number of bytes in lattice data. - const char *WordLattice(int *lattice_size) const; - - // ============= Functions that refer to symbols only ============. - - // Returns true if the current symbol is a superscript. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsSuperscript() const; - // Returns true if the current symbol is a subscript. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsSubscript() const; - // Returns true if the current symbol is a dropcap. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsDropcap() const; - -protected: - const char *line_separator_; - const char *paragraph_separator_; -}; - -// Class to iterate over the classifier choices for a single RIL_SYMBOL. -class TESS_API ChoiceIterator { -public: - // Construction is from a LTRResultIterator that points to the symbol of - // interest. The ChoiceIterator allows a one-shot iteration over the - // choices for this symbol and after that it is useless. - explicit ChoiceIterator(const LTRResultIterator &result_it); - ~ChoiceIterator(); - - // Moves to the next choice for the symbol and returns false if there - // are none left. - bool Next(); - - // ============= Accessing data ==============. - - // Returns the null terminated UTF-8 encoded text string for the current - // choice. - // NOTE: Unlike LTRResultIterator::GetUTF8Text, the return points to an - // internal structure and should NOT be delete[]ed to free after use. - const char *GetUTF8Text() const; - - // Returns the confidence of the current choice depending on the used language - // data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All - // choices for one symbol should roughly add up to 1.0f. - // If only traineddata of the legacy engine is used, the number should be - // interpreted as a percent probability. (0.0f-100.0f) In this case - // probabilities won't add up to 100. Each one stands on its own. - float Confidence() const; - - // Returns a vector containing all timesteps, which belong to the currently - // selected symbol. A timestep is a vector containing pairs of symbols and - // floating point numbers. The number states the probability for the - // corresponding symbol. - std::vector>> *Timesteps() const; - -private: - // clears the remaining spaces out of the results and adapt the probabilities - void filterSpaces(); - // Pointer to the WERD_RES object owned by the API. - WERD_RES *word_res_; - // Iterator over the blob choices. - BLOB_CHOICE_IT *choice_it_; - std::vector> *LSTM_choices_ = nullptr; - std::vector>::iterator LSTM_choice_it_; - - const int *tstep_index_; - // regulates the rating granularity - double rating_coefficient_; - // leading blanks - int blanks_before_word_; - // true when there is lstm engine related trained data - bool oemLSTM_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/ocrclass.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/ocrclass.h deleted file mode 100644 index a55e6528..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/ocrclass.h +++ /dev/null @@ -1,158 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -/********************************************************************** - * File: ocrclass.h - * Description: Class definitions and constants for the OCR API. - * Author: Hewlett-Packard Co - * - * (C) Copyright 1996, Hewlett-Packard Co. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -/********************************************************************** - * This file contains typedefs for all the structures used by - * the HP OCR interface. - * The structures are designed to allow them to be used with any - * structure alignment up to 8. - **********************************************************************/ - -#ifndef CCUTIL_OCRCLASS_H_ -#define CCUTIL_OCRCLASS_H_ - -#include -#include - -namespace tesseract { - -/********************************************************************** - * EANYCODE_CHAR - * Description of a single character. The character code is defined by - * the character set of the current font. - * Output text is sent as an array of these structures. - * Spaces and line endings in the output are represented in the - * structures of the surrounding characters. They are not directly - * represented as characters. - * The first character in a word has a positive value of blanks. - * Missing information should be set to the defaults in the comments. - * If word bounds are known, but not character bounds, then the top and - * bottom of each character should be those of the word. The left of the - * first and right of the last char in each word should be set. All other - * lefts and rights should be set to -1. - * If set, the values of right and bottom are left+width and top+height. - * Most of the members come directly from the parameters to ocr_append_char. - * The formatting member uses the enhancement parameter and combines the - * line direction stuff into the top 3 bits. - * The coding is 0=RL char, 1=LR char, 2=DR NL, 3=UL NL, 4=DR Para, - * 5=UL Para, 6=TB char, 7=BT char. API users do not need to know what - * the coding is, only that it is backwards compatible with the previous - * version. - **********************************************************************/ - -struct EANYCODE_CHAR { /*single character */ - // It should be noted that the format for char_code for version 2.0 and beyond - // is UTF8 which means that ASCII characters will come out as one structure - // but other characters will be returned in two or more instances of this - // structure with a single byte of the UTF8 code in each, but each will have - // the same bounding box. Programs which want to handle languagues with - // different characters sets will need to handle extended characters - // appropriately, but *all* code needs to be prepared to receive UTF8 coded - // characters for characters such as bullet and fancy quotes. - uint16_t char_code; /*character itself */ - int16_t left; /*of char (-1) */ - int16_t right; /*of char (-1) */ - int16_t top; /*of char (-1) */ - int16_t bottom; /*of char (-1) */ - int16_t font_index; /*what font (0) */ - uint8_t confidence; /*0=perfect, 100=reject (0/100) */ - uint8_t point_size; /*of char, 72=i inch, (10) */ - int8_t blanks; /*no of spaces before this char (1) */ - uint8_t formatting; /*char formatting (0) */ -}; - -/********************************************************************** - * ETEXT_DESC - * Description of the output of the OCR engine. - * This structure is used as both a progress monitor and the final - * output header, since it needs to be a valid progress monitor while - * the OCR engine is storing its output to shared memory. - * During progress, all the buffer info is -1. - * Progress starts at 0 and increases to 100 during OCR. No other constraint. - * Additionally the progress callback contains the bounding box of the word that - * is currently being processed. - * Every progress callback, the OCR engine must set ocr_alive to 1. - * The HP side will set ocr_alive to 0. Repeated failure to reset - * to 1 indicates that the OCR engine is dead. - * If the cancel function is not null then it is called with the number of - * user words found. If it returns true then operation is cancelled. - **********************************************************************/ -class ETEXT_DESC; - -using CANCEL_FUNC = bool (*)(void *, int); -using PROGRESS_FUNC = bool (*)(int, int, int, int, int); -using PROGRESS_FUNC2 = bool (*)(ETEXT_DESC *, int, int, int, int); - -class ETEXT_DESC { // output header -public: - int16_t count{0}; /// chars in this buffer(0) - int16_t progress{0}; /// percent complete increasing (0-100) - /** Progress monitor covers word recognition and it does not cover layout - * analysis. - * See Ray comment in https://github.com/tesseract-ocr/tesseract/pull/27 */ - int8_t more_to_come{0}; /// true if not last - volatile int8_t ocr_alive{0}; /// ocr sets to 1, HP 0 - int8_t err_code{0}; /// for errcode use - CANCEL_FUNC cancel{nullptr}; /// returns true to cancel - PROGRESS_FUNC progress_callback{ - nullptr}; /// called whenever progress increases - PROGRESS_FUNC2 progress_callback2; /// monitor-aware progress callback - void *cancel_this{nullptr}; /// this or other data for cancel - std::chrono::steady_clock::time_point end_time; - /// Time to stop. Expected to be set only - /// by call to set_deadline_msecs(). - EANYCODE_CHAR text[1]{}; /// character data - - ETEXT_DESC() : progress_callback2(&default_progress_func) { - end_time = std::chrono::time_point(); - } - - // Sets the end time to be deadline_msecs milliseconds from now. - void set_deadline_msecs(int32_t deadline_msecs) { - if (deadline_msecs > 0) { - end_time = std::chrono::steady_clock::now() + - std::chrono::milliseconds(deadline_msecs); - } - } - - // Returns false if we've not passed the end_time, or have not set a deadline. - bool deadline_exceeded() const { - if (end_time.time_since_epoch() == - std::chrono::steady_clock::duration::zero()) { - return false; - } - auto now = std::chrono::steady_clock::now(); - return (now > end_time); - } - -private: - static bool default_progress_func(ETEXT_DESC *ths, int left, int right, - int top, int bottom) { - if (ths->progress_callback != nullptr) { - return (*(ths->progress_callback))(ths->progress, left, right, top, - bottom); - } - return true; - } -}; - -} // namespace tesseract - -#endif // CCUTIL_OCRCLASS_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/osdetect.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/osdetect.h deleted file mode 100644 index 34bfb557..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/osdetect.h +++ /dev/null @@ -1,139 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: osdetect.h -// Description: Orientation and script detection. -// Author: Samuel Charron -// Ranjith Unnikrishnan -// -// (C) Copyright 2008, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_OSDETECT_H_ -#define TESSERACT_CCMAIN_OSDETECT_H_ - -#include "export.h" // for TESS_API - -#include // for std::vector - -namespace tesseract { - -class BLOBNBOX; -class BLOBNBOX_CLIST; -class BLOB_CHOICE_LIST; -class TO_BLOCK_LIST; -class UNICHARSET; - -class Tesseract; - -// Max number of scripts in ICU + "NULL" + Japanese and Korean + Fraktur -const int kMaxNumberOfScripts = 116 + 1 + 2 + 1; - -struct OSBestResult { - OSBestResult() - : orientation_id(0), script_id(0), sconfidence(0.0), oconfidence(0.0) {} - int orientation_id; - int script_id; - float sconfidence; - float oconfidence; -}; - -struct OSResults { - OSResults() : unicharset(nullptr) { - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < kMaxNumberOfScripts; ++j) { - scripts_na[i][j] = 0; - } - orientations[i] = 0; - } - } - void update_best_orientation(); - // Set the estimate of the orientation to the given id. - void set_best_orientation(int orientation_id); - // Update/Compute the best estimate of the script assuming the given - // orientation id. - void update_best_script(int orientation_id); - // Return the index of the script with the highest score for this orientation. - TESS_API int get_best_script(int orientation_id) const; - // Accumulate scores with given OSResults instance and update the best script. - void accumulate(const OSResults &osr); - - // Print statistics. - void print_scores(void) const; - void print_scores(int orientation_id) const; - - // Array holding scores for each orientation id [0,3]. - // Orientation ids [0..3] map to [0, 270, 180, 90] degree orientations of the - // page respectively, where the values refer to the amount of clockwise - // rotation to be applied to the page for the text to be upright and readable. - float orientations[4]; - // Script confidence scores for each of 4 possible orientations. - float scripts_na[4][kMaxNumberOfScripts]; - - UNICHARSET *unicharset; - OSBestResult best_result; -}; - -class OrientationDetector { -public: - OrientationDetector(const std::vector *allowed_scripts, - OSResults *results); - bool detect_blob(BLOB_CHOICE_LIST *scores); - int get_orientation(); - -private: - OSResults *osr_; - const std::vector *allowed_scripts_; -}; - -class ScriptDetector { -public: - ScriptDetector(const std::vector *allowed_scripts, OSResults *osr, - tesseract::Tesseract *tess); - void detect_blob(BLOB_CHOICE_LIST *scores); - bool must_stop(int orientation) const; - -private: - OSResults *osr_; - static const char *korean_script_; - static const char *japanese_script_; - static const char *fraktur_script_; - int korean_id_; - int japanese_id_; - int katakana_id_; - int hiragana_id_; - int han_id_; - int hangul_id_; - int latin_id_; - int fraktur_id_; - tesseract::Tesseract *tess_; - const std::vector *allowed_scripts_; -}; - -int orientation_and_script_detection(const char *filename, OSResults *, - tesseract::Tesseract *); - -int os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr, - tesseract::Tesseract *tess); - -int os_detect_blobs(const std::vector *allowed_scripts, - BLOBNBOX_CLIST *blob_list, OSResults *osr, - tesseract::Tesseract *tess); - -bool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s, - OSResults *, tesseract::Tesseract *tess); - -// Helper method to convert an orientation index to its value in degrees. -// The value represents the amount of clockwise rotation in degrees that must be -// applied for the text to be upright (readable). -TESS_API int OrientationIdToValue(const int &id); - -} // namespace tesseract - -#endif // TESSERACT_CCMAIN_OSDETECT_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/pageiterator.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/pageiterator.h deleted file mode 100644 index 68739715..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/pageiterator.h +++ /dev/null @@ -1,364 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: pageiterator.h -// Description: Iterator for tesseract page structure that avoids using -// tesseract internal data structures. -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_ -#define TESSERACT_CCMAIN_PAGEITERATOR_H_ - -#include "export.h" -#include "publictypes.h" - -struct Pix; -struct Pta; - -namespace tesseract { - -struct BlamerBundle; -class C_BLOB_IT; -class PAGE_RES; -class PAGE_RES_IT; -class WERD; - -class Tesseract; - -/** - * Class to iterate over tesseract page structure, providing access to all - * levels of the page hierarchy, without including any tesseract headers or - * having to handle any tesseract structures. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - * See tesseract/publictypes.h for the definition of PageIteratorLevel. - * See also ResultIterator, derived from PageIterator, which adds in the - * ability to access OCR output with text-specific methods. - */ - -class TESS_API PageIterator { -public: - /** - * page_res and tesseract come directly from the BaseAPI. - * The rectangle parameters are copied indirectly from the Thresholder, - * via the BaseAPI. They represent the coordinates of some rectangle in an - * original image (in top-left-origin coordinates) and therefore the top-left - * needs to be added to any output boxes in order to specify coordinates - * in the original image. See TessBaseAPI::SetRectangle. - * The scale and scaled_yres are in case the Thresholder scaled the image - * rectangle prior to thresholding. Any coordinates in tesseract's image - * must be divided by scale before adding (rect_left, rect_top). - * The scaled_yres indicates the effective resolution of the binary image - * that tesseract has been given by the Thresholder. - * After the constructor, Begin has already been called. - */ - PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, - int scaled_yres, int rect_left, int rect_top, int rect_width, - int rect_height); - virtual ~PageIterator(); - - /** - * Page/ResultIterators may be copied! This makes it possible to iterate over - * all the objects at a lower level, while maintaining an iterator to - * objects at a higher level. These constructors DO NOT CALL Begin, so - * iterations will continue from the location of src. - */ - PageIterator(const PageIterator &src); - const PageIterator &operator=(const PageIterator &src); - - /** Are we positioned at the same location as other? */ - bool PositionedAtSameWord(const PAGE_RES_IT *other) const; - - // ============= Moving around within the page ============. - - /** - * Moves the iterator to point to the start of the page to begin an - * iteration. - */ - virtual void Begin(); - - /** - * Moves the iterator to the beginning of the paragraph. - * This class implements this functionality by moving it to the zero indexed - * blob of the first (leftmost) word on the first row of the paragraph. - */ - virtual void RestartParagraph(); - - /** - * Return whether this iterator points anywhere in the first textline of a - * paragraph. - */ - bool IsWithinFirstTextlineOfParagraph() const; - - /** - * Moves the iterator to the beginning of the text line. - * This class implements this functionality by moving it to the zero indexed - * blob of the first (leftmost) word of the row. - */ - virtual void RestartRow(); - - /** - * Moves to the start of the next object at the given level in the - * page hierarchy, and returns false if the end of the page was reached. - * NOTE that RIL_SYMBOL will skip non-text blocks, but all other - * PageIteratorLevel level values will visit each non-text block once. - * Think of non text blocks as containing a single para, with a single line, - * with a single imaginary word. - * Calls to Next with different levels may be freely intermixed. - * This function iterates words in right-to-left scripts correctly, if - * the appropriate language has been loaded into Tesseract. - */ - virtual bool Next(PageIteratorLevel level); - - /** - * Returns true if the iterator is at the start of an object at the given - * level. - * - * For instance, suppose an iterator it is pointed to the first symbol of the - * first word of the third line of the second paragraph of the first block in - * a page, then: - * it.IsAtBeginningOf(RIL_BLOCK) = false - * it.IsAtBeginningOf(RIL_PARA) = false - * it.IsAtBeginningOf(RIL_TEXTLINE) = true - * it.IsAtBeginningOf(RIL_WORD) = true - * it.IsAtBeginningOf(RIL_SYMBOL) = true - */ - virtual bool IsAtBeginningOf(PageIteratorLevel level) const; - - /** - * Returns whether the iterator is positioned at the last element in a - * given level. (e.g. the last word in a line, the last line in a block) - * - * Here's some two-paragraph example - * text. It starts off innocuously - * enough but quickly turns bizarre. - * The author inserts a cornucopia - * of words to guard against confused - * references. - * - * Now take an iterator it pointed to the start of "bizarre." - * it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false - * it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true - * it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false - */ - virtual bool IsAtFinalElement(PageIteratorLevel level, - PageIteratorLevel element) const; - - /** - * Returns whether this iterator is positioned - * before other: -1 - * equal to other: 0 - * after other: 1 - */ - int Cmp(const PageIterator &other) const; - - // ============= Accessing data ==============. - // Coordinate system: - // Integer coordinates are at the cracks between the pixels. - // The top-left corner of the top-left pixel in the image is at (0,0). - // The bottom-right corner of the bottom-right pixel in the image is at - // (width, height). - // Every bounding box goes from the top-left of the top-left contained - // pixel to the bottom-right of the bottom-right contained pixel, so - // the bounding box of the single top-left pixel in the image is: - // (0,0)->(1,1). - // If an image rectangle has been set in the API, then returned coordinates - // relate to the original (full) image, rather than the rectangle. - - /** - * Controls what to include in a bounding box. Bounding boxes of all levels - * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics. - * Between layout analysis and recognition, it isn't known where all - * diacritics belong, so this control is used to include or exclude some - * diacritics that are above or below the main body of the word. In most cases - * where the placement is obvious, and after recognition, it doesn't make as - * much difference, as the diacritics will already be included in the word. - */ - void SetBoundingBoxComponents(bool include_upper_dots, - bool include_lower_dots) { - include_upper_dots_ = include_upper_dots; - include_lower_dots_ = include_lower_dots; - } - - /** - * Returns the bounding rectangle of the current object at the given level. - * See comment on coordinate system above. - * Returns false if there is no such object at the current position. - * The returned bounding box is guaranteed to match the size and position - * of the image returned by GetBinaryImage, but may clip foreground pixels - * from a grey image. The padding argument to GetImage can be used to expand - * the image to include more foreground pixels. See GetImage below. - */ - bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, - int *bottom) const; - bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top, - int *right, int *bottom) const; - /** - * Returns the bounding rectangle of the object in a coordinate system of the - * working image rectangle having its origin at (rect_left_, rect_top_) with - * respect to the original image and is scaled by a factor scale_. - */ - bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top, - int *right, int *bottom) const; - - /** Returns whether there is no object of a given level. */ - bool Empty(PageIteratorLevel level) const; - - /** - * Returns the type of the current block. - * See tesseract/publictypes.h for PolyBlockType. - */ - PolyBlockType BlockType() const; - - /** - * Returns the polygon outline of the current block. The returned Pta must - * be ptaDestroy-ed after use. Note that the returned Pta lists the vertices - * of the polygon, and the last edge is the line segment between the last - * point and the first point. nullptr will be returned if the iterator is - * at the end of the document or layout analysis was not used. - */ - Pta *BlockPolygon() const; - - /** - * Returns a binary image of the current object at the given level. - * The position and size match the return from BoundingBoxInternal, and so - * this could be upscaled with respect to the original input image. - * Use pixDestroy to delete the image after use. - */ - Pix *GetBinaryImage(PageIteratorLevel level) const; - - /** - * Returns an image of the current object at the given level in greyscale - * if available in the input. To guarantee a binary image use BinaryImage. - * NOTE that in order to give the best possible image, the bounds are - * expanded slightly over the binary connected component, by the supplied - * padding, so the top-left position of the returned image is returned - * in (left,top). These will most likely not match the coordinates - * returned by BoundingBox. - * If you do not supply an original image, you will get a binary one. - * Use pixDestroy to delete the image after use. - */ - Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img, - int *left, int *top) const; - - /** - * Returns the baseline of the current object at the given level. - * The baseline is the line that passes through (x1, y1) and (x2, y2). - * WARNING: with vertical text, baselines may be vertical! - * Returns false if there is no baseline at the current position. - */ - bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2, - int *y2) const; - - // Returns the attributes of the current row. - void RowAttributes(float *row_height, float *descenders, - float *ascenders) const; - - /** - * Returns orientation for the block the iterator points to. - * orientation, writing_direction, textline_order: see publictypes.h - * deskew_angle: after rotating the block so the text orientation is - * upright, how many radians does one have to rotate the - * block anti-clockwise for it to be level? - * -Pi/4 <= deskew_angle <= Pi/4 - */ - void Orientation(tesseract::Orientation *orientation, - tesseract::WritingDirection *writing_direction, - tesseract::TextlineOrder *textline_order, - float *deskew_angle) const; - - /** - * Returns information about the current paragraph, if available. - * - * justification - - * LEFT if ragged right, or fully justified and script is left-to-right. - * RIGHT if ragged left, or fully justified and script is right-to-left. - * unknown if it looks like source code or we have very few lines. - * is_list_item - - * true if we believe this is a member of an ordered or unordered list. - * is_crown - - * true if the first line of the paragraph is aligned with the other - * lines of the paragraph even though subsequent paragraphs have first - * line indents. This typically indicates that this is the continuation - * of a previous paragraph or that it is the very first paragraph in - * the chapter. - * first_line_indent - - * For LEFT aligned paragraphs, the first text line of paragraphs of - * this kind are indented this many pixels from the left edge of the - * rest of the paragraph. - * for RIGHT aligned paragraphs, the first text line of paragraphs of - * this kind are indented this many pixels from the right edge of the - * rest of the paragraph. - * NOTE 1: This value may be negative. - * NOTE 2: if *is_crown == true, the first line of this paragraph is - * actually flush, and first_line_indent is set to the "common" - * first_line_indent for subsequent paragraphs in this block - * of text. - */ - void ParagraphInfo(tesseract::ParagraphJustification *justification, - bool *is_list_item, bool *is_crown, - int *first_line_indent) const; - - // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle - // of the current word to the given pointer (takes ownership of the pointer) - // and returns true. - // Can only be used when iterating on the word level. - bool SetWordBlamerBundle(BlamerBundle *blamer_bundle); - -protected: - /** - * Sets up the internal data for iterating the blobs of a new word, then - * moves the iterator to the given offset. - */ - void BeginWord(int offset); - - /** Pointer to the page_res owned by the API. */ - PAGE_RES *page_res_; - /** Pointer to the Tesseract object owned by the API. */ - Tesseract *tesseract_; - /** - * The iterator to the page_res_. Owned by this ResultIterator. - * A pointer just to avoid dragging in Tesseract includes. - */ - PAGE_RES_IT *it_; - /** - * The current input WERD being iterated. If there is an output from OCR, - * then word_ is nullptr. Owned by the API - */ - WERD *word_; - /** The length of the current word_. */ - int word_length_; - /** The current blob index within the word. */ - int blob_index_; - /** - * Iterator to the blobs within the word. If nullptr, then we are iterating - * OCR results in the box_word. - * Owned by this ResultIterator. - */ - C_BLOB_IT *cblob_it_; - /** Control over what to include in bounding boxes. */ - bool include_upper_dots_; - bool include_lower_dots_; - /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/ - int scale_; - int scaled_yres_; - int rect_left_; - int rect_top_; - int rect_width_; - int rect_height_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_PAGEITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/publictypes.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/publictypes.h deleted file mode 100644 index 0069cf28..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/publictypes.h +++ /dev/null @@ -1,281 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: publictypes.h -// Description: Types used in both the API and internally -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCSTRUCT_PUBLICTYPES_H_ -#define TESSERACT_CCSTRUCT_PUBLICTYPES_H_ - -namespace tesseract { - -// This file contains types that are used both by the API and internally -// to Tesseract. In order to decouple the API from Tesseract and prevent cyclic -// dependencies, THIS FILE SHOULD NOT DEPEND ON ANY OTHER PART OF TESSERACT. -// Restated: It is OK for low-level Tesseract files to include publictypes.h, -// but not for the low-level tesseract code to include top-level API code. -// This file should not use other Tesseract types, as that would drag -// their includes into the API-level. - -/** Number of printers' points in an inch. The unit of the pointsize return. */ -constexpr int kPointsPerInch = 72; -/** - * Minimum believable resolution. Used as a default if there is no other - * information, as it is safer to under-estimate than over-estimate. - */ -constexpr int kMinCredibleResolution = 70; -/** Maximum believable resolution. */ -constexpr int kMaxCredibleResolution = 2400; -/** - * Ratio between median blob size and likely resolution. Used to estimate - * resolution when none is provided. This is basically 1/usual text size in - * inches. */ -constexpr int kResolutionEstimationFactor = 10; - -/** - * Possible types for a POLY_BLOCK or ColPartition. - * Must be kept in sync with kPBColors in polyblk.cpp and PTIs*Type functions - * below, as well as kPolyBlockNames in layout_test.cc. - * Used extensively by ColPartition, and POLY_BLOCK. - */ -enum PolyBlockType { - PT_UNKNOWN, // Type is not yet known. Keep as the first element. - PT_FLOWING_TEXT, // Text that lives inside a column. - PT_HEADING_TEXT, // Text that spans more than one column. - PT_PULLOUT_TEXT, // Text that is in a cross-column pull-out region. - PT_EQUATION, // Partition belonging to an equation region. - PT_INLINE_EQUATION, // Partition has inline equation. - PT_TABLE, // Partition belonging to a table region. - PT_VERTICAL_TEXT, // Text-line runs vertically. - PT_CAPTION_TEXT, // Text that belongs to an image. - PT_FLOWING_IMAGE, // Image that lives inside a column. - PT_HEADING_IMAGE, // Image that spans more than one column. - PT_PULLOUT_IMAGE, // Image that is in a cross-column pull-out region. - PT_HORZ_LINE, // Horizontal Line. - PT_VERT_LINE, // Vertical Line. - PT_NOISE, // Lies outside of any column. - PT_COUNT -}; - -/** Returns true if PolyBlockType is of horizontal line type */ -inline bool PTIsLineType(PolyBlockType type) { - return type == PT_HORZ_LINE || type == PT_VERT_LINE; -} -/** Returns true if PolyBlockType is of image type */ -inline bool PTIsImageType(PolyBlockType type) { - return type == PT_FLOWING_IMAGE || type == PT_HEADING_IMAGE || - type == PT_PULLOUT_IMAGE; -} -/** Returns true if PolyBlockType is of text type */ -inline bool PTIsTextType(PolyBlockType type) { - return type == PT_FLOWING_TEXT || type == PT_HEADING_TEXT || - type == PT_PULLOUT_TEXT || type == PT_TABLE || - type == PT_VERTICAL_TEXT || type == PT_CAPTION_TEXT || - type == PT_INLINE_EQUATION; -} -// Returns true if PolyBlockType is of pullout(inter-column) type -inline bool PTIsPulloutType(PolyBlockType type) { - return type == PT_PULLOUT_IMAGE || type == PT_PULLOUT_TEXT; -} - -/** - * +------------------+ Orientation Example: - * | 1 Aaaa Aaaa Aaaa | ==================== - * | Aaa aa aaa aa | To left is a diagram of some (1) English and - * | aaaaaa A aa aaa. | (2) Chinese text and a (3) photo credit. - * | 2 | - * | ####### c c C | Upright Latin characters are represented as A and a. - * | ####### c c c | '<' represents a latin character rotated - * | < ####### c c c | anti-clockwise 90 degrees. - * | < ####### c c | - * | < ####### . c | Upright Chinese characters are represented C and c. - * | 3 ####### c | - * +------------------+ NOTA BENE: enum values here should match goodoc.proto - - * If you orient your head so that "up" aligns with Orientation, - * then the characters will appear "right side up" and readable. - * - * In the example above, both the English and Chinese paragraphs are oriented - * so their "up" is the top of the page (page up). The photo credit is read - * with one's head turned leftward ("up" is to page left). - * - * The values of this enum match the convention of Tesseract's osdetect.h -*/ -enum Orientation { - ORIENTATION_PAGE_UP = 0, - ORIENTATION_PAGE_RIGHT = 1, - ORIENTATION_PAGE_DOWN = 2, - ORIENTATION_PAGE_LEFT = 3, -}; - -/** - * The grapheme clusters within a line of text are laid out logically - * in this direction, judged when looking at the text line rotated so that - * its Orientation is "page up". - * - * For English text, the writing direction is left-to-right. For the - * Chinese text in the above example, the writing direction is top-to-bottom. - */ -enum WritingDirection { - WRITING_DIRECTION_LEFT_TO_RIGHT = 0, - WRITING_DIRECTION_RIGHT_TO_LEFT = 1, - WRITING_DIRECTION_TOP_TO_BOTTOM = 2, -}; - -/** - * The text lines are read in the given sequence. - * - * In English, the order is top-to-bottom. - * In Chinese, vertical text lines are read right-to-left. Mongolian is - * written in vertical columns top to bottom like Chinese, but the lines - * order left-to right. - * - * Note that only some combinations make sense. For example, - * WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM - */ -enum TextlineOrder { - TEXTLINE_ORDER_LEFT_TO_RIGHT = 0, - TEXTLINE_ORDER_RIGHT_TO_LEFT = 1, - TEXTLINE_ORDER_TOP_TO_BOTTOM = 2, -}; - -/** - * Possible modes for page layout analysis. These *must* be kept in order - * of decreasing amount of layout analysis to be done, except for OSD_ONLY, - * so that the inequality test macros below work. - */ -enum PageSegMode { - PSM_OSD_ONLY = 0, ///< Orientation and script detection only. - PSM_AUTO_OSD = 1, ///< Automatic page segmentation with orientation and - ///< script detection. (OSD) - PSM_AUTO_ONLY = 2, ///< Automatic page segmentation, but no OSD, or OCR. - PSM_AUTO = 3, ///< Fully automatic page segmentation, but no OSD. - PSM_SINGLE_COLUMN = 4, ///< Assume a single column of text of variable sizes. - PSM_SINGLE_BLOCK_VERT_TEXT = 5, ///< Assume a single uniform block of - ///< vertically aligned text. - PSM_SINGLE_BLOCK = 6, ///< Assume a single uniform block of text. (Default.) - PSM_SINGLE_LINE = 7, ///< Treat the image as a single text line. - PSM_SINGLE_WORD = 8, ///< Treat the image as a single word. - PSM_CIRCLE_WORD = 9, ///< Treat the image as a single word in a circle. - PSM_SINGLE_CHAR = 10, ///< Treat the image as a single character. - PSM_SPARSE_TEXT = - 11, ///< Find as much text as possible in no particular order. - PSM_SPARSE_TEXT_OSD = 12, ///< Sparse text with orientation and script det. - PSM_RAW_LINE = 13, ///< Treat the image as a single text line, bypassing - ///< hacks that are Tesseract-specific. - - PSM_COUNT ///< Number of enum entries. -}; - -/** - * Inline functions that act on a PageSegMode to determine whether components of - * layout analysis are enabled. - * *Depend critically on the order of elements of PageSegMode.* - * NOTE that arg is an int for compatibility with INT_PARAM. - */ -inline bool PSM_OSD_ENABLED(int pageseg_mode) { - return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_ORIENTATION_ENABLED(int pageseg_mode) { - return pageseg_mode <= PSM_AUTO || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_COL_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO; -} -inline bool PSM_SPARSE(int pageseg_mode) { - return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN; -} -inline bool PSM_LINE_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK; -} -inline bool PSM_WORD_FIND_ENABLED(int pageseg_mode) { - return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) || - pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} - -/** - * enum of the elements of the page hierarchy, used in ResultIterator - * to provide functions that operate on each level without having to - * have 5x as many functions. - */ -enum PageIteratorLevel { - RIL_BLOCK, // Block of text/image/separator line. - RIL_PARA, // Paragraph within a block. - RIL_TEXTLINE, // Line within a paragraph. - RIL_WORD, // Word within a textline. - RIL_SYMBOL // Symbol/character within a word. -}; - -/** - * JUSTIFICATION_UNKNOWN - * The alignment is not clearly one of the other options. This could happen - * for example if there are only one or two lines of text or the text looks - * like source code or poetry. - * - * NOTA BENE: Fully justified paragraphs (text aligned to both left and right - * margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text - * is written with a left-to-right script and with JUSTIFICATION_RIGHT if - * their text is written in a right-to-left script. - * - * Interpretation for text read in vertical lines: - * "Left" is wherever the starting reading position is. - * - * JUSTIFICATION_LEFT - * Each line, except possibly the first, is flush to the same left tab stop. - * - * JUSTIFICATION_CENTER - * The text lines of the paragraph are centered about a line going - * down through their middle of the text lines. - * - * JUSTIFICATION_RIGHT - * Each line, except possibly the first, is flush to the same right tab stop. - */ -enum ParagraphJustification { - JUSTIFICATION_UNKNOWN, - JUSTIFICATION_LEFT, - JUSTIFICATION_CENTER, - JUSTIFICATION_RIGHT, -}; - -/** - * When Tesseract/Cube is initialized we can choose to instantiate/load/run - * only the Tesseract part, only the Cube part or both along with the combiner. - * The preference of which engine to use is stored in tessedit_ocr_engine_mode. - * - * ATTENTION: When modifying this enum, please make sure to make the - * appropriate changes to all the enums mirroring it (e.g. OCREngine in - * cityblock/workflow/detection/detection_storage.proto). Such enums will - * mention the connection to OcrEngineMode in the comments. - */ -enum OcrEngineMode { - OEM_TESSERACT_ONLY, // Run Tesseract only - fastest; deprecated - OEM_LSTM_ONLY, // Run just the LSTM line recognizer. - OEM_TESSERACT_LSTM_COMBINED, // Run the LSTM recognizer, but allow fallback - // to Tesseract when things get difficult. - // deprecated - OEM_DEFAULT, // Specify this mode when calling init_*(), - // to indicate that any of the above modes - // should be automatically inferred from the - // variables in the language-specific config, - // command-line configs, or if not specified - // in any of the above should be set to the - // default OEM_TESSERACT_ONLY. - OEM_COUNT // Number of OEMs -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCSTRUCT_PUBLICTYPES_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/renderer.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/renderer.h deleted file mode 100644 index 6f405233..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/renderer.h +++ /dev/null @@ -1,311 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: renderer.h -// Description: Rendering interface to inject into TessBaseAPI -// -// (C) Copyright 2011, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_RENDERER_H_ -#define TESSERACT_API_RENDERER_H_ - -#include "export.h" - -// To avoid collision with other typenames include the ABSOLUTE MINIMUM -// complexity of includes here. Use forward declarations wherever possible -// and hide includes of complex types in baseapi.cpp. -#include -#include // for std::string -#include // for std::vector - -struct Pix; - -namespace tesseract { - -class TessBaseAPI; - -/** - * Interface for rendering tesseract results into a document, such as text, - * HOCR or pdf. This class is abstract. Specific classes handle individual - * formats. This interface is then used to inject the renderer class into - * tesseract when processing images. - * - * For simplicity implementing this with tesseract version 3.01, - * the renderer contains document state that is cleared from document - * to document just as the TessBaseAPI is. This way the base API can just - * delegate its rendering functionality to injected renderers, and the - * renderers can manage the associated state needed for the specific formats - * in addition to the heuristics for producing it. - */ -class TESS_API TessResultRenderer { -public: - virtual ~TessResultRenderer(); - - // Takes ownership of pointer so must be new'd instance. - // Renderers aren't ordered, but appends the sequences of next parameter - // and existing next(). The renderers should be unique across both lists. - void insert(TessResultRenderer *next); - - // Returns the next renderer or nullptr. - TessResultRenderer *next() { - return next_; - } - - /** - * Starts a new document with the given title. - * This clears the contents of the output data. - * Title should use UTF-8 encoding. - */ - bool BeginDocument(const char *title); - - /** - * Adds the recognized text from the source image to the current document. - * Invalid if BeginDocument not yet called. - * - * Note that this API is a bit weird but is designed to fit into the - * current TessBaseAPI implementation where the api has lots of state - * information that we might want to add in. - */ - bool AddImage(TessBaseAPI *api); - - /** - * Finishes the document and finalizes the output data - * Invalid if BeginDocument not yet called. - */ - bool EndDocument(); - - const char *file_extension() const { - return file_extension_; - } - const char *title() const { - return title_.c_str(); - } - - // Is everything fine? Otherwise something went wrong. - bool happy() const { - return happy_; - } - - /** - * Returns the index of the last image given to AddImage - * (i.e. images are incremented whether the image succeeded or not) - * - * This is always defined. It means either the number of the - * current image, the last image ended, or in the completed document - * depending on when in the document lifecycle you are looking at it. - * Will return -1 if a document was never started. - */ - int imagenum() const { - return imagenum_; - } - -protected: - /** - * Called by concrete classes. - * - * outputbase is the name of the output file excluding - * extension. For example, "/path/to/chocolate-chip-cookie-recipe" - * - * extension indicates the file extension to be used for output - * files. For example "pdf" will produce a .pdf file, and "hocr" - * will produce .hocr files. - */ - TessResultRenderer(const char *outputbase, const char *extension); - - // Hook for specialized handling in BeginDocument() - virtual bool BeginDocumentHandler(); - - // This must be overridden to render the OCR'd results - virtual bool AddImageHandler(TessBaseAPI *api) = 0; - - // Hook for specialized handling in EndDocument() - virtual bool EndDocumentHandler(); - - // Renderers can call this to append '\0' terminated strings into - // the output string returned by GetOutput. - // This method will grow the output buffer if needed. - void AppendString(const char *s); - - // Renderers can call this to append binary byte sequences into - // the output string returned by GetOutput. Note that s is not necessarily - // '\0' terminated (and can contain '\0' within it). - // This method will grow the output buffer if needed. - void AppendData(const char *s, int len); - -private: - TessResultRenderer *next_; // Can link multiple renderers together - FILE *fout_; // output file pointer - const char *file_extension_; // standard extension for generated output - std::string title_; // title of document being rendered - int imagenum_; // index of last image added - bool happy_; // I get grumpy when the disk fills up, etc. -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessTextRenderer : public TessResultRenderer { -public: - explicit TessTextRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into an hocr text string - */ -class TESS_API TessHOcrRenderer : public TessResultRenderer { -public: - explicit TessHOcrRenderer(const char *outputbase, bool font_info); - explicit TessHOcrRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool font_info_; // whether to print font information -}; - -/** - * Renders tesseract output into an alto text string - */ -class TESS_API TessAltoRenderer : public TessResultRenderer { -public: - explicit TessAltoRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool begin_document; -}; - -/** - * Renders Tesseract output into a TSV string - */ -class TESS_API TessTsvRenderer : public TessResultRenderer { -public: - explicit TessTsvRenderer(const char *outputbase, bool font_info); - explicit TessTsvRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool font_info_; // whether to print font information -}; - -/** - * Renders tesseract output into searchable PDF - */ -class TESS_API TessPDFRenderer : public TessResultRenderer { -public: - // datadir is the location of the TESSDATA. We need it because - // we load a custom PDF font from this location. - TessPDFRenderer(const char *outputbase, const char *datadir, - bool textonly = false); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - // We don't want to have every image in memory at once, - // so we store some metadata as we go along producing - // PDFs one page at a time. At the end, that metadata is - // used to make everything that isn't easily handled in a - // streaming fashion. - long int obj_; // counter for PDF objects - std::vector offsets_; // offset of every PDF object in bytes - std::vector pages_; // object number for every /Page object - std::string datadir_; // where to find the custom font - bool textonly_; // skip images if set - // Bookkeeping only. DIY = Do It Yourself. - void AppendPDFObjectDIY(size_t objectsize); - // Bookkeeping + emit data. - void AppendPDFObject(const char *data); - // Create the /Contents object for an entire page. - char *GetPDFTextObjects(TessBaseAPI *api, double width, double height); - // Turn an image into a PDF object. Only transcode if we have to. - static bool imageToPDFObj(Pix *pix, const char *filename, long int objnum, - char **pdf_object, long int *pdf_object_size, - int jpg_quality); -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessUnlvRenderer : public TessResultRenderer { -public: - explicit TessUnlvRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string for LSTMBox - */ -class TESS_API TessLSTMBoxRenderer : public TessResultRenderer { -public: - explicit TessLSTMBoxRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessBoxTextRenderer : public TessResultRenderer { -public: - explicit TessBoxTextRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string in WordStr format - */ -class TESS_API TessWordStrBoxRenderer : public TessResultRenderer { -public: - explicit TessWordStrBoxRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -#ifndef DISABLED_LEGACY_ENGINE - -/** - * Renders tesseract output into an osd text string - */ -class TESS_API TessOsdRenderer : public TessResultRenderer { -public: - explicit TessOsdRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -#endif // ndef DISABLED_LEGACY_ENGINE - -} // namespace tesseract. - -#endif // TESSERACT_API_RENDERER_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/resultiterator.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/resultiterator.h deleted file mode 100644 index 3e4d5807..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/resultiterator.h +++ /dev/null @@ -1,250 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: resultiterator.h -// Description: Iterator for tesseract results that is capable of -// iterating in proper reading order over Bi Directional -// (e.g. mixed Hebrew and English) text. -// Author: David Eger -// -// (C) Copyright 2011, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_ -#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_ - -#include "export.h" // for TESS_API, TESS_LOCAL -#include "ltrresultiterator.h" // for LTRResultIterator -#include "publictypes.h" // for PageIteratorLevel -#include "unichar.h" // for StrongScriptDirection - -#include // for std::pair -#include // for std::vector - -namespace tesseract { - -class TESS_API ResultIterator : public LTRResultIterator { -public: - static ResultIterator *StartOfParagraph(const LTRResultIterator &resit); - - /** - * ResultIterator is copy constructible! - * The default copy constructor works just fine for us. - */ - ~ResultIterator() override = default; - - // ============= Moving around within the page ============. - /** - * Moves the iterator to point to the start of the page to begin - * an iteration. - */ - void Begin() override; - - /** - * Moves to the start of the next object at the given level in the - * page hierarchy in the appropriate reading order and returns false if - * the end of the page was reached. - * NOTE that RIL_SYMBOL will skip non-text blocks, but all other - * PageIteratorLevel level values will visit each non-text block once. - * Think of non text blocks as containing a single para, with a single line, - * with a single imaginary word. - * Calls to Next with different levels may be freely intermixed. - * This function iterates words in right-to-left scripts correctly, if - * the appropriate language has been loaded into Tesseract. - */ - bool Next(PageIteratorLevel level) override; - - /** - * IsAtBeginningOf() returns whether we're at the logical beginning of the - * given level. (as opposed to ResultIterator's left-to-right top-to-bottom - * order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf(). - * For a full description, see pageiterator.h - */ - bool IsAtBeginningOf(PageIteratorLevel level) const override; - - /** - * Implement PageIterator's IsAtFinalElement correctly in a BiDi context. - * For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we - * point at the last word in a paragraph. See PageIterator for full comment. - */ - bool IsAtFinalElement(PageIteratorLevel level, - PageIteratorLevel element) const override; - - // ============= Functions that refer to words only ============. - // Returns the number of blanks before the current word. - int BlanksBeforeWord() const; - - // ============= Accessing data ==============. - - /** - * Returns the null terminated UTF-8 encoded text string for the current - * object at the given level. Use delete [] to free after use. - */ - virtual char *GetUTF8Text(PageIteratorLevel level) const; - - /** - * Returns the LSTM choices for every LSTM timestep for the current word. - */ - virtual std::vector>>> - *GetRawLSTMTimesteps() const; - virtual std::vector>> - *GetBestLSTMSymbolChoices() const; - - /** - * Return whether the current paragraph's dominant reading direction - * is left-to-right (as opposed to right-to-left). - */ - bool ParagraphIsLtr() const; - - // ============= Exposed only for testing =============. - - /** - * Yields the reading order as a sequence of indices and (optional) - * meta-marks for a set of words (given left-to-right). - * The meta marks are passed as negative values: - * kMinorRunStart Start of minor direction text. - * kMinorRunEnd End of minor direction text. - * kComplexWord The next indexed word contains both left-to-right and - * right-to-left characters and was treated as neutral. - * - * For example, suppose we have five words in a text line, - * indexed [0,1,2,3,4] from the leftmost side of the text line. - * The following are all believable reading_orders: - * - * Left-to-Right (in ltr paragraph): - * { 0, 1, 2, 3, 4 } - * Left-to-Right (in rtl paragraph): - * { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd } - * Right-to-Left (in rtl paragraph): - * { 4, 3, 2, 1, 0 } - * Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph: - * { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 } - */ - static void CalculateTextlineOrder( - bool paragraph_is_ltr, - const std::vector &word_dirs, - std::vector *reading_order); - - static const int kMinorRunStart; - static const int kMinorRunEnd; - static const int kComplexWord; - -protected: - /** - * We presume the data associated with the given iterator will outlive us. - * NB: This is private because it does something that is non-obvious: - * it resets to the beginning of the paragraph instead of staying wherever - * resit might have pointed. - */ - explicit ResultIterator(const LTRResultIterator &resit); - -private: - /** - * Calculates the current paragraph's dominant writing direction. - * Typically, members should use current_paragraph_ltr_ instead. - */ - bool CurrentParagraphIsLtr() const; - - /** - * Returns word indices as measured from resit->RestartRow() = index 0 - * for the reading order of words within a textline given an iterator - * into the middle of the text line. - * In addition to non-negative word indices, the following negative values - * may be inserted: - * kMinorRunStart Start of minor direction text. - * kMinorRunEnd End of minor direction text. - * kComplexWord The previous word contains both left-to-right and - * right-to-left characters and was treated as neutral. - */ - void CalculateTextlineOrder(bool paragraph_is_ltr, - const LTRResultIterator &resit, - std::vector *indices) const; - /** Same as above, but the caller's ssd gets filled in if ssd != nullptr. */ - void CalculateTextlineOrder(bool paragraph_is_ltr, - const LTRResultIterator &resit, - std::vector *ssd, - std::vector *indices) const; - - /** - * What is the index of the current word in a strict left-to-right reading - * of the row? - */ - int LTRWordIndex() const; - - /** - * Given an iterator pointing at a word, returns the logical reading order - * of blob indices for the word. - */ - void CalculateBlobOrder(std::vector *blob_indices) const; - - /** Precondition: current_paragraph_is_ltr_ is set. */ - void MoveToLogicalStartOfTextline(); - - /** - * Precondition: current_paragraph_is_ltr_ and in_minor_direction_ - * are set. - */ - void MoveToLogicalStartOfWord(); - - /** Are we pointing at the final (reading order) symbol of the word? */ - bool IsAtFinalSymbolOfWord() const; - - /** Are we pointing at the first (reading order) symbol of the word? */ - bool IsAtFirstSymbolOfWord() const; - - /** - * Append any extra marks that should be appended to this word when printed. - * Mostly, these are Unicode BiDi control characters. - */ - void AppendSuffixMarks(std::string *text) const; - - /** Appends the current word in reading order to the given buffer.*/ - void AppendUTF8WordText(std::string *text) const; - - /** - * Appends the text of the current text line, *assuming this iterator is - * positioned at the beginning of the text line* This function - * updates the iterator to point to the first position past the text line. - * Each textline is terminated in a single newline character. - * If the textline ends a paragraph, it gets a second terminal newline. - */ - void IterateAndAppendUTF8TextlineText(std::string *text); - - /** - * Appends the text of the current paragraph in reading order - * to the given buffer. - * Each textline is terminated in a single newline character, and the - * paragraph gets an extra newline at the end. - */ - void AppendUTF8ParagraphText(std::string *text) const; - - /** Returns whether the bidi_debug flag is set to at least min_level. */ - bool BidiDebug(int min_level) const; - - bool current_paragraph_is_ltr_; - - /** - * Is the currently pointed-at character at the beginning of - * a minor-direction run? - */ - bool at_beginning_of_minor_run_; - - /** Is the currently pointed-at character in a minor-direction sequence? */ - bool in_minor_direction_; - - /** - * Should detected inter-word spaces be preserved, or "compressed" to a single - * space character (default behavior). - */ - bool preserve_interword_spaces_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/unichar.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/unichar.h deleted file mode 100644 index 015109d7..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/unichar.h +++ /dev/null @@ -1,174 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: unichar.h -// Description: Unicode character/ligature class. -// Author: Ray Smith -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCUTIL_UNICHAR_H_ -#define TESSERACT_CCUTIL_UNICHAR_H_ - -#include "export.h" - -#include -#include -#include -#include - -namespace tesseract { - -// Maximum number of characters that can be stored in a UNICHAR. Must be -// at least 4. Must not exceed 31 without changing the coding of length. -#define UNICHAR_LEN 30 - -// A UNICHAR_ID is the unique id of a unichar. -using UNICHAR_ID = int; - -// A variable to indicate an invalid or uninitialized unichar id. -static const int INVALID_UNICHAR_ID = -1; -// A special unichar that corresponds to INVALID_UNICHAR_ID. -static const char INVALID_UNICHAR[] = "__INVALID_UNICHAR__"; - -enum StrongScriptDirection { - DIR_NEUTRAL = 0, // Text contains only neutral characters. - DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters. - DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters. - DIR_MIX = 3, // Text contains a mixture of left-to-right - // and right-to-left characters. -}; - -using char32 = signed int; - -// The UNICHAR class holds a single classification result. This may be -// a single Unicode character (stored as between 1 and 4 utf8 bytes) or -// multiple Unicode characters representing the NFKC expansion of a ligature -// such as fi, ffl etc. These are also stored as utf8. -class TESS_API UNICHAR { -public: - UNICHAR() { - memset(chars, 0, UNICHAR_LEN); - } - - // Construct from a utf8 string. If len<0 then the string is null terminated. - // If the string is too long to fit in the UNICHAR then it takes only what - // will fit. - UNICHAR(const char *utf8_str, int len); - - // Construct from a single UCS4 character. - explicit UNICHAR(int unicode); - - // Default copy constructor and operator= are OK. - - // Get the first character as UCS-4. - int first_uni() const; - - // Get the length of the UTF8 string. - int utf8_len() const { - int len = chars[UNICHAR_LEN - 1]; - return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN; - } - - // Get a UTF8 string, but NOT nullptr terminated. - const char *utf8() const { - return chars; - } - - // Get a terminated UTF8 string: Must delete[] it after use. - char *utf8_str() const; - - // Get the number of bytes in the first character of the given utf8 string. - static int utf8_step(const char *utf8_str); - - // A class to simplify iterating over and accessing elements of a UTF8 - // string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or - // take ownership of the underlying byte array. It also does not permit - // modification of the array (as the name suggests). - // - // Example: - // for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len); - // it != UNICHAR::end(str, len); - // ++it) { - // printf("UCS-4 symbol code = %d\n", *it); - // char buf[5]; - // int char_len = it.get_utf8(buf); buf[char_len] = '\0'; - // printf("Char = %s\n", buf); - // } - class TESS_API const_iterator { - using CI = const_iterator; - - public: - // Step to the next UTF8 character. - // If the current position is at an illegal UTF8 character, then print an - // error message and step by one byte. If the current position is at a - // nullptr value, don't step past it. - const_iterator &operator++(); - - // Return the UCS-4 value at the current position. - // If the current position is at an illegal UTF8 value, return a single - // space character. - int operator*() const; - - // Store the UTF-8 encoding of the current codepoint into buf, which must be - // at least 4 bytes long. Return the number of bytes written. - // If the current position is at an illegal UTF8 value, writes a single - // space character and returns 1. - // Note that this method does not null-terminate the buffer. - int get_utf8(char *buf) const; - // Returns the number of bytes of the current codepoint. Returns 1 if the - // current position is at an illegal UTF8 value. - int utf8_len() const; - // Returns true if the UTF-8 encoding at the current position is legal. - bool is_legal() const; - - // Return the pointer into the string at the current position. - const char *utf8_data() const { - return it_; - } - - // Iterator equality operators. - friend bool operator==(const CI &lhs, const CI &rhs) { - return lhs.it_ == rhs.it_; - } - friend bool operator!=(const CI &lhs, const CI &rhs) { - return !(lhs == rhs); - } - - private: - friend class UNICHAR; - explicit const_iterator(const char *it) : it_(it) {} - - const char *it_; // Pointer into the string. - }; - - // Create a start/end iterator pointing to a string. Note that these methods - // are static and do NOT create a copy or take ownership of the underlying - // array. - static const_iterator begin(const char *utf8_str, int byte_length); - static const_iterator end(const char *utf8_str, int byte_length); - - // Converts a utf-8 string to a vector of unicodes. - // Returns an empty vector if the input contains invalid UTF-8. - static std::vector UTF8ToUTF32(const char *utf8_str); - // Converts a vector of unicodes to a utf8 string. - // Returns an empty string if the input contains an invalid unicode. - static std::string UTF32ToUTF8(const std::vector &str32); - -private: - // A UTF-8 representation of 1 or more Unicode characters. - // The last element (chars[UNICHAR_LEN - 1]) is a length if - // its value < UNICHAR_LEN, otherwise it is a genuine character. - char chars[UNICHAR_LEN]{}; -}; - -} // namespace tesseract - -#endif // TESSERACT_CCUTIL_UNICHAR_H_ diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/version.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/version.h deleted file mode 100644 index 6bac5d66..00000000 --- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/version.h +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: version.h -// Description: Version information -// -// (C) Copyright 2018, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_VERSION_H_ -#define TESSERACT_API_VERSION_H_ - -// clang-format off - -#define TESSERACT_MAJOR_VERSION @GENERIC_MAJOR_VERSION@ -#define TESSERACT_MINOR_VERSION @GENERIC_MINOR_VERSION@ -#define TESSERACT_MICRO_VERSION @GENERIC_MICRO_VERSION@ - -#define TESSERACT_VERSION \ - (TESSERACT_MAJOR_VERSION << 16 | \ - TESSERACT_MINOR_VERSION << 8 | \ - TESSERACT_MICRO_VERSION) - -#define TESSERACT_VERSION_STR "@PACKAGE_VERSION@" - -// clang-format on - -#endif // TESSERACT_API_VERSION_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/baseapi.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/baseapi.h deleted file mode 100644 index 5e1e4830..00000000 --- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/baseapi.h +++ /dev/null @@ -1,812 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: baseapi.h -// Description: Simple API for calling tesseract. -// Author: Ray Smith -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_BASEAPI_H_ -#define TESSERACT_API_BASEAPI_H_ - -#ifdef HAVE_CONFIG_H -# include "config_auto.h" // DISABLED_LEGACY_ENGINE -#endif - -#include "export.h" -#include "pageiterator.h" -#include "publictypes.h" -#include "resultiterator.h" -#include "unichar.h" - -#include "version.h" - -#include -#include // for std::vector - -struct Pix; -struct Pixa; -struct Boxa; - -namespace tesseract { - -class PAGE_RES; -class ParagraphModel; -class BLOCK_LIST; -class ETEXT_DESC; -struct OSResults; -class UNICHARSET; - -class Dawg; -class Dict; -class EquationDetect; -class PageIterator; -class ImageThresholder; -class LTRResultIterator; -class ResultIterator; -class MutableIterator; -class TessResultRenderer; -class Tesseract; - -// Function to read a std::vector from a whole file. -// Returns false on failure. -using FileReader = bool (*)(const char *filename, std::vector *data); - -using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID, - bool) const; -using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *, - int, const char *, int); - -/** - * Base class for all tesseract APIs. - * Specific classes can add ability to work on different inputs or produce - * different outputs. - * This class is mostly an interface layer on top of the Tesseract instance - * class to hide the data types so that users of this class don't have to - * include any other Tesseract headers. - */ -class TESS_API TessBaseAPI { -public: - TessBaseAPI(); - virtual ~TessBaseAPI(); - // Copy constructor and assignment operator are currently unsupported. - TessBaseAPI(TessBaseAPI const &) = delete; - TessBaseAPI &operator=(TessBaseAPI const &) = delete; - - /** - * Returns the version identifier as a static string. Do not delete. - */ - static const char *Version(); - - /** - * If compiled with OpenCL AND an available OpenCL - * device is deemed faster than serial code, then - * "device" is populated with the cl_device_id - * and returns sizeof(cl_device_id) - * otherwise *device=nullptr and returns 0. - */ - static size_t getOpenCLDevice(void **device); - - /** - * Set the name of the input file. Needed for training and - * reading a UNLV zone file, and for searchable PDF output. - */ - void SetInputName(const char *name); - /** - * These functions are required for searchable PDF output. - * We need our hands on the input file so that we can include - * it in the PDF without transcoding. If that is not possible, - * we need the original image. Finally, resolution metadata - * is stored in the PDF so we need that as well. - */ - const char *GetInputName(); - // Takes ownership of the input pix. - void SetInputImage(Pix *pix); - Pix *GetInputImage(); - int GetSourceYResolution(); - const char *GetDatapath(); - - /** Set the name of the bonus output files. Needed only for debugging. */ - void SetOutputName(const char *name); - - /** - * Set the value of an internal "parameter." - * Supply the name of the parameter and the value as a string, just as - * you would in a config file. - * Returns false if the name lookup failed. - * Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z. - * Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode. - * SetVariable may be used before Init, but settings will revert to - * defaults on End(). - * - * Note: Must be called after Init(). Only works for non-init variables - * (init variables should be passed to Init()). - */ - bool SetVariable(const char *name, const char *value); - bool SetDebugVariable(const char *name, const char *value); - - /** - * Returns true if the parameter was found among Tesseract parameters. - * Fills in value with the value of the parameter. - */ - bool GetIntVariable(const char *name, int *value) const; - bool GetBoolVariable(const char *name, bool *value) const; - bool GetDoubleVariable(const char *name, double *value) const; - - /** - * Returns the pointer to the string that represents the value of the - * parameter if it was found among Tesseract parameters. - */ - const char *GetStringVariable(const char *name) const; - -#ifndef DISABLED_LEGACY_ENGINE - - /** - * Print Tesseract fonts table to the given file. - */ - void PrintFontsTable(FILE *fp) const; - -#endif - - /** - * Print Tesseract parameters to the given file. - */ - void PrintVariables(FILE *fp) const; - - /** - * Get value of named variable as a string, if it exists. - */ - bool GetVariableAsString(const char *name, std::string *val) const; - - /** - * Instances are now mostly thread-safe and totally independent, - * but some global parameters remain. Basically it is safe to use multiple - * TessBaseAPIs in different threads in parallel, UNLESS: - * you use SetVariable on some of the Params in classify and textord. - * If you do, then the effect will be to change it for all your instances. - * - * Start tesseract. Returns zero on success and -1 on failure. - * NOTE that the only members that may be called before Init are those - * listed above here in the class definition. - * - * The datapath must be the name of the tessdata directory. - * The language is (usually) an ISO 639-3 string or nullptr will default to - * eng. It is entirely safe (and eventually will be efficient too) to call - * Init multiple times on the same instance to change language, or just - * to reset the classifier. - * The language may be a string of the form [~][+[~]]* indicating - * that multiple languages are to be loaded. Eg hin+eng will load Hindi and - * English. Languages may specify internally that they want to be loaded - * with one or more other languages, so the ~ sign is available to override - * that. Eg if hin were set to load eng by default, then hin+~eng would force - * loading only hin. The number of loaded languages is limited only by - * memory, with the caveat that loading additional languages will impact - * both speed and accuracy, as there is more work to do to decide on the - * applicable language, and there is more chance of hallucinating incorrect - * words. - * WARNING: On changing languages, all Tesseract parameters are reset - * back to their default values. (Which may vary between languages.) - * If you have a rare need to set a Variable that controls - * initialization for a second call to Init you should explicitly - * call End() and then use SetVariable before Init. This is only a very - * rare use case, since there are very few uses that require any parameters - * to be set before Init. - * - * If set_only_non_debug_params is true, only params that do not contain - * "debug" in the name will be set. - */ - int Init(const char *datapath, const char *language, OcrEngineMode mode, - char **configs, int configs_size, - const std::vector *vars_vec, - const std::vector *vars_values, - bool set_only_non_debug_params); - int Init(const char *datapath, const char *language, OcrEngineMode oem) { - return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false); - } - int Init(const char *datapath, const char *language) { - return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr, - false); - } - // In-memory version reads the traineddata file directly from the given - // data[data_size] array, and/or reads data via a FileReader. - int Init(const char *data, int data_size, const char *language, - OcrEngineMode mode, char **configs, int configs_size, - const std::vector *vars_vec, - const std::vector *vars_values, - bool set_only_non_debug_params, FileReader reader); - - /** - * Returns the languages string used in the last valid initialization. - * If the last initialization specified "deu+hin" then that will be - * returned. If hin loaded eng automatically as well, then that will - * not be included in this list. To find the languages actually - * loaded use GetLoadedLanguagesAsVector. - * The returned string should NOT be deleted. - */ - const char *GetInitLanguagesAsString() const; - - /** - * Returns the loaded languages in the vector of std::string. - * Includes all languages loaded by the last Init, including those loaded - * as dependencies of other loaded languages. - */ - void GetLoadedLanguagesAsVector(std::vector *langs) const; - - /** - * Returns the available languages in the sorted vector of std::string. - */ - void GetAvailableLanguagesAsVector(std::vector *langs) const; - - /** - * Init only for page layout analysis. Use only for calls to SetImage and - * AnalysePage. Calls that attempt recognition will generate an error. - */ - void InitForAnalysePage(); - - /** - * Read a "config" file containing a set of param, value pairs. - * Searches the standard places: tessdata/configs, tessdata/tessconfigs - * and also accepts a relative or absolute path name. - * Note: only non-init params will be set (init params are set by Init()). - */ - void ReadConfigFile(const char *filename); - /** Same as above, but only set debug params from the given config file. */ - void ReadDebugConfigFile(const char *filename); - - /** - * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK. - * The mode is stored as an IntParam so it can also be modified by - * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string). - */ - void SetPageSegMode(PageSegMode mode); - - /** Return the current page segmentation mode. */ - PageSegMode GetPageSegMode() const; - - /** - * Recognize a rectangle from an image and return the result as a string. - * May be called many times for a single Init. - * Currently has no error checking. - * Greyscale of 8 and color of 24 or 32 bits per pixel may be given. - * Palette color images will not work properly and must be converted to - * 24 bit. - * Binary images of 1 bit per pixel may also be given but they must be - * byte packed with the MSB of the first byte being the first pixel, and a - * 1 represents WHITE. For binary images set bytes_per_pixel=0. - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - * - * Note that TesseractRect is the simplified convenience interface. - * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize, - * and one or more of the Get*Text functions below. - */ - char *TesseractRect(const unsigned char *imagedata, int bytes_per_pixel, - int bytes_per_line, int left, int top, int width, - int height); - - /** - * Call between pages or documents etc to free up memory and forget - * adaptive data. - */ - void ClearAdaptiveClassifier(); - - /** - * @defgroup AdvancedAPI Advanced API - * The following methods break TesseractRect into pieces, so you can - * get hold of the thresholded image, get the text in different formats, - * get bounding boxes, confidences etc. - */ - /* @{ */ - - /** - * Provide an image for Tesseract to recognize. Format is as - * TesseractRect above. Copies the image buffer and converts to Pix. - * SetImage clears all recognition results, and sets the rectangle to the - * full image, so it may be followed immediately by a GetUTF8Text, and it - * will automatically perform recognition. - */ - void SetImage(const unsigned char *imagedata, int width, int height, - int bytes_per_pixel, int bytes_per_line); - - /** - * Provide an image for Tesseract to recognize. As with SetImage above, - * Tesseract takes its own copy of the image, so it need not persist until - * after Recognize. - * Pix vs raw, which to use? - * Use Pix where possible. Tesseract uses Pix as its internal representation - * and it is therefore more efficient to provide a Pix directly. - */ - void SetImage(Pix *pix); - - /** - * Set the resolution of the source image in pixels per inch so font size - * information can be calculated in results. Call this after SetImage(). - */ - void SetSourceResolution(int ppi); - - /** - * Restrict recognition to a sub-rectangle of the image. Call after SetImage. - * Each SetRectangle clears the recogntion results so multiple rectangles - * can be recognized with the same image. - */ - void SetRectangle(int left, int top, int width, int height); - - /** - * Get a copy of the internal thresholded image from Tesseract. - * Caller takes ownership of the Pix and must pixDestroy it. - * May be called any time after SetImage, or after TesseractRect. - */ - Pix *GetThresholdedImage(); - - /** - * Get the result of page layout analysis as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - */ - Boxa *GetRegions(Pixa **pixa); - - /** - * Get the textlines as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * If raw_image is true, then extract from the original image instead of the - * thresholded image and pad by raw_padding pixels. - * If blockids is not nullptr, the block-id of each line is also returned as - * an array of one element per line. delete [] after use. If paraids is not - * nullptr, the paragraph-id of each line within its block is also returned as - * an array of one element per line. delete [] after use. - */ - Boxa *GetTextlines(bool raw_image, int raw_padding, Pixa **pixa, - int **blockids, int **paraids); - /* - Helper method to extract from the thresholded image. (most common usage) -*/ - Boxa *GetTextlines(Pixa **pixa, int **blockids) { - return GetTextlines(false, 0, pixa, blockids, nullptr); - } - - /** - * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa - * pair, in reading order. Enables downstream handling of non-rectangular - * regions. - * Can be called before or after Recognize. - * If blockids is not nullptr, the block-id of each line is also returned as - * an array of one element per line. delete [] after use. - */ - Boxa *GetStrips(Pixa **pixa, int **blockids); - - /** - * Get the words as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - */ - Boxa *GetWords(Pixa **pixa); - - /** - * Gets the individual connected (text) components (created - * after pages segmentation step, but before recognition) - * as a leptonica-style Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * Note: the caller is responsible for calling boxaDestroy() - * on the returned Boxa array and pixaDestroy() on cc array. - */ - Boxa *GetConnectedComponents(Pixa **cc); - - /** - * Get the given level kind of components (block, textline, word etc.) as a - * leptonica-style Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * If blockids is not nullptr, the block-id of each component is also returned - * as an array of one element per component. delete [] after use. - * If blockids is not nullptr, the paragraph-id of each component with its - * block is also returned as an array of one element per component. delete [] - * after use. If raw_image is true, then portions of the original image are - * extracted instead of the thresholded image and padded with raw_padding. If - * text_only is true, then only text components are returned. - */ - Boxa *GetComponentImages(PageIteratorLevel level, bool text_only, - bool raw_image, int raw_padding, Pixa **pixa, - int **blockids, int **paraids); - // Helper function to get binary images with no padding (most common usage). - Boxa *GetComponentImages(const PageIteratorLevel level, const bool text_only, - Pixa **pixa, int **blockids) { - return GetComponentImages(level, text_only, false, 0, pixa, blockids, - nullptr); - } - - /** - * Returns the scale factor of the thresholded image that would be returned by - * GetThresholdedImage() and the various GetX() methods that call - * GetComponentImages(). - * Returns 0 if no thresholder has been set. - */ - int GetThresholdedImageScaleFactor() const; - - /** - * Runs page layout analysis in the mode set by SetPageSegMode. - * May optionally be called prior to Recognize to get access to just - * the page layout results. Returns an iterator to the results. - * If merge_similar_words is true, words are combined where suitable for use - * with a line recognizer. Use if you want to use AnalyseLayout to find the - * textlines, and then want to process textline fragments with an external - * line recognizer. - * Returns nullptr on error or an empty page. - * The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - PageIterator *AnalyseLayout(); - PageIterator *AnalyseLayout(bool merge_similar_words); - - /** - * Recognize the image from SetAndThresholdImage, generating Tesseract - * internal structures. Returns 0 on success. - * Optional. The Get*Text functions below will call Recognize if needed. - * After Recognize, the output is kept internally until the next SetImage. - */ - int Recognize(ETEXT_DESC *monitor); - - /** - * Methods to retrieve information after SetAndThresholdImage(), - * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.) - */ - - /** - * Turns images into symbolic text. - * - * filename can point to a single image, a multi-page TIFF, - * or a plain text list of image filenames. - * - * retry_config is useful for debugging. If not nullptr, you can fall - * back to an alternate configuration if a page fails for some - * reason. - * - * timeout_millisec terminates processing if any single page - * takes too long. Set to 0 for unlimited time. - * - * renderer is responible for creating the output. For example, - * use the TessTextRenderer if you want plaintext output, or - * the TessPDFRender to produce searchable PDF. - * - * If tessedit_page_number is non-negative, will only process that - * single page. Works for multi-page tiff file, or filelist. - * - * Returns true if successful, false on error. - */ - bool ProcessPages(const char *filename, const char *retry_config, - int timeout_millisec, TessResultRenderer *renderer); - // Does the real work of ProcessPages. - bool ProcessPagesInternal(const char *filename, const char *retry_config, - int timeout_millisec, TessResultRenderer *renderer); - - /** - * Turn a single image into symbolic text. - * - * The pix is the image processed. filename and page_index are - * metadata used by side-effect processes, such as reading a box - * file or formatting as hOCR. - * - * See ProcessPages for descriptions of other parameters. - */ - bool ProcessPage(Pix *pix, int page_index, const char *filename, - const char *retry_config, int timeout_millisec, - TessResultRenderer *renderer); - - /** - * Get a reading-order iterator to the results of LayoutAnalysis and/or - * Recognize. The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - ResultIterator *GetIterator(); - - /** - * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize. - * The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - MutableIterator *GetMutableIterator(); - - /** - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - */ - char *GetUTF8Text(); - - /** - * Make a HTML-formatted string with hOCR markup from the internal - * data structures. - * page_number is 0-based but will appear in the output as 1-based. - * monitor can be used to - * cancel the recognition - * receive progress callbacks - * Returned string must be freed with the delete [] operator. - */ - char *GetHOCRText(ETEXT_DESC *monitor, int page_number); - - /** - * Make a HTML-formatted string with hOCR markup from the internal - * data structures. - * page_number is 0-based but will appear in the output as 1-based. - * Returned string must be freed with the delete [] operator. - */ - char *GetHOCRText(int page_number); - - /** - * Make an XML-formatted string with Alto markup from the internal - * data structures. - */ - char *GetAltoText(ETEXT_DESC *monitor, int page_number); - - /** - * Make an XML-formatted string with Alto markup from the internal - * data structures. - */ - char *GetAltoText(int page_number); - - /** - * Make a TSV-formatted string from the internal data structures. - * page_number is 0-based but will appear in the output as 1-based. - * Returned string must be freed with the delete [] operator. - */ - char *GetTSVText(int page_number); - - /** - * Make a box file for LSTM training from the internal data structures. - * Constructs coordinates in the original image - not just the rectangle. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetLSTMBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded in the same - * format as a box file used in training. - * Constructs coordinates in the original image - not just the rectangle. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded in the same - * format as a WordStr box file used in training. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetWordStrBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded - * as UNLV format Latin-1 with specific reject and suspect codes. - * Returned string must be freed with the delete [] operator. - */ - char *GetUNLVText(); - - /** - * Detect the orientation of the input image and apparent script (alphabet). - * orient_deg is the detected clockwise rotation of the input image in degrees - * (0, 90, 180, 270) - * orient_conf is the confidence (15.0 is reasonably confident) - * script_name is an ASCII string, the name of the script, e.g. "Latin" - * script_conf is confidence level in the script - * Returns true on success and writes values to each parameter as an output - */ - bool DetectOrientationScript(int *orient_deg, float *orient_conf, - const char **script_name, float *script_conf); - - /** - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - * page_number is a 0-based page index that will appear in the osd file. - */ - char *GetOsdText(int page_number); - - /** Returns the (average) confidence value between 0 and 100. */ - int MeanTextConf(); - /** - * Returns all word confidences (between 0 and 100) in an array, terminated - * by -1. The calling function must delete [] after use. - * The number of confidences should correspond to the number of space- - * delimited words in GetUTF8Text. - */ - int *AllWordConfidences(); - -#ifndef DISABLED_LEGACY_ENGINE - /** - * Applies the given word to the adaptive classifier if possible. - * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can - * tell the boundaries of the graphemes. - * Assumes that SetImage/SetRectangle have been used to set the image - * to the given word. The mode arg should be PSM_SINGLE_WORD or - * PSM_CIRCLE_WORD, as that will be used to control layout analysis. - * The currently set PageSegMode is preserved. - * Returns false if adaption was not possible for some reason. - */ - bool AdaptToWordStr(PageSegMode mode, const char *wordstr); -#endif // ndef DISABLED_LEGACY_ENGINE - - /** - * Free up recognition results and any stored image data, without actually - * freeing any recognition data that would be time-consuming to reload. - * Afterwards, you must call SetImage or TesseractRect before doing - * any Recognize or Get* operation. - */ - void Clear(); - - /** - * Close down tesseract and free up all memory. End() is equivalent to - * destructing and reconstructing your TessBaseAPI. - * Once End() has been used, none of the other API functions may be used - * other than Init and anything declared above it in the class definition. - */ - void End(); - - /** - * Clear any library-level memory caches. - * There are a variety of expensive-to-load constant data structures (mostly - * language dictionaries) that are cached globally -- surviving the Init() - * and End() of individual TessBaseAPI's. This function allows the clearing - * of these caches. - **/ - static void ClearPersistentCache(); - - /** - * Check whether a word is valid according to Tesseract's language model - * @return 0 if the word is invalid, non-zero if valid. - * @warning temporary! This function will be removed from here and placed - * in a separate API at some future time. - */ - int IsValidWord(const char *word) const; - // Returns true if utf8_character is defined in the UniCharset. - bool IsValidCharacter(const char *utf8_character) const; - - bool GetTextDirection(int *out_offset, float *out_slope); - - /** Sets Dict::letter_is_okay_ function to point to the given function. */ - void SetDictFunc(DictFunc f); - - /** Sets Dict::probability_in_context_ function to point to the given - * function. - */ - void SetProbabilityInContextFunc(ProbabilityInContextFunc f); - - /** - * Estimates the Orientation And Script of the image. - * @return true if the image was processed successfully. - */ - bool DetectOS(OSResults *); - - /** - * Return text orientation of each block as determined by an earlier run - * of layout analysis. - */ - void GetBlockTextOrientations(int **block_orientation, - bool **vertical_writing); - - /** This method returns the string form of the specified unichar. */ - const char *GetUnichar(int unichar_id) const; - - /** Return the pointer to the i-th dawg loaded into tesseract_ object. */ - const Dawg *GetDawg(int i) const; - - /** Return the number of dawgs loaded into tesseract_ object. */ - int NumDawgs() const; - - Tesseract *tesseract() const { - return tesseract_; - } - - OcrEngineMode oem() const { - return last_oem_requested_; - } - - void set_min_orientation_margin(double margin); - /* @} */ - -protected: - /** Common code for setting the image. Returns true if Init has been called. - */ - bool InternalSetImage(); - - /** - * Run the thresholder to make the thresholded image. If pix is not nullptr, - * the source is thresholded to pix instead of the internal IMAGE. - */ - virtual bool Threshold(Pix **pix); - - /** - * Find lines from the image making the BLOCK_LIST. - * @return 0 on success. - */ - int FindLines(); - - /** Delete the pageres and block list ready for a new page. */ - void ClearResults(); - - /** - * Return an LTR Result Iterator -- used only for training, as we really want - * to ignore all BiDi smarts at that point. - * delete once you're done with it. - */ - LTRResultIterator *GetLTRIterator(); - - /** - * Return the length of the output text string, as UTF8, assuming - * one newline per line and one per block, with a terminator, - * and assuming a single character reject marker for each rejected character. - * Also return the number of recognized blobs in blob_count. - */ - int TextLength(int *blob_count) const; - - //// paragraphs.cpp //////////////////////////////////////////////////// - void DetectParagraphs(bool after_text_recognition); - - const PAGE_RES *GetPageRes() const { - return page_res_; - } - -protected: - Tesseract *tesseract_; ///< The underlying data object. - Tesseract *osd_tesseract_; ///< For orientation & script detection. - EquationDetect *equ_detect_; ///< The equation detector. - FileReader reader_; ///< Reads files from any filesystem. - ImageThresholder *thresholder_; ///< Image thresholding module. - std::vector *paragraph_models_; - BLOCK_LIST *block_list_; ///< The page layout. - PAGE_RES *page_res_; ///< The page-level data. - std::string input_file_; ///< Name used by training code. - std::string output_file_; ///< Name used by debug code. - std::string datapath_; ///< Current location of tessdata. - std::string language_; ///< Last initialized language. - OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested. - bool recognition_done_; ///< page_res_ contains recognition data. - - /** - * @defgroup ThresholderParams Thresholder Parameters - * Parameters saved from the Thresholder. Needed to rebuild coordinates. - */ - /* @{ */ - int rect_left_; - int rect_top_; - int rect_width_; - int rect_height_; - int image_width_; - int image_height_; - /* @} */ - -private: - // A list of image filenames gets special consideration - bool ProcessPagesFileList(FILE *fp, std::string *buf, - const char *retry_config, int timeout_millisec, - TessResultRenderer *renderer, - int tessedit_page_number); - // TIFF supports multipage so gets special consideration. - bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size, - const char *filename, const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer, - int tessedit_page_number); -}; // class TessBaseAPI. - -/** Escape a char string - remove &<>"' with HTML codes. */ -std::string HOcrEscape(const char *text); - -} // namespace tesseract - -#endif // TESSERACT_API_BASEAPI_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/capi.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/capi.h deleted file mode 100644 index 40f4856a..00000000 --- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/capi.h +++ /dev/null @@ -1,484 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: capi.h -// Description: C-API TessBaseAPI -// -// (C) Copyright 2012, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef API_CAPI_H_ -#define API_CAPI_H_ - -#include "export.h" - -#ifdef __cplusplus -# include -# include -# include -# include -# include -#endif - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef BOOL -# define BOOL int -# define TRUE 1 -# define FALSE 0 -#endif - -#ifdef __cplusplus -typedef tesseract::TessResultRenderer TessResultRenderer; -typedef tesseract::TessBaseAPI TessBaseAPI; -typedef tesseract::PageIterator TessPageIterator; -typedef tesseract::ResultIterator TessResultIterator; -typedef tesseract::MutableIterator TessMutableIterator; -typedef tesseract::ChoiceIterator TessChoiceIterator; -typedef tesseract::OcrEngineMode TessOcrEngineMode; -typedef tesseract::PageSegMode TessPageSegMode; -typedef tesseract::PageIteratorLevel TessPageIteratorLevel; -typedef tesseract::Orientation TessOrientation; -typedef tesseract::ParagraphJustification TessParagraphJustification; -typedef tesseract::WritingDirection TessWritingDirection; -typedef tesseract::TextlineOrder TessTextlineOrder; -typedef tesseract::PolyBlockType TessPolyBlockType; -typedef tesseract::ETEXT_DESC ETEXT_DESC; -#else -typedef struct TessResultRenderer TessResultRenderer; -typedef struct TessBaseAPI TessBaseAPI; -typedef struct TessPageIterator TessPageIterator; -typedef struct TessResultIterator TessResultIterator; -typedef struct TessMutableIterator TessMutableIterator; -typedef struct TessChoiceIterator TessChoiceIterator; -typedef enum TessOcrEngineMode { - OEM_TESSERACT_ONLY, - OEM_LSTM_ONLY, - OEM_TESSERACT_LSTM_COMBINED, - OEM_DEFAULT -} TessOcrEngineMode; -typedef enum TessPageSegMode { - PSM_OSD_ONLY, - PSM_AUTO_OSD, - PSM_AUTO_ONLY, - PSM_AUTO, - PSM_SINGLE_COLUMN, - PSM_SINGLE_BLOCK_VERT_TEXT, - PSM_SINGLE_BLOCK, - PSM_SINGLE_LINE, - PSM_SINGLE_WORD, - PSM_CIRCLE_WORD, - PSM_SINGLE_CHAR, - PSM_SPARSE_TEXT, - PSM_SPARSE_TEXT_OSD, - PSM_RAW_LINE, - PSM_COUNT -} TessPageSegMode; -typedef enum TessPageIteratorLevel { - RIL_BLOCK, - RIL_PARA, - RIL_TEXTLINE, - RIL_WORD, - RIL_SYMBOL -} TessPageIteratorLevel; -typedef enum TessPolyBlockType { - PT_UNKNOWN, - PT_FLOWING_TEXT, - PT_HEADING_TEXT, - PT_PULLOUT_TEXT, - PT_EQUATION, - PT_INLINE_EQUATION, - PT_TABLE, - PT_VERTICAL_TEXT, - PT_CAPTION_TEXT, - PT_FLOWING_IMAGE, - PT_HEADING_IMAGE, - PT_PULLOUT_IMAGE, - PT_HORZ_LINE, - PT_VERT_LINE, - PT_NOISE, - PT_COUNT -} TessPolyBlockType; -typedef enum TessOrientation { - ORIENTATION_PAGE_UP, - ORIENTATION_PAGE_RIGHT, - ORIENTATION_PAGE_DOWN, - ORIENTATION_PAGE_LEFT -} TessOrientation; -typedef enum TessParagraphJustification { - JUSTIFICATION_UNKNOWN, - JUSTIFICATION_LEFT, - JUSTIFICATION_CENTER, - JUSTIFICATION_RIGHT -} TessParagraphJustification; -typedef enum TessWritingDirection { - WRITING_DIRECTION_LEFT_TO_RIGHT, - WRITING_DIRECTION_RIGHT_TO_LEFT, - WRITING_DIRECTION_TOP_TO_BOTTOM -} TessWritingDirection; -typedef enum TessTextlineOrder { - TEXTLINE_ORDER_LEFT_TO_RIGHT, - TEXTLINE_ORDER_RIGHT_TO_LEFT, - TEXTLINE_ORDER_TOP_TO_BOTTOM -} TessTextlineOrder; -typedef struct ETEXT_DESC ETEXT_DESC; -#endif - -typedef bool (*TessCancelFunc)(void *cancel_this, int words); -typedef bool (*TessProgressFunc)(ETEXT_DESC *ths, int left, int right, int top, - int bottom); - -struct Pix; -struct Boxa; -struct Pixa; - -/* General free functions */ - -TESS_API const char *TessVersion(); -TESS_API void TessDeleteText(const char *text); -TESS_API void TessDeleteTextArray(char **arr); -TESS_API void TessDeleteIntArray(const int *arr); - -/* Renderer API */ -TESS_API TessResultRenderer *TessTextRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessHOcrRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase, - BOOL font_info); -TESS_API TessResultRenderer *TessAltoRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessTsvRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessPDFRendererCreate(const char *outputbase, - const char *datadir, - BOOL textonly); -TESS_API TessResultRenderer *TessUnlvRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessWordStrBoxRendererCreate( - const char *outputbase); - -TESS_API void TessDeleteResultRenderer(TessResultRenderer *renderer); -TESS_API void TessResultRendererInsert(TessResultRenderer *renderer, - TessResultRenderer *next); -TESS_API TessResultRenderer *TessResultRendererNext( - TessResultRenderer *renderer); -TESS_API BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer, - const char *title); -TESS_API BOOL TessResultRendererAddImage(TessResultRenderer *renderer, - TessBaseAPI *api); -TESS_API BOOL TessResultRendererEndDocument(TessResultRenderer *renderer); - -TESS_API const char *TessResultRendererExtention(TessResultRenderer *renderer); -TESS_API const char *TessResultRendererTitle(TessResultRenderer *renderer); -TESS_API int TessResultRendererImageNum(TessResultRenderer *renderer); - -/* Base API */ - -TESS_API TessBaseAPI *TessBaseAPICreate(); -TESS_API void TessBaseAPIDelete(TessBaseAPI *handle); - -TESS_API size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI *handle, void **device); - -TESS_API void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name); -TESS_API const char *TessBaseAPIGetInputName(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetInputImage(TessBaseAPI *handle, struct Pix *pix); -TESS_API struct Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle); - -TESS_API int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle); -TESS_API const char *TessBaseAPIGetDatapath(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name); - -TESS_API BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name, - const char *value); -TESS_API BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name, - const char *value); - -TESS_API BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle, - const char *name, int *value); -TESS_API BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle, - const char *name, BOOL *value); -TESS_API BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle, - const char *name, double *value); -TESS_API const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle, - const char *name); - -TESS_API void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp); -TESS_API BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle, - const char *filename); - -TESS_API int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode oem, - char **configs, int configs_size); -TESS_API int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode oem); -TESS_API int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath, - const char *language); - -TESS_API int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode mode, - char **configs, int configs_size, char **vars_vec, - char **vars_values, size_t vars_vec_size, - BOOL set_only_non_debug_params); - -TESS_API int TessBaseAPIInit5(TessBaseAPI *handle, const char *data, int data_size, - const char *language, TessOcrEngineMode mode, - char **configs, int configs_size, char **vars_vec, - char **vars_values, size_t vars_vec_size, - BOOL set_only_non_debug_params); - -TESS_API const char *TessBaseAPIGetInitLanguagesAsString( - const TessBaseAPI *handle); -TESS_API char **TessBaseAPIGetLoadedLanguagesAsVector( - const TessBaseAPI *handle); -TESS_API char **TessBaseAPIGetAvailableLanguagesAsVector( - const TessBaseAPI *handle); - -TESS_API void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle); - -TESS_API void TessBaseAPIReadConfigFile(TessBaseAPI *handle, - const char *filename); -TESS_API void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle, - const char *filename); - -TESS_API void TessBaseAPISetPageSegMode(TessBaseAPI *handle, - TessPageSegMode mode); -TESS_API TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle); - -TESS_API char *TessBaseAPIRect(TessBaseAPI *handle, - const unsigned char *imagedata, - int bytes_per_pixel, int bytes_per_line, - int left, int top, int width, int height); - -TESS_API void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetImage(TessBaseAPI *handle, - const unsigned char *imagedata, int width, - int height, int bytes_per_pixel, - int bytes_per_line); -TESS_API void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix); - -TESS_API void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi); - -TESS_API void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top, - int width, int height); - -TESS_API struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle); -TESS_API struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle, - struct Pixa **pixa); -TESS_API struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle, - struct Pixa **pixa, - int **blockids); -TESS_API struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle, - BOOL raw_image, int raw_padding, - struct Pixa **pixa, - int **blockids, int **paraids); -TESS_API struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle, - struct Pixa **pixa, int **blockids); -TESS_API struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle, - struct Pixa **pixa); -TESS_API struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle, - struct Pixa **cc); -TESS_API struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle, - TessPageIteratorLevel level, - BOOL text_only, - struct Pixa **pixa, - int **blockids); -TESS_API struct Boxa *TessBaseAPIGetComponentImages1( - TessBaseAPI *handle, TessPageIteratorLevel level, BOOL text_only, - BOOL raw_image, int raw_padding, struct Pixa **pixa, int **blockids, - int **paraids); - -TESS_API int TessBaseAPIGetThresholdedImageScaleFactor( - const TessBaseAPI *handle); - -TESS_API TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle); - -TESS_API int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor); - -TESS_API BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename, - const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer); -TESS_API BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix, - int page_index, const char *filename, - const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer); - -TESS_API TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle); -TESS_API TessMutableIterator *TessBaseAPIGetMutableIterator( - TessBaseAPI *handle); - -TESS_API char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle); -TESS_API char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number); - -TESS_API char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number); - -TESS_API char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle, - int page_number); - -TESS_API char *TessBaseAPIGetUNLVText(TessBaseAPI *handle); -TESS_API int TessBaseAPIMeanTextConf(TessBaseAPI *handle); - -TESS_API int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle); - -#ifndef DISABLED_LEGACY_ENGINE -TESS_API BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle, - TessPageSegMode mode, - const char *wordstr); -#endif // #ifndef DISABLED_LEGACY_ENGINE - -TESS_API void TessBaseAPIClear(TessBaseAPI *handle); -TESS_API void TessBaseAPIEnd(TessBaseAPI *handle); - -TESS_API int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word); -TESS_API BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset, - float *out_slope); - -TESS_API const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id); - -TESS_API void TessBaseAPIClearPersistentCache(TessBaseAPI *handle); - -#ifndef DISABLED_LEGACY_ENGINE - -// Call TessDeleteText(*best_script_name) to free memory allocated by this -// function -TESS_API BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle, - int *orient_deg, - float *orient_conf, - const char **script_name, - float *script_conf); -#endif // #ifndef DISABLED_LEGACY_ENGINE - -TESS_API void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle, - double margin); - -TESS_API int TessBaseAPINumDawgs(const TessBaseAPI *handle); - -TESS_API TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle); - -TESS_API void TessBaseGetBlockTextOrientations(TessBaseAPI *handle, - int **block_orientation, - bool **vertical_writing); - -/* Page iterator */ - -TESS_API void TessPageIteratorDelete(TessPageIterator *handle); - -TESS_API TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle); - -TESS_API void TessPageIteratorBegin(TessPageIterator *handle); - -TESS_API BOOL TessPageIteratorNext(TessPageIterator *handle, - TessPageIteratorLevel level); - -TESS_API BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle, - TessPageIteratorLevel level); - -TESS_API BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle, - TessPageIteratorLevel level, - TessPageIteratorLevel element); - -TESS_API BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle, - TessPageIteratorLevel level, - int *left, int *top, int *right, - int *bottom); - -TESS_API TessPolyBlockType -TessPageIteratorBlockType(const TessPageIterator *handle); - -TESS_API struct Pix *TessPageIteratorGetBinaryImage( - const TessPageIterator *handle, TessPageIteratorLevel level); - -TESS_API struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle, - TessPageIteratorLevel level, - int padding, - struct Pix *original_image, - int *left, int *top); - -TESS_API BOOL TessPageIteratorBaseline(const TessPageIterator *handle, - TessPageIteratorLevel level, int *x1, - int *y1, int *x2, int *y2); - -TESS_API void TessPageIteratorOrientation( - TessPageIterator *handle, TessOrientation *orientation, - TessWritingDirection *writing_direction, TessTextlineOrder *textline_order, - float *deskew_angle); - -TESS_API void TessPageIteratorParagraphInfo( - TessPageIterator *handle, TessParagraphJustification *justification, - BOOL *is_list_item, BOOL *is_crown, int *first_line_indent); - -/* Result iterator */ - -TESS_API void TessResultIteratorDelete(TessResultIterator *handle); -TESS_API TessResultIterator *TessResultIteratorCopy( - const TessResultIterator *handle); -TESS_API TessPageIterator *TessResultIteratorGetPageIterator( - TessResultIterator *handle); -TESS_API const TessPageIterator *TessResultIteratorGetPageIteratorConst( - const TessResultIterator *handle); -TESS_API TessChoiceIterator *TessResultIteratorGetChoiceIterator( - const TessResultIterator *handle); - -TESS_API BOOL TessResultIteratorNext(TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API float TessResultIteratorConfidence(const TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API const char *TessResultIteratorWordRecognitionLanguage( - const TessResultIterator *handle); -TESS_API const char *TessResultIteratorWordFontAttributes( - const TessResultIterator *handle, BOOL *is_bold, BOOL *is_italic, - BOOL *is_underlined, BOOL *is_monospace, BOOL *is_serif, BOOL *is_smallcaps, - int *pointsize, int *font_id); - -TESS_API BOOL -TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle); -TESS_API BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle); - -TESS_API void TessChoiceIteratorDelete(TessChoiceIterator *handle); -TESS_API BOOL TessChoiceIteratorNext(TessChoiceIterator *handle); -TESS_API const char *TessChoiceIteratorGetUTF8Text( - const TessChoiceIterator *handle); -TESS_API float TessChoiceIteratorConfidence(const TessChoiceIterator *handle); - -/* Progress monitor */ - -TESS_API ETEXT_DESC *TessMonitorCreate(); -TESS_API void TessMonitorDelete(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetCancelFunc(ETEXT_DESC *monitor, - TessCancelFunc cancelFunc); -TESS_API void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis); -TESS_API void *TessMonitorGetCancelThis(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetProgressFunc(ETEXT_DESC *monitor, - TessProgressFunc progressFunc); -TESS_API int TessMonitorGetProgress(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline); - -#ifdef __cplusplus -} -#endif - -#endif // API_CAPI_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/export.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/export.h deleted file mode 100644 index d238b628..00000000 --- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/export.h +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: export.h -// Description: Place holder -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_PLATFORM_H_ -#define TESSERACT_PLATFORM_H_ - -#ifndef TESS_API -# if defined(_WIN32) || defined(__CYGWIN__) -# if defined(TESS_EXPORTS) -# define TESS_API __declspec(dllexport) -# elif defined(TESS_IMPORTS) -# define TESS_API __declspec(dllimport) -# else -# define TESS_API -# endif -# else -# if defined(TESS_EXPORTS) || defined(TESS_IMPORTS) -# define TESS_API __attribute__((visibility("default"))) -# else -# define TESS_API -# endif -# endif -#endif - -#endif // TESSERACT_PLATFORM_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/ltrresultiterator.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/ltrresultiterator.h deleted file mode 100644 index 6ca0a98e..00000000 --- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/ltrresultiterator.h +++ /dev/null @@ -1,235 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: ltrresultiterator.h -// Description: Iterator for tesseract results in strict left-to-right -// order that avoids using tesseract internal data structures. -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ -#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ - -#include "export.h" // for TESS_API -#include "pageiterator.h" // for PageIterator -#include "publictypes.h" // for PageIteratorLevel -#include "unichar.h" // for StrongScriptDirection - -namespace tesseract { - -class BLOB_CHOICE_IT; -class PAGE_RES; -class WERD_RES; - -class Tesseract; - -// Class to iterate over tesseract results, providing access to all levels -// of the page hierarchy, without including any tesseract headers or having -// to handle any tesseract structures. -// WARNING! This class points to data held within the TessBaseAPI class, and -// therefore can only be used while the TessBaseAPI class still exists and -// has not been subjected to a call of Init, SetImage, Recognize, Clear, End -// DetectOS, or anything else that changes the internal PAGE_RES. -// See tesseract/publictypes.h for the definition of PageIteratorLevel. -// See also base class PageIterator, which contains the bulk of the interface. -// LTRResultIterator adds text-specific methods for access to OCR output. - -class TESS_API LTRResultIterator : public PageIterator { - friend class ChoiceIterator; - -public: - // page_res and tesseract come directly from the BaseAPI. - // The rectangle parameters are copied indirectly from the Thresholder, - // via the BaseAPI. They represent the coordinates of some rectangle in an - // original image (in top-left-origin coordinates) and therefore the top-left - // needs to be added to any output boxes in order to specify coordinates - // in the original image. See TessBaseAPI::SetRectangle. - // The scale and scaled_yres are in case the Thresholder scaled the image - // rectangle prior to thresholding. Any coordinates in tesseract's image - // must be divided by scale before adding (rect_left, rect_top). - // The scaled_yres indicates the effective resolution of the binary image - // that tesseract has been given by the Thresholder. - // After the constructor, Begin has already been called. - LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, - int scaled_yres, int rect_left, int rect_top, - int rect_width, int rect_height); - - ~LTRResultIterator() override; - - // LTRResultIterators may be copied! This makes it possible to iterate over - // all the objects at a lower level, while maintaining an iterator to - // objects at a higher level. These constructors DO NOT CALL Begin, so - // iterations will continue from the location of src. - // TODO: For now the copy constructor and operator= only need the base class - // versions, but if new data members are added, don't forget to add them! - - // ============= Moving around within the page ============. - - // See PageIterator. - - // ============= Accessing data ==============. - - // Returns the null terminated UTF-8 encoded text string for the current - // object at the given level. Use delete [] to free after use. - char *GetUTF8Text(PageIteratorLevel level) const; - - // Set the string inserted at the end of each text line. "\n" by default. - void SetLineSeparator(const char *new_line); - - // Set the string inserted at the end of each paragraph. "\n" by default. - void SetParagraphSeparator(const char *new_para); - - // Returns the mean confidence of the current object at the given level. - // The number should be interpreted as a percent probability. (0.0f-100.0f) - float Confidence(PageIteratorLevel level) const; - - // ============= Functions that refer to words only ============. - - // Returns the font attributes of the current word. If iterating at a higher - // level object than words, eg textlines, then this will return the - // attributes of the first word in that textline. - // The actual return value is a string representing a font name. It points - // to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as - // the iterator itself, ie rendered invalid by various members of - // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI. - // Pointsize is returned in printers points (1/72 inch.) - const char *WordFontAttributes(bool *is_bold, bool *is_italic, - bool *is_underlined, bool *is_monospace, - bool *is_serif, bool *is_smallcaps, - int *pointsize, int *font_id) const; - - // Return the name of the language used to recognize this word. - // On error, nullptr. Do not delete this pointer. - const char *WordRecognitionLanguage() const; - - // Return the overall directionality of this word. - StrongScriptDirection WordDirection() const; - - // Returns true if the current word was found in a dictionary. - bool WordIsFromDictionary() const; - - // Returns the number of blanks before the current word. - int BlanksBeforeWord() const; - - // Returns true if the current word is numeric. - bool WordIsNumeric() const; - - // Returns true if the word contains blamer information. - bool HasBlamerInfo() const; - - // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle - // of the current word. - const void *GetParamsTrainingBundle() const; - - // Returns a pointer to the string with blamer information for this word. - // Assumes that the word's blamer_bundle is not nullptr. - const char *GetBlamerDebug() const; - - // Returns a pointer to the string with misadaption information for this word. - // Assumes that the word's blamer_bundle is not nullptr. - const char *GetBlamerMisadaptionDebug() const; - - // Returns true if a truth string was recorded for the current word. - bool HasTruthString() const; - - // Returns true if the given string is equivalent to the truth string for - // the current word. - bool EquivalentToTruth(const char *str) const; - - // Returns a null terminated UTF-8 encoded truth string for the current word. - // Use delete [] to free after use. - char *WordTruthUTF8Text() const; - - // Returns a null terminated UTF-8 encoded normalized OCR string for the - // current word. Use delete [] to free after use. - char *WordNormedUTF8Text() const; - - // Returns a pointer to serialized choice lattice. - // Fills lattice_size with the number of bytes in lattice data. - const char *WordLattice(int *lattice_size) const; - - // ============= Functions that refer to symbols only ============. - - // Returns true if the current symbol is a superscript. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsSuperscript() const; - // Returns true if the current symbol is a subscript. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsSubscript() const; - // Returns true if the current symbol is a dropcap. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsDropcap() const; - -protected: - const char *line_separator_; - const char *paragraph_separator_; -}; - -// Class to iterate over the classifier choices for a single RIL_SYMBOL. -class TESS_API ChoiceIterator { -public: - // Construction is from a LTRResultIterator that points to the symbol of - // interest. The ChoiceIterator allows a one-shot iteration over the - // choices for this symbol and after that it is useless. - explicit ChoiceIterator(const LTRResultIterator &result_it); - ~ChoiceIterator(); - - // Moves to the next choice for the symbol and returns false if there - // are none left. - bool Next(); - - // ============= Accessing data ==============. - - // Returns the null terminated UTF-8 encoded text string for the current - // choice. - // NOTE: Unlike LTRResultIterator::GetUTF8Text, the return points to an - // internal structure and should NOT be delete[]ed to free after use. - const char *GetUTF8Text() const; - - // Returns the confidence of the current choice depending on the used language - // data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All - // choices for one symbol should roughly add up to 1.0f. - // If only traineddata of the legacy engine is used, the number should be - // interpreted as a percent probability. (0.0f-100.0f) In this case - // probabilities won't add up to 100. Each one stands on its own. - float Confidence() const; - - // Returns a vector containing all timesteps, which belong to the currently - // selected symbol. A timestep is a vector containing pairs of symbols and - // floating point numbers. The number states the probability for the - // corresponding symbol. - std::vector>> *Timesteps() const; - -private: - // clears the remaining spaces out of the results and adapt the probabilities - void filterSpaces(); - // Pointer to the WERD_RES object owned by the API. - WERD_RES *word_res_; - // Iterator over the blob choices. - BLOB_CHOICE_IT *choice_it_; - std::vector> *LSTM_choices_ = nullptr; - std::vector>::iterator LSTM_choice_it_; - - const int *tstep_index_; - // regulates the rating granularity - double rating_coefficient_; - // leading blanks - int blanks_before_word_; - // true when there is lstm engine related trained data - bool oemLSTM_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/ocrclass.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/ocrclass.h deleted file mode 100644 index a55e6528..00000000 --- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/ocrclass.h +++ /dev/null @@ -1,158 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -/********************************************************************** - * File: ocrclass.h - * Description: Class definitions and constants for the OCR API. - * Author: Hewlett-Packard Co - * - * (C) Copyright 1996, Hewlett-Packard Co. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -/********************************************************************** - * This file contains typedefs for all the structures used by - * the HP OCR interface. - * The structures are designed to allow them to be used with any - * structure alignment up to 8. - **********************************************************************/ - -#ifndef CCUTIL_OCRCLASS_H_ -#define CCUTIL_OCRCLASS_H_ - -#include -#include - -namespace tesseract { - -/********************************************************************** - * EANYCODE_CHAR - * Description of a single character. The character code is defined by - * the character set of the current font. - * Output text is sent as an array of these structures. - * Spaces and line endings in the output are represented in the - * structures of the surrounding characters. They are not directly - * represented as characters. - * The first character in a word has a positive value of blanks. - * Missing information should be set to the defaults in the comments. - * If word bounds are known, but not character bounds, then the top and - * bottom of each character should be those of the word. The left of the - * first and right of the last char in each word should be set. All other - * lefts and rights should be set to -1. - * If set, the values of right and bottom are left+width and top+height. - * Most of the members come directly from the parameters to ocr_append_char. - * The formatting member uses the enhancement parameter and combines the - * line direction stuff into the top 3 bits. - * The coding is 0=RL char, 1=LR char, 2=DR NL, 3=UL NL, 4=DR Para, - * 5=UL Para, 6=TB char, 7=BT char. API users do not need to know what - * the coding is, only that it is backwards compatible with the previous - * version. - **********************************************************************/ - -struct EANYCODE_CHAR { /*single character */ - // It should be noted that the format for char_code for version 2.0 and beyond - // is UTF8 which means that ASCII characters will come out as one structure - // but other characters will be returned in two or more instances of this - // structure with a single byte of the UTF8 code in each, but each will have - // the same bounding box. Programs which want to handle languagues with - // different characters sets will need to handle extended characters - // appropriately, but *all* code needs to be prepared to receive UTF8 coded - // characters for characters such as bullet and fancy quotes. - uint16_t char_code; /*character itself */ - int16_t left; /*of char (-1) */ - int16_t right; /*of char (-1) */ - int16_t top; /*of char (-1) */ - int16_t bottom; /*of char (-1) */ - int16_t font_index; /*what font (0) */ - uint8_t confidence; /*0=perfect, 100=reject (0/100) */ - uint8_t point_size; /*of char, 72=i inch, (10) */ - int8_t blanks; /*no of spaces before this char (1) */ - uint8_t formatting; /*char formatting (0) */ -}; - -/********************************************************************** - * ETEXT_DESC - * Description of the output of the OCR engine. - * This structure is used as both a progress monitor and the final - * output header, since it needs to be a valid progress monitor while - * the OCR engine is storing its output to shared memory. - * During progress, all the buffer info is -1. - * Progress starts at 0 and increases to 100 during OCR. No other constraint. - * Additionally the progress callback contains the bounding box of the word that - * is currently being processed. - * Every progress callback, the OCR engine must set ocr_alive to 1. - * The HP side will set ocr_alive to 0. Repeated failure to reset - * to 1 indicates that the OCR engine is dead. - * If the cancel function is not null then it is called with the number of - * user words found. If it returns true then operation is cancelled. - **********************************************************************/ -class ETEXT_DESC; - -using CANCEL_FUNC = bool (*)(void *, int); -using PROGRESS_FUNC = bool (*)(int, int, int, int, int); -using PROGRESS_FUNC2 = bool (*)(ETEXT_DESC *, int, int, int, int); - -class ETEXT_DESC { // output header -public: - int16_t count{0}; /// chars in this buffer(0) - int16_t progress{0}; /// percent complete increasing (0-100) - /** Progress monitor covers word recognition and it does not cover layout - * analysis. - * See Ray comment in https://github.com/tesseract-ocr/tesseract/pull/27 */ - int8_t more_to_come{0}; /// true if not last - volatile int8_t ocr_alive{0}; /// ocr sets to 1, HP 0 - int8_t err_code{0}; /// for errcode use - CANCEL_FUNC cancel{nullptr}; /// returns true to cancel - PROGRESS_FUNC progress_callback{ - nullptr}; /// called whenever progress increases - PROGRESS_FUNC2 progress_callback2; /// monitor-aware progress callback - void *cancel_this{nullptr}; /// this or other data for cancel - std::chrono::steady_clock::time_point end_time; - /// Time to stop. Expected to be set only - /// by call to set_deadline_msecs(). - EANYCODE_CHAR text[1]{}; /// character data - - ETEXT_DESC() : progress_callback2(&default_progress_func) { - end_time = std::chrono::time_point(); - } - - // Sets the end time to be deadline_msecs milliseconds from now. - void set_deadline_msecs(int32_t deadline_msecs) { - if (deadline_msecs > 0) { - end_time = std::chrono::steady_clock::now() + - std::chrono::milliseconds(deadline_msecs); - } - } - - // Returns false if we've not passed the end_time, or have not set a deadline. - bool deadline_exceeded() const { - if (end_time.time_since_epoch() == - std::chrono::steady_clock::duration::zero()) { - return false; - } - auto now = std::chrono::steady_clock::now(); - return (now > end_time); - } - -private: - static bool default_progress_func(ETEXT_DESC *ths, int left, int right, - int top, int bottom) { - if (ths->progress_callback != nullptr) { - return (*(ths->progress_callback))(ths->progress, left, right, top, - bottom); - } - return true; - } -}; - -} // namespace tesseract - -#endif // CCUTIL_OCRCLASS_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/osdetect.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/osdetect.h deleted file mode 100644 index 34bfb557..00000000 --- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/osdetect.h +++ /dev/null @@ -1,139 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: osdetect.h -// Description: Orientation and script detection. -// Author: Samuel Charron -// Ranjith Unnikrishnan -// -// (C) Copyright 2008, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_OSDETECT_H_ -#define TESSERACT_CCMAIN_OSDETECT_H_ - -#include "export.h" // for TESS_API - -#include // for std::vector - -namespace tesseract { - -class BLOBNBOX; -class BLOBNBOX_CLIST; -class BLOB_CHOICE_LIST; -class TO_BLOCK_LIST; -class UNICHARSET; - -class Tesseract; - -// Max number of scripts in ICU + "NULL" + Japanese and Korean + Fraktur -const int kMaxNumberOfScripts = 116 + 1 + 2 + 1; - -struct OSBestResult { - OSBestResult() - : orientation_id(0), script_id(0), sconfidence(0.0), oconfidence(0.0) {} - int orientation_id; - int script_id; - float sconfidence; - float oconfidence; -}; - -struct OSResults { - OSResults() : unicharset(nullptr) { - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < kMaxNumberOfScripts; ++j) { - scripts_na[i][j] = 0; - } - orientations[i] = 0; - } - } - void update_best_orientation(); - // Set the estimate of the orientation to the given id. - void set_best_orientation(int orientation_id); - // Update/Compute the best estimate of the script assuming the given - // orientation id. - void update_best_script(int orientation_id); - // Return the index of the script with the highest score for this orientation. - TESS_API int get_best_script(int orientation_id) const; - // Accumulate scores with given OSResults instance and update the best script. - void accumulate(const OSResults &osr); - - // Print statistics. - void print_scores(void) const; - void print_scores(int orientation_id) const; - - // Array holding scores for each orientation id [0,3]. - // Orientation ids [0..3] map to [0, 270, 180, 90] degree orientations of the - // page respectively, where the values refer to the amount of clockwise - // rotation to be applied to the page for the text to be upright and readable. - float orientations[4]; - // Script confidence scores for each of 4 possible orientations. - float scripts_na[4][kMaxNumberOfScripts]; - - UNICHARSET *unicharset; - OSBestResult best_result; -}; - -class OrientationDetector { -public: - OrientationDetector(const std::vector *allowed_scripts, - OSResults *results); - bool detect_blob(BLOB_CHOICE_LIST *scores); - int get_orientation(); - -private: - OSResults *osr_; - const std::vector *allowed_scripts_; -}; - -class ScriptDetector { -public: - ScriptDetector(const std::vector *allowed_scripts, OSResults *osr, - tesseract::Tesseract *tess); - void detect_blob(BLOB_CHOICE_LIST *scores); - bool must_stop(int orientation) const; - -private: - OSResults *osr_; - static const char *korean_script_; - static const char *japanese_script_; - static const char *fraktur_script_; - int korean_id_; - int japanese_id_; - int katakana_id_; - int hiragana_id_; - int han_id_; - int hangul_id_; - int latin_id_; - int fraktur_id_; - tesseract::Tesseract *tess_; - const std::vector *allowed_scripts_; -}; - -int orientation_and_script_detection(const char *filename, OSResults *, - tesseract::Tesseract *); - -int os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr, - tesseract::Tesseract *tess); - -int os_detect_blobs(const std::vector *allowed_scripts, - BLOBNBOX_CLIST *blob_list, OSResults *osr, - tesseract::Tesseract *tess); - -bool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s, - OSResults *, tesseract::Tesseract *tess); - -// Helper method to convert an orientation index to its value in degrees. -// The value represents the amount of clockwise rotation in degrees that must be -// applied for the text to be upright (readable). -TESS_API int OrientationIdToValue(const int &id); - -} // namespace tesseract - -#endif // TESSERACT_CCMAIN_OSDETECT_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/pageiterator.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/pageiterator.h deleted file mode 100644 index 68739715..00000000 --- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/pageiterator.h +++ /dev/null @@ -1,364 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: pageiterator.h -// Description: Iterator for tesseract page structure that avoids using -// tesseract internal data structures. -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_ -#define TESSERACT_CCMAIN_PAGEITERATOR_H_ - -#include "export.h" -#include "publictypes.h" - -struct Pix; -struct Pta; - -namespace tesseract { - -struct BlamerBundle; -class C_BLOB_IT; -class PAGE_RES; -class PAGE_RES_IT; -class WERD; - -class Tesseract; - -/** - * Class to iterate over tesseract page structure, providing access to all - * levels of the page hierarchy, without including any tesseract headers or - * having to handle any tesseract structures. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - * See tesseract/publictypes.h for the definition of PageIteratorLevel. - * See also ResultIterator, derived from PageIterator, which adds in the - * ability to access OCR output with text-specific methods. - */ - -class TESS_API PageIterator { -public: - /** - * page_res and tesseract come directly from the BaseAPI. - * The rectangle parameters are copied indirectly from the Thresholder, - * via the BaseAPI. They represent the coordinates of some rectangle in an - * original image (in top-left-origin coordinates) and therefore the top-left - * needs to be added to any output boxes in order to specify coordinates - * in the original image. See TessBaseAPI::SetRectangle. - * The scale and scaled_yres are in case the Thresholder scaled the image - * rectangle prior to thresholding. Any coordinates in tesseract's image - * must be divided by scale before adding (rect_left, rect_top). - * The scaled_yres indicates the effective resolution of the binary image - * that tesseract has been given by the Thresholder. - * After the constructor, Begin has already been called. - */ - PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, - int scaled_yres, int rect_left, int rect_top, int rect_width, - int rect_height); - virtual ~PageIterator(); - - /** - * Page/ResultIterators may be copied! This makes it possible to iterate over - * all the objects at a lower level, while maintaining an iterator to - * objects at a higher level. These constructors DO NOT CALL Begin, so - * iterations will continue from the location of src. - */ - PageIterator(const PageIterator &src); - const PageIterator &operator=(const PageIterator &src); - - /** Are we positioned at the same location as other? */ - bool PositionedAtSameWord(const PAGE_RES_IT *other) const; - - // ============= Moving around within the page ============. - - /** - * Moves the iterator to point to the start of the page to begin an - * iteration. - */ - virtual void Begin(); - - /** - * Moves the iterator to the beginning of the paragraph. - * This class implements this functionality by moving it to the zero indexed - * blob of the first (leftmost) word on the first row of the paragraph. - */ - virtual void RestartParagraph(); - - /** - * Return whether this iterator points anywhere in the first textline of a - * paragraph. - */ - bool IsWithinFirstTextlineOfParagraph() const; - - /** - * Moves the iterator to the beginning of the text line. - * This class implements this functionality by moving it to the zero indexed - * blob of the first (leftmost) word of the row. - */ - virtual void RestartRow(); - - /** - * Moves to the start of the next object at the given level in the - * page hierarchy, and returns false if the end of the page was reached. - * NOTE that RIL_SYMBOL will skip non-text blocks, but all other - * PageIteratorLevel level values will visit each non-text block once. - * Think of non text blocks as containing a single para, with a single line, - * with a single imaginary word. - * Calls to Next with different levels may be freely intermixed. - * This function iterates words in right-to-left scripts correctly, if - * the appropriate language has been loaded into Tesseract. - */ - virtual bool Next(PageIteratorLevel level); - - /** - * Returns true if the iterator is at the start of an object at the given - * level. - * - * For instance, suppose an iterator it is pointed to the first symbol of the - * first word of the third line of the second paragraph of the first block in - * a page, then: - * it.IsAtBeginningOf(RIL_BLOCK) = false - * it.IsAtBeginningOf(RIL_PARA) = false - * it.IsAtBeginningOf(RIL_TEXTLINE) = true - * it.IsAtBeginningOf(RIL_WORD) = true - * it.IsAtBeginningOf(RIL_SYMBOL) = true - */ - virtual bool IsAtBeginningOf(PageIteratorLevel level) const; - - /** - * Returns whether the iterator is positioned at the last element in a - * given level. (e.g. the last word in a line, the last line in a block) - * - * Here's some two-paragraph example - * text. It starts off innocuously - * enough but quickly turns bizarre. - * The author inserts a cornucopia - * of words to guard against confused - * references. - * - * Now take an iterator it pointed to the start of "bizarre." - * it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false - * it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true - * it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false - */ - virtual bool IsAtFinalElement(PageIteratorLevel level, - PageIteratorLevel element) const; - - /** - * Returns whether this iterator is positioned - * before other: -1 - * equal to other: 0 - * after other: 1 - */ - int Cmp(const PageIterator &other) const; - - // ============= Accessing data ==============. - // Coordinate system: - // Integer coordinates are at the cracks between the pixels. - // The top-left corner of the top-left pixel in the image is at (0,0). - // The bottom-right corner of the bottom-right pixel in the image is at - // (width, height). - // Every bounding box goes from the top-left of the top-left contained - // pixel to the bottom-right of the bottom-right contained pixel, so - // the bounding box of the single top-left pixel in the image is: - // (0,0)->(1,1). - // If an image rectangle has been set in the API, then returned coordinates - // relate to the original (full) image, rather than the rectangle. - - /** - * Controls what to include in a bounding box. Bounding boxes of all levels - * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics. - * Between layout analysis and recognition, it isn't known where all - * diacritics belong, so this control is used to include or exclude some - * diacritics that are above or below the main body of the word. In most cases - * where the placement is obvious, and after recognition, it doesn't make as - * much difference, as the diacritics will already be included in the word. - */ - void SetBoundingBoxComponents(bool include_upper_dots, - bool include_lower_dots) { - include_upper_dots_ = include_upper_dots; - include_lower_dots_ = include_lower_dots; - } - - /** - * Returns the bounding rectangle of the current object at the given level. - * See comment on coordinate system above. - * Returns false if there is no such object at the current position. - * The returned bounding box is guaranteed to match the size and position - * of the image returned by GetBinaryImage, but may clip foreground pixels - * from a grey image. The padding argument to GetImage can be used to expand - * the image to include more foreground pixels. See GetImage below. - */ - bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, - int *bottom) const; - bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top, - int *right, int *bottom) const; - /** - * Returns the bounding rectangle of the object in a coordinate system of the - * working image rectangle having its origin at (rect_left_, rect_top_) with - * respect to the original image and is scaled by a factor scale_. - */ - bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top, - int *right, int *bottom) const; - - /** Returns whether there is no object of a given level. */ - bool Empty(PageIteratorLevel level) const; - - /** - * Returns the type of the current block. - * See tesseract/publictypes.h for PolyBlockType. - */ - PolyBlockType BlockType() const; - - /** - * Returns the polygon outline of the current block. The returned Pta must - * be ptaDestroy-ed after use. Note that the returned Pta lists the vertices - * of the polygon, and the last edge is the line segment between the last - * point and the first point. nullptr will be returned if the iterator is - * at the end of the document or layout analysis was not used. - */ - Pta *BlockPolygon() const; - - /** - * Returns a binary image of the current object at the given level. - * The position and size match the return from BoundingBoxInternal, and so - * this could be upscaled with respect to the original input image. - * Use pixDestroy to delete the image after use. - */ - Pix *GetBinaryImage(PageIteratorLevel level) const; - - /** - * Returns an image of the current object at the given level in greyscale - * if available in the input. To guarantee a binary image use BinaryImage. - * NOTE that in order to give the best possible image, the bounds are - * expanded slightly over the binary connected component, by the supplied - * padding, so the top-left position of the returned image is returned - * in (left,top). These will most likely not match the coordinates - * returned by BoundingBox. - * If you do not supply an original image, you will get a binary one. - * Use pixDestroy to delete the image after use. - */ - Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img, - int *left, int *top) const; - - /** - * Returns the baseline of the current object at the given level. - * The baseline is the line that passes through (x1, y1) and (x2, y2). - * WARNING: with vertical text, baselines may be vertical! - * Returns false if there is no baseline at the current position. - */ - bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2, - int *y2) const; - - // Returns the attributes of the current row. - void RowAttributes(float *row_height, float *descenders, - float *ascenders) const; - - /** - * Returns orientation for the block the iterator points to. - * orientation, writing_direction, textline_order: see publictypes.h - * deskew_angle: after rotating the block so the text orientation is - * upright, how many radians does one have to rotate the - * block anti-clockwise for it to be level? - * -Pi/4 <= deskew_angle <= Pi/4 - */ - void Orientation(tesseract::Orientation *orientation, - tesseract::WritingDirection *writing_direction, - tesseract::TextlineOrder *textline_order, - float *deskew_angle) const; - - /** - * Returns information about the current paragraph, if available. - * - * justification - - * LEFT if ragged right, or fully justified and script is left-to-right. - * RIGHT if ragged left, or fully justified and script is right-to-left. - * unknown if it looks like source code or we have very few lines. - * is_list_item - - * true if we believe this is a member of an ordered or unordered list. - * is_crown - - * true if the first line of the paragraph is aligned with the other - * lines of the paragraph even though subsequent paragraphs have first - * line indents. This typically indicates that this is the continuation - * of a previous paragraph or that it is the very first paragraph in - * the chapter. - * first_line_indent - - * For LEFT aligned paragraphs, the first text line of paragraphs of - * this kind are indented this many pixels from the left edge of the - * rest of the paragraph. - * for RIGHT aligned paragraphs, the first text line of paragraphs of - * this kind are indented this many pixels from the right edge of the - * rest of the paragraph. - * NOTE 1: This value may be negative. - * NOTE 2: if *is_crown == true, the first line of this paragraph is - * actually flush, and first_line_indent is set to the "common" - * first_line_indent for subsequent paragraphs in this block - * of text. - */ - void ParagraphInfo(tesseract::ParagraphJustification *justification, - bool *is_list_item, bool *is_crown, - int *first_line_indent) const; - - // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle - // of the current word to the given pointer (takes ownership of the pointer) - // and returns true. - // Can only be used when iterating on the word level. - bool SetWordBlamerBundle(BlamerBundle *blamer_bundle); - -protected: - /** - * Sets up the internal data for iterating the blobs of a new word, then - * moves the iterator to the given offset. - */ - void BeginWord(int offset); - - /** Pointer to the page_res owned by the API. */ - PAGE_RES *page_res_; - /** Pointer to the Tesseract object owned by the API. */ - Tesseract *tesseract_; - /** - * The iterator to the page_res_. Owned by this ResultIterator. - * A pointer just to avoid dragging in Tesseract includes. - */ - PAGE_RES_IT *it_; - /** - * The current input WERD being iterated. If there is an output from OCR, - * then word_ is nullptr. Owned by the API - */ - WERD *word_; - /** The length of the current word_. */ - int word_length_; - /** The current blob index within the word. */ - int blob_index_; - /** - * Iterator to the blobs within the word. If nullptr, then we are iterating - * OCR results in the box_word. - * Owned by this ResultIterator. - */ - C_BLOB_IT *cblob_it_; - /** Control over what to include in bounding boxes. */ - bool include_upper_dots_; - bool include_lower_dots_; - /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/ - int scale_; - int scaled_yres_; - int rect_left_; - int rect_top_; - int rect_width_; - int rect_height_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_PAGEITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/publictypes.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/publictypes.h deleted file mode 100644 index 0069cf28..00000000 --- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/publictypes.h +++ /dev/null @@ -1,281 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: publictypes.h -// Description: Types used in both the API and internally -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCSTRUCT_PUBLICTYPES_H_ -#define TESSERACT_CCSTRUCT_PUBLICTYPES_H_ - -namespace tesseract { - -// This file contains types that are used both by the API and internally -// to Tesseract. In order to decouple the API from Tesseract and prevent cyclic -// dependencies, THIS FILE SHOULD NOT DEPEND ON ANY OTHER PART OF TESSERACT. -// Restated: It is OK for low-level Tesseract files to include publictypes.h, -// but not for the low-level tesseract code to include top-level API code. -// This file should not use other Tesseract types, as that would drag -// their includes into the API-level. - -/** Number of printers' points in an inch. The unit of the pointsize return. */ -constexpr int kPointsPerInch = 72; -/** - * Minimum believable resolution. Used as a default if there is no other - * information, as it is safer to under-estimate than over-estimate. - */ -constexpr int kMinCredibleResolution = 70; -/** Maximum believable resolution. */ -constexpr int kMaxCredibleResolution = 2400; -/** - * Ratio between median blob size and likely resolution. Used to estimate - * resolution when none is provided. This is basically 1/usual text size in - * inches. */ -constexpr int kResolutionEstimationFactor = 10; - -/** - * Possible types for a POLY_BLOCK or ColPartition. - * Must be kept in sync with kPBColors in polyblk.cpp and PTIs*Type functions - * below, as well as kPolyBlockNames in layout_test.cc. - * Used extensively by ColPartition, and POLY_BLOCK. - */ -enum PolyBlockType { - PT_UNKNOWN, // Type is not yet known. Keep as the first element. - PT_FLOWING_TEXT, // Text that lives inside a column. - PT_HEADING_TEXT, // Text that spans more than one column. - PT_PULLOUT_TEXT, // Text that is in a cross-column pull-out region. - PT_EQUATION, // Partition belonging to an equation region. - PT_INLINE_EQUATION, // Partition has inline equation. - PT_TABLE, // Partition belonging to a table region. - PT_VERTICAL_TEXT, // Text-line runs vertically. - PT_CAPTION_TEXT, // Text that belongs to an image. - PT_FLOWING_IMAGE, // Image that lives inside a column. - PT_HEADING_IMAGE, // Image that spans more than one column. - PT_PULLOUT_IMAGE, // Image that is in a cross-column pull-out region. - PT_HORZ_LINE, // Horizontal Line. - PT_VERT_LINE, // Vertical Line. - PT_NOISE, // Lies outside of any column. - PT_COUNT -}; - -/** Returns true if PolyBlockType is of horizontal line type */ -inline bool PTIsLineType(PolyBlockType type) { - return type == PT_HORZ_LINE || type == PT_VERT_LINE; -} -/** Returns true if PolyBlockType is of image type */ -inline bool PTIsImageType(PolyBlockType type) { - return type == PT_FLOWING_IMAGE || type == PT_HEADING_IMAGE || - type == PT_PULLOUT_IMAGE; -} -/** Returns true if PolyBlockType is of text type */ -inline bool PTIsTextType(PolyBlockType type) { - return type == PT_FLOWING_TEXT || type == PT_HEADING_TEXT || - type == PT_PULLOUT_TEXT || type == PT_TABLE || - type == PT_VERTICAL_TEXT || type == PT_CAPTION_TEXT || - type == PT_INLINE_EQUATION; -} -// Returns true if PolyBlockType is of pullout(inter-column) type -inline bool PTIsPulloutType(PolyBlockType type) { - return type == PT_PULLOUT_IMAGE || type == PT_PULLOUT_TEXT; -} - -/** - * +------------------+ Orientation Example: - * | 1 Aaaa Aaaa Aaaa | ==================== - * | Aaa aa aaa aa | To left is a diagram of some (1) English and - * | aaaaaa A aa aaa. | (2) Chinese text and a (3) photo credit. - * | 2 | - * | ####### c c C | Upright Latin characters are represented as A and a. - * | ####### c c c | '<' represents a latin character rotated - * | < ####### c c c | anti-clockwise 90 degrees. - * | < ####### c c | - * | < ####### . c | Upright Chinese characters are represented C and c. - * | 3 ####### c | - * +------------------+ NOTA BENE: enum values here should match goodoc.proto - - * If you orient your head so that "up" aligns with Orientation, - * then the characters will appear "right side up" and readable. - * - * In the example above, both the English and Chinese paragraphs are oriented - * so their "up" is the top of the page (page up). The photo credit is read - * with one's head turned leftward ("up" is to page left). - * - * The values of this enum match the convention of Tesseract's osdetect.h -*/ -enum Orientation { - ORIENTATION_PAGE_UP = 0, - ORIENTATION_PAGE_RIGHT = 1, - ORIENTATION_PAGE_DOWN = 2, - ORIENTATION_PAGE_LEFT = 3, -}; - -/** - * The grapheme clusters within a line of text are laid out logically - * in this direction, judged when looking at the text line rotated so that - * its Orientation is "page up". - * - * For English text, the writing direction is left-to-right. For the - * Chinese text in the above example, the writing direction is top-to-bottom. - */ -enum WritingDirection { - WRITING_DIRECTION_LEFT_TO_RIGHT = 0, - WRITING_DIRECTION_RIGHT_TO_LEFT = 1, - WRITING_DIRECTION_TOP_TO_BOTTOM = 2, -}; - -/** - * The text lines are read in the given sequence. - * - * In English, the order is top-to-bottom. - * In Chinese, vertical text lines are read right-to-left. Mongolian is - * written in vertical columns top to bottom like Chinese, but the lines - * order left-to right. - * - * Note that only some combinations make sense. For example, - * WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM - */ -enum TextlineOrder { - TEXTLINE_ORDER_LEFT_TO_RIGHT = 0, - TEXTLINE_ORDER_RIGHT_TO_LEFT = 1, - TEXTLINE_ORDER_TOP_TO_BOTTOM = 2, -}; - -/** - * Possible modes for page layout analysis. These *must* be kept in order - * of decreasing amount of layout analysis to be done, except for OSD_ONLY, - * so that the inequality test macros below work. - */ -enum PageSegMode { - PSM_OSD_ONLY = 0, ///< Orientation and script detection only. - PSM_AUTO_OSD = 1, ///< Automatic page segmentation with orientation and - ///< script detection. (OSD) - PSM_AUTO_ONLY = 2, ///< Automatic page segmentation, but no OSD, or OCR. - PSM_AUTO = 3, ///< Fully automatic page segmentation, but no OSD. - PSM_SINGLE_COLUMN = 4, ///< Assume a single column of text of variable sizes. - PSM_SINGLE_BLOCK_VERT_TEXT = 5, ///< Assume a single uniform block of - ///< vertically aligned text. - PSM_SINGLE_BLOCK = 6, ///< Assume a single uniform block of text. (Default.) - PSM_SINGLE_LINE = 7, ///< Treat the image as a single text line. - PSM_SINGLE_WORD = 8, ///< Treat the image as a single word. - PSM_CIRCLE_WORD = 9, ///< Treat the image as a single word in a circle. - PSM_SINGLE_CHAR = 10, ///< Treat the image as a single character. - PSM_SPARSE_TEXT = - 11, ///< Find as much text as possible in no particular order. - PSM_SPARSE_TEXT_OSD = 12, ///< Sparse text with orientation and script det. - PSM_RAW_LINE = 13, ///< Treat the image as a single text line, bypassing - ///< hacks that are Tesseract-specific. - - PSM_COUNT ///< Number of enum entries. -}; - -/** - * Inline functions that act on a PageSegMode to determine whether components of - * layout analysis are enabled. - * *Depend critically on the order of elements of PageSegMode.* - * NOTE that arg is an int for compatibility with INT_PARAM. - */ -inline bool PSM_OSD_ENABLED(int pageseg_mode) { - return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_ORIENTATION_ENABLED(int pageseg_mode) { - return pageseg_mode <= PSM_AUTO || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_COL_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO; -} -inline bool PSM_SPARSE(int pageseg_mode) { - return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN; -} -inline bool PSM_LINE_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK; -} -inline bool PSM_WORD_FIND_ENABLED(int pageseg_mode) { - return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) || - pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} - -/** - * enum of the elements of the page hierarchy, used in ResultIterator - * to provide functions that operate on each level without having to - * have 5x as many functions. - */ -enum PageIteratorLevel { - RIL_BLOCK, // Block of text/image/separator line. - RIL_PARA, // Paragraph within a block. - RIL_TEXTLINE, // Line within a paragraph. - RIL_WORD, // Word within a textline. - RIL_SYMBOL // Symbol/character within a word. -}; - -/** - * JUSTIFICATION_UNKNOWN - * The alignment is not clearly one of the other options. This could happen - * for example if there are only one or two lines of text or the text looks - * like source code or poetry. - * - * NOTA BENE: Fully justified paragraphs (text aligned to both left and right - * margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text - * is written with a left-to-right script and with JUSTIFICATION_RIGHT if - * their text is written in a right-to-left script. - * - * Interpretation for text read in vertical lines: - * "Left" is wherever the starting reading position is. - * - * JUSTIFICATION_LEFT - * Each line, except possibly the first, is flush to the same left tab stop. - * - * JUSTIFICATION_CENTER - * The text lines of the paragraph are centered about a line going - * down through their middle of the text lines. - * - * JUSTIFICATION_RIGHT - * Each line, except possibly the first, is flush to the same right tab stop. - */ -enum ParagraphJustification { - JUSTIFICATION_UNKNOWN, - JUSTIFICATION_LEFT, - JUSTIFICATION_CENTER, - JUSTIFICATION_RIGHT, -}; - -/** - * When Tesseract/Cube is initialized we can choose to instantiate/load/run - * only the Tesseract part, only the Cube part or both along with the combiner. - * The preference of which engine to use is stored in tessedit_ocr_engine_mode. - * - * ATTENTION: When modifying this enum, please make sure to make the - * appropriate changes to all the enums mirroring it (e.g. OCREngine in - * cityblock/workflow/detection/detection_storage.proto). Such enums will - * mention the connection to OcrEngineMode in the comments. - */ -enum OcrEngineMode { - OEM_TESSERACT_ONLY, // Run Tesseract only - fastest; deprecated - OEM_LSTM_ONLY, // Run just the LSTM line recognizer. - OEM_TESSERACT_LSTM_COMBINED, // Run the LSTM recognizer, but allow fallback - // to Tesseract when things get difficult. - // deprecated - OEM_DEFAULT, // Specify this mode when calling init_*(), - // to indicate that any of the above modes - // should be automatically inferred from the - // variables in the language-specific config, - // command-line configs, or if not specified - // in any of the above should be set to the - // default OEM_TESSERACT_ONLY. - OEM_COUNT // Number of OEMs -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCSTRUCT_PUBLICTYPES_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/renderer.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/renderer.h deleted file mode 100644 index 6f405233..00000000 --- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/renderer.h +++ /dev/null @@ -1,311 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: renderer.h -// Description: Rendering interface to inject into TessBaseAPI -// -// (C) Copyright 2011, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_RENDERER_H_ -#define TESSERACT_API_RENDERER_H_ - -#include "export.h" - -// To avoid collision with other typenames include the ABSOLUTE MINIMUM -// complexity of includes here. Use forward declarations wherever possible -// and hide includes of complex types in baseapi.cpp. -#include -#include // for std::string -#include // for std::vector - -struct Pix; - -namespace tesseract { - -class TessBaseAPI; - -/** - * Interface for rendering tesseract results into a document, such as text, - * HOCR or pdf. This class is abstract. Specific classes handle individual - * formats. This interface is then used to inject the renderer class into - * tesseract when processing images. - * - * For simplicity implementing this with tesseract version 3.01, - * the renderer contains document state that is cleared from document - * to document just as the TessBaseAPI is. This way the base API can just - * delegate its rendering functionality to injected renderers, and the - * renderers can manage the associated state needed for the specific formats - * in addition to the heuristics for producing it. - */ -class TESS_API TessResultRenderer { -public: - virtual ~TessResultRenderer(); - - // Takes ownership of pointer so must be new'd instance. - // Renderers aren't ordered, but appends the sequences of next parameter - // and existing next(). The renderers should be unique across both lists. - void insert(TessResultRenderer *next); - - // Returns the next renderer or nullptr. - TessResultRenderer *next() { - return next_; - } - - /** - * Starts a new document with the given title. - * This clears the contents of the output data. - * Title should use UTF-8 encoding. - */ - bool BeginDocument(const char *title); - - /** - * Adds the recognized text from the source image to the current document. - * Invalid if BeginDocument not yet called. - * - * Note that this API is a bit weird but is designed to fit into the - * current TessBaseAPI implementation where the api has lots of state - * information that we might want to add in. - */ - bool AddImage(TessBaseAPI *api); - - /** - * Finishes the document and finalizes the output data - * Invalid if BeginDocument not yet called. - */ - bool EndDocument(); - - const char *file_extension() const { - return file_extension_; - } - const char *title() const { - return title_.c_str(); - } - - // Is everything fine? Otherwise something went wrong. - bool happy() const { - return happy_; - } - - /** - * Returns the index of the last image given to AddImage - * (i.e. images are incremented whether the image succeeded or not) - * - * This is always defined. It means either the number of the - * current image, the last image ended, or in the completed document - * depending on when in the document lifecycle you are looking at it. - * Will return -1 if a document was never started. - */ - int imagenum() const { - return imagenum_; - } - -protected: - /** - * Called by concrete classes. - * - * outputbase is the name of the output file excluding - * extension. For example, "/path/to/chocolate-chip-cookie-recipe" - * - * extension indicates the file extension to be used for output - * files. For example "pdf" will produce a .pdf file, and "hocr" - * will produce .hocr files. - */ - TessResultRenderer(const char *outputbase, const char *extension); - - // Hook for specialized handling in BeginDocument() - virtual bool BeginDocumentHandler(); - - // This must be overridden to render the OCR'd results - virtual bool AddImageHandler(TessBaseAPI *api) = 0; - - // Hook for specialized handling in EndDocument() - virtual bool EndDocumentHandler(); - - // Renderers can call this to append '\0' terminated strings into - // the output string returned by GetOutput. - // This method will grow the output buffer if needed. - void AppendString(const char *s); - - // Renderers can call this to append binary byte sequences into - // the output string returned by GetOutput. Note that s is not necessarily - // '\0' terminated (and can contain '\0' within it). - // This method will grow the output buffer if needed. - void AppendData(const char *s, int len); - -private: - TessResultRenderer *next_; // Can link multiple renderers together - FILE *fout_; // output file pointer - const char *file_extension_; // standard extension for generated output - std::string title_; // title of document being rendered - int imagenum_; // index of last image added - bool happy_; // I get grumpy when the disk fills up, etc. -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessTextRenderer : public TessResultRenderer { -public: - explicit TessTextRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into an hocr text string - */ -class TESS_API TessHOcrRenderer : public TessResultRenderer { -public: - explicit TessHOcrRenderer(const char *outputbase, bool font_info); - explicit TessHOcrRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool font_info_; // whether to print font information -}; - -/** - * Renders tesseract output into an alto text string - */ -class TESS_API TessAltoRenderer : public TessResultRenderer { -public: - explicit TessAltoRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool begin_document; -}; - -/** - * Renders Tesseract output into a TSV string - */ -class TESS_API TessTsvRenderer : public TessResultRenderer { -public: - explicit TessTsvRenderer(const char *outputbase, bool font_info); - explicit TessTsvRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool font_info_; // whether to print font information -}; - -/** - * Renders tesseract output into searchable PDF - */ -class TESS_API TessPDFRenderer : public TessResultRenderer { -public: - // datadir is the location of the TESSDATA. We need it because - // we load a custom PDF font from this location. - TessPDFRenderer(const char *outputbase, const char *datadir, - bool textonly = false); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - // We don't want to have every image in memory at once, - // so we store some metadata as we go along producing - // PDFs one page at a time. At the end, that metadata is - // used to make everything that isn't easily handled in a - // streaming fashion. - long int obj_; // counter for PDF objects - std::vector offsets_; // offset of every PDF object in bytes - std::vector pages_; // object number for every /Page object - std::string datadir_; // where to find the custom font - bool textonly_; // skip images if set - // Bookkeeping only. DIY = Do It Yourself. - void AppendPDFObjectDIY(size_t objectsize); - // Bookkeeping + emit data. - void AppendPDFObject(const char *data); - // Create the /Contents object for an entire page. - char *GetPDFTextObjects(TessBaseAPI *api, double width, double height); - // Turn an image into a PDF object. Only transcode if we have to. - static bool imageToPDFObj(Pix *pix, const char *filename, long int objnum, - char **pdf_object, long int *pdf_object_size, - int jpg_quality); -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessUnlvRenderer : public TessResultRenderer { -public: - explicit TessUnlvRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string for LSTMBox - */ -class TESS_API TessLSTMBoxRenderer : public TessResultRenderer { -public: - explicit TessLSTMBoxRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessBoxTextRenderer : public TessResultRenderer { -public: - explicit TessBoxTextRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string in WordStr format - */ -class TESS_API TessWordStrBoxRenderer : public TessResultRenderer { -public: - explicit TessWordStrBoxRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -#ifndef DISABLED_LEGACY_ENGINE - -/** - * Renders tesseract output into an osd text string - */ -class TESS_API TessOsdRenderer : public TessResultRenderer { -public: - explicit TessOsdRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -#endif // ndef DISABLED_LEGACY_ENGINE - -} // namespace tesseract. - -#endif // TESSERACT_API_RENDERER_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/resultiterator.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/resultiterator.h deleted file mode 100644 index 3e4d5807..00000000 --- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/resultiterator.h +++ /dev/null @@ -1,250 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: resultiterator.h -// Description: Iterator for tesseract results that is capable of -// iterating in proper reading order over Bi Directional -// (e.g. mixed Hebrew and English) text. -// Author: David Eger -// -// (C) Copyright 2011, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_ -#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_ - -#include "export.h" // for TESS_API, TESS_LOCAL -#include "ltrresultiterator.h" // for LTRResultIterator -#include "publictypes.h" // for PageIteratorLevel -#include "unichar.h" // for StrongScriptDirection - -#include // for std::pair -#include // for std::vector - -namespace tesseract { - -class TESS_API ResultIterator : public LTRResultIterator { -public: - static ResultIterator *StartOfParagraph(const LTRResultIterator &resit); - - /** - * ResultIterator is copy constructible! - * The default copy constructor works just fine for us. - */ - ~ResultIterator() override = default; - - // ============= Moving around within the page ============. - /** - * Moves the iterator to point to the start of the page to begin - * an iteration. - */ - void Begin() override; - - /** - * Moves to the start of the next object at the given level in the - * page hierarchy in the appropriate reading order and returns false if - * the end of the page was reached. - * NOTE that RIL_SYMBOL will skip non-text blocks, but all other - * PageIteratorLevel level values will visit each non-text block once. - * Think of non text blocks as containing a single para, with a single line, - * with a single imaginary word. - * Calls to Next with different levels may be freely intermixed. - * This function iterates words in right-to-left scripts correctly, if - * the appropriate language has been loaded into Tesseract. - */ - bool Next(PageIteratorLevel level) override; - - /** - * IsAtBeginningOf() returns whether we're at the logical beginning of the - * given level. (as opposed to ResultIterator's left-to-right top-to-bottom - * order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf(). - * For a full description, see pageiterator.h - */ - bool IsAtBeginningOf(PageIteratorLevel level) const override; - - /** - * Implement PageIterator's IsAtFinalElement correctly in a BiDi context. - * For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we - * point at the last word in a paragraph. See PageIterator for full comment. - */ - bool IsAtFinalElement(PageIteratorLevel level, - PageIteratorLevel element) const override; - - // ============= Functions that refer to words only ============. - // Returns the number of blanks before the current word. - int BlanksBeforeWord() const; - - // ============= Accessing data ==============. - - /** - * Returns the null terminated UTF-8 encoded text string for the current - * object at the given level. Use delete [] to free after use. - */ - virtual char *GetUTF8Text(PageIteratorLevel level) const; - - /** - * Returns the LSTM choices for every LSTM timestep for the current word. - */ - virtual std::vector>>> - *GetRawLSTMTimesteps() const; - virtual std::vector>> - *GetBestLSTMSymbolChoices() const; - - /** - * Return whether the current paragraph's dominant reading direction - * is left-to-right (as opposed to right-to-left). - */ - bool ParagraphIsLtr() const; - - // ============= Exposed only for testing =============. - - /** - * Yields the reading order as a sequence of indices and (optional) - * meta-marks for a set of words (given left-to-right). - * The meta marks are passed as negative values: - * kMinorRunStart Start of minor direction text. - * kMinorRunEnd End of minor direction text. - * kComplexWord The next indexed word contains both left-to-right and - * right-to-left characters and was treated as neutral. - * - * For example, suppose we have five words in a text line, - * indexed [0,1,2,3,4] from the leftmost side of the text line. - * The following are all believable reading_orders: - * - * Left-to-Right (in ltr paragraph): - * { 0, 1, 2, 3, 4 } - * Left-to-Right (in rtl paragraph): - * { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd } - * Right-to-Left (in rtl paragraph): - * { 4, 3, 2, 1, 0 } - * Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph: - * { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 } - */ - static void CalculateTextlineOrder( - bool paragraph_is_ltr, - const std::vector &word_dirs, - std::vector *reading_order); - - static const int kMinorRunStart; - static const int kMinorRunEnd; - static const int kComplexWord; - -protected: - /** - * We presume the data associated with the given iterator will outlive us. - * NB: This is private because it does something that is non-obvious: - * it resets to the beginning of the paragraph instead of staying wherever - * resit might have pointed. - */ - explicit ResultIterator(const LTRResultIterator &resit); - -private: - /** - * Calculates the current paragraph's dominant writing direction. - * Typically, members should use current_paragraph_ltr_ instead. - */ - bool CurrentParagraphIsLtr() const; - - /** - * Returns word indices as measured from resit->RestartRow() = index 0 - * for the reading order of words within a textline given an iterator - * into the middle of the text line. - * In addition to non-negative word indices, the following negative values - * may be inserted: - * kMinorRunStart Start of minor direction text. - * kMinorRunEnd End of minor direction text. - * kComplexWord The previous word contains both left-to-right and - * right-to-left characters and was treated as neutral. - */ - void CalculateTextlineOrder(bool paragraph_is_ltr, - const LTRResultIterator &resit, - std::vector *indices) const; - /** Same as above, but the caller's ssd gets filled in if ssd != nullptr. */ - void CalculateTextlineOrder(bool paragraph_is_ltr, - const LTRResultIterator &resit, - std::vector *ssd, - std::vector *indices) const; - - /** - * What is the index of the current word in a strict left-to-right reading - * of the row? - */ - int LTRWordIndex() const; - - /** - * Given an iterator pointing at a word, returns the logical reading order - * of blob indices for the word. - */ - void CalculateBlobOrder(std::vector *blob_indices) const; - - /** Precondition: current_paragraph_is_ltr_ is set. */ - void MoveToLogicalStartOfTextline(); - - /** - * Precondition: current_paragraph_is_ltr_ and in_minor_direction_ - * are set. - */ - void MoveToLogicalStartOfWord(); - - /** Are we pointing at the final (reading order) symbol of the word? */ - bool IsAtFinalSymbolOfWord() const; - - /** Are we pointing at the first (reading order) symbol of the word? */ - bool IsAtFirstSymbolOfWord() const; - - /** - * Append any extra marks that should be appended to this word when printed. - * Mostly, these are Unicode BiDi control characters. - */ - void AppendSuffixMarks(std::string *text) const; - - /** Appends the current word in reading order to the given buffer.*/ - void AppendUTF8WordText(std::string *text) const; - - /** - * Appends the text of the current text line, *assuming this iterator is - * positioned at the beginning of the text line* This function - * updates the iterator to point to the first position past the text line. - * Each textline is terminated in a single newline character. - * If the textline ends a paragraph, it gets a second terminal newline. - */ - void IterateAndAppendUTF8TextlineText(std::string *text); - - /** - * Appends the text of the current paragraph in reading order - * to the given buffer. - * Each textline is terminated in a single newline character, and the - * paragraph gets an extra newline at the end. - */ - void AppendUTF8ParagraphText(std::string *text) const; - - /** Returns whether the bidi_debug flag is set to at least min_level. */ - bool BidiDebug(int min_level) const; - - bool current_paragraph_is_ltr_; - - /** - * Is the currently pointed-at character at the beginning of - * a minor-direction run? - */ - bool at_beginning_of_minor_run_; - - /** Is the currently pointed-at character in a minor-direction sequence? */ - bool in_minor_direction_; - - /** - * Should detected inter-word spaces be preserved, or "compressed" to a single - * space character (default behavior). - */ - bool preserve_interword_spaces_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/unichar.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/unichar.h deleted file mode 100644 index 015109d7..00000000 --- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/unichar.h +++ /dev/null @@ -1,174 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: unichar.h -// Description: Unicode character/ligature class. -// Author: Ray Smith -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCUTIL_UNICHAR_H_ -#define TESSERACT_CCUTIL_UNICHAR_H_ - -#include "export.h" - -#include -#include -#include -#include - -namespace tesseract { - -// Maximum number of characters that can be stored in a UNICHAR. Must be -// at least 4. Must not exceed 31 without changing the coding of length. -#define UNICHAR_LEN 30 - -// A UNICHAR_ID is the unique id of a unichar. -using UNICHAR_ID = int; - -// A variable to indicate an invalid or uninitialized unichar id. -static const int INVALID_UNICHAR_ID = -1; -// A special unichar that corresponds to INVALID_UNICHAR_ID. -static const char INVALID_UNICHAR[] = "__INVALID_UNICHAR__"; - -enum StrongScriptDirection { - DIR_NEUTRAL = 0, // Text contains only neutral characters. - DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters. - DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters. - DIR_MIX = 3, // Text contains a mixture of left-to-right - // and right-to-left characters. -}; - -using char32 = signed int; - -// The UNICHAR class holds a single classification result. This may be -// a single Unicode character (stored as between 1 and 4 utf8 bytes) or -// multiple Unicode characters representing the NFKC expansion of a ligature -// such as fi, ffl etc. These are also stored as utf8. -class TESS_API UNICHAR { -public: - UNICHAR() { - memset(chars, 0, UNICHAR_LEN); - } - - // Construct from a utf8 string. If len<0 then the string is null terminated. - // If the string is too long to fit in the UNICHAR then it takes only what - // will fit. - UNICHAR(const char *utf8_str, int len); - - // Construct from a single UCS4 character. - explicit UNICHAR(int unicode); - - // Default copy constructor and operator= are OK. - - // Get the first character as UCS-4. - int first_uni() const; - - // Get the length of the UTF8 string. - int utf8_len() const { - int len = chars[UNICHAR_LEN - 1]; - return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN; - } - - // Get a UTF8 string, but NOT nullptr terminated. - const char *utf8() const { - return chars; - } - - // Get a terminated UTF8 string: Must delete[] it after use. - char *utf8_str() const; - - // Get the number of bytes in the first character of the given utf8 string. - static int utf8_step(const char *utf8_str); - - // A class to simplify iterating over and accessing elements of a UTF8 - // string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or - // take ownership of the underlying byte array. It also does not permit - // modification of the array (as the name suggests). - // - // Example: - // for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len); - // it != UNICHAR::end(str, len); - // ++it) { - // printf("UCS-4 symbol code = %d\n", *it); - // char buf[5]; - // int char_len = it.get_utf8(buf); buf[char_len] = '\0'; - // printf("Char = %s\n", buf); - // } - class TESS_API const_iterator { - using CI = const_iterator; - - public: - // Step to the next UTF8 character. - // If the current position is at an illegal UTF8 character, then print an - // error message and step by one byte. If the current position is at a - // nullptr value, don't step past it. - const_iterator &operator++(); - - // Return the UCS-4 value at the current position. - // If the current position is at an illegal UTF8 value, return a single - // space character. - int operator*() const; - - // Store the UTF-8 encoding of the current codepoint into buf, which must be - // at least 4 bytes long. Return the number of bytes written. - // If the current position is at an illegal UTF8 value, writes a single - // space character and returns 1. - // Note that this method does not null-terminate the buffer. - int get_utf8(char *buf) const; - // Returns the number of bytes of the current codepoint. Returns 1 if the - // current position is at an illegal UTF8 value. - int utf8_len() const; - // Returns true if the UTF-8 encoding at the current position is legal. - bool is_legal() const; - - // Return the pointer into the string at the current position. - const char *utf8_data() const { - return it_; - } - - // Iterator equality operators. - friend bool operator==(const CI &lhs, const CI &rhs) { - return lhs.it_ == rhs.it_; - } - friend bool operator!=(const CI &lhs, const CI &rhs) { - return !(lhs == rhs); - } - - private: - friend class UNICHAR; - explicit const_iterator(const char *it) : it_(it) {} - - const char *it_; // Pointer into the string. - }; - - // Create a start/end iterator pointing to a string. Note that these methods - // are static and do NOT create a copy or take ownership of the underlying - // array. - static const_iterator begin(const char *utf8_str, int byte_length); - static const_iterator end(const char *utf8_str, int byte_length); - - // Converts a utf-8 string to a vector of unicodes. - // Returns an empty vector if the input contains invalid UTF-8. - static std::vector UTF8ToUTF32(const char *utf8_str); - // Converts a vector of unicodes to a utf8 string. - // Returns an empty string if the input contains an invalid unicode. - static std::string UTF32ToUTF8(const std::vector &str32); - -private: - // A UTF-8 representation of 1 or more Unicode characters. - // The last element (chars[UNICHAR_LEN - 1]) is a length if - // its value < UNICHAR_LEN, otherwise it is a genuine character. - char chars[UNICHAR_LEN]{}; -}; - -} // namespace tesseract - -#endif // TESSERACT_CCUTIL_UNICHAR_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/version.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/version.h deleted file mode 100644 index 6bac5d66..00000000 --- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/version.h +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: version.h -// Description: Version information -// -// (C) Copyright 2018, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_VERSION_H_ -#define TESSERACT_API_VERSION_H_ - -// clang-format off - -#define TESSERACT_MAJOR_VERSION @GENERIC_MAJOR_VERSION@ -#define TESSERACT_MINOR_VERSION @GENERIC_MINOR_VERSION@ -#define TESSERACT_MICRO_VERSION @GENERIC_MICRO_VERSION@ - -#define TESSERACT_VERSION \ - (TESSERACT_MAJOR_VERSION << 16 | \ - TESSERACT_MINOR_VERSION << 8 | \ - TESSERACT_MICRO_VERSION) - -#define TESSERACT_VERSION_STR "@PACKAGE_VERSION@" - -// clang-format on - -#endif // TESSERACT_API_VERSION_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/baseapi.h b/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/baseapi.h deleted file mode 100644 index 5e1e4830..00000000 --- a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/baseapi.h +++ /dev/null @@ -1,812 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: baseapi.h -// Description: Simple API for calling tesseract. -// Author: Ray Smith -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_BASEAPI_H_ -#define TESSERACT_API_BASEAPI_H_ - -#ifdef HAVE_CONFIG_H -# include "config_auto.h" // DISABLED_LEGACY_ENGINE -#endif - -#include "export.h" -#include "pageiterator.h" -#include "publictypes.h" -#include "resultiterator.h" -#include "unichar.h" - -#include "version.h" - -#include -#include // for std::vector - -struct Pix; -struct Pixa; -struct Boxa; - -namespace tesseract { - -class PAGE_RES; -class ParagraphModel; -class BLOCK_LIST; -class ETEXT_DESC; -struct OSResults; -class UNICHARSET; - -class Dawg; -class Dict; -class EquationDetect; -class PageIterator; -class ImageThresholder; -class LTRResultIterator; -class ResultIterator; -class MutableIterator; -class TessResultRenderer; -class Tesseract; - -// Function to read a std::vector from a whole file. -// Returns false on failure. -using FileReader = bool (*)(const char *filename, std::vector *data); - -using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID, - bool) const; -using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *, - int, const char *, int); - -/** - * Base class for all tesseract APIs. - * Specific classes can add ability to work on different inputs or produce - * different outputs. - * This class is mostly an interface layer on top of the Tesseract instance - * class to hide the data types so that users of this class don't have to - * include any other Tesseract headers. - */ -class TESS_API TessBaseAPI { -public: - TessBaseAPI(); - virtual ~TessBaseAPI(); - // Copy constructor and assignment operator are currently unsupported. - TessBaseAPI(TessBaseAPI const &) = delete; - TessBaseAPI &operator=(TessBaseAPI const &) = delete; - - /** - * Returns the version identifier as a static string. Do not delete. - */ - static const char *Version(); - - /** - * If compiled with OpenCL AND an available OpenCL - * device is deemed faster than serial code, then - * "device" is populated with the cl_device_id - * and returns sizeof(cl_device_id) - * otherwise *device=nullptr and returns 0. - */ - static size_t getOpenCLDevice(void **device); - - /** - * Set the name of the input file. Needed for training and - * reading a UNLV zone file, and for searchable PDF output. - */ - void SetInputName(const char *name); - /** - * These functions are required for searchable PDF output. - * We need our hands on the input file so that we can include - * it in the PDF without transcoding. If that is not possible, - * we need the original image. Finally, resolution metadata - * is stored in the PDF so we need that as well. - */ - const char *GetInputName(); - // Takes ownership of the input pix. - void SetInputImage(Pix *pix); - Pix *GetInputImage(); - int GetSourceYResolution(); - const char *GetDatapath(); - - /** Set the name of the bonus output files. Needed only for debugging. */ - void SetOutputName(const char *name); - - /** - * Set the value of an internal "parameter." - * Supply the name of the parameter and the value as a string, just as - * you would in a config file. - * Returns false if the name lookup failed. - * Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z. - * Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode. - * SetVariable may be used before Init, but settings will revert to - * defaults on End(). - * - * Note: Must be called after Init(). Only works for non-init variables - * (init variables should be passed to Init()). - */ - bool SetVariable(const char *name, const char *value); - bool SetDebugVariable(const char *name, const char *value); - - /** - * Returns true if the parameter was found among Tesseract parameters. - * Fills in value with the value of the parameter. - */ - bool GetIntVariable(const char *name, int *value) const; - bool GetBoolVariable(const char *name, bool *value) const; - bool GetDoubleVariable(const char *name, double *value) const; - - /** - * Returns the pointer to the string that represents the value of the - * parameter if it was found among Tesseract parameters. - */ - const char *GetStringVariable(const char *name) const; - -#ifndef DISABLED_LEGACY_ENGINE - - /** - * Print Tesseract fonts table to the given file. - */ - void PrintFontsTable(FILE *fp) const; - -#endif - - /** - * Print Tesseract parameters to the given file. - */ - void PrintVariables(FILE *fp) const; - - /** - * Get value of named variable as a string, if it exists. - */ - bool GetVariableAsString(const char *name, std::string *val) const; - - /** - * Instances are now mostly thread-safe and totally independent, - * but some global parameters remain. Basically it is safe to use multiple - * TessBaseAPIs in different threads in parallel, UNLESS: - * you use SetVariable on some of the Params in classify and textord. - * If you do, then the effect will be to change it for all your instances. - * - * Start tesseract. Returns zero on success and -1 on failure. - * NOTE that the only members that may be called before Init are those - * listed above here in the class definition. - * - * The datapath must be the name of the tessdata directory. - * The language is (usually) an ISO 639-3 string or nullptr will default to - * eng. It is entirely safe (and eventually will be efficient too) to call - * Init multiple times on the same instance to change language, or just - * to reset the classifier. - * The language may be a string of the form [~][+[~]]* indicating - * that multiple languages are to be loaded. Eg hin+eng will load Hindi and - * English. Languages may specify internally that they want to be loaded - * with one or more other languages, so the ~ sign is available to override - * that. Eg if hin were set to load eng by default, then hin+~eng would force - * loading only hin. The number of loaded languages is limited only by - * memory, with the caveat that loading additional languages will impact - * both speed and accuracy, as there is more work to do to decide on the - * applicable language, and there is more chance of hallucinating incorrect - * words. - * WARNING: On changing languages, all Tesseract parameters are reset - * back to their default values. (Which may vary between languages.) - * If you have a rare need to set a Variable that controls - * initialization for a second call to Init you should explicitly - * call End() and then use SetVariable before Init. This is only a very - * rare use case, since there are very few uses that require any parameters - * to be set before Init. - * - * If set_only_non_debug_params is true, only params that do not contain - * "debug" in the name will be set. - */ - int Init(const char *datapath, const char *language, OcrEngineMode mode, - char **configs, int configs_size, - const std::vector *vars_vec, - const std::vector *vars_values, - bool set_only_non_debug_params); - int Init(const char *datapath, const char *language, OcrEngineMode oem) { - return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false); - } - int Init(const char *datapath, const char *language) { - return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr, - false); - } - // In-memory version reads the traineddata file directly from the given - // data[data_size] array, and/or reads data via a FileReader. - int Init(const char *data, int data_size, const char *language, - OcrEngineMode mode, char **configs, int configs_size, - const std::vector *vars_vec, - const std::vector *vars_values, - bool set_only_non_debug_params, FileReader reader); - - /** - * Returns the languages string used in the last valid initialization. - * If the last initialization specified "deu+hin" then that will be - * returned. If hin loaded eng automatically as well, then that will - * not be included in this list. To find the languages actually - * loaded use GetLoadedLanguagesAsVector. - * The returned string should NOT be deleted. - */ - const char *GetInitLanguagesAsString() const; - - /** - * Returns the loaded languages in the vector of std::string. - * Includes all languages loaded by the last Init, including those loaded - * as dependencies of other loaded languages. - */ - void GetLoadedLanguagesAsVector(std::vector *langs) const; - - /** - * Returns the available languages in the sorted vector of std::string. - */ - void GetAvailableLanguagesAsVector(std::vector *langs) const; - - /** - * Init only for page layout analysis. Use only for calls to SetImage and - * AnalysePage. Calls that attempt recognition will generate an error. - */ - void InitForAnalysePage(); - - /** - * Read a "config" file containing a set of param, value pairs. - * Searches the standard places: tessdata/configs, tessdata/tessconfigs - * and also accepts a relative or absolute path name. - * Note: only non-init params will be set (init params are set by Init()). - */ - void ReadConfigFile(const char *filename); - /** Same as above, but only set debug params from the given config file. */ - void ReadDebugConfigFile(const char *filename); - - /** - * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK. - * The mode is stored as an IntParam so it can also be modified by - * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string). - */ - void SetPageSegMode(PageSegMode mode); - - /** Return the current page segmentation mode. */ - PageSegMode GetPageSegMode() const; - - /** - * Recognize a rectangle from an image and return the result as a string. - * May be called many times for a single Init. - * Currently has no error checking. - * Greyscale of 8 and color of 24 or 32 bits per pixel may be given. - * Palette color images will not work properly and must be converted to - * 24 bit. - * Binary images of 1 bit per pixel may also be given but they must be - * byte packed with the MSB of the first byte being the first pixel, and a - * 1 represents WHITE. For binary images set bytes_per_pixel=0. - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - * - * Note that TesseractRect is the simplified convenience interface. - * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize, - * and one or more of the Get*Text functions below. - */ - char *TesseractRect(const unsigned char *imagedata, int bytes_per_pixel, - int bytes_per_line, int left, int top, int width, - int height); - - /** - * Call between pages or documents etc to free up memory and forget - * adaptive data. - */ - void ClearAdaptiveClassifier(); - - /** - * @defgroup AdvancedAPI Advanced API - * The following methods break TesseractRect into pieces, so you can - * get hold of the thresholded image, get the text in different formats, - * get bounding boxes, confidences etc. - */ - /* @{ */ - - /** - * Provide an image for Tesseract to recognize. Format is as - * TesseractRect above. Copies the image buffer and converts to Pix. - * SetImage clears all recognition results, and sets the rectangle to the - * full image, so it may be followed immediately by a GetUTF8Text, and it - * will automatically perform recognition. - */ - void SetImage(const unsigned char *imagedata, int width, int height, - int bytes_per_pixel, int bytes_per_line); - - /** - * Provide an image for Tesseract to recognize. As with SetImage above, - * Tesseract takes its own copy of the image, so it need not persist until - * after Recognize. - * Pix vs raw, which to use? - * Use Pix where possible. Tesseract uses Pix as its internal representation - * and it is therefore more efficient to provide a Pix directly. - */ - void SetImage(Pix *pix); - - /** - * Set the resolution of the source image in pixels per inch so font size - * information can be calculated in results. Call this after SetImage(). - */ - void SetSourceResolution(int ppi); - - /** - * Restrict recognition to a sub-rectangle of the image. Call after SetImage. - * Each SetRectangle clears the recogntion results so multiple rectangles - * can be recognized with the same image. - */ - void SetRectangle(int left, int top, int width, int height); - - /** - * Get a copy of the internal thresholded image from Tesseract. - * Caller takes ownership of the Pix and must pixDestroy it. - * May be called any time after SetImage, or after TesseractRect. - */ - Pix *GetThresholdedImage(); - - /** - * Get the result of page layout analysis as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - */ - Boxa *GetRegions(Pixa **pixa); - - /** - * Get the textlines as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * If raw_image is true, then extract from the original image instead of the - * thresholded image and pad by raw_padding pixels. - * If blockids is not nullptr, the block-id of each line is also returned as - * an array of one element per line. delete [] after use. If paraids is not - * nullptr, the paragraph-id of each line within its block is also returned as - * an array of one element per line. delete [] after use. - */ - Boxa *GetTextlines(bool raw_image, int raw_padding, Pixa **pixa, - int **blockids, int **paraids); - /* - Helper method to extract from the thresholded image. (most common usage) -*/ - Boxa *GetTextlines(Pixa **pixa, int **blockids) { - return GetTextlines(false, 0, pixa, blockids, nullptr); - } - - /** - * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa - * pair, in reading order. Enables downstream handling of non-rectangular - * regions. - * Can be called before or after Recognize. - * If blockids is not nullptr, the block-id of each line is also returned as - * an array of one element per line. delete [] after use. - */ - Boxa *GetStrips(Pixa **pixa, int **blockids); - - /** - * Get the words as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - */ - Boxa *GetWords(Pixa **pixa); - - /** - * Gets the individual connected (text) components (created - * after pages segmentation step, but before recognition) - * as a leptonica-style Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * Note: the caller is responsible for calling boxaDestroy() - * on the returned Boxa array and pixaDestroy() on cc array. - */ - Boxa *GetConnectedComponents(Pixa **cc); - - /** - * Get the given level kind of components (block, textline, word etc.) as a - * leptonica-style Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * If blockids is not nullptr, the block-id of each component is also returned - * as an array of one element per component. delete [] after use. - * If blockids is not nullptr, the paragraph-id of each component with its - * block is also returned as an array of one element per component. delete [] - * after use. If raw_image is true, then portions of the original image are - * extracted instead of the thresholded image and padded with raw_padding. If - * text_only is true, then only text components are returned. - */ - Boxa *GetComponentImages(PageIteratorLevel level, bool text_only, - bool raw_image, int raw_padding, Pixa **pixa, - int **blockids, int **paraids); - // Helper function to get binary images with no padding (most common usage). - Boxa *GetComponentImages(const PageIteratorLevel level, const bool text_only, - Pixa **pixa, int **blockids) { - return GetComponentImages(level, text_only, false, 0, pixa, blockids, - nullptr); - } - - /** - * Returns the scale factor of the thresholded image that would be returned by - * GetThresholdedImage() and the various GetX() methods that call - * GetComponentImages(). - * Returns 0 if no thresholder has been set. - */ - int GetThresholdedImageScaleFactor() const; - - /** - * Runs page layout analysis in the mode set by SetPageSegMode. - * May optionally be called prior to Recognize to get access to just - * the page layout results. Returns an iterator to the results. - * If merge_similar_words is true, words are combined where suitable for use - * with a line recognizer. Use if you want to use AnalyseLayout to find the - * textlines, and then want to process textline fragments with an external - * line recognizer. - * Returns nullptr on error or an empty page. - * The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - PageIterator *AnalyseLayout(); - PageIterator *AnalyseLayout(bool merge_similar_words); - - /** - * Recognize the image from SetAndThresholdImage, generating Tesseract - * internal structures. Returns 0 on success. - * Optional. The Get*Text functions below will call Recognize if needed. - * After Recognize, the output is kept internally until the next SetImage. - */ - int Recognize(ETEXT_DESC *monitor); - - /** - * Methods to retrieve information after SetAndThresholdImage(), - * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.) - */ - - /** - * Turns images into symbolic text. - * - * filename can point to a single image, a multi-page TIFF, - * or a plain text list of image filenames. - * - * retry_config is useful for debugging. If not nullptr, you can fall - * back to an alternate configuration if a page fails for some - * reason. - * - * timeout_millisec terminates processing if any single page - * takes too long. Set to 0 for unlimited time. - * - * renderer is responible for creating the output. For example, - * use the TessTextRenderer if you want plaintext output, or - * the TessPDFRender to produce searchable PDF. - * - * If tessedit_page_number is non-negative, will only process that - * single page. Works for multi-page tiff file, or filelist. - * - * Returns true if successful, false on error. - */ - bool ProcessPages(const char *filename, const char *retry_config, - int timeout_millisec, TessResultRenderer *renderer); - // Does the real work of ProcessPages. - bool ProcessPagesInternal(const char *filename, const char *retry_config, - int timeout_millisec, TessResultRenderer *renderer); - - /** - * Turn a single image into symbolic text. - * - * The pix is the image processed. filename and page_index are - * metadata used by side-effect processes, such as reading a box - * file or formatting as hOCR. - * - * See ProcessPages for descriptions of other parameters. - */ - bool ProcessPage(Pix *pix, int page_index, const char *filename, - const char *retry_config, int timeout_millisec, - TessResultRenderer *renderer); - - /** - * Get a reading-order iterator to the results of LayoutAnalysis and/or - * Recognize. The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - ResultIterator *GetIterator(); - - /** - * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize. - * The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - MutableIterator *GetMutableIterator(); - - /** - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - */ - char *GetUTF8Text(); - - /** - * Make a HTML-formatted string with hOCR markup from the internal - * data structures. - * page_number is 0-based but will appear in the output as 1-based. - * monitor can be used to - * cancel the recognition - * receive progress callbacks - * Returned string must be freed with the delete [] operator. - */ - char *GetHOCRText(ETEXT_DESC *monitor, int page_number); - - /** - * Make a HTML-formatted string with hOCR markup from the internal - * data structures. - * page_number is 0-based but will appear in the output as 1-based. - * Returned string must be freed with the delete [] operator. - */ - char *GetHOCRText(int page_number); - - /** - * Make an XML-formatted string with Alto markup from the internal - * data structures. - */ - char *GetAltoText(ETEXT_DESC *monitor, int page_number); - - /** - * Make an XML-formatted string with Alto markup from the internal - * data structures. - */ - char *GetAltoText(int page_number); - - /** - * Make a TSV-formatted string from the internal data structures. - * page_number is 0-based but will appear in the output as 1-based. - * Returned string must be freed with the delete [] operator. - */ - char *GetTSVText(int page_number); - - /** - * Make a box file for LSTM training from the internal data structures. - * Constructs coordinates in the original image - not just the rectangle. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetLSTMBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded in the same - * format as a box file used in training. - * Constructs coordinates in the original image - not just the rectangle. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded in the same - * format as a WordStr box file used in training. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetWordStrBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded - * as UNLV format Latin-1 with specific reject and suspect codes. - * Returned string must be freed with the delete [] operator. - */ - char *GetUNLVText(); - - /** - * Detect the orientation of the input image and apparent script (alphabet). - * orient_deg is the detected clockwise rotation of the input image in degrees - * (0, 90, 180, 270) - * orient_conf is the confidence (15.0 is reasonably confident) - * script_name is an ASCII string, the name of the script, e.g. "Latin" - * script_conf is confidence level in the script - * Returns true on success and writes values to each parameter as an output - */ - bool DetectOrientationScript(int *orient_deg, float *orient_conf, - const char **script_name, float *script_conf); - - /** - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - * page_number is a 0-based page index that will appear in the osd file. - */ - char *GetOsdText(int page_number); - - /** Returns the (average) confidence value between 0 and 100. */ - int MeanTextConf(); - /** - * Returns all word confidences (between 0 and 100) in an array, terminated - * by -1. The calling function must delete [] after use. - * The number of confidences should correspond to the number of space- - * delimited words in GetUTF8Text. - */ - int *AllWordConfidences(); - -#ifndef DISABLED_LEGACY_ENGINE - /** - * Applies the given word to the adaptive classifier if possible. - * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can - * tell the boundaries of the graphemes. - * Assumes that SetImage/SetRectangle have been used to set the image - * to the given word. The mode arg should be PSM_SINGLE_WORD or - * PSM_CIRCLE_WORD, as that will be used to control layout analysis. - * The currently set PageSegMode is preserved. - * Returns false if adaption was not possible for some reason. - */ - bool AdaptToWordStr(PageSegMode mode, const char *wordstr); -#endif // ndef DISABLED_LEGACY_ENGINE - - /** - * Free up recognition results and any stored image data, without actually - * freeing any recognition data that would be time-consuming to reload. - * Afterwards, you must call SetImage or TesseractRect before doing - * any Recognize or Get* operation. - */ - void Clear(); - - /** - * Close down tesseract and free up all memory. End() is equivalent to - * destructing and reconstructing your TessBaseAPI. - * Once End() has been used, none of the other API functions may be used - * other than Init and anything declared above it in the class definition. - */ - void End(); - - /** - * Clear any library-level memory caches. - * There are a variety of expensive-to-load constant data structures (mostly - * language dictionaries) that are cached globally -- surviving the Init() - * and End() of individual TessBaseAPI's. This function allows the clearing - * of these caches. - **/ - static void ClearPersistentCache(); - - /** - * Check whether a word is valid according to Tesseract's language model - * @return 0 if the word is invalid, non-zero if valid. - * @warning temporary! This function will be removed from here and placed - * in a separate API at some future time. - */ - int IsValidWord(const char *word) const; - // Returns true if utf8_character is defined in the UniCharset. - bool IsValidCharacter(const char *utf8_character) const; - - bool GetTextDirection(int *out_offset, float *out_slope); - - /** Sets Dict::letter_is_okay_ function to point to the given function. */ - void SetDictFunc(DictFunc f); - - /** Sets Dict::probability_in_context_ function to point to the given - * function. - */ - void SetProbabilityInContextFunc(ProbabilityInContextFunc f); - - /** - * Estimates the Orientation And Script of the image. - * @return true if the image was processed successfully. - */ - bool DetectOS(OSResults *); - - /** - * Return text orientation of each block as determined by an earlier run - * of layout analysis. - */ - void GetBlockTextOrientations(int **block_orientation, - bool **vertical_writing); - - /** This method returns the string form of the specified unichar. */ - const char *GetUnichar(int unichar_id) const; - - /** Return the pointer to the i-th dawg loaded into tesseract_ object. */ - const Dawg *GetDawg(int i) const; - - /** Return the number of dawgs loaded into tesseract_ object. */ - int NumDawgs() const; - - Tesseract *tesseract() const { - return tesseract_; - } - - OcrEngineMode oem() const { - return last_oem_requested_; - } - - void set_min_orientation_margin(double margin); - /* @} */ - -protected: - /** Common code for setting the image. Returns true if Init has been called. - */ - bool InternalSetImage(); - - /** - * Run the thresholder to make the thresholded image. If pix is not nullptr, - * the source is thresholded to pix instead of the internal IMAGE. - */ - virtual bool Threshold(Pix **pix); - - /** - * Find lines from the image making the BLOCK_LIST. - * @return 0 on success. - */ - int FindLines(); - - /** Delete the pageres and block list ready for a new page. */ - void ClearResults(); - - /** - * Return an LTR Result Iterator -- used only for training, as we really want - * to ignore all BiDi smarts at that point. - * delete once you're done with it. - */ - LTRResultIterator *GetLTRIterator(); - - /** - * Return the length of the output text string, as UTF8, assuming - * one newline per line and one per block, with a terminator, - * and assuming a single character reject marker for each rejected character. - * Also return the number of recognized blobs in blob_count. - */ - int TextLength(int *blob_count) const; - - //// paragraphs.cpp //////////////////////////////////////////////////// - void DetectParagraphs(bool after_text_recognition); - - const PAGE_RES *GetPageRes() const { - return page_res_; - } - -protected: - Tesseract *tesseract_; ///< The underlying data object. - Tesseract *osd_tesseract_; ///< For orientation & script detection. - EquationDetect *equ_detect_; ///< The equation detector. - FileReader reader_; ///< Reads files from any filesystem. - ImageThresholder *thresholder_; ///< Image thresholding module. - std::vector *paragraph_models_; - BLOCK_LIST *block_list_; ///< The page layout. - PAGE_RES *page_res_; ///< The page-level data. - std::string input_file_; ///< Name used by training code. - std::string output_file_; ///< Name used by debug code. - std::string datapath_; ///< Current location of tessdata. - std::string language_; ///< Last initialized language. - OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested. - bool recognition_done_; ///< page_res_ contains recognition data. - - /** - * @defgroup ThresholderParams Thresholder Parameters - * Parameters saved from the Thresholder. Needed to rebuild coordinates. - */ - /* @{ */ - int rect_left_; - int rect_top_; - int rect_width_; - int rect_height_; - int image_width_; - int image_height_; - /* @} */ - -private: - // A list of image filenames gets special consideration - bool ProcessPagesFileList(FILE *fp, std::string *buf, - const char *retry_config, int timeout_millisec, - TessResultRenderer *renderer, - int tessedit_page_number); - // TIFF supports multipage so gets special consideration. - bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size, - const char *filename, const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer, - int tessedit_page_number); -}; // class TessBaseAPI. - -/** Escape a char string - remove &<>"' with HTML codes. */ -std::string HOcrEscape(const char *text); - -} // namespace tesseract - -#endif // TESSERACT_API_BASEAPI_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/capi.h b/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/capi.h deleted file mode 100644 index 40f4856a..00000000 --- a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/capi.h +++ /dev/null @@ -1,484 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: capi.h -// Description: C-API TessBaseAPI -// -// (C) Copyright 2012, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef API_CAPI_H_ -#define API_CAPI_H_ - -#include "export.h" - -#ifdef __cplusplus -# include -# include -# include -# include -# include -#endif - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef BOOL -# define BOOL int -# define TRUE 1 -# define FALSE 0 -#endif - -#ifdef __cplusplus -typedef tesseract::TessResultRenderer TessResultRenderer; -typedef tesseract::TessBaseAPI TessBaseAPI; -typedef tesseract::PageIterator TessPageIterator; -typedef tesseract::ResultIterator TessResultIterator; -typedef tesseract::MutableIterator TessMutableIterator; -typedef tesseract::ChoiceIterator TessChoiceIterator; -typedef tesseract::OcrEngineMode TessOcrEngineMode; -typedef tesseract::PageSegMode TessPageSegMode; -typedef tesseract::PageIteratorLevel TessPageIteratorLevel; -typedef tesseract::Orientation TessOrientation; -typedef tesseract::ParagraphJustification TessParagraphJustification; -typedef tesseract::WritingDirection TessWritingDirection; -typedef tesseract::TextlineOrder TessTextlineOrder; -typedef tesseract::PolyBlockType TessPolyBlockType; -typedef tesseract::ETEXT_DESC ETEXT_DESC; -#else -typedef struct TessResultRenderer TessResultRenderer; -typedef struct TessBaseAPI TessBaseAPI; -typedef struct TessPageIterator TessPageIterator; -typedef struct TessResultIterator TessResultIterator; -typedef struct TessMutableIterator TessMutableIterator; -typedef struct TessChoiceIterator TessChoiceIterator; -typedef enum TessOcrEngineMode { - OEM_TESSERACT_ONLY, - OEM_LSTM_ONLY, - OEM_TESSERACT_LSTM_COMBINED, - OEM_DEFAULT -} TessOcrEngineMode; -typedef enum TessPageSegMode { - PSM_OSD_ONLY, - PSM_AUTO_OSD, - PSM_AUTO_ONLY, - PSM_AUTO, - PSM_SINGLE_COLUMN, - PSM_SINGLE_BLOCK_VERT_TEXT, - PSM_SINGLE_BLOCK, - PSM_SINGLE_LINE, - PSM_SINGLE_WORD, - PSM_CIRCLE_WORD, - PSM_SINGLE_CHAR, - PSM_SPARSE_TEXT, - PSM_SPARSE_TEXT_OSD, - PSM_RAW_LINE, - PSM_COUNT -} TessPageSegMode; -typedef enum TessPageIteratorLevel { - RIL_BLOCK, - RIL_PARA, - RIL_TEXTLINE, - RIL_WORD, - RIL_SYMBOL -} TessPageIteratorLevel; -typedef enum TessPolyBlockType { - PT_UNKNOWN, - PT_FLOWING_TEXT, - PT_HEADING_TEXT, - PT_PULLOUT_TEXT, - PT_EQUATION, - PT_INLINE_EQUATION, - PT_TABLE, - PT_VERTICAL_TEXT, - PT_CAPTION_TEXT, - PT_FLOWING_IMAGE, - PT_HEADING_IMAGE, - PT_PULLOUT_IMAGE, - PT_HORZ_LINE, - PT_VERT_LINE, - PT_NOISE, - PT_COUNT -} TessPolyBlockType; -typedef enum TessOrientation { - ORIENTATION_PAGE_UP, - ORIENTATION_PAGE_RIGHT, - ORIENTATION_PAGE_DOWN, - ORIENTATION_PAGE_LEFT -} TessOrientation; -typedef enum TessParagraphJustification { - JUSTIFICATION_UNKNOWN, - JUSTIFICATION_LEFT, - JUSTIFICATION_CENTER, - JUSTIFICATION_RIGHT -} TessParagraphJustification; -typedef enum TessWritingDirection { - WRITING_DIRECTION_LEFT_TO_RIGHT, - WRITING_DIRECTION_RIGHT_TO_LEFT, - WRITING_DIRECTION_TOP_TO_BOTTOM -} TessWritingDirection; -typedef enum TessTextlineOrder { - TEXTLINE_ORDER_LEFT_TO_RIGHT, - TEXTLINE_ORDER_RIGHT_TO_LEFT, - TEXTLINE_ORDER_TOP_TO_BOTTOM -} TessTextlineOrder; -typedef struct ETEXT_DESC ETEXT_DESC; -#endif - -typedef bool (*TessCancelFunc)(void *cancel_this, int words); -typedef bool (*TessProgressFunc)(ETEXT_DESC *ths, int left, int right, int top, - int bottom); - -struct Pix; -struct Boxa; -struct Pixa; - -/* General free functions */ - -TESS_API const char *TessVersion(); -TESS_API void TessDeleteText(const char *text); -TESS_API void TessDeleteTextArray(char **arr); -TESS_API void TessDeleteIntArray(const int *arr); - -/* Renderer API */ -TESS_API TessResultRenderer *TessTextRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessHOcrRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase, - BOOL font_info); -TESS_API TessResultRenderer *TessAltoRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessTsvRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessPDFRendererCreate(const char *outputbase, - const char *datadir, - BOOL textonly); -TESS_API TessResultRenderer *TessUnlvRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessWordStrBoxRendererCreate( - const char *outputbase); - -TESS_API void TessDeleteResultRenderer(TessResultRenderer *renderer); -TESS_API void TessResultRendererInsert(TessResultRenderer *renderer, - TessResultRenderer *next); -TESS_API TessResultRenderer *TessResultRendererNext( - TessResultRenderer *renderer); -TESS_API BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer, - const char *title); -TESS_API BOOL TessResultRendererAddImage(TessResultRenderer *renderer, - TessBaseAPI *api); -TESS_API BOOL TessResultRendererEndDocument(TessResultRenderer *renderer); - -TESS_API const char *TessResultRendererExtention(TessResultRenderer *renderer); -TESS_API const char *TessResultRendererTitle(TessResultRenderer *renderer); -TESS_API int TessResultRendererImageNum(TessResultRenderer *renderer); - -/* Base API */ - -TESS_API TessBaseAPI *TessBaseAPICreate(); -TESS_API void TessBaseAPIDelete(TessBaseAPI *handle); - -TESS_API size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI *handle, void **device); - -TESS_API void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name); -TESS_API const char *TessBaseAPIGetInputName(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetInputImage(TessBaseAPI *handle, struct Pix *pix); -TESS_API struct Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle); - -TESS_API int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle); -TESS_API const char *TessBaseAPIGetDatapath(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name); - -TESS_API BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name, - const char *value); -TESS_API BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name, - const char *value); - -TESS_API BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle, - const char *name, int *value); -TESS_API BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle, - const char *name, BOOL *value); -TESS_API BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle, - const char *name, double *value); -TESS_API const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle, - const char *name); - -TESS_API void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp); -TESS_API BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle, - const char *filename); - -TESS_API int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode oem, - char **configs, int configs_size); -TESS_API int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode oem); -TESS_API int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath, - const char *language); - -TESS_API int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode mode, - char **configs, int configs_size, char **vars_vec, - char **vars_values, size_t vars_vec_size, - BOOL set_only_non_debug_params); - -TESS_API int TessBaseAPIInit5(TessBaseAPI *handle, const char *data, int data_size, - const char *language, TessOcrEngineMode mode, - char **configs, int configs_size, char **vars_vec, - char **vars_values, size_t vars_vec_size, - BOOL set_only_non_debug_params); - -TESS_API const char *TessBaseAPIGetInitLanguagesAsString( - const TessBaseAPI *handle); -TESS_API char **TessBaseAPIGetLoadedLanguagesAsVector( - const TessBaseAPI *handle); -TESS_API char **TessBaseAPIGetAvailableLanguagesAsVector( - const TessBaseAPI *handle); - -TESS_API void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle); - -TESS_API void TessBaseAPIReadConfigFile(TessBaseAPI *handle, - const char *filename); -TESS_API void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle, - const char *filename); - -TESS_API void TessBaseAPISetPageSegMode(TessBaseAPI *handle, - TessPageSegMode mode); -TESS_API TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle); - -TESS_API char *TessBaseAPIRect(TessBaseAPI *handle, - const unsigned char *imagedata, - int bytes_per_pixel, int bytes_per_line, - int left, int top, int width, int height); - -TESS_API void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetImage(TessBaseAPI *handle, - const unsigned char *imagedata, int width, - int height, int bytes_per_pixel, - int bytes_per_line); -TESS_API void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix); - -TESS_API void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi); - -TESS_API void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top, - int width, int height); - -TESS_API struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle); -TESS_API struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle, - struct Pixa **pixa); -TESS_API struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle, - struct Pixa **pixa, - int **blockids); -TESS_API struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle, - BOOL raw_image, int raw_padding, - struct Pixa **pixa, - int **blockids, int **paraids); -TESS_API struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle, - struct Pixa **pixa, int **blockids); -TESS_API struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle, - struct Pixa **pixa); -TESS_API struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle, - struct Pixa **cc); -TESS_API struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle, - TessPageIteratorLevel level, - BOOL text_only, - struct Pixa **pixa, - int **blockids); -TESS_API struct Boxa *TessBaseAPIGetComponentImages1( - TessBaseAPI *handle, TessPageIteratorLevel level, BOOL text_only, - BOOL raw_image, int raw_padding, struct Pixa **pixa, int **blockids, - int **paraids); - -TESS_API int TessBaseAPIGetThresholdedImageScaleFactor( - const TessBaseAPI *handle); - -TESS_API TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle); - -TESS_API int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor); - -TESS_API BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename, - const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer); -TESS_API BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix, - int page_index, const char *filename, - const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer); - -TESS_API TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle); -TESS_API TessMutableIterator *TessBaseAPIGetMutableIterator( - TessBaseAPI *handle); - -TESS_API char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle); -TESS_API char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number); - -TESS_API char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number); - -TESS_API char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle, - int page_number); - -TESS_API char *TessBaseAPIGetUNLVText(TessBaseAPI *handle); -TESS_API int TessBaseAPIMeanTextConf(TessBaseAPI *handle); - -TESS_API int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle); - -#ifndef DISABLED_LEGACY_ENGINE -TESS_API BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle, - TessPageSegMode mode, - const char *wordstr); -#endif // #ifndef DISABLED_LEGACY_ENGINE - -TESS_API void TessBaseAPIClear(TessBaseAPI *handle); -TESS_API void TessBaseAPIEnd(TessBaseAPI *handle); - -TESS_API int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word); -TESS_API BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset, - float *out_slope); - -TESS_API const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id); - -TESS_API void TessBaseAPIClearPersistentCache(TessBaseAPI *handle); - -#ifndef DISABLED_LEGACY_ENGINE - -// Call TessDeleteText(*best_script_name) to free memory allocated by this -// function -TESS_API BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle, - int *orient_deg, - float *orient_conf, - const char **script_name, - float *script_conf); -#endif // #ifndef DISABLED_LEGACY_ENGINE - -TESS_API void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle, - double margin); - -TESS_API int TessBaseAPINumDawgs(const TessBaseAPI *handle); - -TESS_API TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle); - -TESS_API void TessBaseGetBlockTextOrientations(TessBaseAPI *handle, - int **block_orientation, - bool **vertical_writing); - -/* Page iterator */ - -TESS_API void TessPageIteratorDelete(TessPageIterator *handle); - -TESS_API TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle); - -TESS_API void TessPageIteratorBegin(TessPageIterator *handle); - -TESS_API BOOL TessPageIteratorNext(TessPageIterator *handle, - TessPageIteratorLevel level); - -TESS_API BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle, - TessPageIteratorLevel level); - -TESS_API BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle, - TessPageIteratorLevel level, - TessPageIteratorLevel element); - -TESS_API BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle, - TessPageIteratorLevel level, - int *left, int *top, int *right, - int *bottom); - -TESS_API TessPolyBlockType -TessPageIteratorBlockType(const TessPageIterator *handle); - -TESS_API struct Pix *TessPageIteratorGetBinaryImage( - const TessPageIterator *handle, TessPageIteratorLevel level); - -TESS_API struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle, - TessPageIteratorLevel level, - int padding, - struct Pix *original_image, - int *left, int *top); - -TESS_API BOOL TessPageIteratorBaseline(const TessPageIterator *handle, - TessPageIteratorLevel level, int *x1, - int *y1, int *x2, int *y2); - -TESS_API void TessPageIteratorOrientation( - TessPageIterator *handle, TessOrientation *orientation, - TessWritingDirection *writing_direction, TessTextlineOrder *textline_order, - float *deskew_angle); - -TESS_API void TessPageIteratorParagraphInfo( - TessPageIterator *handle, TessParagraphJustification *justification, - BOOL *is_list_item, BOOL *is_crown, int *first_line_indent); - -/* Result iterator */ - -TESS_API void TessResultIteratorDelete(TessResultIterator *handle); -TESS_API TessResultIterator *TessResultIteratorCopy( - const TessResultIterator *handle); -TESS_API TessPageIterator *TessResultIteratorGetPageIterator( - TessResultIterator *handle); -TESS_API const TessPageIterator *TessResultIteratorGetPageIteratorConst( - const TessResultIterator *handle); -TESS_API TessChoiceIterator *TessResultIteratorGetChoiceIterator( - const TessResultIterator *handle); - -TESS_API BOOL TessResultIteratorNext(TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API float TessResultIteratorConfidence(const TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API const char *TessResultIteratorWordRecognitionLanguage( - const TessResultIterator *handle); -TESS_API const char *TessResultIteratorWordFontAttributes( - const TessResultIterator *handle, BOOL *is_bold, BOOL *is_italic, - BOOL *is_underlined, BOOL *is_monospace, BOOL *is_serif, BOOL *is_smallcaps, - int *pointsize, int *font_id); - -TESS_API BOOL -TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle); -TESS_API BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle); - -TESS_API void TessChoiceIteratorDelete(TessChoiceIterator *handle); -TESS_API BOOL TessChoiceIteratorNext(TessChoiceIterator *handle); -TESS_API const char *TessChoiceIteratorGetUTF8Text( - const TessChoiceIterator *handle); -TESS_API float TessChoiceIteratorConfidence(const TessChoiceIterator *handle); - -/* Progress monitor */ - -TESS_API ETEXT_DESC *TessMonitorCreate(); -TESS_API void TessMonitorDelete(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetCancelFunc(ETEXT_DESC *monitor, - TessCancelFunc cancelFunc); -TESS_API void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis); -TESS_API void *TessMonitorGetCancelThis(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetProgressFunc(ETEXT_DESC *monitor, - TessProgressFunc progressFunc); -TESS_API int TessMonitorGetProgress(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline); - -#ifdef __cplusplus -} -#endif - -#endif // API_CAPI_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/export.h b/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/export.h deleted file mode 100644 index d238b628..00000000 --- a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/export.h +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: export.h -// Description: Place holder -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_PLATFORM_H_ -#define TESSERACT_PLATFORM_H_ - -#ifndef TESS_API -# if defined(_WIN32) || defined(__CYGWIN__) -# if defined(TESS_EXPORTS) -# define TESS_API __declspec(dllexport) -# elif defined(TESS_IMPORTS) -# define TESS_API __declspec(dllimport) -# else -# define TESS_API -# endif -# else -# if defined(TESS_EXPORTS) || defined(TESS_IMPORTS) -# define TESS_API __attribute__((visibility("default"))) -# else -# define TESS_API -# endif -# endif -#endif - -#endif // TESSERACT_PLATFORM_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/ltrresultiterator.h b/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/ltrresultiterator.h deleted file mode 100644 index 6ca0a98e..00000000 --- a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/ltrresultiterator.h +++ /dev/null @@ -1,235 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: ltrresultiterator.h -// Description: Iterator for tesseract results in strict left-to-right -// order that avoids using tesseract internal data structures. -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ -#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ - -#include "export.h" // for TESS_API -#include "pageiterator.h" // for PageIterator -#include "publictypes.h" // for PageIteratorLevel -#include "unichar.h" // for StrongScriptDirection - -namespace tesseract { - -class BLOB_CHOICE_IT; -class PAGE_RES; -class WERD_RES; - -class Tesseract; - -// Class to iterate over tesseract results, providing access to all levels -// of the page hierarchy, without including any tesseract headers or having -// to handle any tesseract structures. -// WARNING! This class points to data held within the TessBaseAPI class, and -// therefore can only be used while the TessBaseAPI class still exists and -// has not been subjected to a call of Init, SetImage, Recognize, Clear, End -// DetectOS, or anything else that changes the internal PAGE_RES. -// See tesseract/publictypes.h for the definition of PageIteratorLevel. -// See also base class PageIterator, which contains the bulk of the interface. -// LTRResultIterator adds text-specific methods for access to OCR output. - -class TESS_API LTRResultIterator : public PageIterator { - friend class ChoiceIterator; - -public: - // page_res and tesseract come directly from the BaseAPI. - // The rectangle parameters are copied indirectly from the Thresholder, - // via the BaseAPI. They represent the coordinates of some rectangle in an - // original image (in top-left-origin coordinates) and therefore the top-left - // needs to be added to any output boxes in order to specify coordinates - // in the original image. See TessBaseAPI::SetRectangle. - // The scale and scaled_yres are in case the Thresholder scaled the image - // rectangle prior to thresholding. Any coordinates in tesseract's image - // must be divided by scale before adding (rect_left, rect_top). - // The scaled_yres indicates the effective resolution of the binary image - // that tesseract has been given by the Thresholder. - // After the constructor, Begin has already been called. - LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, - int scaled_yres, int rect_left, int rect_top, - int rect_width, int rect_height); - - ~LTRResultIterator() override; - - // LTRResultIterators may be copied! This makes it possible to iterate over - // all the objects at a lower level, while maintaining an iterator to - // objects at a higher level. These constructors DO NOT CALL Begin, so - // iterations will continue from the location of src. - // TODO: For now the copy constructor and operator= only need the base class - // versions, but if new data members are added, don't forget to add them! - - // ============= Moving around within the page ============. - - // See PageIterator. - - // ============= Accessing data ==============. - - // Returns the null terminated UTF-8 encoded text string for the current - // object at the given level. Use delete [] to free after use. - char *GetUTF8Text(PageIteratorLevel level) const; - - // Set the string inserted at the end of each text line. "\n" by default. - void SetLineSeparator(const char *new_line); - - // Set the string inserted at the end of each paragraph. "\n" by default. - void SetParagraphSeparator(const char *new_para); - - // Returns the mean confidence of the current object at the given level. - // The number should be interpreted as a percent probability. (0.0f-100.0f) - float Confidence(PageIteratorLevel level) const; - - // ============= Functions that refer to words only ============. - - // Returns the font attributes of the current word. If iterating at a higher - // level object than words, eg textlines, then this will return the - // attributes of the first word in that textline. - // The actual return value is a string representing a font name. It points - // to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as - // the iterator itself, ie rendered invalid by various members of - // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI. - // Pointsize is returned in printers points (1/72 inch.) - const char *WordFontAttributes(bool *is_bold, bool *is_italic, - bool *is_underlined, bool *is_monospace, - bool *is_serif, bool *is_smallcaps, - int *pointsize, int *font_id) const; - - // Return the name of the language used to recognize this word. - // On error, nullptr. Do not delete this pointer. - const char *WordRecognitionLanguage() const; - - // Return the overall directionality of this word. - StrongScriptDirection WordDirection() const; - - // Returns true if the current word was found in a dictionary. - bool WordIsFromDictionary() const; - - // Returns the number of blanks before the current word. - int BlanksBeforeWord() const; - - // Returns true if the current word is numeric. - bool WordIsNumeric() const; - - // Returns true if the word contains blamer information. - bool HasBlamerInfo() const; - - // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle - // of the current word. - const void *GetParamsTrainingBundle() const; - - // Returns a pointer to the string with blamer information for this word. - // Assumes that the word's blamer_bundle is not nullptr. - const char *GetBlamerDebug() const; - - // Returns a pointer to the string with misadaption information for this word. - // Assumes that the word's blamer_bundle is not nullptr. - const char *GetBlamerMisadaptionDebug() const; - - // Returns true if a truth string was recorded for the current word. - bool HasTruthString() const; - - // Returns true if the given string is equivalent to the truth string for - // the current word. - bool EquivalentToTruth(const char *str) const; - - // Returns a null terminated UTF-8 encoded truth string for the current word. - // Use delete [] to free after use. - char *WordTruthUTF8Text() const; - - // Returns a null terminated UTF-8 encoded normalized OCR string for the - // current word. Use delete [] to free after use. - char *WordNormedUTF8Text() const; - - // Returns a pointer to serialized choice lattice. - // Fills lattice_size with the number of bytes in lattice data. - const char *WordLattice(int *lattice_size) const; - - // ============= Functions that refer to symbols only ============. - - // Returns true if the current symbol is a superscript. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsSuperscript() const; - // Returns true if the current symbol is a subscript. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsSubscript() const; - // Returns true if the current symbol is a dropcap. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsDropcap() const; - -protected: - const char *line_separator_; - const char *paragraph_separator_; -}; - -// Class to iterate over the classifier choices for a single RIL_SYMBOL. -class TESS_API ChoiceIterator { -public: - // Construction is from a LTRResultIterator that points to the symbol of - // interest. The ChoiceIterator allows a one-shot iteration over the - // choices for this symbol and after that it is useless. - explicit ChoiceIterator(const LTRResultIterator &result_it); - ~ChoiceIterator(); - - // Moves to the next choice for the symbol and returns false if there - // are none left. - bool Next(); - - // ============= Accessing data ==============. - - // Returns the null terminated UTF-8 encoded text string for the current - // choice. - // NOTE: Unlike LTRResultIterator::GetUTF8Text, the return points to an - // internal structure and should NOT be delete[]ed to free after use. - const char *GetUTF8Text() const; - - // Returns the confidence of the current choice depending on the used language - // data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All - // choices for one symbol should roughly add up to 1.0f. - // If only traineddata of the legacy engine is used, the number should be - // interpreted as a percent probability. (0.0f-100.0f) In this case - // probabilities won't add up to 100. Each one stands on its own. - float Confidence() const; - - // Returns a vector containing all timesteps, which belong to the currently - // selected symbol. A timestep is a vector containing pairs of symbols and - // floating point numbers. The number states the probability for the - // corresponding symbol. - std::vector>> *Timesteps() const; - -private: - // clears the remaining spaces out of the results and adapt the probabilities - void filterSpaces(); - // Pointer to the WERD_RES object owned by the API. - WERD_RES *word_res_; - // Iterator over the blob choices. - BLOB_CHOICE_IT *choice_it_; - std::vector> *LSTM_choices_ = nullptr; - std::vector>::iterator LSTM_choice_it_; - - const int *tstep_index_; - // regulates the rating granularity - double rating_coefficient_; - // leading blanks - int blanks_before_word_; - // true when there is lstm engine related trained data - bool oemLSTM_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/ocrclass.h b/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/ocrclass.h deleted file mode 100644 index a55e6528..00000000 --- a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/ocrclass.h +++ /dev/null @@ -1,158 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -/********************************************************************** - * File: ocrclass.h - * Description: Class definitions and constants for the OCR API. - * Author: Hewlett-Packard Co - * - * (C) Copyright 1996, Hewlett-Packard Co. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -/********************************************************************** - * This file contains typedefs for all the structures used by - * the HP OCR interface. - * The structures are designed to allow them to be used with any - * structure alignment up to 8. - **********************************************************************/ - -#ifndef CCUTIL_OCRCLASS_H_ -#define CCUTIL_OCRCLASS_H_ - -#include -#include - -namespace tesseract { - -/********************************************************************** - * EANYCODE_CHAR - * Description of a single character. The character code is defined by - * the character set of the current font. - * Output text is sent as an array of these structures. - * Spaces and line endings in the output are represented in the - * structures of the surrounding characters. They are not directly - * represented as characters. - * The first character in a word has a positive value of blanks. - * Missing information should be set to the defaults in the comments. - * If word bounds are known, but not character bounds, then the top and - * bottom of each character should be those of the word. The left of the - * first and right of the last char in each word should be set. All other - * lefts and rights should be set to -1. - * If set, the values of right and bottom are left+width and top+height. - * Most of the members come directly from the parameters to ocr_append_char. - * The formatting member uses the enhancement parameter and combines the - * line direction stuff into the top 3 bits. - * The coding is 0=RL char, 1=LR char, 2=DR NL, 3=UL NL, 4=DR Para, - * 5=UL Para, 6=TB char, 7=BT char. API users do not need to know what - * the coding is, only that it is backwards compatible with the previous - * version. - **********************************************************************/ - -struct EANYCODE_CHAR { /*single character */ - // It should be noted that the format for char_code for version 2.0 and beyond - // is UTF8 which means that ASCII characters will come out as one structure - // but other characters will be returned in two or more instances of this - // structure with a single byte of the UTF8 code in each, but each will have - // the same bounding box. Programs which want to handle languagues with - // different characters sets will need to handle extended characters - // appropriately, but *all* code needs to be prepared to receive UTF8 coded - // characters for characters such as bullet and fancy quotes. - uint16_t char_code; /*character itself */ - int16_t left; /*of char (-1) */ - int16_t right; /*of char (-1) */ - int16_t top; /*of char (-1) */ - int16_t bottom; /*of char (-1) */ - int16_t font_index; /*what font (0) */ - uint8_t confidence; /*0=perfect, 100=reject (0/100) */ - uint8_t point_size; /*of char, 72=i inch, (10) */ - int8_t blanks; /*no of spaces before this char (1) */ - uint8_t formatting; /*char formatting (0) */ -}; - -/********************************************************************** - * ETEXT_DESC - * Description of the output of the OCR engine. - * This structure is used as both a progress monitor and the final - * output header, since it needs to be a valid progress monitor while - * the OCR engine is storing its output to shared memory. - * During progress, all the buffer info is -1. - * Progress starts at 0 and increases to 100 during OCR. No other constraint. - * Additionally the progress callback contains the bounding box of the word that - * is currently being processed. - * Every progress callback, the OCR engine must set ocr_alive to 1. - * The HP side will set ocr_alive to 0. Repeated failure to reset - * to 1 indicates that the OCR engine is dead. - * If the cancel function is not null then it is called with the number of - * user words found. If it returns true then operation is cancelled. - **********************************************************************/ -class ETEXT_DESC; - -using CANCEL_FUNC = bool (*)(void *, int); -using PROGRESS_FUNC = bool (*)(int, int, int, int, int); -using PROGRESS_FUNC2 = bool (*)(ETEXT_DESC *, int, int, int, int); - -class ETEXT_DESC { // output header -public: - int16_t count{0}; /// chars in this buffer(0) - int16_t progress{0}; /// percent complete increasing (0-100) - /** Progress monitor covers word recognition and it does not cover layout - * analysis. - * See Ray comment in https://github.com/tesseract-ocr/tesseract/pull/27 */ - int8_t more_to_come{0}; /// true if not last - volatile int8_t ocr_alive{0}; /// ocr sets to 1, HP 0 - int8_t err_code{0}; /// for errcode use - CANCEL_FUNC cancel{nullptr}; /// returns true to cancel - PROGRESS_FUNC progress_callback{ - nullptr}; /// called whenever progress increases - PROGRESS_FUNC2 progress_callback2; /// monitor-aware progress callback - void *cancel_this{nullptr}; /// this or other data for cancel - std::chrono::steady_clock::time_point end_time; - /// Time to stop. Expected to be set only - /// by call to set_deadline_msecs(). - EANYCODE_CHAR text[1]{}; /// character data - - ETEXT_DESC() : progress_callback2(&default_progress_func) { - end_time = std::chrono::time_point(); - } - - // Sets the end time to be deadline_msecs milliseconds from now. - void set_deadline_msecs(int32_t deadline_msecs) { - if (deadline_msecs > 0) { - end_time = std::chrono::steady_clock::now() + - std::chrono::milliseconds(deadline_msecs); - } - } - - // Returns false if we've not passed the end_time, or have not set a deadline. - bool deadline_exceeded() const { - if (end_time.time_since_epoch() == - std::chrono::steady_clock::duration::zero()) { - return false; - } - auto now = std::chrono::steady_clock::now(); - return (now > end_time); - } - -private: - static bool default_progress_func(ETEXT_DESC *ths, int left, int right, - int top, int bottom) { - if (ths->progress_callback != nullptr) { - return (*(ths->progress_callback))(ths->progress, left, right, top, - bottom); - } - return true; - } -}; - -} // namespace tesseract - -#endif // CCUTIL_OCRCLASS_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/osdetect.h b/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/osdetect.h deleted file mode 100644 index 34bfb557..00000000 --- a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/osdetect.h +++ /dev/null @@ -1,139 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: osdetect.h -// Description: Orientation and script detection. -// Author: Samuel Charron -// Ranjith Unnikrishnan -// -// (C) Copyright 2008, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_OSDETECT_H_ -#define TESSERACT_CCMAIN_OSDETECT_H_ - -#include "export.h" // for TESS_API - -#include // for std::vector - -namespace tesseract { - -class BLOBNBOX; -class BLOBNBOX_CLIST; -class BLOB_CHOICE_LIST; -class TO_BLOCK_LIST; -class UNICHARSET; - -class Tesseract; - -// Max number of scripts in ICU + "NULL" + Japanese and Korean + Fraktur -const int kMaxNumberOfScripts = 116 + 1 + 2 + 1; - -struct OSBestResult { - OSBestResult() - : orientation_id(0), script_id(0), sconfidence(0.0), oconfidence(0.0) {} - int orientation_id; - int script_id; - float sconfidence; - float oconfidence; -}; - -struct OSResults { - OSResults() : unicharset(nullptr) { - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < kMaxNumberOfScripts; ++j) { - scripts_na[i][j] = 0; - } - orientations[i] = 0; - } - } - void update_best_orientation(); - // Set the estimate of the orientation to the given id. - void set_best_orientation(int orientation_id); - // Update/Compute the best estimate of the script assuming the given - // orientation id. - void update_best_script(int orientation_id); - // Return the index of the script with the highest score for this orientation. - TESS_API int get_best_script(int orientation_id) const; - // Accumulate scores with given OSResults instance and update the best script. - void accumulate(const OSResults &osr); - - // Print statistics. - void print_scores(void) const; - void print_scores(int orientation_id) const; - - // Array holding scores for each orientation id [0,3]. - // Orientation ids [0..3] map to [0, 270, 180, 90] degree orientations of the - // page respectively, where the values refer to the amount of clockwise - // rotation to be applied to the page for the text to be upright and readable. - float orientations[4]; - // Script confidence scores for each of 4 possible orientations. - float scripts_na[4][kMaxNumberOfScripts]; - - UNICHARSET *unicharset; - OSBestResult best_result; -}; - -class OrientationDetector { -public: - OrientationDetector(const std::vector *allowed_scripts, - OSResults *results); - bool detect_blob(BLOB_CHOICE_LIST *scores); - int get_orientation(); - -private: - OSResults *osr_; - const std::vector *allowed_scripts_; -}; - -class ScriptDetector { -public: - ScriptDetector(const std::vector *allowed_scripts, OSResults *osr, - tesseract::Tesseract *tess); - void detect_blob(BLOB_CHOICE_LIST *scores); - bool must_stop(int orientation) const; - -private: - OSResults *osr_; - static const char *korean_script_; - static const char *japanese_script_; - static const char *fraktur_script_; - int korean_id_; - int japanese_id_; - int katakana_id_; - int hiragana_id_; - int han_id_; - int hangul_id_; - int latin_id_; - int fraktur_id_; - tesseract::Tesseract *tess_; - const std::vector *allowed_scripts_; -}; - -int orientation_and_script_detection(const char *filename, OSResults *, - tesseract::Tesseract *); - -int os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr, - tesseract::Tesseract *tess); - -int os_detect_blobs(const std::vector *allowed_scripts, - BLOBNBOX_CLIST *blob_list, OSResults *osr, - tesseract::Tesseract *tess); - -bool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s, - OSResults *, tesseract::Tesseract *tess); - -// Helper method to convert an orientation index to its value in degrees. -// The value represents the amount of clockwise rotation in degrees that must be -// applied for the text to be upright (readable). -TESS_API int OrientationIdToValue(const int &id); - -} // namespace tesseract - -#endif // TESSERACT_CCMAIN_OSDETECT_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/pageiterator.h b/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/pageiterator.h deleted file mode 100644 index 68739715..00000000 --- a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/pageiterator.h +++ /dev/null @@ -1,364 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: pageiterator.h -// Description: Iterator for tesseract page structure that avoids using -// tesseract internal data structures. -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_ -#define TESSERACT_CCMAIN_PAGEITERATOR_H_ - -#include "export.h" -#include "publictypes.h" - -struct Pix; -struct Pta; - -namespace tesseract { - -struct BlamerBundle; -class C_BLOB_IT; -class PAGE_RES; -class PAGE_RES_IT; -class WERD; - -class Tesseract; - -/** - * Class to iterate over tesseract page structure, providing access to all - * levels of the page hierarchy, without including any tesseract headers or - * having to handle any tesseract structures. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - * See tesseract/publictypes.h for the definition of PageIteratorLevel. - * See also ResultIterator, derived from PageIterator, which adds in the - * ability to access OCR output with text-specific methods. - */ - -class TESS_API PageIterator { -public: - /** - * page_res and tesseract come directly from the BaseAPI. - * The rectangle parameters are copied indirectly from the Thresholder, - * via the BaseAPI. They represent the coordinates of some rectangle in an - * original image (in top-left-origin coordinates) and therefore the top-left - * needs to be added to any output boxes in order to specify coordinates - * in the original image. See TessBaseAPI::SetRectangle. - * The scale and scaled_yres are in case the Thresholder scaled the image - * rectangle prior to thresholding. Any coordinates in tesseract's image - * must be divided by scale before adding (rect_left, rect_top). - * The scaled_yres indicates the effective resolution of the binary image - * that tesseract has been given by the Thresholder. - * After the constructor, Begin has already been called. - */ - PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, - int scaled_yres, int rect_left, int rect_top, int rect_width, - int rect_height); - virtual ~PageIterator(); - - /** - * Page/ResultIterators may be copied! This makes it possible to iterate over - * all the objects at a lower level, while maintaining an iterator to - * objects at a higher level. These constructors DO NOT CALL Begin, so - * iterations will continue from the location of src. - */ - PageIterator(const PageIterator &src); - const PageIterator &operator=(const PageIterator &src); - - /** Are we positioned at the same location as other? */ - bool PositionedAtSameWord(const PAGE_RES_IT *other) const; - - // ============= Moving around within the page ============. - - /** - * Moves the iterator to point to the start of the page to begin an - * iteration. - */ - virtual void Begin(); - - /** - * Moves the iterator to the beginning of the paragraph. - * This class implements this functionality by moving it to the zero indexed - * blob of the first (leftmost) word on the first row of the paragraph. - */ - virtual void RestartParagraph(); - - /** - * Return whether this iterator points anywhere in the first textline of a - * paragraph. - */ - bool IsWithinFirstTextlineOfParagraph() const; - - /** - * Moves the iterator to the beginning of the text line. - * This class implements this functionality by moving it to the zero indexed - * blob of the first (leftmost) word of the row. - */ - virtual void RestartRow(); - - /** - * Moves to the start of the next object at the given level in the - * page hierarchy, and returns false if the end of the page was reached. - * NOTE that RIL_SYMBOL will skip non-text blocks, but all other - * PageIteratorLevel level values will visit each non-text block once. - * Think of non text blocks as containing a single para, with a single line, - * with a single imaginary word. - * Calls to Next with different levels may be freely intermixed. - * This function iterates words in right-to-left scripts correctly, if - * the appropriate language has been loaded into Tesseract. - */ - virtual bool Next(PageIteratorLevel level); - - /** - * Returns true if the iterator is at the start of an object at the given - * level. - * - * For instance, suppose an iterator it is pointed to the first symbol of the - * first word of the third line of the second paragraph of the first block in - * a page, then: - * it.IsAtBeginningOf(RIL_BLOCK) = false - * it.IsAtBeginningOf(RIL_PARA) = false - * it.IsAtBeginningOf(RIL_TEXTLINE) = true - * it.IsAtBeginningOf(RIL_WORD) = true - * it.IsAtBeginningOf(RIL_SYMBOL) = true - */ - virtual bool IsAtBeginningOf(PageIteratorLevel level) const; - - /** - * Returns whether the iterator is positioned at the last element in a - * given level. (e.g. the last word in a line, the last line in a block) - * - * Here's some two-paragraph example - * text. It starts off innocuously - * enough but quickly turns bizarre. - * The author inserts a cornucopia - * of words to guard against confused - * references. - * - * Now take an iterator it pointed to the start of "bizarre." - * it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false - * it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true - * it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false - */ - virtual bool IsAtFinalElement(PageIteratorLevel level, - PageIteratorLevel element) const; - - /** - * Returns whether this iterator is positioned - * before other: -1 - * equal to other: 0 - * after other: 1 - */ - int Cmp(const PageIterator &other) const; - - // ============= Accessing data ==============. - // Coordinate system: - // Integer coordinates are at the cracks between the pixels. - // The top-left corner of the top-left pixel in the image is at (0,0). - // The bottom-right corner of the bottom-right pixel in the image is at - // (width, height). - // Every bounding box goes from the top-left of the top-left contained - // pixel to the bottom-right of the bottom-right contained pixel, so - // the bounding box of the single top-left pixel in the image is: - // (0,0)->(1,1). - // If an image rectangle has been set in the API, then returned coordinates - // relate to the original (full) image, rather than the rectangle. - - /** - * Controls what to include in a bounding box. Bounding boxes of all levels - * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics. - * Between layout analysis and recognition, it isn't known where all - * diacritics belong, so this control is used to include or exclude some - * diacritics that are above or below the main body of the word. In most cases - * where the placement is obvious, and after recognition, it doesn't make as - * much difference, as the diacritics will already be included in the word. - */ - void SetBoundingBoxComponents(bool include_upper_dots, - bool include_lower_dots) { - include_upper_dots_ = include_upper_dots; - include_lower_dots_ = include_lower_dots; - } - - /** - * Returns the bounding rectangle of the current object at the given level. - * See comment on coordinate system above. - * Returns false if there is no such object at the current position. - * The returned bounding box is guaranteed to match the size and position - * of the image returned by GetBinaryImage, but may clip foreground pixels - * from a grey image. The padding argument to GetImage can be used to expand - * the image to include more foreground pixels. See GetImage below. - */ - bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, - int *bottom) const; - bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top, - int *right, int *bottom) const; - /** - * Returns the bounding rectangle of the object in a coordinate system of the - * working image rectangle having its origin at (rect_left_, rect_top_) with - * respect to the original image and is scaled by a factor scale_. - */ - bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top, - int *right, int *bottom) const; - - /** Returns whether there is no object of a given level. */ - bool Empty(PageIteratorLevel level) const; - - /** - * Returns the type of the current block. - * See tesseract/publictypes.h for PolyBlockType. - */ - PolyBlockType BlockType() const; - - /** - * Returns the polygon outline of the current block. The returned Pta must - * be ptaDestroy-ed after use. Note that the returned Pta lists the vertices - * of the polygon, and the last edge is the line segment between the last - * point and the first point. nullptr will be returned if the iterator is - * at the end of the document or layout analysis was not used. - */ - Pta *BlockPolygon() const; - - /** - * Returns a binary image of the current object at the given level. - * The position and size match the return from BoundingBoxInternal, and so - * this could be upscaled with respect to the original input image. - * Use pixDestroy to delete the image after use. - */ - Pix *GetBinaryImage(PageIteratorLevel level) const; - - /** - * Returns an image of the current object at the given level in greyscale - * if available in the input. To guarantee a binary image use BinaryImage. - * NOTE that in order to give the best possible image, the bounds are - * expanded slightly over the binary connected component, by the supplied - * padding, so the top-left position of the returned image is returned - * in (left,top). These will most likely not match the coordinates - * returned by BoundingBox. - * If you do not supply an original image, you will get a binary one. - * Use pixDestroy to delete the image after use. - */ - Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img, - int *left, int *top) const; - - /** - * Returns the baseline of the current object at the given level. - * The baseline is the line that passes through (x1, y1) and (x2, y2). - * WARNING: with vertical text, baselines may be vertical! - * Returns false if there is no baseline at the current position. - */ - bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2, - int *y2) const; - - // Returns the attributes of the current row. - void RowAttributes(float *row_height, float *descenders, - float *ascenders) const; - - /** - * Returns orientation for the block the iterator points to. - * orientation, writing_direction, textline_order: see publictypes.h - * deskew_angle: after rotating the block so the text orientation is - * upright, how many radians does one have to rotate the - * block anti-clockwise for it to be level? - * -Pi/4 <= deskew_angle <= Pi/4 - */ - void Orientation(tesseract::Orientation *orientation, - tesseract::WritingDirection *writing_direction, - tesseract::TextlineOrder *textline_order, - float *deskew_angle) const; - - /** - * Returns information about the current paragraph, if available. - * - * justification - - * LEFT if ragged right, or fully justified and script is left-to-right. - * RIGHT if ragged left, or fully justified and script is right-to-left. - * unknown if it looks like source code or we have very few lines. - * is_list_item - - * true if we believe this is a member of an ordered or unordered list. - * is_crown - - * true if the first line of the paragraph is aligned with the other - * lines of the paragraph even though subsequent paragraphs have first - * line indents. This typically indicates that this is the continuation - * of a previous paragraph or that it is the very first paragraph in - * the chapter. - * first_line_indent - - * For LEFT aligned paragraphs, the first text line of paragraphs of - * this kind are indented this many pixels from the left edge of the - * rest of the paragraph. - * for RIGHT aligned paragraphs, the first text line of paragraphs of - * this kind are indented this many pixels from the right edge of the - * rest of the paragraph. - * NOTE 1: This value may be negative. - * NOTE 2: if *is_crown == true, the first line of this paragraph is - * actually flush, and first_line_indent is set to the "common" - * first_line_indent for subsequent paragraphs in this block - * of text. - */ - void ParagraphInfo(tesseract::ParagraphJustification *justification, - bool *is_list_item, bool *is_crown, - int *first_line_indent) const; - - // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle - // of the current word to the given pointer (takes ownership of the pointer) - // and returns true. - // Can only be used when iterating on the word level. - bool SetWordBlamerBundle(BlamerBundle *blamer_bundle); - -protected: - /** - * Sets up the internal data for iterating the blobs of a new word, then - * moves the iterator to the given offset. - */ - void BeginWord(int offset); - - /** Pointer to the page_res owned by the API. */ - PAGE_RES *page_res_; - /** Pointer to the Tesseract object owned by the API. */ - Tesseract *tesseract_; - /** - * The iterator to the page_res_. Owned by this ResultIterator. - * A pointer just to avoid dragging in Tesseract includes. - */ - PAGE_RES_IT *it_; - /** - * The current input WERD being iterated. If there is an output from OCR, - * then word_ is nullptr. Owned by the API - */ - WERD *word_; - /** The length of the current word_. */ - int word_length_; - /** The current blob index within the word. */ - int blob_index_; - /** - * Iterator to the blobs within the word. If nullptr, then we are iterating - * OCR results in the box_word. - * Owned by this ResultIterator. - */ - C_BLOB_IT *cblob_it_; - /** Control over what to include in bounding boxes. */ - bool include_upper_dots_; - bool include_lower_dots_; - /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/ - int scale_; - int scaled_yres_; - int rect_left_; - int rect_top_; - int rect_width_; - int rect_height_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_PAGEITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/publictypes.h b/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/publictypes.h deleted file mode 100644 index 0069cf28..00000000 --- a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/publictypes.h +++ /dev/null @@ -1,281 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: publictypes.h -// Description: Types used in both the API and internally -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCSTRUCT_PUBLICTYPES_H_ -#define TESSERACT_CCSTRUCT_PUBLICTYPES_H_ - -namespace tesseract { - -// This file contains types that are used both by the API and internally -// to Tesseract. In order to decouple the API from Tesseract and prevent cyclic -// dependencies, THIS FILE SHOULD NOT DEPEND ON ANY OTHER PART OF TESSERACT. -// Restated: It is OK for low-level Tesseract files to include publictypes.h, -// but not for the low-level tesseract code to include top-level API code. -// This file should not use other Tesseract types, as that would drag -// their includes into the API-level. - -/** Number of printers' points in an inch. The unit of the pointsize return. */ -constexpr int kPointsPerInch = 72; -/** - * Minimum believable resolution. Used as a default if there is no other - * information, as it is safer to under-estimate than over-estimate. - */ -constexpr int kMinCredibleResolution = 70; -/** Maximum believable resolution. */ -constexpr int kMaxCredibleResolution = 2400; -/** - * Ratio between median blob size and likely resolution. Used to estimate - * resolution when none is provided. This is basically 1/usual text size in - * inches. */ -constexpr int kResolutionEstimationFactor = 10; - -/** - * Possible types for a POLY_BLOCK or ColPartition. - * Must be kept in sync with kPBColors in polyblk.cpp and PTIs*Type functions - * below, as well as kPolyBlockNames in layout_test.cc. - * Used extensively by ColPartition, and POLY_BLOCK. - */ -enum PolyBlockType { - PT_UNKNOWN, // Type is not yet known. Keep as the first element. - PT_FLOWING_TEXT, // Text that lives inside a column. - PT_HEADING_TEXT, // Text that spans more than one column. - PT_PULLOUT_TEXT, // Text that is in a cross-column pull-out region. - PT_EQUATION, // Partition belonging to an equation region. - PT_INLINE_EQUATION, // Partition has inline equation. - PT_TABLE, // Partition belonging to a table region. - PT_VERTICAL_TEXT, // Text-line runs vertically. - PT_CAPTION_TEXT, // Text that belongs to an image. - PT_FLOWING_IMAGE, // Image that lives inside a column. - PT_HEADING_IMAGE, // Image that spans more than one column. - PT_PULLOUT_IMAGE, // Image that is in a cross-column pull-out region. - PT_HORZ_LINE, // Horizontal Line. - PT_VERT_LINE, // Vertical Line. - PT_NOISE, // Lies outside of any column. - PT_COUNT -}; - -/** Returns true if PolyBlockType is of horizontal line type */ -inline bool PTIsLineType(PolyBlockType type) { - return type == PT_HORZ_LINE || type == PT_VERT_LINE; -} -/** Returns true if PolyBlockType is of image type */ -inline bool PTIsImageType(PolyBlockType type) { - return type == PT_FLOWING_IMAGE || type == PT_HEADING_IMAGE || - type == PT_PULLOUT_IMAGE; -} -/** Returns true if PolyBlockType is of text type */ -inline bool PTIsTextType(PolyBlockType type) { - return type == PT_FLOWING_TEXT || type == PT_HEADING_TEXT || - type == PT_PULLOUT_TEXT || type == PT_TABLE || - type == PT_VERTICAL_TEXT || type == PT_CAPTION_TEXT || - type == PT_INLINE_EQUATION; -} -// Returns true if PolyBlockType is of pullout(inter-column) type -inline bool PTIsPulloutType(PolyBlockType type) { - return type == PT_PULLOUT_IMAGE || type == PT_PULLOUT_TEXT; -} - -/** - * +------------------+ Orientation Example: - * | 1 Aaaa Aaaa Aaaa | ==================== - * | Aaa aa aaa aa | To left is a diagram of some (1) English and - * | aaaaaa A aa aaa. | (2) Chinese text and a (3) photo credit. - * | 2 | - * | ####### c c C | Upright Latin characters are represented as A and a. - * | ####### c c c | '<' represents a latin character rotated - * | < ####### c c c | anti-clockwise 90 degrees. - * | < ####### c c | - * | < ####### . c | Upright Chinese characters are represented C and c. - * | 3 ####### c | - * +------------------+ NOTA BENE: enum values here should match goodoc.proto - - * If you orient your head so that "up" aligns with Orientation, - * then the characters will appear "right side up" and readable. - * - * In the example above, both the English and Chinese paragraphs are oriented - * so their "up" is the top of the page (page up). The photo credit is read - * with one's head turned leftward ("up" is to page left). - * - * The values of this enum match the convention of Tesseract's osdetect.h -*/ -enum Orientation { - ORIENTATION_PAGE_UP = 0, - ORIENTATION_PAGE_RIGHT = 1, - ORIENTATION_PAGE_DOWN = 2, - ORIENTATION_PAGE_LEFT = 3, -}; - -/** - * The grapheme clusters within a line of text are laid out logically - * in this direction, judged when looking at the text line rotated so that - * its Orientation is "page up". - * - * For English text, the writing direction is left-to-right. For the - * Chinese text in the above example, the writing direction is top-to-bottom. - */ -enum WritingDirection { - WRITING_DIRECTION_LEFT_TO_RIGHT = 0, - WRITING_DIRECTION_RIGHT_TO_LEFT = 1, - WRITING_DIRECTION_TOP_TO_BOTTOM = 2, -}; - -/** - * The text lines are read in the given sequence. - * - * In English, the order is top-to-bottom. - * In Chinese, vertical text lines are read right-to-left. Mongolian is - * written in vertical columns top to bottom like Chinese, but the lines - * order left-to right. - * - * Note that only some combinations make sense. For example, - * WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM - */ -enum TextlineOrder { - TEXTLINE_ORDER_LEFT_TO_RIGHT = 0, - TEXTLINE_ORDER_RIGHT_TO_LEFT = 1, - TEXTLINE_ORDER_TOP_TO_BOTTOM = 2, -}; - -/** - * Possible modes for page layout analysis. These *must* be kept in order - * of decreasing amount of layout analysis to be done, except for OSD_ONLY, - * so that the inequality test macros below work. - */ -enum PageSegMode { - PSM_OSD_ONLY = 0, ///< Orientation and script detection only. - PSM_AUTO_OSD = 1, ///< Automatic page segmentation with orientation and - ///< script detection. (OSD) - PSM_AUTO_ONLY = 2, ///< Automatic page segmentation, but no OSD, or OCR. - PSM_AUTO = 3, ///< Fully automatic page segmentation, but no OSD. - PSM_SINGLE_COLUMN = 4, ///< Assume a single column of text of variable sizes. - PSM_SINGLE_BLOCK_VERT_TEXT = 5, ///< Assume a single uniform block of - ///< vertically aligned text. - PSM_SINGLE_BLOCK = 6, ///< Assume a single uniform block of text. (Default.) - PSM_SINGLE_LINE = 7, ///< Treat the image as a single text line. - PSM_SINGLE_WORD = 8, ///< Treat the image as a single word. - PSM_CIRCLE_WORD = 9, ///< Treat the image as a single word in a circle. - PSM_SINGLE_CHAR = 10, ///< Treat the image as a single character. - PSM_SPARSE_TEXT = - 11, ///< Find as much text as possible in no particular order. - PSM_SPARSE_TEXT_OSD = 12, ///< Sparse text with orientation and script det. - PSM_RAW_LINE = 13, ///< Treat the image as a single text line, bypassing - ///< hacks that are Tesseract-specific. - - PSM_COUNT ///< Number of enum entries. -}; - -/** - * Inline functions that act on a PageSegMode to determine whether components of - * layout analysis are enabled. - * *Depend critically on the order of elements of PageSegMode.* - * NOTE that arg is an int for compatibility with INT_PARAM. - */ -inline bool PSM_OSD_ENABLED(int pageseg_mode) { - return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_ORIENTATION_ENABLED(int pageseg_mode) { - return pageseg_mode <= PSM_AUTO || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_COL_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO; -} -inline bool PSM_SPARSE(int pageseg_mode) { - return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN; -} -inline bool PSM_LINE_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK; -} -inline bool PSM_WORD_FIND_ENABLED(int pageseg_mode) { - return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) || - pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} - -/** - * enum of the elements of the page hierarchy, used in ResultIterator - * to provide functions that operate on each level without having to - * have 5x as many functions. - */ -enum PageIteratorLevel { - RIL_BLOCK, // Block of text/image/separator line. - RIL_PARA, // Paragraph within a block. - RIL_TEXTLINE, // Line within a paragraph. - RIL_WORD, // Word within a textline. - RIL_SYMBOL // Symbol/character within a word. -}; - -/** - * JUSTIFICATION_UNKNOWN - * The alignment is not clearly one of the other options. This could happen - * for example if there are only one or two lines of text or the text looks - * like source code or poetry. - * - * NOTA BENE: Fully justified paragraphs (text aligned to both left and right - * margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text - * is written with a left-to-right script and with JUSTIFICATION_RIGHT if - * their text is written in a right-to-left script. - * - * Interpretation for text read in vertical lines: - * "Left" is wherever the starting reading position is. - * - * JUSTIFICATION_LEFT - * Each line, except possibly the first, is flush to the same left tab stop. - * - * JUSTIFICATION_CENTER - * The text lines of the paragraph are centered about a line going - * down through their middle of the text lines. - * - * JUSTIFICATION_RIGHT - * Each line, except possibly the first, is flush to the same right tab stop. - */ -enum ParagraphJustification { - JUSTIFICATION_UNKNOWN, - JUSTIFICATION_LEFT, - JUSTIFICATION_CENTER, - JUSTIFICATION_RIGHT, -}; - -/** - * When Tesseract/Cube is initialized we can choose to instantiate/load/run - * only the Tesseract part, only the Cube part or both along with the combiner. - * The preference of which engine to use is stored in tessedit_ocr_engine_mode. - * - * ATTENTION: When modifying this enum, please make sure to make the - * appropriate changes to all the enums mirroring it (e.g. OCREngine in - * cityblock/workflow/detection/detection_storage.proto). Such enums will - * mention the connection to OcrEngineMode in the comments. - */ -enum OcrEngineMode { - OEM_TESSERACT_ONLY, // Run Tesseract only - fastest; deprecated - OEM_LSTM_ONLY, // Run just the LSTM line recognizer. - OEM_TESSERACT_LSTM_COMBINED, // Run the LSTM recognizer, but allow fallback - // to Tesseract when things get difficult. - // deprecated - OEM_DEFAULT, // Specify this mode when calling init_*(), - // to indicate that any of the above modes - // should be automatically inferred from the - // variables in the language-specific config, - // command-line configs, or if not specified - // in any of the above should be set to the - // default OEM_TESSERACT_ONLY. - OEM_COUNT // Number of OEMs -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCSTRUCT_PUBLICTYPES_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/renderer.h b/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/renderer.h deleted file mode 100644 index 6f405233..00000000 --- a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/renderer.h +++ /dev/null @@ -1,311 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: renderer.h -// Description: Rendering interface to inject into TessBaseAPI -// -// (C) Copyright 2011, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_RENDERER_H_ -#define TESSERACT_API_RENDERER_H_ - -#include "export.h" - -// To avoid collision with other typenames include the ABSOLUTE MINIMUM -// complexity of includes here. Use forward declarations wherever possible -// and hide includes of complex types in baseapi.cpp. -#include -#include // for std::string -#include // for std::vector - -struct Pix; - -namespace tesseract { - -class TessBaseAPI; - -/** - * Interface for rendering tesseract results into a document, such as text, - * HOCR or pdf. This class is abstract. Specific classes handle individual - * formats. This interface is then used to inject the renderer class into - * tesseract when processing images. - * - * For simplicity implementing this with tesseract version 3.01, - * the renderer contains document state that is cleared from document - * to document just as the TessBaseAPI is. This way the base API can just - * delegate its rendering functionality to injected renderers, and the - * renderers can manage the associated state needed for the specific formats - * in addition to the heuristics for producing it. - */ -class TESS_API TessResultRenderer { -public: - virtual ~TessResultRenderer(); - - // Takes ownership of pointer so must be new'd instance. - // Renderers aren't ordered, but appends the sequences of next parameter - // and existing next(). The renderers should be unique across both lists. - void insert(TessResultRenderer *next); - - // Returns the next renderer or nullptr. - TessResultRenderer *next() { - return next_; - } - - /** - * Starts a new document with the given title. - * This clears the contents of the output data. - * Title should use UTF-8 encoding. - */ - bool BeginDocument(const char *title); - - /** - * Adds the recognized text from the source image to the current document. - * Invalid if BeginDocument not yet called. - * - * Note that this API is a bit weird but is designed to fit into the - * current TessBaseAPI implementation where the api has lots of state - * information that we might want to add in. - */ - bool AddImage(TessBaseAPI *api); - - /** - * Finishes the document and finalizes the output data - * Invalid if BeginDocument not yet called. - */ - bool EndDocument(); - - const char *file_extension() const { - return file_extension_; - } - const char *title() const { - return title_.c_str(); - } - - // Is everything fine? Otherwise something went wrong. - bool happy() const { - return happy_; - } - - /** - * Returns the index of the last image given to AddImage - * (i.e. images are incremented whether the image succeeded or not) - * - * This is always defined. It means either the number of the - * current image, the last image ended, or in the completed document - * depending on when in the document lifecycle you are looking at it. - * Will return -1 if a document was never started. - */ - int imagenum() const { - return imagenum_; - } - -protected: - /** - * Called by concrete classes. - * - * outputbase is the name of the output file excluding - * extension. For example, "/path/to/chocolate-chip-cookie-recipe" - * - * extension indicates the file extension to be used for output - * files. For example "pdf" will produce a .pdf file, and "hocr" - * will produce .hocr files. - */ - TessResultRenderer(const char *outputbase, const char *extension); - - // Hook for specialized handling in BeginDocument() - virtual bool BeginDocumentHandler(); - - // This must be overridden to render the OCR'd results - virtual bool AddImageHandler(TessBaseAPI *api) = 0; - - // Hook for specialized handling in EndDocument() - virtual bool EndDocumentHandler(); - - // Renderers can call this to append '\0' terminated strings into - // the output string returned by GetOutput. - // This method will grow the output buffer if needed. - void AppendString(const char *s); - - // Renderers can call this to append binary byte sequences into - // the output string returned by GetOutput. Note that s is not necessarily - // '\0' terminated (and can contain '\0' within it). - // This method will grow the output buffer if needed. - void AppendData(const char *s, int len); - -private: - TessResultRenderer *next_; // Can link multiple renderers together - FILE *fout_; // output file pointer - const char *file_extension_; // standard extension for generated output - std::string title_; // title of document being rendered - int imagenum_; // index of last image added - bool happy_; // I get grumpy when the disk fills up, etc. -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessTextRenderer : public TessResultRenderer { -public: - explicit TessTextRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into an hocr text string - */ -class TESS_API TessHOcrRenderer : public TessResultRenderer { -public: - explicit TessHOcrRenderer(const char *outputbase, bool font_info); - explicit TessHOcrRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool font_info_; // whether to print font information -}; - -/** - * Renders tesseract output into an alto text string - */ -class TESS_API TessAltoRenderer : public TessResultRenderer { -public: - explicit TessAltoRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool begin_document; -}; - -/** - * Renders Tesseract output into a TSV string - */ -class TESS_API TessTsvRenderer : public TessResultRenderer { -public: - explicit TessTsvRenderer(const char *outputbase, bool font_info); - explicit TessTsvRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool font_info_; // whether to print font information -}; - -/** - * Renders tesseract output into searchable PDF - */ -class TESS_API TessPDFRenderer : public TessResultRenderer { -public: - // datadir is the location of the TESSDATA. We need it because - // we load a custom PDF font from this location. - TessPDFRenderer(const char *outputbase, const char *datadir, - bool textonly = false); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - // We don't want to have every image in memory at once, - // so we store some metadata as we go along producing - // PDFs one page at a time. At the end, that metadata is - // used to make everything that isn't easily handled in a - // streaming fashion. - long int obj_; // counter for PDF objects - std::vector offsets_; // offset of every PDF object in bytes - std::vector pages_; // object number for every /Page object - std::string datadir_; // where to find the custom font - bool textonly_; // skip images if set - // Bookkeeping only. DIY = Do It Yourself. - void AppendPDFObjectDIY(size_t objectsize); - // Bookkeeping + emit data. - void AppendPDFObject(const char *data); - // Create the /Contents object for an entire page. - char *GetPDFTextObjects(TessBaseAPI *api, double width, double height); - // Turn an image into a PDF object. Only transcode if we have to. - static bool imageToPDFObj(Pix *pix, const char *filename, long int objnum, - char **pdf_object, long int *pdf_object_size, - int jpg_quality); -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessUnlvRenderer : public TessResultRenderer { -public: - explicit TessUnlvRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string for LSTMBox - */ -class TESS_API TessLSTMBoxRenderer : public TessResultRenderer { -public: - explicit TessLSTMBoxRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessBoxTextRenderer : public TessResultRenderer { -public: - explicit TessBoxTextRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string in WordStr format - */ -class TESS_API TessWordStrBoxRenderer : public TessResultRenderer { -public: - explicit TessWordStrBoxRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -#ifndef DISABLED_LEGACY_ENGINE - -/** - * Renders tesseract output into an osd text string - */ -class TESS_API TessOsdRenderer : public TessResultRenderer { -public: - explicit TessOsdRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -#endif // ndef DISABLED_LEGACY_ENGINE - -} // namespace tesseract. - -#endif // TESSERACT_API_RENDERER_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/resultiterator.h b/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/resultiterator.h deleted file mode 100644 index 3e4d5807..00000000 --- a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/resultiterator.h +++ /dev/null @@ -1,250 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: resultiterator.h -// Description: Iterator for tesseract results that is capable of -// iterating in proper reading order over Bi Directional -// (e.g. mixed Hebrew and English) text. -// Author: David Eger -// -// (C) Copyright 2011, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_ -#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_ - -#include "export.h" // for TESS_API, TESS_LOCAL -#include "ltrresultiterator.h" // for LTRResultIterator -#include "publictypes.h" // for PageIteratorLevel -#include "unichar.h" // for StrongScriptDirection - -#include // for std::pair -#include // for std::vector - -namespace tesseract { - -class TESS_API ResultIterator : public LTRResultIterator { -public: - static ResultIterator *StartOfParagraph(const LTRResultIterator &resit); - - /** - * ResultIterator is copy constructible! - * The default copy constructor works just fine for us. - */ - ~ResultIterator() override = default; - - // ============= Moving around within the page ============. - /** - * Moves the iterator to point to the start of the page to begin - * an iteration. - */ - void Begin() override; - - /** - * Moves to the start of the next object at the given level in the - * page hierarchy in the appropriate reading order and returns false if - * the end of the page was reached. - * NOTE that RIL_SYMBOL will skip non-text blocks, but all other - * PageIteratorLevel level values will visit each non-text block once. - * Think of non text blocks as containing a single para, with a single line, - * with a single imaginary word. - * Calls to Next with different levels may be freely intermixed. - * This function iterates words in right-to-left scripts correctly, if - * the appropriate language has been loaded into Tesseract. - */ - bool Next(PageIteratorLevel level) override; - - /** - * IsAtBeginningOf() returns whether we're at the logical beginning of the - * given level. (as opposed to ResultIterator's left-to-right top-to-bottom - * order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf(). - * For a full description, see pageiterator.h - */ - bool IsAtBeginningOf(PageIteratorLevel level) const override; - - /** - * Implement PageIterator's IsAtFinalElement correctly in a BiDi context. - * For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we - * point at the last word in a paragraph. See PageIterator for full comment. - */ - bool IsAtFinalElement(PageIteratorLevel level, - PageIteratorLevel element) const override; - - // ============= Functions that refer to words only ============. - // Returns the number of blanks before the current word. - int BlanksBeforeWord() const; - - // ============= Accessing data ==============. - - /** - * Returns the null terminated UTF-8 encoded text string for the current - * object at the given level. Use delete [] to free after use. - */ - virtual char *GetUTF8Text(PageIteratorLevel level) const; - - /** - * Returns the LSTM choices for every LSTM timestep for the current word. - */ - virtual std::vector>>> - *GetRawLSTMTimesteps() const; - virtual std::vector>> - *GetBestLSTMSymbolChoices() const; - - /** - * Return whether the current paragraph's dominant reading direction - * is left-to-right (as opposed to right-to-left). - */ - bool ParagraphIsLtr() const; - - // ============= Exposed only for testing =============. - - /** - * Yields the reading order as a sequence of indices and (optional) - * meta-marks for a set of words (given left-to-right). - * The meta marks are passed as negative values: - * kMinorRunStart Start of minor direction text. - * kMinorRunEnd End of minor direction text. - * kComplexWord The next indexed word contains both left-to-right and - * right-to-left characters and was treated as neutral. - * - * For example, suppose we have five words in a text line, - * indexed [0,1,2,3,4] from the leftmost side of the text line. - * The following are all believable reading_orders: - * - * Left-to-Right (in ltr paragraph): - * { 0, 1, 2, 3, 4 } - * Left-to-Right (in rtl paragraph): - * { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd } - * Right-to-Left (in rtl paragraph): - * { 4, 3, 2, 1, 0 } - * Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph: - * { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 } - */ - static void CalculateTextlineOrder( - bool paragraph_is_ltr, - const std::vector &word_dirs, - std::vector *reading_order); - - static const int kMinorRunStart; - static const int kMinorRunEnd; - static const int kComplexWord; - -protected: - /** - * We presume the data associated with the given iterator will outlive us. - * NB: This is private because it does something that is non-obvious: - * it resets to the beginning of the paragraph instead of staying wherever - * resit might have pointed. - */ - explicit ResultIterator(const LTRResultIterator &resit); - -private: - /** - * Calculates the current paragraph's dominant writing direction. - * Typically, members should use current_paragraph_ltr_ instead. - */ - bool CurrentParagraphIsLtr() const; - - /** - * Returns word indices as measured from resit->RestartRow() = index 0 - * for the reading order of words within a textline given an iterator - * into the middle of the text line. - * In addition to non-negative word indices, the following negative values - * may be inserted: - * kMinorRunStart Start of minor direction text. - * kMinorRunEnd End of minor direction text. - * kComplexWord The previous word contains both left-to-right and - * right-to-left characters and was treated as neutral. - */ - void CalculateTextlineOrder(bool paragraph_is_ltr, - const LTRResultIterator &resit, - std::vector *indices) const; - /** Same as above, but the caller's ssd gets filled in if ssd != nullptr. */ - void CalculateTextlineOrder(bool paragraph_is_ltr, - const LTRResultIterator &resit, - std::vector *ssd, - std::vector *indices) const; - - /** - * What is the index of the current word in a strict left-to-right reading - * of the row? - */ - int LTRWordIndex() const; - - /** - * Given an iterator pointing at a word, returns the logical reading order - * of blob indices for the word. - */ - void CalculateBlobOrder(std::vector *blob_indices) const; - - /** Precondition: current_paragraph_is_ltr_ is set. */ - void MoveToLogicalStartOfTextline(); - - /** - * Precondition: current_paragraph_is_ltr_ and in_minor_direction_ - * are set. - */ - void MoveToLogicalStartOfWord(); - - /** Are we pointing at the final (reading order) symbol of the word? */ - bool IsAtFinalSymbolOfWord() const; - - /** Are we pointing at the first (reading order) symbol of the word? */ - bool IsAtFirstSymbolOfWord() const; - - /** - * Append any extra marks that should be appended to this word when printed. - * Mostly, these are Unicode BiDi control characters. - */ - void AppendSuffixMarks(std::string *text) const; - - /** Appends the current word in reading order to the given buffer.*/ - void AppendUTF8WordText(std::string *text) const; - - /** - * Appends the text of the current text line, *assuming this iterator is - * positioned at the beginning of the text line* This function - * updates the iterator to point to the first position past the text line. - * Each textline is terminated in a single newline character. - * If the textline ends a paragraph, it gets a second terminal newline. - */ - void IterateAndAppendUTF8TextlineText(std::string *text); - - /** - * Appends the text of the current paragraph in reading order - * to the given buffer. - * Each textline is terminated in a single newline character, and the - * paragraph gets an extra newline at the end. - */ - void AppendUTF8ParagraphText(std::string *text) const; - - /** Returns whether the bidi_debug flag is set to at least min_level. */ - bool BidiDebug(int min_level) const; - - bool current_paragraph_is_ltr_; - - /** - * Is the currently pointed-at character at the beginning of - * a minor-direction run? - */ - bool at_beginning_of_minor_run_; - - /** Is the currently pointed-at character in a minor-direction sequence? */ - bool in_minor_direction_; - - /** - * Should detected inter-word spaces be preserved, or "compressed" to a single - * space character (default behavior). - */ - bool preserve_interword_spaces_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/unichar.h b/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/unichar.h deleted file mode 100644 index 015109d7..00000000 --- a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/unichar.h +++ /dev/null @@ -1,174 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: unichar.h -// Description: Unicode character/ligature class. -// Author: Ray Smith -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCUTIL_UNICHAR_H_ -#define TESSERACT_CCUTIL_UNICHAR_H_ - -#include "export.h" - -#include -#include -#include -#include - -namespace tesseract { - -// Maximum number of characters that can be stored in a UNICHAR. Must be -// at least 4. Must not exceed 31 without changing the coding of length. -#define UNICHAR_LEN 30 - -// A UNICHAR_ID is the unique id of a unichar. -using UNICHAR_ID = int; - -// A variable to indicate an invalid or uninitialized unichar id. -static const int INVALID_UNICHAR_ID = -1; -// A special unichar that corresponds to INVALID_UNICHAR_ID. -static const char INVALID_UNICHAR[] = "__INVALID_UNICHAR__"; - -enum StrongScriptDirection { - DIR_NEUTRAL = 0, // Text contains only neutral characters. - DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters. - DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters. - DIR_MIX = 3, // Text contains a mixture of left-to-right - // and right-to-left characters. -}; - -using char32 = signed int; - -// The UNICHAR class holds a single classification result. This may be -// a single Unicode character (stored as between 1 and 4 utf8 bytes) or -// multiple Unicode characters representing the NFKC expansion of a ligature -// such as fi, ffl etc. These are also stored as utf8. -class TESS_API UNICHAR { -public: - UNICHAR() { - memset(chars, 0, UNICHAR_LEN); - } - - // Construct from a utf8 string. If len<0 then the string is null terminated. - // If the string is too long to fit in the UNICHAR then it takes only what - // will fit. - UNICHAR(const char *utf8_str, int len); - - // Construct from a single UCS4 character. - explicit UNICHAR(int unicode); - - // Default copy constructor and operator= are OK. - - // Get the first character as UCS-4. - int first_uni() const; - - // Get the length of the UTF8 string. - int utf8_len() const { - int len = chars[UNICHAR_LEN - 1]; - return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN; - } - - // Get a UTF8 string, but NOT nullptr terminated. - const char *utf8() const { - return chars; - } - - // Get a terminated UTF8 string: Must delete[] it after use. - char *utf8_str() const; - - // Get the number of bytes in the first character of the given utf8 string. - static int utf8_step(const char *utf8_str); - - // A class to simplify iterating over and accessing elements of a UTF8 - // string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or - // take ownership of the underlying byte array. It also does not permit - // modification of the array (as the name suggests). - // - // Example: - // for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len); - // it != UNICHAR::end(str, len); - // ++it) { - // printf("UCS-4 symbol code = %d\n", *it); - // char buf[5]; - // int char_len = it.get_utf8(buf); buf[char_len] = '\0'; - // printf("Char = %s\n", buf); - // } - class TESS_API const_iterator { - using CI = const_iterator; - - public: - // Step to the next UTF8 character. - // If the current position is at an illegal UTF8 character, then print an - // error message and step by one byte. If the current position is at a - // nullptr value, don't step past it. - const_iterator &operator++(); - - // Return the UCS-4 value at the current position. - // If the current position is at an illegal UTF8 value, return a single - // space character. - int operator*() const; - - // Store the UTF-8 encoding of the current codepoint into buf, which must be - // at least 4 bytes long. Return the number of bytes written. - // If the current position is at an illegal UTF8 value, writes a single - // space character and returns 1. - // Note that this method does not null-terminate the buffer. - int get_utf8(char *buf) const; - // Returns the number of bytes of the current codepoint. Returns 1 if the - // current position is at an illegal UTF8 value. - int utf8_len() const; - // Returns true if the UTF-8 encoding at the current position is legal. - bool is_legal() const; - - // Return the pointer into the string at the current position. - const char *utf8_data() const { - return it_; - } - - // Iterator equality operators. - friend bool operator==(const CI &lhs, const CI &rhs) { - return lhs.it_ == rhs.it_; - } - friend bool operator!=(const CI &lhs, const CI &rhs) { - return !(lhs == rhs); - } - - private: - friend class UNICHAR; - explicit const_iterator(const char *it) : it_(it) {} - - const char *it_; // Pointer into the string. - }; - - // Create a start/end iterator pointing to a string. Note that these methods - // are static and do NOT create a copy or take ownership of the underlying - // array. - static const_iterator begin(const char *utf8_str, int byte_length); - static const_iterator end(const char *utf8_str, int byte_length); - - // Converts a utf-8 string to a vector of unicodes. - // Returns an empty vector if the input contains invalid UTF-8. - static std::vector UTF8ToUTF32(const char *utf8_str); - // Converts a vector of unicodes to a utf8 string. - // Returns an empty string if the input contains an invalid unicode. - static std::string UTF32ToUTF8(const std::vector &str32); - -private: - // A UTF-8 representation of 1 or more Unicode characters. - // The last element (chars[UNICHAR_LEN - 1]) is a length if - // its value < UNICHAR_LEN, otherwise it is a genuine character. - char chars[UNICHAR_LEN]{}; -}; - -} // namespace tesseract - -#endif // TESSERACT_CCUTIL_UNICHAR_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/version.h b/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/version.h deleted file mode 100644 index 6bac5d66..00000000 --- a/third_party/ocr/tesseract-ocr/uos/loongarch64/include/tesseract/version.h +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: version.h -// Description: Version information -// -// (C) Copyright 2018, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_VERSION_H_ -#define TESSERACT_API_VERSION_H_ - -// clang-format off - -#define TESSERACT_MAJOR_VERSION @GENERIC_MAJOR_VERSION@ -#define TESSERACT_MINOR_VERSION @GENERIC_MINOR_VERSION@ -#define TESSERACT_MICRO_VERSION @GENERIC_MICRO_VERSION@ - -#define TESSERACT_VERSION \ - (TESSERACT_MAJOR_VERSION << 16 | \ - TESSERACT_MINOR_VERSION << 8 | \ - TESSERACT_MICRO_VERSION) - -#define TESSERACT_VERSION_STR "@PACKAGE_VERSION@" - -// clang-format on - -#endif // TESSERACT_API_VERSION_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/baseapi.h b/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/baseapi.h deleted file mode 100644 index 5e1e4830..00000000 --- a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/baseapi.h +++ /dev/null @@ -1,812 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: baseapi.h -// Description: Simple API for calling tesseract. -// Author: Ray Smith -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_BASEAPI_H_ -#define TESSERACT_API_BASEAPI_H_ - -#ifdef HAVE_CONFIG_H -# include "config_auto.h" // DISABLED_LEGACY_ENGINE -#endif - -#include "export.h" -#include "pageiterator.h" -#include "publictypes.h" -#include "resultiterator.h" -#include "unichar.h" - -#include "version.h" - -#include -#include // for std::vector - -struct Pix; -struct Pixa; -struct Boxa; - -namespace tesseract { - -class PAGE_RES; -class ParagraphModel; -class BLOCK_LIST; -class ETEXT_DESC; -struct OSResults; -class UNICHARSET; - -class Dawg; -class Dict; -class EquationDetect; -class PageIterator; -class ImageThresholder; -class LTRResultIterator; -class ResultIterator; -class MutableIterator; -class TessResultRenderer; -class Tesseract; - -// Function to read a std::vector from a whole file. -// Returns false on failure. -using FileReader = bool (*)(const char *filename, std::vector *data); - -using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID, - bool) const; -using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *, - int, const char *, int); - -/** - * Base class for all tesseract APIs. - * Specific classes can add ability to work on different inputs or produce - * different outputs. - * This class is mostly an interface layer on top of the Tesseract instance - * class to hide the data types so that users of this class don't have to - * include any other Tesseract headers. - */ -class TESS_API TessBaseAPI { -public: - TessBaseAPI(); - virtual ~TessBaseAPI(); - // Copy constructor and assignment operator are currently unsupported. - TessBaseAPI(TessBaseAPI const &) = delete; - TessBaseAPI &operator=(TessBaseAPI const &) = delete; - - /** - * Returns the version identifier as a static string. Do not delete. - */ - static const char *Version(); - - /** - * If compiled with OpenCL AND an available OpenCL - * device is deemed faster than serial code, then - * "device" is populated with the cl_device_id - * and returns sizeof(cl_device_id) - * otherwise *device=nullptr and returns 0. - */ - static size_t getOpenCLDevice(void **device); - - /** - * Set the name of the input file. Needed for training and - * reading a UNLV zone file, and for searchable PDF output. - */ - void SetInputName(const char *name); - /** - * These functions are required for searchable PDF output. - * We need our hands on the input file so that we can include - * it in the PDF without transcoding. If that is not possible, - * we need the original image. Finally, resolution metadata - * is stored in the PDF so we need that as well. - */ - const char *GetInputName(); - // Takes ownership of the input pix. - void SetInputImage(Pix *pix); - Pix *GetInputImage(); - int GetSourceYResolution(); - const char *GetDatapath(); - - /** Set the name of the bonus output files. Needed only for debugging. */ - void SetOutputName(const char *name); - - /** - * Set the value of an internal "parameter." - * Supply the name of the parameter and the value as a string, just as - * you would in a config file. - * Returns false if the name lookup failed. - * Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z. - * Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode. - * SetVariable may be used before Init, but settings will revert to - * defaults on End(). - * - * Note: Must be called after Init(). Only works for non-init variables - * (init variables should be passed to Init()). - */ - bool SetVariable(const char *name, const char *value); - bool SetDebugVariable(const char *name, const char *value); - - /** - * Returns true if the parameter was found among Tesseract parameters. - * Fills in value with the value of the parameter. - */ - bool GetIntVariable(const char *name, int *value) const; - bool GetBoolVariable(const char *name, bool *value) const; - bool GetDoubleVariable(const char *name, double *value) const; - - /** - * Returns the pointer to the string that represents the value of the - * parameter if it was found among Tesseract parameters. - */ - const char *GetStringVariable(const char *name) const; - -#ifndef DISABLED_LEGACY_ENGINE - - /** - * Print Tesseract fonts table to the given file. - */ - void PrintFontsTable(FILE *fp) const; - -#endif - - /** - * Print Tesseract parameters to the given file. - */ - void PrintVariables(FILE *fp) const; - - /** - * Get value of named variable as a string, if it exists. - */ - bool GetVariableAsString(const char *name, std::string *val) const; - - /** - * Instances are now mostly thread-safe and totally independent, - * but some global parameters remain. Basically it is safe to use multiple - * TessBaseAPIs in different threads in parallel, UNLESS: - * you use SetVariable on some of the Params in classify and textord. - * If you do, then the effect will be to change it for all your instances. - * - * Start tesseract. Returns zero on success and -1 on failure. - * NOTE that the only members that may be called before Init are those - * listed above here in the class definition. - * - * The datapath must be the name of the tessdata directory. - * The language is (usually) an ISO 639-3 string or nullptr will default to - * eng. It is entirely safe (and eventually will be efficient too) to call - * Init multiple times on the same instance to change language, or just - * to reset the classifier. - * The language may be a string of the form [~][+[~]]* indicating - * that multiple languages are to be loaded. Eg hin+eng will load Hindi and - * English. Languages may specify internally that they want to be loaded - * with one or more other languages, so the ~ sign is available to override - * that. Eg if hin were set to load eng by default, then hin+~eng would force - * loading only hin. The number of loaded languages is limited only by - * memory, with the caveat that loading additional languages will impact - * both speed and accuracy, as there is more work to do to decide on the - * applicable language, and there is more chance of hallucinating incorrect - * words. - * WARNING: On changing languages, all Tesseract parameters are reset - * back to their default values. (Which may vary between languages.) - * If you have a rare need to set a Variable that controls - * initialization for a second call to Init you should explicitly - * call End() and then use SetVariable before Init. This is only a very - * rare use case, since there are very few uses that require any parameters - * to be set before Init. - * - * If set_only_non_debug_params is true, only params that do not contain - * "debug" in the name will be set. - */ - int Init(const char *datapath, const char *language, OcrEngineMode mode, - char **configs, int configs_size, - const std::vector *vars_vec, - const std::vector *vars_values, - bool set_only_non_debug_params); - int Init(const char *datapath, const char *language, OcrEngineMode oem) { - return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false); - } - int Init(const char *datapath, const char *language) { - return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr, - false); - } - // In-memory version reads the traineddata file directly from the given - // data[data_size] array, and/or reads data via a FileReader. - int Init(const char *data, int data_size, const char *language, - OcrEngineMode mode, char **configs, int configs_size, - const std::vector *vars_vec, - const std::vector *vars_values, - bool set_only_non_debug_params, FileReader reader); - - /** - * Returns the languages string used in the last valid initialization. - * If the last initialization specified "deu+hin" then that will be - * returned. If hin loaded eng automatically as well, then that will - * not be included in this list. To find the languages actually - * loaded use GetLoadedLanguagesAsVector. - * The returned string should NOT be deleted. - */ - const char *GetInitLanguagesAsString() const; - - /** - * Returns the loaded languages in the vector of std::string. - * Includes all languages loaded by the last Init, including those loaded - * as dependencies of other loaded languages. - */ - void GetLoadedLanguagesAsVector(std::vector *langs) const; - - /** - * Returns the available languages in the sorted vector of std::string. - */ - void GetAvailableLanguagesAsVector(std::vector *langs) const; - - /** - * Init only for page layout analysis. Use only for calls to SetImage and - * AnalysePage. Calls that attempt recognition will generate an error. - */ - void InitForAnalysePage(); - - /** - * Read a "config" file containing a set of param, value pairs. - * Searches the standard places: tessdata/configs, tessdata/tessconfigs - * and also accepts a relative or absolute path name. - * Note: only non-init params will be set (init params are set by Init()). - */ - void ReadConfigFile(const char *filename); - /** Same as above, but only set debug params from the given config file. */ - void ReadDebugConfigFile(const char *filename); - - /** - * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK. - * The mode is stored as an IntParam so it can also be modified by - * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string). - */ - void SetPageSegMode(PageSegMode mode); - - /** Return the current page segmentation mode. */ - PageSegMode GetPageSegMode() const; - - /** - * Recognize a rectangle from an image and return the result as a string. - * May be called many times for a single Init. - * Currently has no error checking. - * Greyscale of 8 and color of 24 or 32 bits per pixel may be given. - * Palette color images will not work properly and must be converted to - * 24 bit. - * Binary images of 1 bit per pixel may also be given but they must be - * byte packed with the MSB of the first byte being the first pixel, and a - * 1 represents WHITE. For binary images set bytes_per_pixel=0. - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - * - * Note that TesseractRect is the simplified convenience interface. - * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize, - * and one or more of the Get*Text functions below. - */ - char *TesseractRect(const unsigned char *imagedata, int bytes_per_pixel, - int bytes_per_line, int left, int top, int width, - int height); - - /** - * Call between pages or documents etc to free up memory and forget - * adaptive data. - */ - void ClearAdaptiveClassifier(); - - /** - * @defgroup AdvancedAPI Advanced API - * The following methods break TesseractRect into pieces, so you can - * get hold of the thresholded image, get the text in different formats, - * get bounding boxes, confidences etc. - */ - /* @{ */ - - /** - * Provide an image for Tesseract to recognize. Format is as - * TesseractRect above. Copies the image buffer and converts to Pix. - * SetImage clears all recognition results, and sets the rectangle to the - * full image, so it may be followed immediately by a GetUTF8Text, and it - * will automatically perform recognition. - */ - void SetImage(const unsigned char *imagedata, int width, int height, - int bytes_per_pixel, int bytes_per_line); - - /** - * Provide an image for Tesseract to recognize. As with SetImage above, - * Tesseract takes its own copy of the image, so it need not persist until - * after Recognize. - * Pix vs raw, which to use? - * Use Pix where possible. Tesseract uses Pix as its internal representation - * and it is therefore more efficient to provide a Pix directly. - */ - void SetImage(Pix *pix); - - /** - * Set the resolution of the source image in pixels per inch so font size - * information can be calculated in results. Call this after SetImage(). - */ - void SetSourceResolution(int ppi); - - /** - * Restrict recognition to a sub-rectangle of the image. Call after SetImage. - * Each SetRectangle clears the recogntion results so multiple rectangles - * can be recognized with the same image. - */ - void SetRectangle(int left, int top, int width, int height); - - /** - * Get a copy of the internal thresholded image from Tesseract. - * Caller takes ownership of the Pix and must pixDestroy it. - * May be called any time after SetImage, or after TesseractRect. - */ - Pix *GetThresholdedImage(); - - /** - * Get the result of page layout analysis as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - */ - Boxa *GetRegions(Pixa **pixa); - - /** - * Get the textlines as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * If raw_image is true, then extract from the original image instead of the - * thresholded image and pad by raw_padding pixels. - * If blockids is not nullptr, the block-id of each line is also returned as - * an array of one element per line. delete [] after use. If paraids is not - * nullptr, the paragraph-id of each line within its block is also returned as - * an array of one element per line. delete [] after use. - */ - Boxa *GetTextlines(bool raw_image, int raw_padding, Pixa **pixa, - int **blockids, int **paraids); - /* - Helper method to extract from the thresholded image. (most common usage) -*/ - Boxa *GetTextlines(Pixa **pixa, int **blockids) { - return GetTextlines(false, 0, pixa, blockids, nullptr); - } - - /** - * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa - * pair, in reading order. Enables downstream handling of non-rectangular - * regions. - * Can be called before or after Recognize. - * If blockids is not nullptr, the block-id of each line is also returned as - * an array of one element per line. delete [] after use. - */ - Boxa *GetStrips(Pixa **pixa, int **blockids); - - /** - * Get the words as a leptonica-style - * Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - */ - Boxa *GetWords(Pixa **pixa); - - /** - * Gets the individual connected (text) components (created - * after pages segmentation step, but before recognition) - * as a leptonica-style Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * Note: the caller is responsible for calling boxaDestroy() - * on the returned Boxa array and pixaDestroy() on cc array. - */ - Boxa *GetConnectedComponents(Pixa **cc); - - /** - * Get the given level kind of components (block, textline, word etc.) as a - * leptonica-style Boxa, Pixa pair, in reading order. - * Can be called before or after Recognize. - * If blockids is not nullptr, the block-id of each component is also returned - * as an array of one element per component. delete [] after use. - * If blockids is not nullptr, the paragraph-id of each component with its - * block is also returned as an array of one element per component. delete [] - * after use. If raw_image is true, then portions of the original image are - * extracted instead of the thresholded image and padded with raw_padding. If - * text_only is true, then only text components are returned. - */ - Boxa *GetComponentImages(PageIteratorLevel level, bool text_only, - bool raw_image, int raw_padding, Pixa **pixa, - int **blockids, int **paraids); - // Helper function to get binary images with no padding (most common usage). - Boxa *GetComponentImages(const PageIteratorLevel level, const bool text_only, - Pixa **pixa, int **blockids) { - return GetComponentImages(level, text_only, false, 0, pixa, blockids, - nullptr); - } - - /** - * Returns the scale factor of the thresholded image that would be returned by - * GetThresholdedImage() and the various GetX() methods that call - * GetComponentImages(). - * Returns 0 if no thresholder has been set. - */ - int GetThresholdedImageScaleFactor() const; - - /** - * Runs page layout analysis in the mode set by SetPageSegMode. - * May optionally be called prior to Recognize to get access to just - * the page layout results. Returns an iterator to the results. - * If merge_similar_words is true, words are combined where suitable for use - * with a line recognizer. Use if you want to use AnalyseLayout to find the - * textlines, and then want to process textline fragments with an external - * line recognizer. - * Returns nullptr on error or an empty page. - * The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - PageIterator *AnalyseLayout(); - PageIterator *AnalyseLayout(bool merge_similar_words); - - /** - * Recognize the image from SetAndThresholdImage, generating Tesseract - * internal structures. Returns 0 on success. - * Optional. The Get*Text functions below will call Recognize if needed. - * After Recognize, the output is kept internally until the next SetImage. - */ - int Recognize(ETEXT_DESC *monitor); - - /** - * Methods to retrieve information after SetAndThresholdImage(), - * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.) - */ - - /** - * Turns images into symbolic text. - * - * filename can point to a single image, a multi-page TIFF, - * or a plain text list of image filenames. - * - * retry_config is useful for debugging. If not nullptr, you can fall - * back to an alternate configuration if a page fails for some - * reason. - * - * timeout_millisec terminates processing if any single page - * takes too long. Set to 0 for unlimited time. - * - * renderer is responible for creating the output. For example, - * use the TessTextRenderer if you want plaintext output, or - * the TessPDFRender to produce searchable PDF. - * - * If tessedit_page_number is non-negative, will only process that - * single page. Works for multi-page tiff file, or filelist. - * - * Returns true if successful, false on error. - */ - bool ProcessPages(const char *filename, const char *retry_config, - int timeout_millisec, TessResultRenderer *renderer); - // Does the real work of ProcessPages. - bool ProcessPagesInternal(const char *filename, const char *retry_config, - int timeout_millisec, TessResultRenderer *renderer); - - /** - * Turn a single image into symbolic text. - * - * The pix is the image processed. filename and page_index are - * metadata used by side-effect processes, such as reading a box - * file or formatting as hOCR. - * - * See ProcessPages for descriptions of other parameters. - */ - bool ProcessPage(Pix *pix, int page_index, const char *filename, - const char *retry_config, int timeout_millisec, - TessResultRenderer *renderer); - - /** - * Get a reading-order iterator to the results of LayoutAnalysis and/or - * Recognize. The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - ResultIterator *GetIterator(); - - /** - * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize. - * The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - MutableIterator *GetMutableIterator(); - - /** - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - */ - char *GetUTF8Text(); - - /** - * Make a HTML-formatted string with hOCR markup from the internal - * data structures. - * page_number is 0-based but will appear in the output as 1-based. - * monitor can be used to - * cancel the recognition - * receive progress callbacks - * Returned string must be freed with the delete [] operator. - */ - char *GetHOCRText(ETEXT_DESC *monitor, int page_number); - - /** - * Make a HTML-formatted string with hOCR markup from the internal - * data structures. - * page_number is 0-based but will appear in the output as 1-based. - * Returned string must be freed with the delete [] operator. - */ - char *GetHOCRText(int page_number); - - /** - * Make an XML-formatted string with Alto markup from the internal - * data structures. - */ - char *GetAltoText(ETEXT_DESC *monitor, int page_number); - - /** - * Make an XML-formatted string with Alto markup from the internal - * data structures. - */ - char *GetAltoText(int page_number); - - /** - * Make a TSV-formatted string from the internal data structures. - * page_number is 0-based but will appear in the output as 1-based. - * Returned string must be freed with the delete [] operator. - */ - char *GetTSVText(int page_number); - - /** - * Make a box file for LSTM training from the internal data structures. - * Constructs coordinates in the original image - not just the rectangle. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetLSTMBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded in the same - * format as a box file used in training. - * Constructs coordinates in the original image - not just the rectangle. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded in the same - * format as a WordStr box file used in training. - * page_number is a 0-based page index that will appear in the box file. - * Returned string must be freed with the delete [] operator. - */ - char *GetWordStrBoxText(int page_number); - - /** - * The recognized text is returned as a char* which is coded - * as UNLV format Latin-1 with specific reject and suspect codes. - * Returned string must be freed with the delete [] operator. - */ - char *GetUNLVText(); - - /** - * Detect the orientation of the input image and apparent script (alphabet). - * orient_deg is the detected clockwise rotation of the input image in degrees - * (0, 90, 180, 270) - * orient_conf is the confidence (15.0 is reasonably confident) - * script_name is an ASCII string, the name of the script, e.g. "Latin" - * script_conf is confidence level in the script - * Returns true on success and writes values to each parameter as an output - */ - bool DetectOrientationScript(int *orient_deg, float *orient_conf, - const char **script_name, float *script_conf); - - /** - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - * page_number is a 0-based page index that will appear in the osd file. - */ - char *GetOsdText(int page_number); - - /** Returns the (average) confidence value between 0 and 100. */ - int MeanTextConf(); - /** - * Returns all word confidences (between 0 and 100) in an array, terminated - * by -1. The calling function must delete [] after use. - * The number of confidences should correspond to the number of space- - * delimited words in GetUTF8Text. - */ - int *AllWordConfidences(); - -#ifndef DISABLED_LEGACY_ENGINE - /** - * Applies the given word to the adaptive classifier if possible. - * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can - * tell the boundaries of the graphemes. - * Assumes that SetImage/SetRectangle have been used to set the image - * to the given word. The mode arg should be PSM_SINGLE_WORD or - * PSM_CIRCLE_WORD, as that will be used to control layout analysis. - * The currently set PageSegMode is preserved. - * Returns false if adaption was not possible for some reason. - */ - bool AdaptToWordStr(PageSegMode mode, const char *wordstr); -#endif // ndef DISABLED_LEGACY_ENGINE - - /** - * Free up recognition results and any stored image data, without actually - * freeing any recognition data that would be time-consuming to reload. - * Afterwards, you must call SetImage or TesseractRect before doing - * any Recognize or Get* operation. - */ - void Clear(); - - /** - * Close down tesseract and free up all memory. End() is equivalent to - * destructing and reconstructing your TessBaseAPI. - * Once End() has been used, none of the other API functions may be used - * other than Init and anything declared above it in the class definition. - */ - void End(); - - /** - * Clear any library-level memory caches. - * There are a variety of expensive-to-load constant data structures (mostly - * language dictionaries) that are cached globally -- surviving the Init() - * and End() of individual TessBaseAPI's. This function allows the clearing - * of these caches. - **/ - static void ClearPersistentCache(); - - /** - * Check whether a word is valid according to Tesseract's language model - * @return 0 if the word is invalid, non-zero if valid. - * @warning temporary! This function will be removed from here and placed - * in a separate API at some future time. - */ - int IsValidWord(const char *word) const; - // Returns true if utf8_character is defined in the UniCharset. - bool IsValidCharacter(const char *utf8_character) const; - - bool GetTextDirection(int *out_offset, float *out_slope); - - /** Sets Dict::letter_is_okay_ function to point to the given function. */ - void SetDictFunc(DictFunc f); - - /** Sets Dict::probability_in_context_ function to point to the given - * function. - */ - void SetProbabilityInContextFunc(ProbabilityInContextFunc f); - - /** - * Estimates the Orientation And Script of the image. - * @return true if the image was processed successfully. - */ - bool DetectOS(OSResults *); - - /** - * Return text orientation of each block as determined by an earlier run - * of layout analysis. - */ - void GetBlockTextOrientations(int **block_orientation, - bool **vertical_writing); - - /** This method returns the string form of the specified unichar. */ - const char *GetUnichar(int unichar_id) const; - - /** Return the pointer to the i-th dawg loaded into tesseract_ object. */ - const Dawg *GetDawg(int i) const; - - /** Return the number of dawgs loaded into tesseract_ object. */ - int NumDawgs() const; - - Tesseract *tesseract() const { - return tesseract_; - } - - OcrEngineMode oem() const { - return last_oem_requested_; - } - - void set_min_orientation_margin(double margin); - /* @} */ - -protected: - /** Common code for setting the image. Returns true if Init has been called. - */ - bool InternalSetImage(); - - /** - * Run the thresholder to make the thresholded image. If pix is not nullptr, - * the source is thresholded to pix instead of the internal IMAGE. - */ - virtual bool Threshold(Pix **pix); - - /** - * Find lines from the image making the BLOCK_LIST. - * @return 0 on success. - */ - int FindLines(); - - /** Delete the pageres and block list ready for a new page. */ - void ClearResults(); - - /** - * Return an LTR Result Iterator -- used only for training, as we really want - * to ignore all BiDi smarts at that point. - * delete once you're done with it. - */ - LTRResultIterator *GetLTRIterator(); - - /** - * Return the length of the output text string, as UTF8, assuming - * one newline per line and one per block, with a terminator, - * and assuming a single character reject marker for each rejected character. - * Also return the number of recognized blobs in blob_count. - */ - int TextLength(int *blob_count) const; - - //// paragraphs.cpp //////////////////////////////////////////////////// - void DetectParagraphs(bool after_text_recognition); - - const PAGE_RES *GetPageRes() const { - return page_res_; - } - -protected: - Tesseract *tesseract_; ///< The underlying data object. - Tesseract *osd_tesseract_; ///< For orientation & script detection. - EquationDetect *equ_detect_; ///< The equation detector. - FileReader reader_; ///< Reads files from any filesystem. - ImageThresholder *thresholder_; ///< Image thresholding module. - std::vector *paragraph_models_; - BLOCK_LIST *block_list_; ///< The page layout. - PAGE_RES *page_res_; ///< The page-level data. - std::string input_file_; ///< Name used by training code. - std::string output_file_; ///< Name used by debug code. - std::string datapath_; ///< Current location of tessdata. - std::string language_; ///< Last initialized language. - OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested. - bool recognition_done_; ///< page_res_ contains recognition data. - - /** - * @defgroup ThresholderParams Thresholder Parameters - * Parameters saved from the Thresholder. Needed to rebuild coordinates. - */ - /* @{ */ - int rect_left_; - int rect_top_; - int rect_width_; - int rect_height_; - int image_width_; - int image_height_; - /* @} */ - -private: - // A list of image filenames gets special consideration - bool ProcessPagesFileList(FILE *fp, std::string *buf, - const char *retry_config, int timeout_millisec, - TessResultRenderer *renderer, - int tessedit_page_number); - // TIFF supports multipage so gets special consideration. - bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size, - const char *filename, const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer, - int tessedit_page_number); -}; // class TessBaseAPI. - -/** Escape a char string - remove &<>"' with HTML codes. */ -std::string HOcrEscape(const char *text); - -} // namespace tesseract - -#endif // TESSERACT_API_BASEAPI_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/capi.h b/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/capi.h deleted file mode 100644 index 40f4856a..00000000 --- a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/capi.h +++ /dev/null @@ -1,484 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: capi.h -// Description: C-API TessBaseAPI -// -// (C) Copyright 2012, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef API_CAPI_H_ -#define API_CAPI_H_ - -#include "export.h" - -#ifdef __cplusplus -# include -# include -# include -# include -# include -#endif - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef BOOL -# define BOOL int -# define TRUE 1 -# define FALSE 0 -#endif - -#ifdef __cplusplus -typedef tesseract::TessResultRenderer TessResultRenderer; -typedef tesseract::TessBaseAPI TessBaseAPI; -typedef tesseract::PageIterator TessPageIterator; -typedef tesseract::ResultIterator TessResultIterator; -typedef tesseract::MutableIterator TessMutableIterator; -typedef tesseract::ChoiceIterator TessChoiceIterator; -typedef tesseract::OcrEngineMode TessOcrEngineMode; -typedef tesseract::PageSegMode TessPageSegMode; -typedef tesseract::PageIteratorLevel TessPageIteratorLevel; -typedef tesseract::Orientation TessOrientation; -typedef tesseract::ParagraphJustification TessParagraphJustification; -typedef tesseract::WritingDirection TessWritingDirection; -typedef tesseract::TextlineOrder TessTextlineOrder; -typedef tesseract::PolyBlockType TessPolyBlockType; -typedef tesseract::ETEXT_DESC ETEXT_DESC; -#else -typedef struct TessResultRenderer TessResultRenderer; -typedef struct TessBaseAPI TessBaseAPI; -typedef struct TessPageIterator TessPageIterator; -typedef struct TessResultIterator TessResultIterator; -typedef struct TessMutableIterator TessMutableIterator; -typedef struct TessChoiceIterator TessChoiceIterator; -typedef enum TessOcrEngineMode { - OEM_TESSERACT_ONLY, - OEM_LSTM_ONLY, - OEM_TESSERACT_LSTM_COMBINED, - OEM_DEFAULT -} TessOcrEngineMode; -typedef enum TessPageSegMode { - PSM_OSD_ONLY, - PSM_AUTO_OSD, - PSM_AUTO_ONLY, - PSM_AUTO, - PSM_SINGLE_COLUMN, - PSM_SINGLE_BLOCK_VERT_TEXT, - PSM_SINGLE_BLOCK, - PSM_SINGLE_LINE, - PSM_SINGLE_WORD, - PSM_CIRCLE_WORD, - PSM_SINGLE_CHAR, - PSM_SPARSE_TEXT, - PSM_SPARSE_TEXT_OSD, - PSM_RAW_LINE, - PSM_COUNT -} TessPageSegMode; -typedef enum TessPageIteratorLevel { - RIL_BLOCK, - RIL_PARA, - RIL_TEXTLINE, - RIL_WORD, - RIL_SYMBOL -} TessPageIteratorLevel; -typedef enum TessPolyBlockType { - PT_UNKNOWN, - PT_FLOWING_TEXT, - PT_HEADING_TEXT, - PT_PULLOUT_TEXT, - PT_EQUATION, - PT_INLINE_EQUATION, - PT_TABLE, - PT_VERTICAL_TEXT, - PT_CAPTION_TEXT, - PT_FLOWING_IMAGE, - PT_HEADING_IMAGE, - PT_PULLOUT_IMAGE, - PT_HORZ_LINE, - PT_VERT_LINE, - PT_NOISE, - PT_COUNT -} TessPolyBlockType; -typedef enum TessOrientation { - ORIENTATION_PAGE_UP, - ORIENTATION_PAGE_RIGHT, - ORIENTATION_PAGE_DOWN, - ORIENTATION_PAGE_LEFT -} TessOrientation; -typedef enum TessParagraphJustification { - JUSTIFICATION_UNKNOWN, - JUSTIFICATION_LEFT, - JUSTIFICATION_CENTER, - JUSTIFICATION_RIGHT -} TessParagraphJustification; -typedef enum TessWritingDirection { - WRITING_DIRECTION_LEFT_TO_RIGHT, - WRITING_DIRECTION_RIGHT_TO_LEFT, - WRITING_DIRECTION_TOP_TO_BOTTOM -} TessWritingDirection; -typedef enum TessTextlineOrder { - TEXTLINE_ORDER_LEFT_TO_RIGHT, - TEXTLINE_ORDER_RIGHT_TO_LEFT, - TEXTLINE_ORDER_TOP_TO_BOTTOM -} TessTextlineOrder; -typedef struct ETEXT_DESC ETEXT_DESC; -#endif - -typedef bool (*TessCancelFunc)(void *cancel_this, int words); -typedef bool (*TessProgressFunc)(ETEXT_DESC *ths, int left, int right, int top, - int bottom); - -struct Pix; -struct Boxa; -struct Pixa; - -/* General free functions */ - -TESS_API const char *TessVersion(); -TESS_API void TessDeleteText(const char *text); -TESS_API void TessDeleteTextArray(char **arr); -TESS_API void TessDeleteIntArray(const int *arr); - -/* Renderer API */ -TESS_API TessResultRenderer *TessTextRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessHOcrRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase, - BOOL font_info); -TESS_API TessResultRenderer *TessAltoRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessTsvRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessPDFRendererCreate(const char *outputbase, - const char *datadir, - BOOL textonly); -TESS_API TessResultRenderer *TessUnlvRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase); -TESS_API TessResultRenderer *TessWordStrBoxRendererCreate( - const char *outputbase); - -TESS_API void TessDeleteResultRenderer(TessResultRenderer *renderer); -TESS_API void TessResultRendererInsert(TessResultRenderer *renderer, - TessResultRenderer *next); -TESS_API TessResultRenderer *TessResultRendererNext( - TessResultRenderer *renderer); -TESS_API BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer, - const char *title); -TESS_API BOOL TessResultRendererAddImage(TessResultRenderer *renderer, - TessBaseAPI *api); -TESS_API BOOL TessResultRendererEndDocument(TessResultRenderer *renderer); - -TESS_API const char *TessResultRendererExtention(TessResultRenderer *renderer); -TESS_API const char *TessResultRendererTitle(TessResultRenderer *renderer); -TESS_API int TessResultRendererImageNum(TessResultRenderer *renderer); - -/* Base API */ - -TESS_API TessBaseAPI *TessBaseAPICreate(); -TESS_API void TessBaseAPIDelete(TessBaseAPI *handle); - -TESS_API size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI *handle, void **device); - -TESS_API void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name); -TESS_API const char *TessBaseAPIGetInputName(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetInputImage(TessBaseAPI *handle, struct Pix *pix); -TESS_API struct Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle); - -TESS_API int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle); -TESS_API const char *TessBaseAPIGetDatapath(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name); - -TESS_API BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name, - const char *value); -TESS_API BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name, - const char *value); - -TESS_API BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle, - const char *name, int *value); -TESS_API BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle, - const char *name, BOOL *value); -TESS_API BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle, - const char *name, double *value); -TESS_API const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle, - const char *name); - -TESS_API void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp); -TESS_API BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle, - const char *filename); - -TESS_API int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode oem, - char **configs, int configs_size); -TESS_API int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode oem); -TESS_API int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath, - const char *language); - -TESS_API int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath, - const char *language, TessOcrEngineMode mode, - char **configs, int configs_size, char **vars_vec, - char **vars_values, size_t vars_vec_size, - BOOL set_only_non_debug_params); - -TESS_API int TessBaseAPIInit5(TessBaseAPI *handle, const char *data, int data_size, - const char *language, TessOcrEngineMode mode, - char **configs, int configs_size, char **vars_vec, - char **vars_values, size_t vars_vec_size, - BOOL set_only_non_debug_params); - -TESS_API const char *TessBaseAPIGetInitLanguagesAsString( - const TessBaseAPI *handle); -TESS_API char **TessBaseAPIGetLoadedLanguagesAsVector( - const TessBaseAPI *handle); -TESS_API char **TessBaseAPIGetAvailableLanguagesAsVector( - const TessBaseAPI *handle); - -TESS_API void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle); - -TESS_API void TessBaseAPIReadConfigFile(TessBaseAPI *handle, - const char *filename); -TESS_API void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle, - const char *filename); - -TESS_API void TessBaseAPISetPageSegMode(TessBaseAPI *handle, - TessPageSegMode mode); -TESS_API TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle); - -TESS_API char *TessBaseAPIRect(TessBaseAPI *handle, - const unsigned char *imagedata, - int bytes_per_pixel, int bytes_per_line, - int left, int top, int width, int height); - -TESS_API void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle); - -TESS_API void TessBaseAPISetImage(TessBaseAPI *handle, - const unsigned char *imagedata, int width, - int height, int bytes_per_pixel, - int bytes_per_line); -TESS_API void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix); - -TESS_API void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi); - -TESS_API void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top, - int width, int height); - -TESS_API struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle); -TESS_API struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle, - struct Pixa **pixa); -TESS_API struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle, - struct Pixa **pixa, - int **blockids); -TESS_API struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle, - BOOL raw_image, int raw_padding, - struct Pixa **pixa, - int **blockids, int **paraids); -TESS_API struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle, - struct Pixa **pixa, int **blockids); -TESS_API struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle, - struct Pixa **pixa); -TESS_API struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle, - struct Pixa **cc); -TESS_API struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle, - TessPageIteratorLevel level, - BOOL text_only, - struct Pixa **pixa, - int **blockids); -TESS_API struct Boxa *TessBaseAPIGetComponentImages1( - TessBaseAPI *handle, TessPageIteratorLevel level, BOOL text_only, - BOOL raw_image, int raw_padding, struct Pixa **pixa, int **blockids, - int **paraids); - -TESS_API int TessBaseAPIGetThresholdedImageScaleFactor( - const TessBaseAPI *handle); - -TESS_API TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle); - -TESS_API int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor); - -TESS_API BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename, - const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer); -TESS_API BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix, - int page_index, const char *filename, - const char *retry_config, - int timeout_millisec, - TessResultRenderer *renderer); - -TESS_API TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle); -TESS_API TessMutableIterator *TessBaseAPIGetMutableIterator( - TessBaseAPI *handle); - -TESS_API char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle); -TESS_API char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number); - -TESS_API char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number); - -TESS_API char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number); -TESS_API char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle, - int page_number); - -TESS_API char *TessBaseAPIGetUNLVText(TessBaseAPI *handle); -TESS_API int TessBaseAPIMeanTextConf(TessBaseAPI *handle); - -TESS_API int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle); - -#ifndef DISABLED_LEGACY_ENGINE -TESS_API BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle, - TessPageSegMode mode, - const char *wordstr); -#endif // #ifndef DISABLED_LEGACY_ENGINE - -TESS_API void TessBaseAPIClear(TessBaseAPI *handle); -TESS_API void TessBaseAPIEnd(TessBaseAPI *handle); - -TESS_API int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word); -TESS_API BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset, - float *out_slope); - -TESS_API const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id); - -TESS_API void TessBaseAPIClearPersistentCache(TessBaseAPI *handle); - -#ifndef DISABLED_LEGACY_ENGINE - -// Call TessDeleteText(*best_script_name) to free memory allocated by this -// function -TESS_API BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle, - int *orient_deg, - float *orient_conf, - const char **script_name, - float *script_conf); -#endif // #ifndef DISABLED_LEGACY_ENGINE - -TESS_API void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle, - double margin); - -TESS_API int TessBaseAPINumDawgs(const TessBaseAPI *handle); - -TESS_API TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle); - -TESS_API void TessBaseGetBlockTextOrientations(TessBaseAPI *handle, - int **block_orientation, - bool **vertical_writing); - -/* Page iterator */ - -TESS_API void TessPageIteratorDelete(TessPageIterator *handle); - -TESS_API TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle); - -TESS_API void TessPageIteratorBegin(TessPageIterator *handle); - -TESS_API BOOL TessPageIteratorNext(TessPageIterator *handle, - TessPageIteratorLevel level); - -TESS_API BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle, - TessPageIteratorLevel level); - -TESS_API BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle, - TessPageIteratorLevel level, - TessPageIteratorLevel element); - -TESS_API BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle, - TessPageIteratorLevel level, - int *left, int *top, int *right, - int *bottom); - -TESS_API TessPolyBlockType -TessPageIteratorBlockType(const TessPageIterator *handle); - -TESS_API struct Pix *TessPageIteratorGetBinaryImage( - const TessPageIterator *handle, TessPageIteratorLevel level); - -TESS_API struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle, - TessPageIteratorLevel level, - int padding, - struct Pix *original_image, - int *left, int *top); - -TESS_API BOOL TessPageIteratorBaseline(const TessPageIterator *handle, - TessPageIteratorLevel level, int *x1, - int *y1, int *x2, int *y2); - -TESS_API void TessPageIteratorOrientation( - TessPageIterator *handle, TessOrientation *orientation, - TessWritingDirection *writing_direction, TessTextlineOrder *textline_order, - float *deskew_angle); - -TESS_API void TessPageIteratorParagraphInfo( - TessPageIterator *handle, TessParagraphJustification *justification, - BOOL *is_list_item, BOOL *is_crown, int *first_line_indent); - -/* Result iterator */ - -TESS_API void TessResultIteratorDelete(TessResultIterator *handle); -TESS_API TessResultIterator *TessResultIteratorCopy( - const TessResultIterator *handle); -TESS_API TessPageIterator *TessResultIteratorGetPageIterator( - TessResultIterator *handle); -TESS_API const TessPageIterator *TessResultIteratorGetPageIteratorConst( - const TessResultIterator *handle); -TESS_API TessChoiceIterator *TessResultIteratorGetChoiceIterator( - const TessResultIterator *handle); - -TESS_API BOOL TessResultIteratorNext(TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API float TessResultIteratorConfidence(const TessResultIterator *handle, - TessPageIteratorLevel level); -TESS_API const char *TessResultIteratorWordRecognitionLanguage( - const TessResultIterator *handle); -TESS_API const char *TessResultIteratorWordFontAttributes( - const TessResultIterator *handle, BOOL *is_bold, BOOL *is_italic, - BOOL *is_underlined, BOOL *is_monospace, BOOL *is_serif, BOOL *is_smallcaps, - int *pointsize, int *font_id); - -TESS_API BOOL -TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle); -TESS_API BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle); -TESS_API BOOL -TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle); - -TESS_API void TessChoiceIteratorDelete(TessChoiceIterator *handle); -TESS_API BOOL TessChoiceIteratorNext(TessChoiceIterator *handle); -TESS_API const char *TessChoiceIteratorGetUTF8Text( - const TessChoiceIterator *handle); -TESS_API float TessChoiceIteratorConfidence(const TessChoiceIterator *handle); - -/* Progress monitor */ - -TESS_API ETEXT_DESC *TessMonitorCreate(); -TESS_API void TessMonitorDelete(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetCancelFunc(ETEXT_DESC *monitor, - TessCancelFunc cancelFunc); -TESS_API void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis); -TESS_API void *TessMonitorGetCancelThis(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetProgressFunc(ETEXT_DESC *monitor, - TessProgressFunc progressFunc); -TESS_API int TessMonitorGetProgress(ETEXT_DESC *monitor); -TESS_API void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline); - -#ifdef __cplusplus -} -#endif - -#endif // API_CAPI_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/export.h b/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/export.h deleted file mode 100644 index d238b628..00000000 --- a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/export.h +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: export.h -// Description: Place holder -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_PLATFORM_H_ -#define TESSERACT_PLATFORM_H_ - -#ifndef TESS_API -# if defined(_WIN32) || defined(__CYGWIN__) -# if defined(TESS_EXPORTS) -# define TESS_API __declspec(dllexport) -# elif defined(TESS_IMPORTS) -# define TESS_API __declspec(dllimport) -# else -# define TESS_API -# endif -# else -# if defined(TESS_EXPORTS) || defined(TESS_IMPORTS) -# define TESS_API __attribute__((visibility("default"))) -# else -# define TESS_API -# endif -# endif -#endif - -#endif // TESSERACT_PLATFORM_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/ltrresultiterator.h b/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/ltrresultiterator.h deleted file mode 100644 index 6ca0a98e..00000000 --- a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/ltrresultiterator.h +++ /dev/null @@ -1,235 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: ltrresultiterator.h -// Description: Iterator for tesseract results in strict left-to-right -// order that avoids using tesseract internal data structures. -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ -#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ - -#include "export.h" // for TESS_API -#include "pageiterator.h" // for PageIterator -#include "publictypes.h" // for PageIteratorLevel -#include "unichar.h" // for StrongScriptDirection - -namespace tesseract { - -class BLOB_CHOICE_IT; -class PAGE_RES; -class WERD_RES; - -class Tesseract; - -// Class to iterate over tesseract results, providing access to all levels -// of the page hierarchy, without including any tesseract headers or having -// to handle any tesseract structures. -// WARNING! This class points to data held within the TessBaseAPI class, and -// therefore can only be used while the TessBaseAPI class still exists and -// has not been subjected to a call of Init, SetImage, Recognize, Clear, End -// DetectOS, or anything else that changes the internal PAGE_RES. -// See tesseract/publictypes.h for the definition of PageIteratorLevel. -// See also base class PageIterator, which contains the bulk of the interface. -// LTRResultIterator adds text-specific methods for access to OCR output. - -class TESS_API LTRResultIterator : public PageIterator { - friend class ChoiceIterator; - -public: - // page_res and tesseract come directly from the BaseAPI. - // The rectangle parameters are copied indirectly from the Thresholder, - // via the BaseAPI. They represent the coordinates of some rectangle in an - // original image (in top-left-origin coordinates) and therefore the top-left - // needs to be added to any output boxes in order to specify coordinates - // in the original image. See TessBaseAPI::SetRectangle. - // The scale and scaled_yres are in case the Thresholder scaled the image - // rectangle prior to thresholding. Any coordinates in tesseract's image - // must be divided by scale before adding (rect_left, rect_top). - // The scaled_yres indicates the effective resolution of the binary image - // that tesseract has been given by the Thresholder. - // After the constructor, Begin has already been called. - LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, - int scaled_yres, int rect_left, int rect_top, - int rect_width, int rect_height); - - ~LTRResultIterator() override; - - // LTRResultIterators may be copied! This makes it possible to iterate over - // all the objects at a lower level, while maintaining an iterator to - // objects at a higher level. These constructors DO NOT CALL Begin, so - // iterations will continue from the location of src. - // TODO: For now the copy constructor and operator= only need the base class - // versions, but if new data members are added, don't forget to add them! - - // ============= Moving around within the page ============. - - // See PageIterator. - - // ============= Accessing data ==============. - - // Returns the null terminated UTF-8 encoded text string for the current - // object at the given level. Use delete [] to free after use. - char *GetUTF8Text(PageIteratorLevel level) const; - - // Set the string inserted at the end of each text line. "\n" by default. - void SetLineSeparator(const char *new_line); - - // Set the string inserted at the end of each paragraph. "\n" by default. - void SetParagraphSeparator(const char *new_para); - - // Returns the mean confidence of the current object at the given level. - // The number should be interpreted as a percent probability. (0.0f-100.0f) - float Confidence(PageIteratorLevel level) const; - - // ============= Functions that refer to words only ============. - - // Returns the font attributes of the current word. If iterating at a higher - // level object than words, eg textlines, then this will return the - // attributes of the first word in that textline. - // The actual return value is a string representing a font name. It points - // to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as - // the iterator itself, ie rendered invalid by various members of - // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI. - // Pointsize is returned in printers points (1/72 inch.) - const char *WordFontAttributes(bool *is_bold, bool *is_italic, - bool *is_underlined, bool *is_monospace, - bool *is_serif, bool *is_smallcaps, - int *pointsize, int *font_id) const; - - // Return the name of the language used to recognize this word. - // On error, nullptr. Do not delete this pointer. - const char *WordRecognitionLanguage() const; - - // Return the overall directionality of this word. - StrongScriptDirection WordDirection() const; - - // Returns true if the current word was found in a dictionary. - bool WordIsFromDictionary() const; - - // Returns the number of blanks before the current word. - int BlanksBeforeWord() const; - - // Returns true if the current word is numeric. - bool WordIsNumeric() const; - - // Returns true if the word contains blamer information. - bool HasBlamerInfo() const; - - // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle - // of the current word. - const void *GetParamsTrainingBundle() const; - - // Returns a pointer to the string with blamer information for this word. - // Assumes that the word's blamer_bundle is not nullptr. - const char *GetBlamerDebug() const; - - // Returns a pointer to the string with misadaption information for this word. - // Assumes that the word's blamer_bundle is not nullptr. - const char *GetBlamerMisadaptionDebug() const; - - // Returns true if a truth string was recorded for the current word. - bool HasTruthString() const; - - // Returns true if the given string is equivalent to the truth string for - // the current word. - bool EquivalentToTruth(const char *str) const; - - // Returns a null terminated UTF-8 encoded truth string for the current word. - // Use delete [] to free after use. - char *WordTruthUTF8Text() const; - - // Returns a null terminated UTF-8 encoded normalized OCR string for the - // current word. Use delete [] to free after use. - char *WordNormedUTF8Text() const; - - // Returns a pointer to serialized choice lattice. - // Fills lattice_size with the number of bytes in lattice data. - const char *WordLattice(int *lattice_size) const; - - // ============= Functions that refer to symbols only ============. - - // Returns true if the current symbol is a superscript. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsSuperscript() const; - // Returns true if the current symbol is a subscript. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsSubscript() const; - // Returns true if the current symbol is a dropcap. - // If iterating at a higher level object than symbols, eg words, then - // this will return the attributes of the first symbol in that word. - bool SymbolIsDropcap() const; - -protected: - const char *line_separator_; - const char *paragraph_separator_; -}; - -// Class to iterate over the classifier choices for a single RIL_SYMBOL. -class TESS_API ChoiceIterator { -public: - // Construction is from a LTRResultIterator that points to the symbol of - // interest. The ChoiceIterator allows a one-shot iteration over the - // choices for this symbol and after that it is useless. - explicit ChoiceIterator(const LTRResultIterator &result_it); - ~ChoiceIterator(); - - // Moves to the next choice for the symbol and returns false if there - // are none left. - bool Next(); - - // ============= Accessing data ==============. - - // Returns the null terminated UTF-8 encoded text string for the current - // choice. - // NOTE: Unlike LTRResultIterator::GetUTF8Text, the return points to an - // internal structure and should NOT be delete[]ed to free after use. - const char *GetUTF8Text() const; - - // Returns the confidence of the current choice depending on the used language - // data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All - // choices for one symbol should roughly add up to 1.0f. - // If only traineddata of the legacy engine is used, the number should be - // interpreted as a percent probability. (0.0f-100.0f) In this case - // probabilities won't add up to 100. Each one stands on its own. - float Confidence() const; - - // Returns a vector containing all timesteps, which belong to the currently - // selected symbol. A timestep is a vector containing pairs of symbols and - // floating point numbers. The number states the probability for the - // corresponding symbol. - std::vector>> *Timesteps() const; - -private: - // clears the remaining spaces out of the results and adapt the probabilities - void filterSpaces(); - // Pointer to the WERD_RES object owned by the API. - WERD_RES *word_res_; - // Iterator over the blob choices. - BLOB_CHOICE_IT *choice_it_; - std::vector> *LSTM_choices_ = nullptr; - std::vector>::iterator LSTM_choice_it_; - - const int *tstep_index_; - // regulates the rating granularity - double rating_coefficient_; - // leading blanks - int blanks_before_word_; - // true when there is lstm engine related trained data - bool oemLSTM_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/ocrclass.h b/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/ocrclass.h deleted file mode 100644 index a55e6528..00000000 --- a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/ocrclass.h +++ /dev/null @@ -1,158 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -/********************************************************************** - * File: ocrclass.h - * Description: Class definitions and constants for the OCR API. - * Author: Hewlett-Packard Co - * - * (C) Copyright 1996, Hewlett-Packard Co. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -/********************************************************************** - * This file contains typedefs for all the structures used by - * the HP OCR interface. - * The structures are designed to allow them to be used with any - * structure alignment up to 8. - **********************************************************************/ - -#ifndef CCUTIL_OCRCLASS_H_ -#define CCUTIL_OCRCLASS_H_ - -#include -#include - -namespace tesseract { - -/********************************************************************** - * EANYCODE_CHAR - * Description of a single character. The character code is defined by - * the character set of the current font. - * Output text is sent as an array of these structures. - * Spaces and line endings in the output are represented in the - * structures of the surrounding characters. They are not directly - * represented as characters. - * The first character in a word has a positive value of blanks. - * Missing information should be set to the defaults in the comments. - * If word bounds are known, but not character bounds, then the top and - * bottom of each character should be those of the word. The left of the - * first and right of the last char in each word should be set. All other - * lefts and rights should be set to -1. - * If set, the values of right and bottom are left+width and top+height. - * Most of the members come directly from the parameters to ocr_append_char. - * The formatting member uses the enhancement parameter and combines the - * line direction stuff into the top 3 bits. - * The coding is 0=RL char, 1=LR char, 2=DR NL, 3=UL NL, 4=DR Para, - * 5=UL Para, 6=TB char, 7=BT char. API users do not need to know what - * the coding is, only that it is backwards compatible with the previous - * version. - **********************************************************************/ - -struct EANYCODE_CHAR { /*single character */ - // It should be noted that the format for char_code for version 2.0 and beyond - // is UTF8 which means that ASCII characters will come out as one structure - // but other characters will be returned in two or more instances of this - // structure with a single byte of the UTF8 code in each, but each will have - // the same bounding box. Programs which want to handle languagues with - // different characters sets will need to handle extended characters - // appropriately, but *all* code needs to be prepared to receive UTF8 coded - // characters for characters such as bullet and fancy quotes. - uint16_t char_code; /*character itself */ - int16_t left; /*of char (-1) */ - int16_t right; /*of char (-1) */ - int16_t top; /*of char (-1) */ - int16_t bottom; /*of char (-1) */ - int16_t font_index; /*what font (0) */ - uint8_t confidence; /*0=perfect, 100=reject (0/100) */ - uint8_t point_size; /*of char, 72=i inch, (10) */ - int8_t blanks; /*no of spaces before this char (1) */ - uint8_t formatting; /*char formatting (0) */ -}; - -/********************************************************************** - * ETEXT_DESC - * Description of the output of the OCR engine. - * This structure is used as both a progress monitor and the final - * output header, since it needs to be a valid progress monitor while - * the OCR engine is storing its output to shared memory. - * During progress, all the buffer info is -1. - * Progress starts at 0 and increases to 100 during OCR. No other constraint. - * Additionally the progress callback contains the bounding box of the word that - * is currently being processed. - * Every progress callback, the OCR engine must set ocr_alive to 1. - * The HP side will set ocr_alive to 0. Repeated failure to reset - * to 1 indicates that the OCR engine is dead. - * If the cancel function is not null then it is called with the number of - * user words found. If it returns true then operation is cancelled. - **********************************************************************/ -class ETEXT_DESC; - -using CANCEL_FUNC = bool (*)(void *, int); -using PROGRESS_FUNC = bool (*)(int, int, int, int, int); -using PROGRESS_FUNC2 = bool (*)(ETEXT_DESC *, int, int, int, int); - -class ETEXT_DESC { // output header -public: - int16_t count{0}; /// chars in this buffer(0) - int16_t progress{0}; /// percent complete increasing (0-100) - /** Progress monitor covers word recognition and it does not cover layout - * analysis. - * See Ray comment in https://github.com/tesseract-ocr/tesseract/pull/27 */ - int8_t more_to_come{0}; /// true if not last - volatile int8_t ocr_alive{0}; /// ocr sets to 1, HP 0 - int8_t err_code{0}; /// for errcode use - CANCEL_FUNC cancel{nullptr}; /// returns true to cancel - PROGRESS_FUNC progress_callback{ - nullptr}; /// called whenever progress increases - PROGRESS_FUNC2 progress_callback2; /// monitor-aware progress callback - void *cancel_this{nullptr}; /// this or other data for cancel - std::chrono::steady_clock::time_point end_time; - /// Time to stop. Expected to be set only - /// by call to set_deadline_msecs(). - EANYCODE_CHAR text[1]{}; /// character data - - ETEXT_DESC() : progress_callback2(&default_progress_func) { - end_time = std::chrono::time_point(); - } - - // Sets the end time to be deadline_msecs milliseconds from now. - void set_deadline_msecs(int32_t deadline_msecs) { - if (deadline_msecs > 0) { - end_time = std::chrono::steady_clock::now() + - std::chrono::milliseconds(deadline_msecs); - } - } - - // Returns false if we've not passed the end_time, or have not set a deadline. - bool deadline_exceeded() const { - if (end_time.time_since_epoch() == - std::chrono::steady_clock::duration::zero()) { - return false; - } - auto now = std::chrono::steady_clock::now(); - return (now > end_time); - } - -private: - static bool default_progress_func(ETEXT_DESC *ths, int left, int right, - int top, int bottom) { - if (ths->progress_callback != nullptr) { - return (*(ths->progress_callback))(ths->progress, left, right, top, - bottom); - } - return true; - } -}; - -} // namespace tesseract - -#endif // CCUTIL_OCRCLASS_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/osdetect.h b/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/osdetect.h deleted file mode 100644 index 34bfb557..00000000 --- a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/osdetect.h +++ /dev/null @@ -1,139 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: osdetect.h -// Description: Orientation and script detection. -// Author: Samuel Charron -// Ranjith Unnikrishnan -// -// (C) Copyright 2008, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_OSDETECT_H_ -#define TESSERACT_CCMAIN_OSDETECT_H_ - -#include "export.h" // for TESS_API - -#include // for std::vector - -namespace tesseract { - -class BLOBNBOX; -class BLOBNBOX_CLIST; -class BLOB_CHOICE_LIST; -class TO_BLOCK_LIST; -class UNICHARSET; - -class Tesseract; - -// Max number of scripts in ICU + "NULL" + Japanese and Korean + Fraktur -const int kMaxNumberOfScripts = 116 + 1 + 2 + 1; - -struct OSBestResult { - OSBestResult() - : orientation_id(0), script_id(0), sconfidence(0.0), oconfidence(0.0) {} - int orientation_id; - int script_id; - float sconfidence; - float oconfidence; -}; - -struct OSResults { - OSResults() : unicharset(nullptr) { - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < kMaxNumberOfScripts; ++j) { - scripts_na[i][j] = 0; - } - orientations[i] = 0; - } - } - void update_best_orientation(); - // Set the estimate of the orientation to the given id. - void set_best_orientation(int orientation_id); - // Update/Compute the best estimate of the script assuming the given - // orientation id. - void update_best_script(int orientation_id); - // Return the index of the script with the highest score for this orientation. - TESS_API int get_best_script(int orientation_id) const; - // Accumulate scores with given OSResults instance and update the best script. - void accumulate(const OSResults &osr); - - // Print statistics. - void print_scores(void) const; - void print_scores(int orientation_id) const; - - // Array holding scores for each orientation id [0,3]. - // Orientation ids [0..3] map to [0, 270, 180, 90] degree orientations of the - // page respectively, where the values refer to the amount of clockwise - // rotation to be applied to the page for the text to be upright and readable. - float orientations[4]; - // Script confidence scores for each of 4 possible orientations. - float scripts_na[4][kMaxNumberOfScripts]; - - UNICHARSET *unicharset; - OSBestResult best_result; -}; - -class OrientationDetector { -public: - OrientationDetector(const std::vector *allowed_scripts, - OSResults *results); - bool detect_blob(BLOB_CHOICE_LIST *scores); - int get_orientation(); - -private: - OSResults *osr_; - const std::vector *allowed_scripts_; -}; - -class ScriptDetector { -public: - ScriptDetector(const std::vector *allowed_scripts, OSResults *osr, - tesseract::Tesseract *tess); - void detect_blob(BLOB_CHOICE_LIST *scores); - bool must_stop(int orientation) const; - -private: - OSResults *osr_; - static const char *korean_script_; - static const char *japanese_script_; - static const char *fraktur_script_; - int korean_id_; - int japanese_id_; - int katakana_id_; - int hiragana_id_; - int han_id_; - int hangul_id_; - int latin_id_; - int fraktur_id_; - tesseract::Tesseract *tess_; - const std::vector *allowed_scripts_; -}; - -int orientation_and_script_detection(const char *filename, OSResults *, - tesseract::Tesseract *); - -int os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr, - tesseract::Tesseract *tess); - -int os_detect_blobs(const std::vector *allowed_scripts, - BLOBNBOX_CLIST *blob_list, OSResults *osr, - tesseract::Tesseract *tess); - -bool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s, - OSResults *, tesseract::Tesseract *tess); - -// Helper method to convert an orientation index to its value in degrees. -// The value represents the amount of clockwise rotation in degrees that must be -// applied for the text to be upright (readable). -TESS_API int OrientationIdToValue(const int &id); - -} // namespace tesseract - -#endif // TESSERACT_CCMAIN_OSDETECT_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/pageiterator.h b/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/pageiterator.h deleted file mode 100644 index 68739715..00000000 --- a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/pageiterator.h +++ /dev/null @@ -1,364 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: pageiterator.h -// Description: Iterator for tesseract page structure that avoids using -// tesseract internal data structures. -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_ -#define TESSERACT_CCMAIN_PAGEITERATOR_H_ - -#include "export.h" -#include "publictypes.h" - -struct Pix; -struct Pta; - -namespace tesseract { - -struct BlamerBundle; -class C_BLOB_IT; -class PAGE_RES; -class PAGE_RES_IT; -class WERD; - -class Tesseract; - -/** - * Class to iterate over tesseract page structure, providing access to all - * levels of the page hierarchy, without including any tesseract headers or - * having to handle any tesseract structures. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - * See tesseract/publictypes.h for the definition of PageIteratorLevel. - * See also ResultIterator, derived from PageIterator, which adds in the - * ability to access OCR output with text-specific methods. - */ - -class TESS_API PageIterator { -public: - /** - * page_res and tesseract come directly from the BaseAPI. - * The rectangle parameters are copied indirectly from the Thresholder, - * via the BaseAPI. They represent the coordinates of some rectangle in an - * original image (in top-left-origin coordinates) and therefore the top-left - * needs to be added to any output boxes in order to specify coordinates - * in the original image. See TessBaseAPI::SetRectangle. - * The scale and scaled_yres are in case the Thresholder scaled the image - * rectangle prior to thresholding. Any coordinates in tesseract's image - * must be divided by scale before adding (rect_left, rect_top). - * The scaled_yres indicates the effective resolution of the binary image - * that tesseract has been given by the Thresholder. - * After the constructor, Begin has already been called. - */ - PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, - int scaled_yres, int rect_left, int rect_top, int rect_width, - int rect_height); - virtual ~PageIterator(); - - /** - * Page/ResultIterators may be copied! This makes it possible to iterate over - * all the objects at a lower level, while maintaining an iterator to - * objects at a higher level. These constructors DO NOT CALL Begin, so - * iterations will continue from the location of src. - */ - PageIterator(const PageIterator &src); - const PageIterator &operator=(const PageIterator &src); - - /** Are we positioned at the same location as other? */ - bool PositionedAtSameWord(const PAGE_RES_IT *other) const; - - // ============= Moving around within the page ============. - - /** - * Moves the iterator to point to the start of the page to begin an - * iteration. - */ - virtual void Begin(); - - /** - * Moves the iterator to the beginning of the paragraph. - * This class implements this functionality by moving it to the zero indexed - * blob of the first (leftmost) word on the first row of the paragraph. - */ - virtual void RestartParagraph(); - - /** - * Return whether this iterator points anywhere in the first textline of a - * paragraph. - */ - bool IsWithinFirstTextlineOfParagraph() const; - - /** - * Moves the iterator to the beginning of the text line. - * This class implements this functionality by moving it to the zero indexed - * blob of the first (leftmost) word of the row. - */ - virtual void RestartRow(); - - /** - * Moves to the start of the next object at the given level in the - * page hierarchy, and returns false if the end of the page was reached. - * NOTE that RIL_SYMBOL will skip non-text blocks, but all other - * PageIteratorLevel level values will visit each non-text block once. - * Think of non text blocks as containing a single para, with a single line, - * with a single imaginary word. - * Calls to Next with different levels may be freely intermixed. - * This function iterates words in right-to-left scripts correctly, if - * the appropriate language has been loaded into Tesseract. - */ - virtual bool Next(PageIteratorLevel level); - - /** - * Returns true if the iterator is at the start of an object at the given - * level. - * - * For instance, suppose an iterator it is pointed to the first symbol of the - * first word of the third line of the second paragraph of the first block in - * a page, then: - * it.IsAtBeginningOf(RIL_BLOCK) = false - * it.IsAtBeginningOf(RIL_PARA) = false - * it.IsAtBeginningOf(RIL_TEXTLINE) = true - * it.IsAtBeginningOf(RIL_WORD) = true - * it.IsAtBeginningOf(RIL_SYMBOL) = true - */ - virtual bool IsAtBeginningOf(PageIteratorLevel level) const; - - /** - * Returns whether the iterator is positioned at the last element in a - * given level. (e.g. the last word in a line, the last line in a block) - * - * Here's some two-paragraph example - * text. It starts off innocuously - * enough but quickly turns bizarre. - * The author inserts a cornucopia - * of words to guard against confused - * references. - * - * Now take an iterator it pointed to the start of "bizarre." - * it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false - * it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true - * it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false - */ - virtual bool IsAtFinalElement(PageIteratorLevel level, - PageIteratorLevel element) const; - - /** - * Returns whether this iterator is positioned - * before other: -1 - * equal to other: 0 - * after other: 1 - */ - int Cmp(const PageIterator &other) const; - - // ============= Accessing data ==============. - // Coordinate system: - // Integer coordinates are at the cracks between the pixels. - // The top-left corner of the top-left pixel in the image is at (0,0). - // The bottom-right corner of the bottom-right pixel in the image is at - // (width, height). - // Every bounding box goes from the top-left of the top-left contained - // pixel to the bottom-right of the bottom-right contained pixel, so - // the bounding box of the single top-left pixel in the image is: - // (0,0)->(1,1). - // If an image rectangle has been set in the API, then returned coordinates - // relate to the original (full) image, rather than the rectangle. - - /** - * Controls what to include in a bounding box. Bounding boxes of all levels - * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics. - * Between layout analysis and recognition, it isn't known where all - * diacritics belong, so this control is used to include or exclude some - * diacritics that are above or below the main body of the word. In most cases - * where the placement is obvious, and after recognition, it doesn't make as - * much difference, as the diacritics will already be included in the word. - */ - void SetBoundingBoxComponents(bool include_upper_dots, - bool include_lower_dots) { - include_upper_dots_ = include_upper_dots; - include_lower_dots_ = include_lower_dots; - } - - /** - * Returns the bounding rectangle of the current object at the given level. - * See comment on coordinate system above. - * Returns false if there is no such object at the current position. - * The returned bounding box is guaranteed to match the size and position - * of the image returned by GetBinaryImage, but may clip foreground pixels - * from a grey image. The padding argument to GetImage can be used to expand - * the image to include more foreground pixels. See GetImage below. - */ - bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, - int *bottom) const; - bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top, - int *right, int *bottom) const; - /** - * Returns the bounding rectangle of the object in a coordinate system of the - * working image rectangle having its origin at (rect_left_, rect_top_) with - * respect to the original image and is scaled by a factor scale_. - */ - bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top, - int *right, int *bottom) const; - - /** Returns whether there is no object of a given level. */ - bool Empty(PageIteratorLevel level) const; - - /** - * Returns the type of the current block. - * See tesseract/publictypes.h for PolyBlockType. - */ - PolyBlockType BlockType() const; - - /** - * Returns the polygon outline of the current block. The returned Pta must - * be ptaDestroy-ed after use. Note that the returned Pta lists the vertices - * of the polygon, and the last edge is the line segment between the last - * point and the first point. nullptr will be returned if the iterator is - * at the end of the document or layout analysis was not used. - */ - Pta *BlockPolygon() const; - - /** - * Returns a binary image of the current object at the given level. - * The position and size match the return from BoundingBoxInternal, and so - * this could be upscaled with respect to the original input image. - * Use pixDestroy to delete the image after use. - */ - Pix *GetBinaryImage(PageIteratorLevel level) const; - - /** - * Returns an image of the current object at the given level in greyscale - * if available in the input. To guarantee a binary image use BinaryImage. - * NOTE that in order to give the best possible image, the bounds are - * expanded slightly over the binary connected component, by the supplied - * padding, so the top-left position of the returned image is returned - * in (left,top). These will most likely not match the coordinates - * returned by BoundingBox. - * If you do not supply an original image, you will get a binary one. - * Use pixDestroy to delete the image after use. - */ - Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img, - int *left, int *top) const; - - /** - * Returns the baseline of the current object at the given level. - * The baseline is the line that passes through (x1, y1) and (x2, y2). - * WARNING: with vertical text, baselines may be vertical! - * Returns false if there is no baseline at the current position. - */ - bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2, - int *y2) const; - - // Returns the attributes of the current row. - void RowAttributes(float *row_height, float *descenders, - float *ascenders) const; - - /** - * Returns orientation for the block the iterator points to. - * orientation, writing_direction, textline_order: see publictypes.h - * deskew_angle: after rotating the block so the text orientation is - * upright, how many radians does one have to rotate the - * block anti-clockwise for it to be level? - * -Pi/4 <= deskew_angle <= Pi/4 - */ - void Orientation(tesseract::Orientation *orientation, - tesseract::WritingDirection *writing_direction, - tesseract::TextlineOrder *textline_order, - float *deskew_angle) const; - - /** - * Returns information about the current paragraph, if available. - * - * justification - - * LEFT if ragged right, or fully justified and script is left-to-right. - * RIGHT if ragged left, or fully justified and script is right-to-left. - * unknown if it looks like source code or we have very few lines. - * is_list_item - - * true if we believe this is a member of an ordered or unordered list. - * is_crown - - * true if the first line of the paragraph is aligned with the other - * lines of the paragraph even though subsequent paragraphs have first - * line indents. This typically indicates that this is the continuation - * of a previous paragraph or that it is the very first paragraph in - * the chapter. - * first_line_indent - - * For LEFT aligned paragraphs, the first text line of paragraphs of - * this kind are indented this many pixels from the left edge of the - * rest of the paragraph. - * for RIGHT aligned paragraphs, the first text line of paragraphs of - * this kind are indented this many pixels from the right edge of the - * rest of the paragraph. - * NOTE 1: This value may be negative. - * NOTE 2: if *is_crown == true, the first line of this paragraph is - * actually flush, and first_line_indent is set to the "common" - * first_line_indent for subsequent paragraphs in this block - * of text. - */ - void ParagraphInfo(tesseract::ParagraphJustification *justification, - bool *is_list_item, bool *is_crown, - int *first_line_indent) const; - - // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle - // of the current word to the given pointer (takes ownership of the pointer) - // and returns true. - // Can only be used when iterating on the word level. - bool SetWordBlamerBundle(BlamerBundle *blamer_bundle); - -protected: - /** - * Sets up the internal data for iterating the blobs of a new word, then - * moves the iterator to the given offset. - */ - void BeginWord(int offset); - - /** Pointer to the page_res owned by the API. */ - PAGE_RES *page_res_; - /** Pointer to the Tesseract object owned by the API. */ - Tesseract *tesseract_; - /** - * The iterator to the page_res_. Owned by this ResultIterator. - * A pointer just to avoid dragging in Tesseract includes. - */ - PAGE_RES_IT *it_; - /** - * The current input WERD being iterated. If there is an output from OCR, - * then word_ is nullptr. Owned by the API - */ - WERD *word_; - /** The length of the current word_. */ - int word_length_; - /** The current blob index within the word. */ - int blob_index_; - /** - * Iterator to the blobs within the word. If nullptr, then we are iterating - * OCR results in the box_word. - * Owned by this ResultIterator. - */ - C_BLOB_IT *cblob_it_; - /** Control over what to include in bounding boxes. */ - bool include_upper_dots_; - bool include_lower_dots_; - /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/ - int scale_; - int scaled_yres_; - int rect_left_; - int rect_top_; - int rect_width_; - int rect_height_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_PAGEITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/publictypes.h b/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/publictypes.h deleted file mode 100644 index 0069cf28..00000000 --- a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/publictypes.h +++ /dev/null @@ -1,281 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: publictypes.h -// Description: Types used in both the API and internally -// Author: Ray Smith -// -// (C) Copyright 2010, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCSTRUCT_PUBLICTYPES_H_ -#define TESSERACT_CCSTRUCT_PUBLICTYPES_H_ - -namespace tesseract { - -// This file contains types that are used both by the API and internally -// to Tesseract. In order to decouple the API from Tesseract and prevent cyclic -// dependencies, THIS FILE SHOULD NOT DEPEND ON ANY OTHER PART OF TESSERACT. -// Restated: It is OK for low-level Tesseract files to include publictypes.h, -// but not for the low-level tesseract code to include top-level API code. -// This file should not use other Tesseract types, as that would drag -// their includes into the API-level. - -/** Number of printers' points in an inch. The unit of the pointsize return. */ -constexpr int kPointsPerInch = 72; -/** - * Minimum believable resolution. Used as a default if there is no other - * information, as it is safer to under-estimate than over-estimate. - */ -constexpr int kMinCredibleResolution = 70; -/** Maximum believable resolution. */ -constexpr int kMaxCredibleResolution = 2400; -/** - * Ratio between median blob size and likely resolution. Used to estimate - * resolution when none is provided. This is basically 1/usual text size in - * inches. */ -constexpr int kResolutionEstimationFactor = 10; - -/** - * Possible types for a POLY_BLOCK or ColPartition. - * Must be kept in sync with kPBColors in polyblk.cpp and PTIs*Type functions - * below, as well as kPolyBlockNames in layout_test.cc. - * Used extensively by ColPartition, and POLY_BLOCK. - */ -enum PolyBlockType { - PT_UNKNOWN, // Type is not yet known. Keep as the first element. - PT_FLOWING_TEXT, // Text that lives inside a column. - PT_HEADING_TEXT, // Text that spans more than one column. - PT_PULLOUT_TEXT, // Text that is in a cross-column pull-out region. - PT_EQUATION, // Partition belonging to an equation region. - PT_INLINE_EQUATION, // Partition has inline equation. - PT_TABLE, // Partition belonging to a table region. - PT_VERTICAL_TEXT, // Text-line runs vertically. - PT_CAPTION_TEXT, // Text that belongs to an image. - PT_FLOWING_IMAGE, // Image that lives inside a column. - PT_HEADING_IMAGE, // Image that spans more than one column. - PT_PULLOUT_IMAGE, // Image that is in a cross-column pull-out region. - PT_HORZ_LINE, // Horizontal Line. - PT_VERT_LINE, // Vertical Line. - PT_NOISE, // Lies outside of any column. - PT_COUNT -}; - -/** Returns true if PolyBlockType is of horizontal line type */ -inline bool PTIsLineType(PolyBlockType type) { - return type == PT_HORZ_LINE || type == PT_VERT_LINE; -} -/** Returns true if PolyBlockType is of image type */ -inline bool PTIsImageType(PolyBlockType type) { - return type == PT_FLOWING_IMAGE || type == PT_HEADING_IMAGE || - type == PT_PULLOUT_IMAGE; -} -/** Returns true if PolyBlockType is of text type */ -inline bool PTIsTextType(PolyBlockType type) { - return type == PT_FLOWING_TEXT || type == PT_HEADING_TEXT || - type == PT_PULLOUT_TEXT || type == PT_TABLE || - type == PT_VERTICAL_TEXT || type == PT_CAPTION_TEXT || - type == PT_INLINE_EQUATION; -} -// Returns true if PolyBlockType is of pullout(inter-column) type -inline bool PTIsPulloutType(PolyBlockType type) { - return type == PT_PULLOUT_IMAGE || type == PT_PULLOUT_TEXT; -} - -/** - * +------------------+ Orientation Example: - * | 1 Aaaa Aaaa Aaaa | ==================== - * | Aaa aa aaa aa | To left is a diagram of some (1) English and - * | aaaaaa A aa aaa. | (2) Chinese text and a (3) photo credit. - * | 2 | - * | ####### c c C | Upright Latin characters are represented as A and a. - * | ####### c c c | '<' represents a latin character rotated - * | < ####### c c c | anti-clockwise 90 degrees. - * | < ####### c c | - * | < ####### . c | Upright Chinese characters are represented C and c. - * | 3 ####### c | - * +------------------+ NOTA BENE: enum values here should match goodoc.proto - - * If you orient your head so that "up" aligns with Orientation, - * then the characters will appear "right side up" and readable. - * - * In the example above, both the English and Chinese paragraphs are oriented - * so their "up" is the top of the page (page up). The photo credit is read - * with one's head turned leftward ("up" is to page left). - * - * The values of this enum match the convention of Tesseract's osdetect.h -*/ -enum Orientation { - ORIENTATION_PAGE_UP = 0, - ORIENTATION_PAGE_RIGHT = 1, - ORIENTATION_PAGE_DOWN = 2, - ORIENTATION_PAGE_LEFT = 3, -}; - -/** - * The grapheme clusters within a line of text are laid out logically - * in this direction, judged when looking at the text line rotated so that - * its Orientation is "page up". - * - * For English text, the writing direction is left-to-right. For the - * Chinese text in the above example, the writing direction is top-to-bottom. - */ -enum WritingDirection { - WRITING_DIRECTION_LEFT_TO_RIGHT = 0, - WRITING_DIRECTION_RIGHT_TO_LEFT = 1, - WRITING_DIRECTION_TOP_TO_BOTTOM = 2, -}; - -/** - * The text lines are read in the given sequence. - * - * In English, the order is top-to-bottom. - * In Chinese, vertical text lines are read right-to-left. Mongolian is - * written in vertical columns top to bottom like Chinese, but the lines - * order left-to right. - * - * Note that only some combinations make sense. For example, - * WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM - */ -enum TextlineOrder { - TEXTLINE_ORDER_LEFT_TO_RIGHT = 0, - TEXTLINE_ORDER_RIGHT_TO_LEFT = 1, - TEXTLINE_ORDER_TOP_TO_BOTTOM = 2, -}; - -/** - * Possible modes for page layout analysis. These *must* be kept in order - * of decreasing amount of layout analysis to be done, except for OSD_ONLY, - * so that the inequality test macros below work. - */ -enum PageSegMode { - PSM_OSD_ONLY = 0, ///< Orientation and script detection only. - PSM_AUTO_OSD = 1, ///< Automatic page segmentation with orientation and - ///< script detection. (OSD) - PSM_AUTO_ONLY = 2, ///< Automatic page segmentation, but no OSD, or OCR. - PSM_AUTO = 3, ///< Fully automatic page segmentation, but no OSD. - PSM_SINGLE_COLUMN = 4, ///< Assume a single column of text of variable sizes. - PSM_SINGLE_BLOCK_VERT_TEXT = 5, ///< Assume a single uniform block of - ///< vertically aligned text. - PSM_SINGLE_BLOCK = 6, ///< Assume a single uniform block of text. (Default.) - PSM_SINGLE_LINE = 7, ///< Treat the image as a single text line. - PSM_SINGLE_WORD = 8, ///< Treat the image as a single word. - PSM_CIRCLE_WORD = 9, ///< Treat the image as a single word in a circle. - PSM_SINGLE_CHAR = 10, ///< Treat the image as a single character. - PSM_SPARSE_TEXT = - 11, ///< Find as much text as possible in no particular order. - PSM_SPARSE_TEXT_OSD = 12, ///< Sparse text with orientation and script det. - PSM_RAW_LINE = 13, ///< Treat the image as a single text line, bypassing - ///< hacks that are Tesseract-specific. - - PSM_COUNT ///< Number of enum entries. -}; - -/** - * Inline functions that act on a PageSegMode to determine whether components of - * layout analysis are enabled. - * *Depend critically on the order of elements of PageSegMode.* - * NOTE that arg is an int for compatibility with INT_PARAM. - */ -inline bool PSM_OSD_ENABLED(int pageseg_mode) { - return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_ORIENTATION_ENABLED(int pageseg_mode) { - return pageseg_mode <= PSM_AUTO || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_COL_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO; -} -inline bool PSM_SPARSE(int pageseg_mode) { - return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} -inline bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN; -} -inline bool PSM_LINE_FIND_ENABLED(int pageseg_mode) { - return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK; -} -inline bool PSM_WORD_FIND_ENABLED(int pageseg_mode) { - return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) || - pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD; -} - -/** - * enum of the elements of the page hierarchy, used in ResultIterator - * to provide functions that operate on each level without having to - * have 5x as many functions. - */ -enum PageIteratorLevel { - RIL_BLOCK, // Block of text/image/separator line. - RIL_PARA, // Paragraph within a block. - RIL_TEXTLINE, // Line within a paragraph. - RIL_WORD, // Word within a textline. - RIL_SYMBOL // Symbol/character within a word. -}; - -/** - * JUSTIFICATION_UNKNOWN - * The alignment is not clearly one of the other options. This could happen - * for example if there are only one or two lines of text or the text looks - * like source code or poetry. - * - * NOTA BENE: Fully justified paragraphs (text aligned to both left and right - * margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text - * is written with a left-to-right script and with JUSTIFICATION_RIGHT if - * their text is written in a right-to-left script. - * - * Interpretation for text read in vertical lines: - * "Left" is wherever the starting reading position is. - * - * JUSTIFICATION_LEFT - * Each line, except possibly the first, is flush to the same left tab stop. - * - * JUSTIFICATION_CENTER - * The text lines of the paragraph are centered about a line going - * down through their middle of the text lines. - * - * JUSTIFICATION_RIGHT - * Each line, except possibly the first, is flush to the same right tab stop. - */ -enum ParagraphJustification { - JUSTIFICATION_UNKNOWN, - JUSTIFICATION_LEFT, - JUSTIFICATION_CENTER, - JUSTIFICATION_RIGHT, -}; - -/** - * When Tesseract/Cube is initialized we can choose to instantiate/load/run - * only the Tesseract part, only the Cube part or both along with the combiner. - * The preference of which engine to use is stored in tessedit_ocr_engine_mode. - * - * ATTENTION: When modifying this enum, please make sure to make the - * appropriate changes to all the enums mirroring it (e.g. OCREngine in - * cityblock/workflow/detection/detection_storage.proto). Such enums will - * mention the connection to OcrEngineMode in the comments. - */ -enum OcrEngineMode { - OEM_TESSERACT_ONLY, // Run Tesseract only - fastest; deprecated - OEM_LSTM_ONLY, // Run just the LSTM line recognizer. - OEM_TESSERACT_LSTM_COMBINED, // Run the LSTM recognizer, but allow fallback - // to Tesseract when things get difficult. - // deprecated - OEM_DEFAULT, // Specify this mode when calling init_*(), - // to indicate that any of the above modes - // should be automatically inferred from the - // variables in the language-specific config, - // command-line configs, or if not specified - // in any of the above should be set to the - // default OEM_TESSERACT_ONLY. - OEM_COUNT // Number of OEMs -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCSTRUCT_PUBLICTYPES_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/renderer.h b/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/renderer.h deleted file mode 100644 index 6f405233..00000000 --- a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/renderer.h +++ /dev/null @@ -1,311 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: renderer.h -// Description: Rendering interface to inject into TessBaseAPI -// -// (C) Copyright 2011, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_RENDERER_H_ -#define TESSERACT_API_RENDERER_H_ - -#include "export.h" - -// To avoid collision with other typenames include the ABSOLUTE MINIMUM -// complexity of includes here. Use forward declarations wherever possible -// and hide includes of complex types in baseapi.cpp. -#include -#include // for std::string -#include // for std::vector - -struct Pix; - -namespace tesseract { - -class TessBaseAPI; - -/** - * Interface for rendering tesseract results into a document, such as text, - * HOCR or pdf. This class is abstract. Specific classes handle individual - * formats. This interface is then used to inject the renderer class into - * tesseract when processing images. - * - * For simplicity implementing this with tesseract version 3.01, - * the renderer contains document state that is cleared from document - * to document just as the TessBaseAPI is. This way the base API can just - * delegate its rendering functionality to injected renderers, and the - * renderers can manage the associated state needed for the specific formats - * in addition to the heuristics for producing it. - */ -class TESS_API TessResultRenderer { -public: - virtual ~TessResultRenderer(); - - // Takes ownership of pointer so must be new'd instance. - // Renderers aren't ordered, but appends the sequences of next parameter - // and existing next(). The renderers should be unique across both lists. - void insert(TessResultRenderer *next); - - // Returns the next renderer or nullptr. - TessResultRenderer *next() { - return next_; - } - - /** - * Starts a new document with the given title. - * This clears the contents of the output data. - * Title should use UTF-8 encoding. - */ - bool BeginDocument(const char *title); - - /** - * Adds the recognized text from the source image to the current document. - * Invalid if BeginDocument not yet called. - * - * Note that this API is a bit weird but is designed to fit into the - * current TessBaseAPI implementation where the api has lots of state - * information that we might want to add in. - */ - bool AddImage(TessBaseAPI *api); - - /** - * Finishes the document and finalizes the output data - * Invalid if BeginDocument not yet called. - */ - bool EndDocument(); - - const char *file_extension() const { - return file_extension_; - } - const char *title() const { - return title_.c_str(); - } - - // Is everything fine? Otherwise something went wrong. - bool happy() const { - return happy_; - } - - /** - * Returns the index of the last image given to AddImage - * (i.e. images are incremented whether the image succeeded or not) - * - * This is always defined. It means either the number of the - * current image, the last image ended, or in the completed document - * depending on when in the document lifecycle you are looking at it. - * Will return -1 if a document was never started. - */ - int imagenum() const { - return imagenum_; - } - -protected: - /** - * Called by concrete classes. - * - * outputbase is the name of the output file excluding - * extension. For example, "/path/to/chocolate-chip-cookie-recipe" - * - * extension indicates the file extension to be used for output - * files. For example "pdf" will produce a .pdf file, and "hocr" - * will produce .hocr files. - */ - TessResultRenderer(const char *outputbase, const char *extension); - - // Hook for specialized handling in BeginDocument() - virtual bool BeginDocumentHandler(); - - // This must be overridden to render the OCR'd results - virtual bool AddImageHandler(TessBaseAPI *api) = 0; - - // Hook for specialized handling in EndDocument() - virtual bool EndDocumentHandler(); - - // Renderers can call this to append '\0' terminated strings into - // the output string returned by GetOutput. - // This method will grow the output buffer if needed. - void AppendString(const char *s); - - // Renderers can call this to append binary byte sequences into - // the output string returned by GetOutput. Note that s is not necessarily - // '\0' terminated (and can contain '\0' within it). - // This method will grow the output buffer if needed. - void AppendData(const char *s, int len); - -private: - TessResultRenderer *next_; // Can link multiple renderers together - FILE *fout_; // output file pointer - const char *file_extension_; // standard extension for generated output - std::string title_; // title of document being rendered - int imagenum_; // index of last image added - bool happy_; // I get grumpy when the disk fills up, etc. -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessTextRenderer : public TessResultRenderer { -public: - explicit TessTextRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into an hocr text string - */ -class TESS_API TessHOcrRenderer : public TessResultRenderer { -public: - explicit TessHOcrRenderer(const char *outputbase, bool font_info); - explicit TessHOcrRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool font_info_; // whether to print font information -}; - -/** - * Renders tesseract output into an alto text string - */ -class TESS_API TessAltoRenderer : public TessResultRenderer { -public: - explicit TessAltoRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool begin_document; -}; - -/** - * Renders Tesseract output into a TSV string - */ -class TESS_API TessTsvRenderer : public TessResultRenderer { -public: - explicit TessTsvRenderer(const char *outputbase, bool font_info); - explicit TessTsvRenderer(const char *outputbase); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - bool font_info_; // whether to print font information -}; - -/** - * Renders tesseract output into searchable PDF - */ -class TESS_API TessPDFRenderer : public TessResultRenderer { -public: - // datadir is the location of the TESSDATA. We need it because - // we load a custom PDF font from this location. - TessPDFRenderer(const char *outputbase, const char *datadir, - bool textonly = false); - -protected: - bool BeginDocumentHandler() override; - bool AddImageHandler(TessBaseAPI *api) override; - bool EndDocumentHandler() override; - -private: - // We don't want to have every image in memory at once, - // so we store some metadata as we go along producing - // PDFs one page at a time. At the end, that metadata is - // used to make everything that isn't easily handled in a - // streaming fashion. - long int obj_; // counter for PDF objects - std::vector offsets_; // offset of every PDF object in bytes - std::vector pages_; // object number for every /Page object - std::string datadir_; // where to find the custom font - bool textonly_; // skip images if set - // Bookkeeping only. DIY = Do It Yourself. - void AppendPDFObjectDIY(size_t objectsize); - // Bookkeeping + emit data. - void AppendPDFObject(const char *data); - // Create the /Contents object for an entire page. - char *GetPDFTextObjects(TessBaseAPI *api, double width, double height); - // Turn an image into a PDF object. Only transcode if we have to. - static bool imageToPDFObj(Pix *pix, const char *filename, long int objnum, - char **pdf_object, long int *pdf_object_size, - int jpg_quality); -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessUnlvRenderer : public TessResultRenderer { -public: - explicit TessUnlvRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string for LSTMBox - */ -class TESS_API TessLSTMBoxRenderer : public TessResultRenderer { -public: - explicit TessLSTMBoxRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string - */ -class TESS_API TessBoxTextRenderer : public TessResultRenderer { -public: - explicit TessBoxTextRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -/** - * Renders tesseract output into a plain UTF-8 text string in WordStr format - */ -class TESS_API TessWordStrBoxRenderer : public TessResultRenderer { -public: - explicit TessWordStrBoxRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -#ifndef DISABLED_LEGACY_ENGINE - -/** - * Renders tesseract output into an osd text string - */ -class TESS_API TessOsdRenderer : public TessResultRenderer { -public: - explicit TessOsdRenderer(const char *outputbase); - -protected: - bool AddImageHandler(TessBaseAPI *api) override; -}; - -#endif // ndef DISABLED_LEGACY_ENGINE - -} // namespace tesseract. - -#endif // TESSERACT_API_RENDERER_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/resultiterator.h b/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/resultiterator.h deleted file mode 100644 index 3e4d5807..00000000 --- a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/resultiterator.h +++ /dev/null @@ -1,250 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: resultiterator.h -// Description: Iterator for tesseract results that is capable of -// iterating in proper reading order over Bi Directional -// (e.g. mixed Hebrew and English) text. -// Author: David Eger -// -// (C) Copyright 2011, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_ -#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_ - -#include "export.h" // for TESS_API, TESS_LOCAL -#include "ltrresultiterator.h" // for LTRResultIterator -#include "publictypes.h" // for PageIteratorLevel -#include "unichar.h" // for StrongScriptDirection - -#include // for std::pair -#include // for std::vector - -namespace tesseract { - -class TESS_API ResultIterator : public LTRResultIterator { -public: - static ResultIterator *StartOfParagraph(const LTRResultIterator &resit); - - /** - * ResultIterator is copy constructible! - * The default copy constructor works just fine for us. - */ - ~ResultIterator() override = default; - - // ============= Moving around within the page ============. - /** - * Moves the iterator to point to the start of the page to begin - * an iteration. - */ - void Begin() override; - - /** - * Moves to the start of the next object at the given level in the - * page hierarchy in the appropriate reading order and returns false if - * the end of the page was reached. - * NOTE that RIL_SYMBOL will skip non-text blocks, but all other - * PageIteratorLevel level values will visit each non-text block once. - * Think of non text blocks as containing a single para, with a single line, - * with a single imaginary word. - * Calls to Next with different levels may be freely intermixed. - * This function iterates words in right-to-left scripts correctly, if - * the appropriate language has been loaded into Tesseract. - */ - bool Next(PageIteratorLevel level) override; - - /** - * IsAtBeginningOf() returns whether we're at the logical beginning of the - * given level. (as opposed to ResultIterator's left-to-right top-to-bottom - * order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf(). - * For a full description, see pageiterator.h - */ - bool IsAtBeginningOf(PageIteratorLevel level) const override; - - /** - * Implement PageIterator's IsAtFinalElement correctly in a BiDi context. - * For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we - * point at the last word in a paragraph. See PageIterator for full comment. - */ - bool IsAtFinalElement(PageIteratorLevel level, - PageIteratorLevel element) const override; - - // ============= Functions that refer to words only ============. - // Returns the number of blanks before the current word. - int BlanksBeforeWord() const; - - // ============= Accessing data ==============. - - /** - * Returns the null terminated UTF-8 encoded text string for the current - * object at the given level. Use delete [] to free after use. - */ - virtual char *GetUTF8Text(PageIteratorLevel level) const; - - /** - * Returns the LSTM choices for every LSTM timestep for the current word. - */ - virtual std::vector>>> - *GetRawLSTMTimesteps() const; - virtual std::vector>> - *GetBestLSTMSymbolChoices() const; - - /** - * Return whether the current paragraph's dominant reading direction - * is left-to-right (as opposed to right-to-left). - */ - bool ParagraphIsLtr() const; - - // ============= Exposed only for testing =============. - - /** - * Yields the reading order as a sequence of indices and (optional) - * meta-marks for a set of words (given left-to-right). - * The meta marks are passed as negative values: - * kMinorRunStart Start of minor direction text. - * kMinorRunEnd End of minor direction text. - * kComplexWord The next indexed word contains both left-to-right and - * right-to-left characters and was treated as neutral. - * - * For example, suppose we have five words in a text line, - * indexed [0,1,2,3,4] from the leftmost side of the text line. - * The following are all believable reading_orders: - * - * Left-to-Right (in ltr paragraph): - * { 0, 1, 2, 3, 4 } - * Left-to-Right (in rtl paragraph): - * { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd } - * Right-to-Left (in rtl paragraph): - * { 4, 3, 2, 1, 0 } - * Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph: - * { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 } - */ - static void CalculateTextlineOrder( - bool paragraph_is_ltr, - const std::vector &word_dirs, - std::vector *reading_order); - - static const int kMinorRunStart; - static const int kMinorRunEnd; - static const int kComplexWord; - -protected: - /** - * We presume the data associated with the given iterator will outlive us. - * NB: This is private because it does something that is non-obvious: - * it resets to the beginning of the paragraph instead of staying wherever - * resit might have pointed. - */ - explicit ResultIterator(const LTRResultIterator &resit); - -private: - /** - * Calculates the current paragraph's dominant writing direction. - * Typically, members should use current_paragraph_ltr_ instead. - */ - bool CurrentParagraphIsLtr() const; - - /** - * Returns word indices as measured from resit->RestartRow() = index 0 - * for the reading order of words within a textline given an iterator - * into the middle of the text line. - * In addition to non-negative word indices, the following negative values - * may be inserted: - * kMinorRunStart Start of minor direction text. - * kMinorRunEnd End of minor direction text. - * kComplexWord The previous word contains both left-to-right and - * right-to-left characters and was treated as neutral. - */ - void CalculateTextlineOrder(bool paragraph_is_ltr, - const LTRResultIterator &resit, - std::vector *indices) const; - /** Same as above, but the caller's ssd gets filled in if ssd != nullptr. */ - void CalculateTextlineOrder(bool paragraph_is_ltr, - const LTRResultIterator &resit, - std::vector *ssd, - std::vector *indices) const; - - /** - * What is the index of the current word in a strict left-to-right reading - * of the row? - */ - int LTRWordIndex() const; - - /** - * Given an iterator pointing at a word, returns the logical reading order - * of blob indices for the word. - */ - void CalculateBlobOrder(std::vector *blob_indices) const; - - /** Precondition: current_paragraph_is_ltr_ is set. */ - void MoveToLogicalStartOfTextline(); - - /** - * Precondition: current_paragraph_is_ltr_ and in_minor_direction_ - * are set. - */ - void MoveToLogicalStartOfWord(); - - /** Are we pointing at the final (reading order) symbol of the word? */ - bool IsAtFinalSymbolOfWord() const; - - /** Are we pointing at the first (reading order) symbol of the word? */ - bool IsAtFirstSymbolOfWord() const; - - /** - * Append any extra marks that should be appended to this word when printed. - * Mostly, these are Unicode BiDi control characters. - */ - void AppendSuffixMarks(std::string *text) const; - - /** Appends the current word in reading order to the given buffer.*/ - void AppendUTF8WordText(std::string *text) const; - - /** - * Appends the text of the current text line, *assuming this iterator is - * positioned at the beginning of the text line* This function - * updates the iterator to point to the first position past the text line. - * Each textline is terminated in a single newline character. - * If the textline ends a paragraph, it gets a second terminal newline. - */ - void IterateAndAppendUTF8TextlineText(std::string *text); - - /** - * Appends the text of the current paragraph in reading order - * to the given buffer. - * Each textline is terminated in a single newline character, and the - * paragraph gets an extra newline at the end. - */ - void AppendUTF8ParagraphText(std::string *text) const; - - /** Returns whether the bidi_debug flag is set to at least min_level. */ - bool BidiDebug(int min_level) const; - - bool current_paragraph_is_ltr_; - - /** - * Is the currently pointed-at character at the beginning of - * a minor-direction run? - */ - bool at_beginning_of_minor_run_; - - /** Is the currently pointed-at character in a minor-direction sequence? */ - bool in_minor_direction_; - - /** - * Should detected inter-word spaces be preserved, or "compressed" to a single - * space character (default behavior). - */ - bool preserve_interword_spaces_; -}; - -} // namespace tesseract. - -#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/unichar.h b/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/unichar.h deleted file mode 100644 index 015109d7..00000000 --- a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/unichar.h +++ /dev/null @@ -1,174 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: unichar.h -// Description: Unicode character/ligature class. -// Author: Ray Smith -// -// (C) Copyright 2006, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_CCUTIL_UNICHAR_H_ -#define TESSERACT_CCUTIL_UNICHAR_H_ - -#include "export.h" - -#include -#include -#include -#include - -namespace tesseract { - -// Maximum number of characters that can be stored in a UNICHAR. Must be -// at least 4. Must not exceed 31 without changing the coding of length. -#define UNICHAR_LEN 30 - -// A UNICHAR_ID is the unique id of a unichar. -using UNICHAR_ID = int; - -// A variable to indicate an invalid or uninitialized unichar id. -static const int INVALID_UNICHAR_ID = -1; -// A special unichar that corresponds to INVALID_UNICHAR_ID. -static const char INVALID_UNICHAR[] = "__INVALID_UNICHAR__"; - -enum StrongScriptDirection { - DIR_NEUTRAL = 0, // Text contains only neutral characters. - DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters. - DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters. - DIR_MIX = 3, // Text contains a mixture of left-to-right - // and right-to-left characters. -}; - -using char32 = signed int; - -// The UNICHAR class holds a single classification result. This may be -// a single Unicode character (stored as between 1 and 4 utf8 bytes) or -// multiple Unicode characters representing the NFKC expansion of a ligature -// such as fi, ffl etc. These are also stored as utf8. -class TESS_API UNICHAR { -public: - UNICHAR() { - memset(chars, 0, UNICHAR_LEN); - } - - // Construct from a utf8 string. If len<0 then the string is null terminated. - // If the string is too long to fit in the UNICHAR then it takes only what - // will fit. - UNICHAR(const char *utf8_str, int len); - - // Construct from a single UCS4 character. - explicit UNICHAR(int unicode); - - // Default copy constructor and operator= are OK. - - // Get the first character as UCS-4. - int first_uni() const; - - // Get the length of the UTF8 string. - int utf8_len() const { - int len = chars[UNICHAR_LEN - 1]; - return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN; - } - - // Get a UTF8 string, but NOT nullptr terminated. - const char *utf8() const { - return chars; - } - - // Get a terminated UTF8 string: Must delete[] it after use. - char *utf8_str() const; - - // Get the number of bytes in the first character of the given utf8 string. - static int utf8_step(const char *utf8_str); - - // A class to simplify iterating over and accessing elements of a UTF8 - // string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or - // take ownership of the underlying byte array. It also does not permit - // modification of the array (as the name suggests). - // - // Example: - // for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len); - // it != UNICHAR::end(str, len); - // ++it) { - // printf("UCS-4 symbol code = %d\n", *it); - // char buf[5]; - // int char_len = it.get_utf8(buf); buf[char_len] = '\0'; - // printf("Char = %s\n", buf); - // } - class TESS_API const_iterator { - using CI = const_iterator; - - public: - // Step to the next UTF8 character. - // If the current position is at an illegal UTF8 character, then print an - // error message and step by one byte. If the current position is at a - // nullptr value, don't step past it. - const_iterator &operator++(); - - // Return the UCS-4 value at the current position. - // If the current position is at an illegal UTF8 value, return a single - // space character. - int operator*() const; - - // Store the UTF-8 encoding of the current codepoint into buf, which must be - // at least 4 bytes long. Return the number of bytes written. - // If the current position is at an illegal UTF8 value, writes a single - // space character and returns 1. - // Note that this method does not null-terminate the buffer. - int get_utf8(char *buf) const; - // Returns the number of bytes of the current codepoint. Returns 1 if the - // current position is at an illegal UTF8 value. - int utf8_len() const; - // Returns true if the UTF-8 encoding at the current position is legal. - bool is_legal() const; - - // Return the pointer into the string at the current position. - const char *utf8_data() const { - return it_; - } - - // Iterator equality operators. - friend bool operator==(const CI &lhs, const CI &rhs) { - return lhs.it_ == rhs.it_; - } - friend bool operator!=(const CI &lhs, const CI &rhs) { - return !(lhs == rhs); - } - - private: - friend class UNICHAR; - explicit const_iterator(const char *it) : it_(it) {} - - const char *it_; // Pointer into the string. - }; - - // Create a start/end iterator pointing to a string. Note that these methods - // are static and do NOT create a copy or take ownership of the underlying - // array. - static const_iterator begin(const char *utf8_str, int byte_length); - static const_iterator end(const char *utf8_str, int byte_length); - - // Converts a utf-8 string to a vector of unicodes. - // Returns an empty vector if the input contains invalid UTF-8. - static std::vector UTF8ToUTF32(const char *utf8_str); - // Converts a vector of unicodes to a utf8 string. - // Returns an empty string if the input contains an invalid unicode. - static std::string UTF32ToUTF8(const std::vector &str32); - -private: - // A UTF-8 representation of 1 or more Unicode characters. - // The last element (chars[UNICHAR_LEN - 1]) is a length if - // its value < UNICHAR_LEN, otherwise it is a genuine character. - char chars[UNICHAR_LEN]{}; -}; - -} // namespace tesseract - -#endif // TESSERACT_CCUTIL_UNICHAR_H_ diff --git a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/version.h b/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/version.h deleted file mode 100644 index 6bac5d66..00000000 --- a/third_party/ocr/tesseract-ocr/uos/mips64/include/tesseract/version.h +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// File: version.h -// Description: Version information -// -// (C) Copyright 2018, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef TESSERACT_API_VERSION_H_ -#define TESSERACT_API_VERSION_H_ - -// clang-format off - -#define TESSERACT_MAJOR_VERSION @GENERIC_MAJOR_VERSION@ -#define TESSERACT_MINOR_VERSION @GENERIC_MINOR_VERSION@ -#define TESSERACT_MICRO_VERSION @GENERIC_MICRO_VERSION@ - -#define TESSERACT_VERSION \ - (TESSERACT_MAJOR_VERSION << 16 | \ - TESSERACT_MINOR_VERSION << 8 | \ - TESSERACT_MICRO_VERSION) - -#define TESSERACT_VERSION_STR "@PACKAGE_VERSION@" - -// clang-format on - -#endif // TESSERACT_API_VERSION_H_