diff --git a/build/linux/HGImgProc/HGImgProc.cbp b/build/linux/HGImgProc/HGImgProc.cbp
index 58319482..2713d21f 100644
--- a/build/linux/HGImgProc/HGImgProc.cbp
+++ b/build/linux/HGImgProc/HGImgProc.cbp
@@ -29,8 +29,6 @@
-
-
@@ -61,8 +59,6 @@
-
-
@@ -495,8 +491,6 @@
-
-
diff --git a/modules/imgproc/HGOCR.cpp b/modules/imgproc/HGOCR.cpp
index 5422b225..bcaf36c1 100644
--- a/modules/imgproc/HGOCR.cpp
+++ b/modules/imgproc/HGOCR.cpp
@@ -1,7 +1,9 @@
#include "HGOCR.h"
#include "HGOCRBase.hpp"
#include "HGOCRHanvon.hpp"
+#if defined(HG_CMP_MSC)
#include "HGOCRTesseract.hpp"
+#endif
#include "HGOCRRetImpl.hpp"
HGResult HGAPI HGImgProc_CreateOCRMgr(HGUInt algo, HGOCRMgr* ocrMgr)
@@ -13,6 +15,7 @@ HGResult HGAPI HGImgProc_CreateOCRMgr(HGUInt algo, HGOCRMgr* ocrMgr)
if (HGIMGPROC_OCRALGO_DEFAULT == algo)
{
+#if defined(HG_CMP_MSC)
HGOCRBase* ocrMgrImpl = new HGOCRHanvon;
HGResult ret = ocrMgrImpl->Init();
if (HGBASE_ERR_OK != ret)
@@ -26,6 +29,15 @@ HGResult HGAPI HGImgProc_CreateOCRMgr(HGUInt algo, HGOCRMgr* ocrMgr)
return ret;
}
}
+#else
+ HGOCRBase* ocrMgrImpl = new HGOCRHanvon;
+ HGResult ret = ocrMgrImpl->Init();
+ if (HGBASE_ERR_OK != ret)
+ {
+ delete ocrMgrImpl;
+ return ret;
+ }
+#endif
*ocrMgr = (HGOCRMgr)ocrMgrImpl;
return HGBASE_ERR_OK;
@@ -45,6 +57,7 @@ HGResult HGAPI HGImgProc_CreateOCRMgr(HGUInt algo, HGOCRMgr* ocrMgr)
}
else if (HGIMGPROC_OCRALGO_TESSERACT == algo)
{
+#if defined(HG_CMP_MSC)
HGOCRBase* ocrMgrImpl = new HGOCRTesseract;
HGResult ret = ocrMgrImpl->Init();
if (HGBASE_ERR_OK != ret)
@@ -55,6 +68,10 @@ HGResult HGAPI HGImgProc_CreateOCRMgr(HGUInt algo, HGOCRMgr* ocrMgr)
*ocrMgr = (HGOCRMgr)ocrMgrImpl;
return HGBASE_ERR_OK;
+#else
+ return HGBASE_ERR_INVALIDARG;
+#endif
+
}
return HGBASE_ERR_INVALIDARG;
diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/baseapi.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/baseapi.h
deleted file mode 100644
index 5e1e4830..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/baseapi.h
+++ /dev/null
@@ -1,812 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: baseapi.h
-// Description: Simple API for calling tesseract.
-// Author: Ray Smith
-//
-// (C) Copyright 2006, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_API_BASEAPI_H_
-#define TESSERACT_API_BASEAPI_H_
-
-#ifdef HAVE_CONFIG_H
-# include "config_auto.h" // DISABLED_LEGACY_ENGINE
-#endif
-
-#include "export.h"
-#include "pageiterator.h"
-#include "publictypes.h"
-#include "resultiterator.h"
-#include "unichar.h"
-
-#include "version.h"
-
-#include
-#include // for std::vector
-
-struct Pix;
-struct Pixa;
-struct Boxa;
-
-namespace tesseract {
-
-class PAGE_RES;
-class ParagraphModel;
-class BLOCK_LIST;
-class ETEXT_DESC;
-struct OSResults;
-class UNICHARSET;
-
-class Dawg;
-class Dict;
-class EquationDetect;
-class PageIterator;
-class ImageThresholder;
-class LTRResultIterator;
-class ResultIterator;
-class MutableIterator;
-class TessResultRenderer;
-class Tesseract;
-
-// Function to read a std::vector from a whole file.
-// Returns false on failure.
-using FileReader = bool (*)(const char *filename, std::vector *data);
-
-using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID,
- bool) const;
-using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *,
- int, const char *, int);
-
-/**
- * Base class for all tesseract APIs.
- * Specific classes can add ability to work on different inputs or produce
- * different outputs.
- * This class is mostly an interface layer on top of the Tesseract instance
- * class to hide the data types so that users of this class don't have to
- * include any other Tesseract headers.
- */
-class TESS_API TessBaseAPI {
-public:
- TessBaseAPI();
- virtual ~TessBaseAPI();
- // Copy constructor and assignment operator are currently unsupported.
- TessBaseAPI(TessBaseAPI const &) = delete;
- TessBaseAPI &operator=(TessBaseAPI const &) = delete;
-
- /**
- * Returns the version identifier as a static string. Do not delete.
- */
- static const char *Version();
-
- /**
- * If compiled with OpenCL AND an available OpenCL
- * device is deemed faster than serial code, then
- * "device" is populated with the cl_device_id
- * and returns sizeof(cl_device_id)
- * otherwise *device=nullptr and returns 0.
- */
- static size_t getOpenCLDevice(void **device);
-
- /**
- * Set the name of the input file. Needed for training and
- * reading a UNLV zone file, and for searchable PDF output.
- */
- void SetInputName(const char *name);
- /**
- * These functions are required for searchable PDF output.
- * We need our hands on the input file so that we can include
- * it in the PDF without transcoding. If that is not possible,
- * we need the original image. Finally, resolution metadata
- * is stored in the PDF so we need that as well.
- */
- const char *GetInputName();
- // Takes ownership of the input pix.
- void SetInputImage(Pix *pix);
- Pix *GetInputImage();
- int GetSourceYResolution();
- const char *GetDatapath();
-
- /** Set the name of the bonus output files. Needed only for debugging. */
- void SetOutputName(const char *name);
-
- /**
- * Set the value of an internal "parameter."
- * Supply the name of the parameter and the value as a string, just as
- * you would in a config file.
- * Returns false if the name lookup failed.
- * Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z.
- * Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode.
- * SetVariable may be used before Init, but settings will revert to
- * defaults on End().
- *
- * Note: Must be called after Init(). Only works for non-init variables
- * (init variables should be passed to Init()).
- */
- bool SetVariable(const char *name, const char *value);
- bool SetDebugVariable(const char *name, const char *value);
-
- /**
- * Returns true if the parameter was found among Tesseract parameters.
- * Fills in value with the value of the parameter.
- */
- bool GetIntVariable(const char *name, int *value) const;
- bool GetBoolVariable(const char *name, bool *value) const;
- bool GetDoubleVariable(const char *name, double *value) const;
-
- /**
- * Returns the pointer to the string that represents the value of the
- * parameter if it was found among Tesseract parameters.
- */
- const char *GetStringVariable(const char *name) const;
-
-#ifndef DISABLED_LEGACY_ENGINE
-
- /**
- * Print Tesseract fonts table to the given file.
- */
- void PrintFontsTable(FILE *fp) const;
-
-#endif
-
- /**
- * Print Tesseract parameters to the given file.
- */
- void PrintVariables(FILE *fp) const;
-
- /**
- * Get value of named variable as a string, if it exists.
- */
- bool GetVariableAsString(const char *name, std::string *val) const;
-
- /**
- * Instances are now mostly thread-safe and totally independent,
- * but some global parameters remain. Basically it is safe to use multiple
- * TessBaseAPIs in different threads in parallel, UNLESS:
- * you use SetVariable on some of the Params in classify and textord.
- * If you do, then the effect will be to change it for all your instances.
- *
- * Start tesseract. Returns zero on success and -1 on failure.
- * NOTE that the only members that may be called before Init are those
- * listed above here in the class definition.
- *
- * The datapath must be the name of the tessdata directory.
- * The language is (usually) an ISO 639-3 string or nullptr will default to
- * eng. It is entirely safe (and eventually will be efficient too) to call
- * Init multiple times on the same instance to change language, or just
- * to reset the classifier.
- * The language may be a string of the form [~][+[~]]* indicating
- * that multiple languages are to be loaded. Eg hin+eng will load Hindi and
- * English. Languages may specify internally that they want to be loaded
- * with one or more other languages, so the ~ sign is available to override
- * that. Eg if hin were set to load eng by default, then hin+~eng would force
- * loading only hin. The number of loaded languages is limited only by
- * memory, with the caveat that loading additional languages will impact
- * both speed and accuracy, as there is more work to do to decide on the
- * applicable language, and there is more chance of hallucinating incorrect
- * words.
- * WARNING: On changing languages, all Tesseract parameters are reset
- * back to their default values. (Which may vary between languages.)
- * If you have a rare need to set a Variable that controls
- * initialization for a second call to Init you should explicitly
- * call End() and then use SetVariable before Init. This is only a very
- * rare use case, since there are very few uses that require any parameters
- * to be set before Init.
- *
- * If set_only_non_debug_params is true, only params that do not contain
- * "debug" in the name will be set.
- */
- int Init(const char *datapath, const char *language, OcrEngineMode mode,
- char **configs, int configs_size,
- const std::vector *vars_vec,
- const std::vector *vars_values,
- bool set_only_non_debug_params);
- int Init(const char *datapath, const char *language, OcrEngineMode oem) {
- return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false);
- }
- int Init(const char *datapath, const char *language) {
- return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr,
- false);
- }
- // In-memory version reads the traineddata file directly from the given
- // data[data_size] array, and/or reads data via a FileReader.
- int Init(const char *data, int data_size, const char *language,
- OcrEngineMode mode, char **configs, int configs_size,
- const std::vector *vars_vec,
- const std::vector *vars_values,
- bool set_only_non_debug_params, FileReader reader);
-
- /**
- * Returns the languages string used in the last valid initialization.
- * If the last initialization specified "deu+hin" then that will be
- * returned. If hin loaded eng automatically as well, then that will
- * not be included in this list. To find the languages actually
- * loaded use GetLoadedLanguagesAsVector.
- * The returned string should NOT be deleted.
- */
- const char *GetInitLanguagesAsString() const;
-
- /**
- * Returns the loaded languages in the vector of std::string.
- * Includes all languages loaded by the last Init, including those loaded
- * as dependencies of other loaded languages.
- */
- void GetLoadedLanguagesAsVector(std::vector *langs) const;
-
- /**
- * Returns the available languages in the sorted vector of std::string.
- */
- void GetAvailableLanguagesAsVector(std::vector *langs) const;
-
- /**
- * Init only for page layout analysis. Use only for calls to SetImage and
- * AnalysePage. Calls that attempt recognition will generate an error.
- */
- void InitForAnalysePage();
-
- /**
- * Read a "config" file containing a set of param, value pairs.
- * Searches the standard places: tessdata/configs, tessdata/tessconfigs
- * and also accepts a relative or absolute path name.
- * Note: only non-init params will be set (init params are set by Init()).
- */
- void ReadConfigFile(const char *filename);
- /** Same as above, but only set debug params from the given config file. */
- void ReadDebugConfigFile(const char *filename);
-
- /**
- * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK.
- * The mode is stored as an IntParam so it can also be modified by
- * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
- */
- void SetPageSegMode(PageSegMode mode);
-
- /** Return the current page segmentation mode. */
- PageSegMode GetPageSegMode() const;
-
- /**
- * Recognize a rectangle from an image and return the result as a string.
- * May be called many times for a single Init.
- * Currently has no error checking.
- * Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
- * Palette color images will not work properly and must be converted to
- * 24 bit.
- * Binary images of 1 bit per pixel may also be given but they must be
- * byte packed with the MSB of the first byte being the first pixel, and a
- * 1 represents WHITE. For binary images set bytes_per_pixel=0.
- * The recognized text is returned as a char* which is coded
- * as UTF8 and must be freed with the delete [] operator.
- *
- * Note that TesseractRect is the simplified convenience interface.
- * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize,
- * and one or more of the Get*Text functions below.
- */
- char *TesseractRect(const unsigned char *imagedata, int bytes_per_pixel,
- int bytes_per_line, int left, int top, int width,
- int height);
-
- /**
- * Call between pages or documents etc to free up memory and forget
- * adaptive data.
- */
- void ClearAdaptiveClassifier();
-
- /**
- * @defgroup AdvancedAPI Advanced API
- * The following methods break TesseractRect into pieces, so you can
- * get hold of the thresholded image, get the text in different formats,
- * get bounding boxes, confidences etc.
- */
- /* @{ */
-
- /**
- * Provide an image for Tesseract to recognize. Format is as
- * TesseractRect above. Copies the image buffer and converts to Pix.
- * SetImage clears all recognition results, and sets the rectangle to the
- * full image, so it may be followed immediately by a GetUTF8Text, and it
- * will automatically perform recognition.
- */
- void SetImage(const unsigned char *imagedata, int width, int height,
- int bytes_per_pixel, int bytes_per_line);
-
- /**
- * Provide an image for Tesseract to recognize. As with SetImage above,
- * Tesseract takes its own copy of the image, so it need not persist until
- * after Recognize.
- * Pix vs raw, which to use?
- * Use Pix where possible. Tesseract uses Pix as its internal representation
- * and it is therefore more efficient to provide a Pix directly.
- */
- void SetImage(Pix *pix);
-
- /**
- * Set the resolution of the source image in pixels per inch so font size
- * information can be calculated in results. Call this after SetImage().
- */
- void SetSourceResolution(int ppi);
-
- /**
- * Restrict recognition to a sub-rectangle of the image. Call after SetImage.
- * Each SetRectangle clears the recogntion results so multiple rectangles
- * can be recognized with the same image.
- */
- void SetRectangle(int left, int top, int width, int height);
-
- /**
- * Get a copy of the internal thresholded image from Tesseract.
- * Caller takes ownership of the Pix and must pixDestroy it.
- * May be called any time after SetImage, or after TesseractRect.
- */
- Pix *GetThresholdedImage();
-
- /**
- * Get the result of page layout analysis as a leptonica-style
- * Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- */
- Boxa *GetRegions(Pixa **pixa);
-
- /**
- * Get the textlines as a leptonica-style
- * Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- * If raw_image is true, then extract from the original image instead of the
- * thresholded image and pad by raw_padding pixels.
- * If blockids is not nullptr, the block-id of each line is also returned as
- * an array of one element per line. delete [] after use. If paraids is not
- * nullptr, the paragraph-id of each line within its block is also returned as
- * an array of one element per line. delete [] after use.
- */
- Boxa *GetTextlines(bool raw_image, int raw_padding, Pixa **pixa,
- int **blockids, int **paraids);
- /*
- Helper method to extract from the thresholded image. (most common usage)
-*/
- Boxa *GetTextlines(Pixa **pixa, int **blockids) {
- return GetTextlines(false, 0, pixa, blockids, nullptr);
- }
-
- /**
- * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa
- * pair, in reading order. Enables downstream handling of non-rectangular
- * regions.
- * Can be called before or after Recognize.
- * If blockids is not nullptr, the block-id of each line is also returned as
- * an array of one element per line. delete [] after use.
- */
- Boxa *GetStrips(Pixa **pixa, int **blockids);
-
- /**
- * Get the words as a leptonica-style
- * Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- */
- Boxa *GetWords(Pixa **pixa);
-
- /**
- * Gets the individual connected (text) components (created
- * after pages segmentation step, but before recognition)
- * as a leptonica-style Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- * Note: the caller is responsible for calling boxaDestroy()
- * on the returned Boxa array and pixaDestroy() on cc array.
- */
- Boxa *GetConnectedComponents(Pixa **cc);
-
- /**
- * Get the given level kind of components (block, textline, word etc.) as a
- * leptonica-style Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- * If blockids is not nullptr, the block-id of each component is also returned
- * as an array of one element per component. delete [] after use.
- * If blockids is not nullptr, the paragraph-id of each component with its
- * block is also returned as an array of one element per component. delete []
- * after use. If raw_image is true, then portions of the original image are
- * extracted instead of the thresholded image and padded with raw_padding. If
- * text_only is true, then only text components are returned.
- */
- Boxa *GetComponentImages(PageIteratorLevel level, bool text_only,
- bool raw_image, int raw_padding, Pixa **pixa,
- int **blockids, int **paraids);
- // Helper function to get binary images with no padding (most common usage).
- Boxa *GetComponentImages(const PageIteratorLevel level, const bool text_only,
- Pixa **pixa, int **blockids) {
- return GetComponentImages(level, text_only, false, 0, pixa, blockids,
- nullptr);
- }
-
- /**
- * Returns the scale factor of the thresholded image that would be returned by
- * GetThresholdedImage() and the various GetX() methods that call
- * GetComponentImages().
- * Returns 0 if no thresholder has been set.
- */
- int GetThresholdedImageScaleFactor() const;
-
- /**
- * Runs page layout analysis in the mode set by SetPageSegMode.
- * May optionally be called prior to Recognize to get access to just
- * the page layout results. Returns an iterator to the results.
- * If merge_similar_words is true, words are combined where suitable for use
- * with a line recognizer. Use if you want to use AnalyseLayout to find the
- * textlines, and then want to process textline fragments with an external
- * line recognizer.
- * Returns nullptr on error or an empty page.
- * The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- */
- PageIterator *AnalyseLayout();
- PageIterator *AnalyseLayout(bool merge_similar_words);
-
- /**
- * Recognize the image from SetAndThresholdImage, generating Tesseract
- * internal structures. Returns 0 on success.
- * Optional. The Get*Text functions below will call Recognize if needed.
- * After Recognize, the output is kept internally until the next SetImage.
- */
- int Recognize(ETEXT_DESC *monitor);
-
- /**
- * Methods to retrieve information after SetAndThresholdImage(),
- * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.)
- */
-
- /**
- * Turns images into symbolic text.
- *
- * filename can point to a single image, a multi-page TIFF,
- * or a plain text list of image filenames.
- *
- * retry_config is useful for debugging. If not nullptr, you can fall
- * back to an alternate configuration if a page fails for some
- * reason.
- *
- * timeout_millisec terminates processing if any single page
- * takes too long. Set to 0 for unlimited time.
- *
- * renderer is responible for creating the output. For example,
- * use the TessTextRenderer if you want plaintext output, or
- * the TessPDFRender to produce searchable PDF.
- *
- * If tessedit_page_number is non-negative, will only process that
- * single page. Works for multi-page tiff file, or filelist.
- *
- * Returns true if successful, false on error.
- */
- bool ProcessPages(const char *filename, const char *retry_config,
- int timeout_millisec, TessResultRenderer *renderer);
- // Does the real work of ProcessPages.
- bool ProcessPagesInternal(const char *filename, const char *retry_config,
- int timeout_millisec, TessResultRenderer *renderer);
-
- /**
- * Turn a single image into symbolic text.
- *
- * The pix is the image processed. filename and page_index are
- * metadata used by side-effect processes, such as reading a box
- * file or formatting as hOCR.
- *
- * See ProcessPages for descriptions of other parameters.
- */
- bool ProcessPage(Pix *pix, int page_index, const char *filename,
- const char *retry_config, int timeout_millisec,
- TessResultRenderer *renderer);
-
- /**
- * Get a reading-order iterator to the results of LayoutAnalysis and/or
- * Recognize. The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- */
- ResultIterator *GetIterator();
-
- /**
- * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.
- * The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- */
- MutableIterator *GetMutableIterator();
-
- /**
- * The recognized text is returned as a char* which is coded
- * as UTF8 and must be freed with the delete [] operator.
- */
- char *GetUTF8Text();
-
- /**
- * Make a HTML-formatted string with hOCR markup from the internal
- * data structures.
- * page_number is 0-based but will appear in the output as 1-based.
- * monitor can be used to
- * cancel the recognition
- * receive progress callbacks
- * Returned string must be freed with the delete [] operator.
- */
- char *GetHOCRText(ETEXT_DESC *monitor, int page_number);
-
- /**
- * Make a HTML-formatted string with hOCR markup from the internal
- * data structures.
- * page_number is 0-based but will appear in the output as 1-based.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetHOCRText(int page_number);
-
- /**
- * Make an XML-formatted string with Alto markup from the internal
- * data structures.
- */
- char *GetAltoText(ETEXT_DESC *monitor, int page_number);
-
- /**
- * Make an XML-formatted string with Alto markup from the internal
- * data structures.
- */
- char *GetAltoText(int page_number);
-
- /**
- * Make a TSV-formatted string from the internal data structures.
- * page_number is 0-based but will appear in the output as 1-based.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetTSVText(int page_number);
-
- /**
- * Make a box file for LSTM training from the internal data structures.
- * Constructs coordinates in the original image - not just the rectangle.
- * page_number is a 0-based page index that will appear in the box file.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetLSTMBoxText(int page_number);
-
- /**
- * The recognized text is returned as a char* which is coded in the same
- * format as a box file used in training.
- * Constructs coordinates in the original image - not just the rectangle.
- * page_number is a 0-based page index that will appear in the box file.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetBoxText(int page_number);
-
- /**
- * The recognized text is returned as a char* which is coded in the same
- * format as a WordStr box file used in training.
- * page_number is a 0-based page index that will appear in the box file.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetWordStrBoxText(int page_number);
-
- /**
- * The recognized text is returned as a char* which is coded
- * as UNLV format Latin-1 with specific reject and suspect codes.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetUNLVText();
-
- /**
- * Detect the orientation of the input image and apparent script (alphabet).
- * orient_deg is the detected clockwise rotation of the input image in degrees
- * (0, 90, 180, 270)
- * orient_conf is the confidence (15.0 is reasonably confident)
- * script_name is an ASCII string, the name of the script, e.g. "Latin"
- * script_conf is confidence level in the script
- * Returns true on success and writes values to each parameter as an output
- */
- bool DetectOrientationScript(int *orient_deg, float *orient_conf,
- const char **script_name, float *script_conf);
-
- /**
- * The recognized text is returned as a char* which is coded
- * as UTF8 and must be freed with the delete [] operator.
- * page_number is a 0-based page index that will appear in the osd file.
- */
- char *GetOsdText(int page_number);
-
- /** Returns the (average) confidence value between 0 and 100. */
- int MeanTextConf();
- /**
- * Returns all word confidences (between 0 and 100) in an array, terminated
- * by -1. The calling function must delete [] after use.
- * The number of confidences should correspond to the number of space-
- * delimited words in GetUTF8Text.
- */
- int *AllWordConfidences();
-
-#ifndef DISABLED_LEGACY_ENGINE
- /**
- * Applies the given word to the adaptive classifier if possible.
- * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can
- * tell the boundaries of the graphemes.
- * Assumes that SetImage/SetRectangle have been used to set the image
- * to the given word. The mode arg should be PSM_SINGLE_WORD or
- * PSM_CIRCLE_WORD, as that will be used to control layout analysis.
- * The currently set PageSegMode is preserved.
- * Returns false if adaption was not possible for some reason.
- */
- bool AdaptToWordStr(PageSegMode mode, const char *wordstr);
-#endif // ndef DISABLED_LEGACY_ENGINE
-
- /**
- * Free up recognition results and any stored image data, without actually
- * freeing any recognition data that would be time-consuming to reload.
- * Afterwards, you must call SetImage or TesseractRect before doing
- * any Recognize or Get* operation.
- */
- void Clear();
-
- /**
- * Close down tesseract and free up all memory. End() is equivalent to
- * destructing and reconstructing your TessBaseAPI.
- * Once End() has been used, none of the other API functions may be used
- * other than Init and anything declared above it in the class definition.
- */
- void End();
-
- /**
- * Clear any library-level memory caches.
- * There are a variety of expensive-to-load constant data structures (mostly
- * language dictionaries) that are cached globally -- surviving the Init()
- * and End() of individual TessBaseAPI's. This function allows the clearing
- * of these caches.
- **/
- static void ClearPersistentCache();
-
- /**
- * Check whether a word is valid according to Tesseract's language model
- * @return 0 if the word is invalid, non-zero if valid.
- * @warning temporary! This function will be removed from here and placed
- * in a separate API at some future time.
- */
- int IsValidWord(const char *word) const;
- // Returns true if utf8_character is defined in the UniCharset.
- bool IsValidCharacter(const char *utf8_character) const;
-
- bool GetTextDirection(int *out_offset, float *out_slope);
-
- /** Sets Dict::letter_is_okay_ function to point to the given function. */
- void SetDictFunc(DictFunc f);
-
- /** Sets Dict::probability_in_context_ function to point to the given
- * function.
- */
- void SetProbabilityInContextFunc(ProbabilityInContextFunc f);
-
- /**
- * Estimates the Orientation And Script of the image.
- * @return true if the image was processed successfully.
- */
- bool DetectOS(OSResults *);
-
- /**
- * Return text orientation of each block as determined by an earlier run
- * of layout analysis.
- */
- void GetBlockTextOrientations(int **block_orientation,
- bool **vertical_writing);
-
- /** This method returns the string form of the specified unichar. */
- const char *GetUnichar(int unichar_id) const;
-
- /** Return the pointer to the i-th dawg loaded into tesseract_ object. */
- const Dawg *GetDawg(int i) const;
-
- /** Return the number of dawgs loaded into tesseract_ object. */
- int NumDawgs() const;
-
- Tesseract *tesseract() const {
- return tesseract_;
- }
-
- OcrEngineMode oem() const {
- return last_oem_requested_;
- }
-
- void set_min_orientation_margin(double margin);
- /* @} */
-
-protected:
- /** Common code for setting the image. Returns true if Init has been called.
- */
- bool InternalSetImage();
-
- /**
- * Run the thresholder to make the thresholded image. If pix is not nullptr,
- * the source is thresholded to pix instead of the internal IMAGE.
- */
- virtual bool Threshold(Pix **pix);
-
- /**
- * Find lines from the image making the BLOCK_LIST.
- * @return 0 on success.
- */
- int FindLines();
-
- /** Delete the pageres and block list ready for a new page. */
- void ClearResults();
-
- /**
- * Return an LTR Result Iterator -- used only for training, as we really want
- * to ignore all BiDi smarts at that point.
- * delete once you're done with it.
- */
- LTRResultIterator *GetLTRIterator();
-
- /**
- * Return the length of the output text string, as UTF8, assuming
- * one newline per line and one per block, with a terminator,
- * and assuming a single character reject marker for each rejected character.
- * Also return the number of recognized blobs in blob_count.
- */
- int TextLength(int *blob_count) const;
-
- //// paragraphs.cpp ////////////////////////////////////////////////////
- void DetectParagraphs(bool after_text_recognition);
-
- const PAGE_RES *GetPageRes() const {
- return page_res_;
- }
-
-protected:
- Tesseract *tesseract_; ///< The underlying data object.
- Tesseract *osd_tesseract_; ///< For orientation & script detection.
- EquationDetect *equ_detect_; ///< The equation detector.
- FileReader reader_; ///< Reads files from any filesystem.
- ImageThresholder *thresholder_; ///< Image thresholding module.
- std::vector *paragraph_models_;
- BLOCK_LIST *block_list_; ///< The page layout.
- PAGE_RES *page_res_; ///< The page-level data.
- std::string input_file_; ///< Name used by training code.
- std::string output_file_; ///< Name used by debug code.
- std::string datapath_; ///< Current location of tessdata.
- std::string language_; ///< Last initialized language.
- OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested.
- bool recognition_done_; ///< page_res_ contains recognition data.
-
- /**
- * @defgroup ThresholderParams Thresholder Parameters
- * Parameters saved from the Thresholder. Needed to rebuild coordinates.
- */
- /* @{ */
- int rect_left_;
- int rect_top_;
- int rect_width_;
- int rect_height_;
- int image_width_;
- int image_height_;
- /* @} */
-
-private:
- // A list of image filenames gets special consideration
- bool ProcessPagesFileList(FILE *fp, std::string *buf,
- const char *retry_config, int timeout_millisec,
- TessResultRenderer *renderer,
- int tessedit_page_number);
- // TIFF supports multipage so gets special consideration.
- bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size,
- const char *filename, const char *retry_config,
- int timeout_millisec,
- TessResultRenderer *renderer,
- int tessedit_page_number);
-}; // class TessBaseAPI.
-
-/** Escape a char string - remove &<>"' with HTML codes. */
-std::string HOcrEscape(const char *text);
-
-} // namespace tesseract
-
-#endif // TESSERACT_API_BASEAPI_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/capi.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/capi.h
deleted file mode 100644
index 40f4856a..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/capi.h
+++ /dev/null
@@ -1,484 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: capi.h
-// Description: C-API TessBaseAPI
-//
-// (C) Copyright 2012, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef API_CAPI_H_
-#define API_CAPI_H_
-
-#include "export.h"
-
-#ifdef __cplusplus
-# include
-# include
-# include
-# include
-# include
-#endif
-
-#include
-#include
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef BOOL
-# define BOOL int
-# define TRUE 1
-# define FALSE 0
-#endif
-
-#ifdef __cplusplus
-typedef tesseract::TessResultRenderer TessResultRenderer;
-typedef tesseract::TessBaseAPI TessBaseAPI;
-typedef tesseract::PageIterator TessPageIterator;
-typedef tesseract::ResultIterator TessResultIterator;
-typedef tesseract::MutableIterator TessMutableIterator;
-typedef tesseract::ChoiceIterator TessChoiceIterator;
-typedef tesseract::OcrEngineMode TessOcrEngineMode;
-typedef tesseract::PageSegMode TessPageSegMode;
-typedef tesseract::PageIteratorLevel TessPageIteratorLevel;
-typedef tesseract::Orientation TessOrientation;
-typedef tesseract::ParagraphJustification TessParagraphJustification;
-typedef tesseract::WritingDirection TessWritingDirection;
-typedef tesseract::TextlineOrder TessTextlineOrder;
-typedef tesseract::PolyBlockType TessPolyBlockType;
-typedef tesseract::ETEXT_DESC ETEXT_DESC;
-#else
-typedef struct TessResultRenderer TessResultRenderer;
-typedef struct TessBaseAPI TessBaseAPI;
-typedef struct TessPageIterator TessPageIterator;
-typedef struct TessResultIterator TessResultIterator;
-typedef struct TessMutableIterator TessMutableIterator;
-typedef struct TessChoiceIterator TessChoiceIterator;
-typedef enum TessOcrEngineMode {
- OEM_TESSERACT_ONLY,
- OEM_LSTM_ONLY,
- OEM_TESSERACT_LSTM_COMBINED,
- OEM_DEFAULT
-} TessOcrEngineMode;
-typedef enum TessPageSegMode {
- PSM_OSD_ONLY,
- PSM_AUTO_OSD,
- PSM_AUTO_ONLY,
- PSM_AUTO,
- PSM_SINGLE_COLUMN,
- PSM_SINGLE_BLOCK_VERT_TEXT,
- PSM_SINGLE_BLOCK,
- PSM_SINGLE_LINE,
- PSM_SINGLE_WORD,
- PSM_CIRCLE_WORD,
- PSM_SINGLE_CHAR,
- PSM_SPARSE_TEXT,
- PSM_SPARSE_TEXT_OSD,
- PSM_RAW_LINE,
- PSM_COUNT
-} TessPageSegMode;
-typedef enum TessPageIteratorLevel {
- RIL_BLOCK,
- RIL_PARA,
- RIL_TEXTLINE,
- RIL_WORD,
- RIL_SYMBOL
-} TessPageIteratorLevel;
-typedef enum TessPolyBlockType {
- PT_UNKNOWN,
- PT_FLOWING_TEXT,
- PT_HEADING_TEXT,
- PT_PULLOUT_TEXT,
- PT_EQUATION,
- PT_INLINE_EQUATION,
- PT_TABLE,
- PT_VERTICAL_TEXT,
- PT_CAPTION_TEXT,
- PT_FLOWING_IMAGE,
- PT_HEADING_IMAGE,
- PT_PULLOUT_IMAGE,
- PT_HORZ_LINE,
- PT_VERT_LINE,
- PT_NOISE,
- PT_COUNT
-} TessPolyBlockType;
-typedef enum TessOrientation {
- ORIENTATION_PAGE_UP,
- ORIENTATION_PAGE_RIGHT,
- ORIENTATION_PAGE_DOWN,
- ORIENTATION_PAGE_LEFT
-} TessOrientation;
-typedef enum TessParagraphJustification {
- JUSTIFICATION_UNKNOWN,
- JUSTIFICATION_LEFT,
- JUSTIFICATION_CENTER,
- JUSTIFICATION_RIGHT
-} TessParagraphJustification;
-typedef enum TessWritingDirection {
- WRITING_DIRECTION_LEFT_TO_RIGHT,
- WRITING_DIRECTION_RIGHT_TO_LEFT,
- WRITING_DIRECTION_TOP_TO_BOTTOM
-} TessWritingDirection;
-typedef enum TessTextlineOrder {
- TEXTLINE_ORDER_LEFT_TO_RIGHT,
- TEXTLINE_ORDER_RIGHT_TO_LEFT,
- TEXTLINE_ORDER_TOP_TO_BOTTOM
-} TessTextlineOrder;
-typedef struct ETEXT_DESC ETEXT_DESC;
-#endif
-
-typedef bool (*TessCancelFunc)(void *cancel_this, int words);
-typedef bool (*TessProgressFunc)(ETEXT_DESC *ths, int left, int right, int top,
- int bottom);
-
-struct Pix;
-struct Boxa;
-struct Pixa;
-
-/* General free functions */
-
-TESS_API const char *TessVersion();
-TESS_API void TessDeleteText(const char *text);
-TESS_API void TessDeleteTextArray(char **arr);
-TESS_API void TessDeleteIntArray(const int *arr);
-
-/* Renderer API */
-TESS_API TessResultRenderer *TessTextRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessHOcrRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase,
- BOOL font_info);
-TESS_API TessResultRenderer *TessAltoRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessTsvRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessPDFRendererCreate(const char *outputbase,
- const char *datadir,
- BOOL textonly);
-TESS_API TessResultRenderer *TessUnlvRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessWordStrBoxRendererCreate(
- const char *outputbase);
-
-TESS_API void TessDeleteResultRenderer(TessResultRenderer *renderer);
-TESS_API void TessResultRendererInsert(TessResultRenderer *renderer,
- TessResultRenderer *next);
-TESS_API TessResultRenderer *TessResultRendererNext(
- TessResultRenderer *renderer);
-TESS_API BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer,
- const char *title);
-TESS_API BOOL TessResultRendererAddImage(TessResultRenderer *renderer,
- TessBaseAPI *api);
-TESS_API BOOL TessResultRendererEndDocument(TessResultRenderer *renderer);
-
-TESS_API const char *TessResultRendererExtention(TessResultRenderer *renderer);
-TESS_API const char *TessResultRendererTitle(TessResultRenderer *renderer);
-TESS_API int TessResultRendererImageNum(TessResultRenderer *renderer);
-
-/* Base API */
-
-TESS_API TessBaseAPI *TessBaseAPICreate();
-TESS_API void TessBaseAPIDelete(TessBaseAPI *handle);
-
-TESS_API size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI *handle, void **device);
-
-TESS_API void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name);
-TESS_API const char *TessBaseAPIGetInputName(TessBaseAPI *handle);
-
-TESS_API void TessBaseAPISetInputImage(TessBaseAPI *handle, struct Pix *pix);
-TESS_API struct Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle);
-
-TESS_API int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle);
-TESS_API const char *TessBaseAPIGetDatapath(TessBaseAPI *handle);
-
-TESS_API void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name);
-
-TESS_API BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name,
- const char *value);
-TESS_API BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name,
- const char *value);
-
-TESS_API BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle,
- const char *name, int *value);
-TESS_API BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle,
- const char *name, BOOL *value);
-TESS_API BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle,
- const char *name, double *value);
-TESS_API const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle,
- const char *name);
-
-TESS_API void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp);
-TESS_API BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle,
- const char *filename);
-
-TESS_API int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath,
- const char *language, TessOcrEngineMode oem,
- char **configs, int configs_size);
-TESS_API int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath,
- const char *language, TessOcrEngineMode oem);
-TESS_API int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath,
- const char *language);
-
-TESS_API int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath,
- const char *language, TessOcrEngineMode mode,
- char **configs, int configs_size, char **vars_vec,
- char **vars_values, size_t vars_vec_size,
- BOOL set_only_non_debug_params);
-
-TESS_API int TessBaseAPIInit5(TessBaseAPI *handle, const char *data, int data_size,
- const char *language, TessOcrEngineMode mode,
- char **configs, int configs_size, char **vars_vec,
- char **vars_values, size_t vars_vec_size,
- BOOL set_only_non_debug_params);
-
-TESS_API const char *TessBaseAPIGetInitLanguagesAsString(
- const TessBaseAPI *handle);
-TESS_API char **TessBaseAPIGetLoadedLanguagesAsVector(
- const TessBaseAPI *handle);
-TESS_API char **TessBaseAPIGetAvailableLanguagesAsVector(
- const TessBaseAPI *handle);
-
-TESS_API void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle);
-
-TESS_API void TessBaseAPIReadConfigFile(TessBaseAPI *handle,
- const char *filename);
-TESS_API void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle,
- const char *filename);
-
-TESS_API void TessBaseAPISetPageSegMode(TessBaseAPI *handle,
- TessPageSegMode mode);
-TESS_API TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle);
-
-TESS_API char *TessBaseAPIRect(TessBaseAPI *handle,
- const unsigned char *imagedata,
- int bytes_per_pixel, int bytes_per_line,
- int left, int top, int width, int height);
-
-TESS_API void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle);
-
-TESS_API void TessBaseAPISetImage(TessBaseAPI *handle,
- const unsigned char *imagedata, int width,
- int height, int bytes_per_pixel,
- int bytes_per_line);
-TESS_API void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix);
-
-TESS_API void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi);
-
-TESS_API void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top,
- int width, int height);
-
-TESS_API struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle);
-TESS_API struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle,
- struct Pixa **pixa);
-TESS_API struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle,
- struct Pixa **pixa,
- int **blockids);
-TESS_API struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle,
- BOOL raw_image, int raw_padding,
- struct Pixa **pixa,
- int **blockids, int **paraids);
-TESS_API struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle,
- struct Pixa **pixa, int **blockids);
-TESS_API struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle,
- struct Pixa **pixa);
-TESS_API struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle,
- struct Pixa **cc);
-TESS_API struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle,
- TessPageIteratorLevel level,
- BOOL text_only,
- struct Pixa **pixa,
- int **blockids);
-TESS_API struct Boxa *TessBaseAPIGetComponentImages1(
- TessBaseAPI *handle, TessPageIteratorLevel level, BOOL text_only,
- BOOL raw_image, int raw_padding, struct Pixa **pixa, int **blockids,
- int **paraids);
-
-TESS_API int TessBaseAPIGetThresholdedImageScaleFactor(
- const TessBaseAPI *handle);
-
-TESS_API TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle);
-
-TESS_API int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor);
-
-TESS_API BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename,
- const char *retry_config,
- int timeout_millisec,
- TessResultRenderer *renderer);
-TESS_API BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix,
- int page_index, const char *filename,
- const char *retry_config,
- int timeout_millisec,
- TessResultRenderer *renderer);
-
-TESS_API TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle);
-TESS_API TessMutableIterator *TessBaseAPIGetMutableIterator(
- TessBaseAPI *handle);
-
-TESS_API char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle);
-TESS_API char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number);
-
-TESS_API char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number);
-TESS_API char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number);
-
-TESS_API char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number);
-TESS_API char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number);
-TESS_API char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle,
- int page_number);
-
-TESS_API char *TessBaseAPIGetUNLVText(TessBaseAPI *handle);
-TESS_API int TessBaseAPIMeanTextConf(TessBaseAPI *handle);
-
-TESS_API int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle);
-
-#ifndef DISABLED_LEGACY_ENGINE
-TESS_API BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle,
- TessPageSegMode mode,
- const char *wordstr);
-#endif // #ifndef DISABLED_LEGACY_ENGINE
-
-TESS_API void TessBaseAPIClear(TessBaseAPI *handle);
-TESS_API void TessBaseAPIEnd(TessBaseAPI *handle);
-
-TESS_API int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word);
-TESS_API BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset,
- float *out_slope);
-
-TESS_API const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id);
-
-TESS_API void TessBaseAPIClearPersistentCache(TessBaseAPI *handle);
-
-#ifndef DISABLED_LEGACY_ENGINE
-
-// Call TessDeleteText(*best_script_name) to free memory allocated by this
-// function
-TESS_API BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle,
- int *orient_deg,
- float *orient_conf,
- const char **script_name,
- float *script_conf);
-#endif // #ifndef DISABLED_LEGACY_ENGINE
-
-TESS_API void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle,
- double margin);
-
-TESS_API int TessBaseAPINumDawgs(const TessBaseAPI *handle);
-
-TESS_API TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle);
-
-TESS_API void TessBaseGetBlockTextOrientations(TessBaseAPI *handle,
- int **block_orientation,
- bool **vertical_writing);
-
-/* Page iterator */
-
-TESS_API void TessPageIteratorDelete(TessPageIterator *handle);
-
-TESS_API TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle);
-
-TESS_API void TessPageIteratorBegin(TessPageIterator *handle);
-
-TESS_API BOOL TessPageIteratorNext(TessPageIterator *handle,
- TessPageIteratorLevel level);
-
-TESS_API BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle,
- TessPageIteratorLevel level);
-
-TESS_API BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle,
- TessPageIteratorLevel level,
- TessPageIteratorLevel element);
-
-TESS_API BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle,
- TessPageIteratorLevel level,
- int *left, int *top, int *right,
- int *bottom);
-
-TESS_API TessPolyBlockType
-TessPageIteratorBlockType(const TessPageIterator *handle);
-
-TESS_API struct Pix *TessPageIteratorGetBinaryImage(
- const TessPageIterator *handle, TessPageIteratorLevel level);
-
-TESS_API struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle,
- TessPageIteratorLevel level,
- int padding,
- struct Pix *original_image,
- int *left, int *top);
-
-TESS_API BOOL TessPageIteratorBaseline(const TessPageIterator *handle,
- TessPageIteratorLevel level, int *x1,
- int *y1, int *x2, int *y2);
-
-TESS_API void TessPageIteratorOrientation(
- TessPageIterator *handle, TessOrientation *orientation,
- TessWritingDirection *writing_direction, TessTextlineOrder *textline_order,
- float *deskew_angle);
-
-TESS_API void TessPageIteratorParagraphInfo(
- TessPageIterator *handle, TessParagraphJustification *justification,
- BOOL *is_list_item, BOOL *is_crown, int *first_line_indent);
-
-/* Result iterator */
-
-TESS_API void TessResultIteratorDelete(TessResultIterator *handle);
-TESS_API TessResultIterator *TessResultIteratorCopy(
- const TessResultIterator *handle);
-TESS_API TessPageIterator *TessResultIteratorGetPageIterator(
- TessResultIterator *handle);
-TESS_API const TessPageIterator *TessResultIteratorGetPageIteratorConst(
- const TessResultIterator *handle);
-TESS_API TessChoiceIterator *TessResultIteratorGetChoiceIterator(
- const TessResultIterator *handle);
-
-TESS_API BOOL TessResultIteratorNext(TessResultIterator *handle,
- TessPageIteratorLevel level);
-TESS_API char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle,
- TessPageIteratorLevel level);
-TESS_API float TessResultIteratorConfidence(const TessResultIterator *handle,
- TessPageIteratorLevel level);
-TESS_API const char *TessResultIteratorWordRecognitionLanguage(
- const TessResultIterator *handle);
-TESS_API const char *TessResultIteratorWordFontAttributes(
- const TessResultIterator *handle, BOOL *is_bold, BOOL *is_italic,
- BOOL *is_underlined, BOOL *is_monospace, BOOL *is_serif, BOOL *is_smallcaps,
- int *pointsize, int *font_id);
-
-TESS_API BOOL
-TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle);
-TESS_API BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle);
-TESS_API BOOL
-TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle);
-TESS_API BOOL
-TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle);
-TESS_API BOOL
-TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle);
-
-TESS_API void TessChoiceIteratorDelete(TessChoiceIterator *handle);
-TESS_API BOOL TessChoiceIteratorNext(TessChoiceIterator *handle);
-TESS_API const char *TessChoiceIteratorGetUTF8Text(
- const TessChoiceIterator *handle);
-TESS_API float TessChoiceIteratorConfidence(const TessChoiceIterator *handle);
-
-/* Progress monitor */
-
-TESS_API ETEXT_DESC *TessMonitorCreate();
-TESS_API void TessMonitorDelete(ETEXT_DESC *monitor);
-TESS_API void TessMonitorSetCancelFunc(ETEXT_DESC *monitor,
- TessCancelFunc cancelFunc);
-TESS_API void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis);
-TESS_API void *TessMonitorGetCancelThis(ETEXT_DESC *monitor);
-TESS_API void TessMonitorSetProgressFunc(ETEXT_DESC *monitor,
- TessProgressFunc progressFunc);
-TESS_API int TessMonitorGetProgress(ETEXT_DESC *monitor);
-TESS_API void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // API_CAPI_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/export.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/export.h
deleted file mode 100644
index d238b628..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/export.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: export.h
-// Description: Place holder
-//
-// (C) Copyright 2006, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_PLATFORM_H_
-#define TESSERACT_PLATFORM_H_
-
-#ifndef TESS_API
-# if defined(_WIN32) || defined(__CYGWIN__)
-# if defined(TESS_EXPORTS)
-# define TESS_API __declspec(dllexport)
-# elif defined(TESS_IMPORTS)
-# define TESS_API __declspec(dllimport)
-# else
-# define TESS_API
-# endif
-# else
-# if defined(TESS_EXPORTS) || defined(TESS_IMPORTS)
-# define TESS_API __attribute__((visibility("default")))
-# else
-# define TESS_API
-# endif
-# endif
-#endif
-
-#endif // TESSERACT_PLATFORM_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/ltrresultiterator.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/ltrresultiterator.h
deleted file mode 100644
index 6ca0a98e..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/ltrresultiterator.h
+++ /dev/null
@@ -1,235 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: ltrresultiterator.h
-// Description: Iterator for tesseract results in strict left-to-right
-// order that avoids using tesseract internal data structures.
-// Author: Ray Smith
-//
-// (C) Copyright 2010, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
-#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
-
-#include "export.h" // for TESS_API
-#include "pageiterator.h" // for PageIterator
-#include "publictypes.h" // for PageIteratorLevel
-#include "unichar.h" // for StrongScriptDirection
-
-namespace tesseract {
-
-class BLOB_CHOICE_IT;
-class PAGE_RES;
-class WERD_RES;
-
-class Tesseract;
-
-// Class to iterate over tesseract results, providing access to all levels
-// of the page hierarchy, without including any tesseract headers or having
-// to handle any tesseract structures.
-// WARNING! This class points to data held within the TessBaseAPI class, and
-// therefore can only be used while the TessBaseAPI class still exists and
-// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
-// DetectOS, or anything else that changes the internal PAGE_RES.
-// See tesseract/publictypes.h for the definition of PageIteratorLevel.
-// See also base class PageIterator, which contains the bulk of the interface.
-// LTRResultIterator adds text-specific methods for access to OCR output.
-
-class TESS_API LTRResultIterator : public PageIterator {
- friend class ChoiceIterator;
-
-public:
- // page_res and tesseract come directly from the BaseAPI.
- // The rectangle parameters are copied indirectly from the Thresholder,
- // via the BaseAPI. They represent the coordinates of some rectangle in an
- // original image (in top-left-origin coordinates) and therefore the top-left
- // needs to be added to any output boxes in order to specify coordinates
- // in the original image. See TessBaseAPI::SetRectangle.
- // The scale and scaled_yres are in case the Thresholder scaled the image
- // rectangle prior to thresholding. Any coordinates in tesseract's image
- // must be divided by scale before adding (rect_left, rect_top).
- // The scaled_yres indicates the effective resolution of the binary image
- // that tesseract has been given by the Thresholder.
- // After the constructor, Begin has already been called.
- LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
- int scaled_yres, int rect_left, int rect_top,
- int rect_width, int rect_height);
-
- ~LTRResultIterator() override;
-
- // LTRResultIterators may be copied! This makes it possible to iterate over
- // all the objects at a lower level, while maintaining an iterator to
- // objects at a higher level. These constructors DO NOT CALL Begin, so
- // iterations will continue from the location of src.
- // TODO: For now the copy constructor and operator= only need the base class
- // versions, but if new data members are added, don't forget to add them!
-
- // ============= Moving around within the page ============.
-
- // See PageIterator.
-
- // ============= Accessing data ==============.
-
- // Returns the null terminated UTF-8 encoded text string for the current
- // object at the given level. Use delete [] to free after use.
- char *GetUTF8Text(PageIteratorLevel level) const;
-
- // Set the string inserted at the end of each text line. "\n" by default.
- void SetLineSeparator(const char *new_line);
-
- // Set the string inserted at the end of each paragraph. "\n" by default.
- void SetParagraphSeparator(const char *new_para);
-
- // Returns the mean confidence of the current object at the given level.
- // The number should be interpreted as a percent probability. (0.0f-100.0f)
- float Confidence(PageIteratorLevel level) const;
-
- // ============= Functions that refer to words only ============.
-
- // Returns the font attributes of the current word. If iterating at a higher
- // level object than words, eg textlines, then this will return the
- // attributes of the first word in that textline.
- // The actual return value is a string representing a font name. It points
- // to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
- // the iterator itself, ie rendered invalid by various members of
- // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
- // Pointsize is returned in printers points (1/72 inch.)
- const char *WordFontAttributes(bool *is_bold, bool *is_italic,
- bool *is_underlined, bool *is_monospace,
- bool *is_serif, bool *is_smallcaps,
- int *pointsize, int *font_id) const;
-
- // Return the name of the language used to recognize this word.
- // On error, nullptr. Do not delete this pointer.
- const char *WordRecognitionLanguage() const;
-
- // Return the overall directionality of this word.
- StrongScriptDirection WordDirection() const;
-
- // Returns true if the current word was found in a dictionary.
- bool WordIsFromDictionary() const;
-
- // Returns the number of blanks before the current word.
- int BlanksBeforeWord() const;
-
- // Returns true if the current word is numeric.
- bool WordIsNumeric() const;
-
- // Returns true if the word contains blamer information.
- bool HasBlamerInfo() const;
-
- // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
- // of the current word.
- const void *GetParamsTrainingBundle() const;
-
- // Returns a pointer to the string with blamer information for this word.
- // Assumes that the word's blamer_bundle is not nullptr.
- const char *GetBlamerDebug() const;
-
- // Returns a pointer to the string with misadaption information for this word.
- // Assumes that the word's blamer_bundle is not nullptr.
- const char *GetBlamerMisadaptionDebug() const;
-
- // Returns true if a truth string was recorded for the current word.
- bool HasTruthString() const;
-
- // Returns true if the given string is equivalent to the truth string for
- // the current word.
- bool EquivalentToTruth(const char *str) const;
-
- // Returns a null terminated UTF-8 encoded truth string for the current word.
- // Use delete [] to free after use.
- char *WordTruthUTF8Text() const;
-
- // Returns a null terminated UTF-8 encoded normalized OCR string for the
- // current word. Use delete [] to free after use.
- char *WordNormedUTF8Text() const;
-
- // Returns a pointer to serialized choice lattice.
- // Fills lattice_size with the number of bytes in lattice data.
- const char *WordLattice(int *lattice_size) const;
-
- // ============= Functions that refer to symbols only ============.
-
- // Returns true if the current symbol is a superscript.
- // If iterating at a higher level object than symbols, eg words, then
- // this will return the attributes of the first symbol in that word.
- bool SymbolIsSuperscript() const;
- // Returns true if the current symbol is a subscript.
- // If iterating at a higher level object than symbols, eg words, then
- // this will return the attributes of the first symbol in that word.
- bool SymbolIsSubscript() const;
- // Returns true if the current symbol is a dropcap.
- // If iterating at a higher level object than symbols, eg words, then
- // this will return the attributes of the first symbol in that word.
- bool SymbolIsDropcap() const;
-
-protected:
- const char *line_separator_;
- const char *paragraph_separator_;
-};
-
-// Class to iterate over the classifier choices for a single RIL_SYMBOL.
-class TESS_API ChoiceIterator {
-public:
- // Construction is from a LTRResultIterator that points to the symbol of
- // interest. The ChoiceIterator allows a one-shot iteration over the
- // choices for this symbol and after that it is useless.
- explicit ChoiceIterator(const LTRResultIterator &result_it);
- ~ChoiceIterator();
-
- // Moves to the next choice for the symbol and returns false if there
- // are none left.
- bool Next();
-
- // ============= Accessing data ==============.
-
- // Returns the null terminated UTF-8 encoded text string for the current
- // choice.
- // NOTE: Unlike LTRResultIterator::GetUTF8Text, the return points to an
- // internal structure and should NOT be delete[]ed to free after use.
- const char *GetUTF8Text() const;
-
- // Returns the confidence of the current choice depending on the used language
- // data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All
- // choices for one symbol should roughly add up to 1.0f.
- // If only traineddata of the legacy engine is used, the number should be
- // interpreted as a percent probability. (0.0f-100.0f) In this case
- // probabilities won't add up to 100. Each one stands on its own.
- float Confidence() const;
-
- // Returns a vector containing all timesteps, which belong to the currently
- // selected symbol. A timestep is a vector containing pairs of symbols and
- // floating point numbers. The number states the probability for the
- // corresponding symbol.
- std::vector>> *Timesteps() const;
-
-private:
- // clears the remaining spaces out of the results and adapt the probabilities
- void filterSpaces();
- // Pointer to the WERD_RES object owned by the API.
- WERD_RES *word_res_;
- // Iterator over the blob choices.
- BLOB_CHOICE_IT *choice_it_;
- std::vector> *LSTM_choices_ = nullptr;
- std::vector>::iterator LSTM_choice_it_;
-
- const int *tstep_index_;
- // regulates the rating granularity
- double rating_coefficient_;
- // leading blanks
- int blanks_before_word_;
- // true when there is lstm engine related trained data
- bool oemLSTM_;
-};
-
-} // namespace tesseract.
-
-#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/ocrclass.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/ocrclass.h
deleted file mode 100644
index a55e6528..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/ocrclass.h
+++ /dev/null
@@ -1,158 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-/**********************************************************************
- * File: ocrclass.h
- * Description: Class definitions and constants for the OCR API.
- * Author: Hewlett-Packard Co
- *
- * (C) Copyright 1996, Hewlett-Packard Co.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-/**********************************************************************
- * This file contains typedefs for all the structures used by
- * the HP OCR interface.
- * The structures are designed to allow them to be used with any
- * structure alignment up to 8.
- **********************************************************************/
-
-#ifndef CCUTIL_OCRCLASS_H_
-#define CCUTIL_OCRCLASS_H_
-
-#include
-#include
-
-namespace tesseract {
-
-/**********************************************************************
- * EANYCODE_CHAR
- * Description of a single character. The character code is defined by
- * the character set of the current font.
- * Output text is sent as an array of these structures.
- * Spaces and line endings in the output are represented in the
- * structures of the surrounding characters. They are not directly
- * represented as characters.
- * The first character in a word has a positive value of blanks.
- * Missing information should be set to the defaults in the comments.
- * If word bounds are known, but not character bounds, then the top and
- * bottom of each character should be those of the word. The left of the
- * first and right of the last char in each word should be set. All other
- * lefts and rights should be set to -1.
- * If set, the values of right and bottom are left+width and top+height.
- * Most of the members come directly from the parameters to ocr_append_char.
- * The formatting member uses the enhancement parameter and combines the
- * line direction stuff into the top 3 bits.
- * The coding is 0=RL char, 1=LR char, 2=DR NL, 3=UL NL, 4=DR Para,
- * 5=UL Para, 6=TB char, 7=BT char. API users do not need to know what
- * the coding is, only that it is backwards compatible with the previous
- * version.
- **********************************************************************/
-
-struct EANYCODE_CHAR { /*single character */
- // It should be noted that the format for char_code for version 2.0 and beyond
- // is UTF8 which means that ASCII characters will come out as one structure
- // but other characters will be returned in two or more instances of this
- // structure with a single byte of the UTF8 code in each, but each will have
- // the same bounding box. Programs which want to handle languagues with
- // different characters sets will need to handle extended characters
- // appropriately, but *all* code needs to be prepared to receive UTF8 coded
- // characters for characters such as bullet and fancy quotes.
- uint16_t char_code; /*character itself */
- int16_t left; /*of char (-1) */
- int16_t right; /*of char (-1) */
- int16_t top; /*of char (-1) */
- int16_t bottom; /*of char (-1) */
- int16_t font_index; /*what font (0) */
- uint8_t confidence; /*0=perfect, 100=reject (0/100) */
- uint8_t point_size; /*of char, 72=i inch, (10) */
- int8_t blanks; /*no of spaces before this char (1) */
- uint8_t formatting; /*char formatting (0) */
-};
-
-/**********************************************************************
- * ETEXT_DESC
- * Description of the output of the OCR engine.
- * This structure is used as both a progress monitor and the final
- * output header, since it needs to be a valid progress monitor while
- * the OCR engine is storing its output to shared memory.
- * During progress, all the buffer info is -1.
- * Progress starts at 0 and increases to 100 during OCR. No other constraint.
- * Additionally the progress callback contains the bounding box of the word that
- * is currently being processed.
- * Every progress callback, the OCR engine must set ocr_alive to 1.
- * The HP side will set ocr_alive to 0. Repeated failure to reset
- * to 1 indicates that the OCR engine is dead.
- * If the cancel function is not null then it is called with the number of
- * user words found. If it returns true then operation is cancelled.
- **********************************************************************/
-class ETEXT_DESC;
-
-using CANCEL_FUNC = bool (*)(void *, int);
-using PROGRESS_FUNC = bool (*)(int, int, int, int, int);
-using PROGRESS_FUNC2 = bool (*)(ETEXT_DESC *, int, int, int, int);
-
-class ETEXT_DESC { // output header
-public:
- int16_t count{0}; /// chars in this buffer(0)
- int16_t progress{0}; /// percent complete increasing (0-100)
- /** Progress monitor covers word recognition and it does not cover layout
- * analysis.
- * See Ray comment in https://github.com/tesseract-ocr/tesseract/pull/27 */
- int8_t more_to_come{0}; /// true if not last
- volatile int8_t ocr_alive{0}; /// ocr sets to 1, HP 0
- int8_t err_code{0}; /// for errcode use
- CANCEL_FUNC cancel{nullptr}; /// returns true to cancel
- PROGRESS_FUNC progress_callback{
- nullptr}; /// called whenever progress increases
- PROGRESS_FUNC2 progress_callback2; /// monitor-aware progress callback
- void *cancel_this{nullptr}; /// this or other data for cancel
- std::chrono::steady_clock::time_point end_time;
- /// Time to stop. Expected to be set only
- /// by call to set_deadline_msecs().
- EANYCODE_CHAR text[1]{}; /// character data
-
- ETEXT_DESC() : progress_callback2(&default_progress_func) {
- end_time = std::chrono::time_point();
- }
-
- // Sets the end time to be deadline_msecs milliseconds from now.
- void set_deadline_msecs(int32_t deadline_msecs) {
- if (deadline_msecs > 0) {
- end_time = std::chrono::steady_clock::now() +
- std::chrono::milliseconds(deadline_msecs);
- }
- }
-
- // Returns false if we've not passed the end_time, or have not set a deadline.
- bool deadline_exceeded() const {
- if (end_time.time_since_epoch() ==
- std::chrono::steady_clock::duration::zero()) {
- return false;
- }
- auto now = std::chrono::steady_clock::now();
- return (now > end_time);
- }
-
-private:
- static bool default_progress_func(ETEXT_DESC *ths, int left, int right,
- int top, int bottom) {
- if (ths->progress_callback != nullptr) {
- return (*(ths->progress_callback))(ths->progress, left, right, top,
- bottom);
- }
- return true;
- }
-};
-
-} // namespace tesseract
-
-#endif // CCUTIL_OCRCLASS_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/osdetect.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/osdetect.h
deleted file mode 100644
index 34bfb557..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/osdetect.h
+++ /dev/null
@@ -1,139 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: osdetect.h
-// Description: Orientation and script detection.
-// Author: Samuel Charron
-// Ranjith Unnikrishnan
-//
-// (C) Copyright 2008, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCMAIN_OSDETECT_H_
-#define TESSERACT_CCMAIN_OSDETECT_H_
-
-#include "export.h" // for TESS_API
-
-#include // for std::vector
-
-namespace tesseract {
-
-class BLOBNBOX;
-class BLOBNBOX_CLIST;
-class BLOB_CHOICE_LIST;
-class TO_BLOCK_LIST;
-class UNICHARSET;
-
-class Tesseract;
-
-// Max number of scripts in ICU + "NULL" + Japanese and Korean + Fraktur
-const int kMaxNumberOfScripts = 116 + 1 + 2 + 1;
-
-struct OSBestResult {
- OSBestResult()
- : orientation_id(0), script_id(0), sconfidence(0.0), oconfidence(0.0) {}
- int orientation_id;
- int script_id;
- float sconfidence;
- float oconfidence;
-};
-
-struct OSResults {
- OSResults() : unicharset(nullptr) {
- for (int i = 0; i < 4; ++i) {
- for (int j = 0; j < kMaxNumberOfScripts; ++j) {
- scripts_na[i][j] = 0;
- }
- orientations[i] = 0;
- }
- }
- void update_best_orientation();
- // Set the estimate of the orientation to the given id.
- void set_best_orientation(int orientation_id);
- // Update/Compute the best estimate of the script assuming the given
- // orientation id.
- void update_best_script(int orientation_id);
- // Return the index of the script with the highest score for this orientation.
- TESS_API int get_best_script(int orientation_id) const;
- // Accumulate scores with given OSResults instance and update the best script.
- void accumulate(const OSResults &osr);
-
- // Print statistics.
- void print_scores(void) const;
- void print_scores(int orientation_id) const;
-
- // Array holding scores for each orientation id [0,3].
- // Orientation ids [0..3] map to [0, 270, 180, 90] degree orientations of the
- // page respectively, where the values refer to the amount of clockwise
- // rotation to be applied to the page for the text to be upright and readable.
- float orientations[4];
- // Script confidence scores for each of 4 possible orientations.
- float scripts_na[4][kMaxNumberOfScripts];
-
- UNICHARSET *unicharset;
- OSBestResult best_result;
-};
-
-class OrientationDetector {
-public:
- OrientationDetector(const std::vector *allowed_scripts,
- OSResults *results);
- bool detect_blob(BLOB_CHOICE_LIST *scores);
- int get_orientation();
-
-private:
- OSResults *osr_;
- const std::vector *allowed_scripts_;
-};
-
-class ScriptDetector {
-public:
- ScriptDetector(const std::vector *allowed_scripts, OSResults *osr,
- tesseract::Tesseract *tess);
- void detect_blob(BLOB_CHOICE_LIST *scores);
- bool must_stop(int orientation) const;
-
-private:
- OSResults *osr_;
- static const char *korean_script_;
- static const char *japanese_script_;
- static const char *fraktur_script_;
- int korean_id_;
- int japanese_id_;
- int katakana_id_;
- int hiragana_id_;
- int han_id_;
- int hangul_id_;
- int latin_id_;
- int fraktur_id_;
- tesseract::Tesseract *tess_;
- const std::vector *allowed_scripts_;
-};
-
-int orientation_and_script_detection(const char *filename, OSResults *,
- tesseract::Tesseract *);
-
-int os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr,
- tesseract::Tesseract *tess);
-
-int os_detect_blobs(const std::vector *allowed_scripts,
- BLOBNBOX_CLIST *blob_list, OSResults *osr,
- tesseract::Tesseract *tess);
-
-bool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s,
- OSResults *, tesseract::Tesseract *tess);
-
-// Helper method to convert an orientation index to its value in degrees.
-// The value represents the amount of clockwise rotation in degrees that must be
-// applied for the text to be upright (readable).
-TESS_API int OrientationIdToValue(const int &id);
-
-} // namespace tesseract
-
-#endif // TESSERACT_CCMAIN_OSDETECT_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/pageiterator.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/pageiterator.h
deleted file mode 100644
index 68739715..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/pageiterator.h
+++ /dev/null
@@ -1,364 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: pageiterator.h
-// Description: Iterator for tesseract page structure that avoids using
-// tesseract internal data structures.
-// Author: Ray Smith
-//
-// (C) Copyright 2010, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_
-#define TESSERACT_CCMAIN_PAGEITERATOR_H_
-
-#include "export.h"
-#include "publictypes.h"
-
-struct Pix;
-struct Pta;
-
-namespace tesseract {
-
-struct BlamerBundle;
-class C_BLOB_IT;
-class PAGE_RES;
-class PAGE_RES_IT;
-class WERD;
-
-class Tesseract;
-
-/**
- * Class to iterate over tesseract page structure, providing access to all
- * levels of the page hierarchy, without including any tesseract headers or
- * having to handle any tesseract structures.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- * See tesseract/publictypes.h for the definition of PageIteratorLevel.
- * See also ResultIterator, derived from PageIterator, which adds in the
- * ability to access OCR output with text-specific methods.
- */
-
-class TESS_API PageIterator {
-public:
- /**
- * page_res and tesseract come directly from the BaseAPI.
- * The rectangle parameters are copied indirectly from the Thresholder,
- * via the BaseAPI. They represent the coordinates of some rectangle in an
- * original image (in top-left-origin coordinates) and therefore the top-left
- * needs to be added to any output boxes in order to specify coordinates
- * in the original image. See TessBaseAPI::SetRectangle.
- * The scale and scaled_yres are in case the Thresholder scaled the image
- * rectangle prior to thresholding. Any coordinates in tesseract's image
- * must be divided by scale before adding (rect_left, rect_top).
- * The scaled_yres indicates the effective resolution of the binary image
- * that tesseract has been given by the Thresholder.
- * After the constructor, Begin has already been called.
- */
- PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
- int scaled_yres, int rect_left, int rect_top, int rect_width,
- int rect_height);
- virtual ~PageIterator();
-
- /**
- * Page/ResultIterators may be copied! This makes it possible to iterate over
- * all the objects at a lower level, while maintaining an iterator to
- * objects at a higher level. These constructors DO NOT CALL Begin, so
- * iterations will continue from the location of src.
- */
- PageIterator(const PageIterator &src);
- const PageIterator &operator=(const PageIterator &src);
-
- /** Are we positioned at the same location as other? */
- bool PositionedAtSameWord(const PAGE_RES_IT *other) const;
-
- // ============= Moving around within the page ============.
-
- /**
- * Moves the iterator to point to the start of the page to begin an
- * iteration.
- */
- virtual void Begin();
-
- /**
- * Moves the iterator to the beginning of the paragraph.
- * This class implements this functionality by moving it to the zero indexed
- * blob of the first (leftmost) word on the first row of the paragraph.
- */
- virtual void RestartParagraph();
-
- /**
- * Return whether this iterator points anywhere in the first textline of a
- * paragraph.
- */
- bool IsWithinFirstTextlineOfParagraph() const;
-
- /**
- * Moves the iterator to the beginning of the text line.
- * This class implements this functionality by moving it to the zero indexed
- * blob of the first (leftmost) word of the row.
- */
- virtual void RestartRow();
-
- /**
- * Moves to the start of the next object at the given level in the
- * page hierarchy, and returns false if the end of the page was reached.
- * NOTE that RIL_SYMBOL will skip non-text blocks, but all other
- * PageIteratorLevel level values will visit each non-text block once.
- * Think of non text blocks as containing a single para, with a single line,
- * with a single imaginary word.
- * Calls to Next with different levels may be freely intermixed.
- * This function iterates words in right-to-left scripts correctly, if
- * the appropriate language has been loaded into Tesseract.
- */
- virtual bool Next(PageIteratorLevel level);
-
- /**
- * Returns true if the iterator is at the start of an object at the given
- * level.
- *
- * For instance, suppose an iterator it is pointed to the first symbol of the
- * first word of the third line of the second paragraph of the first block in
- * a page, then:
- * it.IsAtBeginningOf(RIL_BLOCK) = false
- * it.IsAtBeginningOf(RIL_PARA) = false
- * it.IsAtBeginningOf(RIL_TEXTLINE) = true
- * it.IsAtBeginningOf(RIL_WORD) = true
- * it.IsAtBeginningOf(RIL_SYMBOL) = true
- */
- virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
-
- /**
- * Returns whether the iterator is positioned at the last element in a
- * given level. (e.g. the last word in a line, the last line in a block)
- *
- * Here's some two-paragraph example
- * text. It starts off innocuously
- * enough but quickly turns bizarre.
- * The author inserts a cornucopia
- * of words to guard against confused
- * references.
- *
- * Now take an iterator it pointed to the start of "bizarre."
- * it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false
- * it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true
- * it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false
- */
- virtual bool IsAtFinalElement(PageIteratorLevel level,
- PageIteratorLevel element) const;
-
- /**
- * Returns whether this iterator is positioned
- * before other: -1
- * equal to other: 0
- * after other: 1
- */
- int Cmp(const PageIterator &other) const;
-
- // ============= Accessing data ==============.
- // Coordinate system:
- // Integer coordinates are at the cracks between the pixels.
- // The top-left corner of the top-left pixel in the image is at (0,0).
- // The bottom-right corner of the bottom-right pixel in the image is at
- // (width, height).
- // Every bounding box goes from the top-left of the top-left contained
- // pixel to the bottom-right of the bottom-right contained pixel, so
- // the bounding box of the single top-left pixel in the image is:
- // (0,0)->(1,1).
- // If an image rectangle has been set in the API, then returned coordinates
- // relate to the original (full) image, rather than the rectangle.
-
- /**
- * Controls what to include in a bounding box. Bounding boxes of all levels
- * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics.
- * Between layout analysis and recognition, it isn't known where all
- * diacritics belong, so this control is used to include or exclude some
- * diacritics that are above or below the main body of the word. In most cases
- * where the placement is obvious, and after recognition, it doesn't make as
- * much difference, as the diacritics will already be included in the word.
- */
- void SetBoundingBoxComponents(bool include_upper_dots,
- bool include_lower_dots) {
- include_upper_dots_ = include_upper_dots;
- include_lower_dots_ = include_lower_dots;
- }
-
- /**
- * Returns the bounding rectangle of the current object at the given level.
- * See comment on coordinate system above.
- * Returns false if there is no such object at the current position.
- * The returned bounding box is guaranteed to match the size and position
- * of the image returned by GetBinaryImage, but may clip foreground pixels
- * from a grey image. The padding argument to GetImage can be used to expand
- * the image to include more foreground pixels. See GetImage below.
- */
- bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right,
- int *bottom) const;
- bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top,
- int *right, int *bottom) const;
- /**
- * Returns the bounding rectangle of the object in a coordinate system of the
- * working image rectangle having its origin at (rect_left_, rect_top_) with
- * respect to the original image and is scaled by a factor scale_.
- */
- bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top,
- int *right, int *bottom) const;
-
- /** Returns whether there is no object of a given level. */
- bool Empty(PageIteratorLevel level) const;
-
- /**
- * Returns the type of the current block.
- * See tesseract/publictypes.h for PolyBlockType.
- */
- PolyBlockType BlockType() const;
-
- /**
- * Returns the polygon outline of the current block. The returned Pta must
- * be ptaDestroy-ed after use. Note that the returned Pta lists the vertices
- * of the polygon, and the last edge is the line segment between the last
- * point and the first point. nullptr will be returned if the iterator is
- * at the end of the document or layout analysis was not used.
- */
- Pta *BlockPolygon() const;
-
- /**
- * Returns a binary image of the current object at the given level.
- * The position and size match the return from BoundingBoxInternal, and so
- * this could be upscaled with respect to the original input image.
- * Use pixDestroy to delete the image after use.
- */
- Pix *GetBinaryImage(PageIteratorLevel level) const;
-
- /**
- * Returns an image of the current object at the given level in greyscale
- * if available in the input. To guarantee a binary image use BinaryImage.
- * NOTE that in order to give the best possible image, the bounds are
- * expanded slightly over the binary connected component, by the supplied
- * padding, so the top-left position of the returned image is returned
- * in (left,top). These will most likely not match the coordinates
- * returned by BoundingBox.
- * If you do not supply an original image, you will get a binary one.
- * Use pixDestroy to delete the image after use.
- */
- Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img,
- int *left, int *top) const;
-
- /**
- * Returns the baseline of the current object at the given level.
- * The baseline is the line that passes through (x1, y1) and (x2, y2).
- * WARNING: with vertical text, baselines may be vertical!
- * Returns false if there is no baseline at the current position.
- */
- bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2,
- int *y2) const;
-
- // Returns the attributes of the current row.
- void RowAttributes(float *row_height, float *descenders,
- float *ascenders) const;
-
- /**
- * Returns orientation for the block the iterator points to.
- * orientation, writing_direction, textline_order: see publictypes.h
- * deskew_angle: after rotating the block so the text orientation is
- * upright, how many radians does one have to rotate the
- * block anti-clockwise for it to be level?
- * -Pi/4 <= deskew_angle <= Pi/4
- */
- void Orientation(tesseract::Orientation *orientation,
- tesseract::WritingDirection *writing_direction,
- tesseract::TextlineOrder *textline_order,
- float *deskew_angle) const;
-
- /**
- * Returns information about the current paragraph, if available.
- *
- * justification -
- * LEFT if ragged right, or fully justified and script is left-to-right.
- * RIGHT if ragged left, or fully justified and script is right-to-left.
- * unknown if it looks like source code or we have very few lines.
- * is_list_item -
- * true if we believe this is a member of an ordered or unordered list.
- * is_crown -
- * true if the first line of the paragraph is aligned with the other
- * lines of the paragraph even though subsequent paragraphs have first
- * line indents. This typically indicates that this is the continuation
- * of a previous paragraph or that it is the very first paragraph in
- * the chapter.
- * first_line_indent -
- * For LEFT aligned paragraphs, the first text line of paragraphs of
- * this kind are indented this many pixels from the left edge of the
- * rest of the paragraph.
- * for RIGHT aligned paragraphs, the first text line of paragraphs of
- * this kind are indented this many pixels from the right edge of the
- * rest of the paragraph.
- * NOTE 1: This value may be negative.
- * NOTE 2: if *is_crown == true, the first line of this paragraph is
- * actually flush, and first_line_indent is set to the "common"
- * first_line_indent for subsequent paragraphs in this block
- * of text.
- */
- void ParagraphInfo(tesseract::ParagraphJustification *justification,
- bool *is_list_item, bool *is_crown,
- int *first_line_indent) const;
-
- // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle
- // of the current word to the given pointer (takes ownership of the pointer)
- // and returns true.
- // Can only be used when iterating on the word level.
- bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);
-
-protected:
- /**
- * Sets up the internal data for iterating the blobs of a new word, then
- * moves the iterator to the given offset.
- */
- void BeginWord(int offset);
-
- /** Pointer to the page_res owned by the API. */
- PAGE_RES *page_res_;
- /** Pointer to the Tesseract object owned by the API. */
- Tesseract *tesseract_;
- /**
- * The iterator to the page_res_. Owned by this ResultIterator.
- * A pointer just to avoid dragging in Tesseract includes.
- */
- PAGE_RES_IT *it_;
- /**
- * The current input WERD being iterated. If there is an output from OCR,
- * then word_ is nullptr. Owned by the API
- */
- WERD *word_;
- /** The length of the current word_. */
- int word_length_;
- /** The current blob index within the word. */
- int blob_index_;
- /**
- * Iterator to the blobs within the word. If nullptr, then we are iterating
- * OCR results in the box_word.
- * Owned by this ResultIterator.
- */
- C_BLOB_IT *cblob_it_;
- /** Control over what to include in bounding boxes. */
- bool include_upper_dots_;
- bool include_lower_dots_;
- /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/
- int scale_;
- int scaled_yres_;
- int rect_left_;
- int rect_top_;
- int rect_width_;
- int rect_height_;
-};
-
-} // namespace tesseract.
-
-#endif // TESSERACT_CCMAIN_PAGEITERATOR_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/publictypes.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/publictypes.h
deleted file mode 100644
index 0069cf28..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/publictypes.h
+++ /dev/null
@@ -1,281 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: publictypes.h
-// Description: Types used in both the API and internally
-// Author: Ray Smith
-//
-// (C) Copyright 2010, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCSTRUCT_PUBLICTYPES_H_
-#define TESSERACT_CCSTRUCT_PUBLICTYPES_H_
-
-namespace tesseract {
-
-// This file contains types that are used both by the API and internally
-// to Tesseract. In order to decouple the API from Tesseract and prevent cyclic
-// dependencies, THIS FILE SHOULD NOT DEPEND ON ANY OTHER PART OF TESSERACT.
-// Restated: It is OK for low-level Tesseract files to include publictypes.h,
-// but not for the low-level tesseract code to include top-level API code.
-// This file should not use other Tesseract types, as that would drag
-// their includes into the API-level.
-
-/** Number of printers' points in an inch. The unit of the pointsize return. */
-constexpr int kPointsPerInch = 72;
-/**
- * Minimum believable resolution. Used as a default if there is no other
- * information, as it is safer to under-estimate than over-estimate.
- */
-constexpr int kMinCredibleResolution = 70;
-/** Maximum believable resolution. */
-constexpr int kMaxCredibleResolution = 2400;
-/**
- * Ratio between median blob size and likely resolution. Used to estimate
- * resolution when none is provided. This is basically 1/usual text size in
- * inches. */
-constexpr int kResolutionEstimationFactor = 10;
-
-/**
- * Possible types for a POLY_BLOCK or ColPartition.
- * Must be kept in sync with kPBColors in polyblk.cpp and PTIs*Type functions
- * below, as well as kPolyBlockNames in layout_test.cc.
- * Used extensively by ColPartition, and POLY_BLOCK.
- */
-enum PolyBlockType {
- PT_UNKNOWN, // Type is not yet known. Keep as the first element.
- PT_FLOWING_TEXT, // Text that lives inside a column.
- PT_HEADING_TEXT, // Text that spans more than one column.
- PT_PULLOUT_TEXT, // Text that is in a cross-column pull-out region.
- PT_EQUATION, // Partition belonging to an equation region.
- PT_INLINE_EQUATION, // Partition has inline equation.
- PT_TABLE, // Partition belonging to a table region.
- PT_VERTICAL_TEXT, // Text-line runs vertically.
- PT_CAPTION_TEXT, // Text that belongs to an image.
- PT_FLOWING_IMAGE, // Image that lives inside a column.
- PT_HEADING_IMAGE, // Image that spans more than one column.
- PT_PULLOUT_IMAGE, // Image that is in a cross-column pull-out region.
- PT_HORZ_LINE, // Horizontal Line.
- PT_VERT_LINE, // Vertical Line.
- PT_NOISE, // Lies outside of any column.
- PT_COUNT
-};
-
-/** Returns true if PolyBlockType is of horizontal line type */
-inline bool PTIsLineType(PolyBlockType type) {
- return type == PT_HORZ_LINE || type == PT_VERT_LINE;
-}
-/** Returns true if PolyBlockType is of image type */
-inline bool PTIsImageType(PolyBlockType type) {
- return type == PT_FLOWING_IMAGE || type == PT_HEADING_IMAGE ||
- type == PT_PULLOUT_IMAGE;
-}
-/** Returns true if PolyBlockType is of text type */
-inline bool PTIsTextType(PolyBlockType type) {
- return type == PT_FLOWING_TEXT || type == PT_HEADING_TEXT ||
- type == PT_PULLOUT_TEXT || type == PT_TABLE ||
- type == PT_VERTICAL_TEXT || type == PT_CAPTION_TEXT ||
- type == PT_INLINE_EQUATION;
-}
-// Returns true if PolyBlockType is of pullout(inter-column) type
-inline bool PTIsPulloutType(PolyBlockType type) {
- return type == PT_PULLOUT_IMAGE || type == PT_PULLOUT_TEXT;
-}
-
-/**
- * +------------------+ Orientation Example:
- * | 1 Aaaa Aaaa Aaaa | ====================
- * | Aaa aa aaa aa | To left is a diagram of some (1) English and
- * | aaaaaa A aa aaa. | (2) Chinese text and a (3) photo credit.
- * | 2 |
- * | ####### c c C | Upright Latin characters are represented as A and a.
- * | ####### c c c | '<' represents a latin character rotated
- * | < ####### c c c | anti-clockwise 90 degrees.
- * | < ####### c c |
- * | < ####### . c | Upright Chinese characters are represented C and c.
- * | 3 ####### c |
- * +------------------+ NOTA BENE: enum values here should match goodoc.proto
-
- * If you orient your head so that "up" aligns with Orientation,
- * then the characters will appear "right side up" and readable.
- *
- * In the example above, both the English and Chinese paragraphs are oriented
- * so their "up" is the top of the page (page up). The photo credit is read
- * with one's head turned leftward ("up" is to page left).
- *
- * The values of this enum match the convention of Tesseract's osdetect.h
-*/
-enum Orientation {
- ORIENTATION_PAGE_UP = 0,
- ORIENTATION_PAGE_RIGHT = 1,
- ORIENTATION_PAGE_DOWN = 2,
- ORIENTATION_PAGE_LEFT = 3,
-};
-
-/**
- * The grapheme clusters within a line of text are laid out logically
- * in this direction, judged when looking at the text line rotated so that
- * its Orientation is "page up".
- *
- * For English text, the writing direction is left-to-right. For the
- * Chinese text in the above example, the writing direction is top-to-bottom.
- */
-enum WritingDirection {
- WRITING_DIRECTION_LEFT_TO_RIGHT = 0,
- WRITING_DIRECTION_RIGHT_TO_LEFT = 1,
- WRITING_DIRECTION_TOP_TO_BOTTOM = 2,
-};
-
-/**
- * The text lines are read in the given sequence.
- *
- * In English, the order is top-to-bottom.
- * In Chinese, vertical text lines are read right-to-left. Mongolian is
- * written in vertical columns top to bottom like Chinese, but the lines
- * order left-to right.
- *
- * Note that only some combinations make sense. For example,
- * WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM
- */
-enum TextlineOrder {
- TEXTLINE_ORDER_LEFT_TO_RIGHT = 0,
- TEXTLINE_ORDER_RIGHT_TO_LEFT = 1,
- TEXTLINE_ORDER_TOP_TO_BOTTOM = 2,
-};
-
-/**
- * Possible modes for page layout analysis. These *must* be kept in order
- * of decreasing amount of layout analysis to be done, except for OSD_ONLY,
- * so that the inequality test macros below work.
- */
-enum PageSegMode {
- PSM_OSD_ONLY = 0, ///< Orientation and script detection only.
- PSM_AUTO_OSD = 1, ///< Automatic page segmentation with orientation and
- ///< script detection. (OSD)
- PSM_AUTO_ONLY = 2, ///< Automatic page segmentation, but no OSD, or OCR.
- PSM_AUTO = 3, ///< Fully automatic page segmentation, but no OSD.
- PSM_SINGLE_COLUMN = 4, ///< Assume a single column of text of variable sizes.
- PSM_SINGLE_BLOCK_VERT_TEXT = 5, ///< Assume a single uniform block of
- ///< vertically aligned text.
- PSM_SINGLE_BLOCK = 6, ///< Assume a single uniform block of text. (Default.)
- PSM_SINGLE_LINE = 7, ///< Treat the image as a single text line.
- PSM_SINGLE_WORD = 8, ///< Treat the image as a single word.
- PSM_CIRCLE_WORD = 9, ///< Treat the image as a single word in a circle.
- PSM_SINGLE_CHAR = 10, ///< Treat the image as a single character.
- PSM_SPARSE_TEXT =
- 11, ///< Find as much text as possible in no particular order.
- PSM_SPARSE_TEXT_OSD = 12, ///< Sparse text with orientation and script det.
- PSM_RAW_LINE = 13, ///< Treat the image as a single text line, bypassing
- ///< hacks that are Tesseract-specific.
-
- PSM_COUNT ///< Number of enum entries.
-};
-
-/**
- * Inline functions that act on a PageSegMode to determine whether components of
- * layout analysis are enabled.
- * *Depend critically on the order of elements of PageSegMode.*
- * NOTE that arg is an int for compatibility with INT_PARAM.
- */
-inline bool PSM_OSD_ENABLED(int pageseg_mode) {
- return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD;
-}
-inline bool PSM_ORIENTATION_ENABLED(int pageseg_mode) {
- return pageseg_mode <= PSM_AUTO || pageseg_mode == PSM_SPARSE_TEXT_OSD;
-}
-inline bool PSM_COL_FIND_ENABLED(int pageseg_mode) {
- return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO;
-}
-inline bool PSM_SPARSE(int pageseg_mode) {
- return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
-}
-inline bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode) {
- return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN;
-}
-inline bool PSM_LINE_FIND_ENABLED(int pageseg_mode) {
- return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK;
-}
-inline bool PSM_WORD_FIND_ENABLED(int pageseg_mode) {
- return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) ||
- pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
-}
-
-/**
- * enum of the elements of the page hierarchy, used in ResultIterator
- * to provide functions that operate on each level without having to
- * have 5x as many functions.
- */
-enum PageIteratorLevel {
- RIL_BLOCK, // Block of text/image/separator line.
- RIL_PARA, // Paragraph within a block.
- RIL_TEXTLINE, // Line within a paragraph.
- RIL_WORD, // Word within a textline.
- RIL_SYMBOL // Symbol/character within a word.
-};
-
-/**
- * JUSTIFICATION_UNKNOWN
- * The alignment is not clearly one of the other options. This could happen
- * for example if there are only one or two lines of text or the text looks
- * like source code or poetry.
- *
- * NOTA BENE: Fully justified paragraphs (text aligned to both left and right
- * margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text
- * is written with a left-to-right script and with JUSTIFICATION_RIGHT if
- * their text is written in a right-to-left script.
- *
- * Interpretation for text read in vertical lines:
- * "Left" is wherever the starting reading position is.
- *
- * JUSTIFICATION_LEFT
- * Each line, except possibly the first, is flush to the same left tab stop.
- *
- * JUSTIFICATION_CENTER
- * The text lines of the paragraph are centered about a line going
- * down through their middle of the text lines.
- *
- * JUSTIFICATION_RIGHT
- * Each line, except possibly the first, is flush to the same right tab stop.
- */
-enum ParagraphJustification {
- JUSTIFICATION_UNKNOWN,
- JUSTIFICATION_LEFT,
- JUSTIFICATION_CENTER,
- JUSTIFICATION_RIGHT,
-};
-
-/**
- * When Tesseract/Cube is initialized we can choose to instantiate/load/run
- * only the Tesseract part, only the Cube part or both along with the combiner.
- * The preference of which engine to use is stored in tessedit_ocr_engine_mode.
- *
- * ATTENTION: When modifying this enum, please make sure to make the
- * appropriate changes to all the enums mirroring it (e.g. OCREngine in
- * cityblock/workflow/detection/detection_storage.proto). Such enums will
- * mention the connection to OcrEngineMode in the comments.
- */
-enum OcrEngineMode {
- OEM_TESSERACT_ONLY, // Run Tesseract only - fastest; deprecated
- OEM_LSTM_ONLY, // Run just the LSTM line recognizer.
- OEM_TESSERACT_LSTM_COMBINED, // Run the LSTM recognizer, but allow fallback
- // to Tesseract when things get difficult.
- // deprecated
- OEM_DEFAULT, // Specify this mode when calling init_*(),
- // to indicate that any of the above modes
- // should be automatically inferred from the
- // variables in the language-specific config,
- // command-line configs, or if not specified
- // in any of the above should be set to the
- // default OEM_TESSERACT_ONLY.
- OEM_COUNT // Number of OEMs
-};
-
-} // namespace tesseract.
-
-#endif // TESSERACT_CCSTRUCT_PUBLICTYPES_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/renderer.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/renderer.h
deleted file mode 100644
index 6f405233..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/renderer.h
+++ /dev/null
@@ -1,311 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: renderer.h
-// Description: Rendering interface to inject into TessBaseAPI
-//
-// (C) Copyright 2011, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_API_RENDERER_H_
-#define TESSERACT_API_RENDERER_H_
-
-#include "export.h"
-
-// To avoid collision with other typenames include the ABSOLUTE MINIMUM
-// complexity of includes here. Use forward declarations wherever possible
-// and hide includes of complex types in baseapi.cpp.
-#include
-#include // for std::string
-#include // for std::vector
-
-struct Pix;
-
-namespace tesseract {
-
-class TessBaseAPI;
-
-/**
- * Interface for rendering tesseract results into a document, such as text,
- * HOCR or pdf. This class is abstract. Specific classes handle individual
- * formats. This interface is then used to inject the renderer class into
- * tesseract when processing images.
- *
- * For simplicity implementing this with tesseract version 3.01,
- * the renderer contains document state that is cleared from document
- * to document just as the TessBaseAPI is. This way the base API can just
- * delegate its rendering functionality to injected renderers, and the
- * renderers can manage the associated state needed for the specific formats
- * in addition to the heuristics for producing it.
- */
-class TESS_API TessResultRenderer {
-public:
- virtual ~TessResultRenderer();
-
- // Takes ownership of pointer so must be new'd instance.
- // Renderers aren't ordered, but appends the sequences of next parameter
- // and existing next(). The renderers should be unique across both lists.
- void insert(TessResultRenderer *next);
-
- // Returns the next renderer or nullptr.
- TessResultRenderer *next() {
- return next_;
- }
-
- /**
- * Starts a new document with the given title.
- * This clears the contents of the output data.
- * Title should use UTF-8 encoding.
- */
- bool BeginDocument(const char *title);
-
- /**
- * Adds the recognized text from the source image to the current document.
- * Invalid if BeginDocument not yet called.
- *
- * Note that this API is a bit weird but is designed to fit into the
- * current TessBaseAPI implementation where the api has lots of state
- * information that we might want to add in.
- */
- bool AddImage(TessBaseAPI *api);
-
- /**
- * Finishes the document and finalizes the output data
- * Invalid if BeginDocument not yet called.
- */
- bool EndDocument();
-
- const char *file_extension() const {
- return file_extension_;
- }
- const char *title() const {
- return title_.c_str();
- }
-
- // Is everything fine? Otherwise something went wrong.
- bool happy() const {
- return happy_;
- }
-
- /**
- * Returns the index of the last image given to AddImage
- * (i.e. images are incremented whether the image succeeded or not)
- *
- * This is always defined. It means either the number of the
- * current image, the last image ended, or in the completed document
- * depending on when in the document lifecycle you are looking at it.
- * Will return -1 if a document was never started.
- */
- int imagenum() const {
- return imagenum_;
- }
-
-protected:
- /**
- * Called by concrete classes.
- *
- * outputbase is the name of the output file excluding
- * extension. For example, "/path/to/chocolate-chip-cookie-recipe"
- *
- * extension indicates the file extension to be used for output
- * files. For example "pdf" will produce a .pdf file, and "hocr"
- * will produce .hocr files.
- */
- TessResultRenderer(const char *outputbase, const char *extension);
-
- // Hook for specialized handling in BeginDocument()
- virtual bool BeginDocumentHandler();
-
- // This must be overridden to render the OCR'd results
- virtual bool AddImageHandler(TessBaseAPI *api) = 0;
-
- // Hook for specialized handling in EndDocument()
- virtual bool EndDocumentHandler();
-
- // Renderers can call this to append '\0' terminated strings into
- // the output string returned by GetOutput.
- // This method will grow the output buffer if needed.
- void AppendString(const char *s);
-
- // Renderers can call this to append binary byte sequences into
- // the output string returned by GetOutput. Note that s is not necessarily
- // '\0' terminated (and can contain '\0' within it).
- // This method will grow the output buffer if needed.
- void AppendData(const char *s, int len);
-
-private:
- TessResultRenderer *next_; // Can link multiple renderers together
- FILE *fout_; // output file pointer
- const char *file_extension_; // standard extension for generated output
- std::string title_; // title of document being rendered
- int imagenum_; // index of last image added
- bool happy_; // I get grumpy when the disk fills up, etc.
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string
- */
-class TESS_API TessTextRenderer : public TessResultRenderer {
-public:
- explicit TessTextRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-/**
- * Renders tesseract output into an hocr text string
- */
-class TESS_API TessHOcrRenderer : public TessResultRenderer {
-public:
- explicit TessHOcrRenderer(const char *outputbase, bool font_info);
- explicit TessHOcrRenderer(const char *outputbase);
-
-protected:
- bool BeginDocumentHandler() override;
- bool AddImageHandler(TessBaseAPI *api) override;
- bool EndDocumentHandler() override;
-
-private:
- bool font_info_; // whether to print font information
-};
-
-/**
- * Renders tesseract output into an alto text string
- */
-class TESS_API TessAltoRenderer : public TessResultRenderer {
-public:
- explicit TessAltoRenderer(const char *outputbase);
-
-protected:
- bool BeginDocumentHandler() override;
- bool AddImageHandler(TessBaseAPI *api) override;
- bool EndDocumentHandler() override;
-
-private:
- bool begin_document;
-};
-
-/**
- * Renders Tesseract output into a TSV string
- */
-class TESS_API TessTsvRenderer : public TessResultRenderer {
-public:
- explicit TessTsvRenderer(const char *outputbase, bool font_info);
- explicit TessTsvRenderer(const char *outputbase);
-
-protected:
- bool BeginDocumentHandler() override;
- bool AddImageHandler(TessBaseAPI *api) override;
- bool EndDocumentHandler() override;
-
-private:
- bool font_info_; // whether to print font information
-};
-
-/**
- * Renders tesseract output into searchable PDF
- */
-class TESS_API TessPDFRenderer : public TessResultRenderer {
-public:
- // datadir is the location of the TESSDATA. We need it because
- // we load a custom PDF font from this location.
- TessPDFRenderer(const char *outputbase, const char *datadir,
- bool textonly = false);
-
-protected:
- bool BeginDocumentHandler() override;
- bool AddImageHandler(TessBaseAPI *api) override;
- bool EndDocumentHandler() override;
-
-private:
- // We don't want to have every image in memory at once,
- // so we store some metadata as we go along producing
- // PDFs one page at a time. At the end, that metadata is
- // used to make everything that isn't easily handled in a
- // streaming fashion.
- long int obj_; // counter for PDF objects
- std::vector offsets_; // offset of every PDF object in bytes
- std::vector pages_; // object number for every /Page object
- std::string datadir_; // where to find the custom font
- bool textonly_; // skip images if set
- // Bookkeeping only. DIY = Do It Yourself.
- void AppendPDFObjectDIY(size_t objectsize);
- // Bookkeeping + emit data.
- void AppendPDFObject(const char *data);
- // Create the /Contents object for an entire page.
- char *GetPDFTextObjects(TessBaseAPI *api, double width, double height);
- // Turn an image into a PDF object. Only transcode if we have to.
- static bool imageToPDFObj(Pix *pix, const char *filename, long int objnum,
- char **pdf_object, long int *pdf_object_size,
- int jpg_quality);
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string
- */
-class TESS_API TessUnlvRenderer : public TessResultRenderer {
-public:
- explicit TessUnlvRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string for LSTMBox
- */
-class TESS_API TessLSTMBoxRenderer : public TessResultRenderer {
-public:
- explicit TessLSTMBoxRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string
- */
-class TESS_API TessBoxTextRenderer : public TessResultRenderer {
-public:
- explicit TessBoxTextRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string in WordStr format
- */
-class TESS_API TessWordStrBoxRenderer : public TessResultRenderer {
-public:
- explicit TessWordStrBoxRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-#ifndef DISABLED_LEGACY_ENGINE
-
-/**
- * Renders tesseract output into an osd text string
- */
-class TESS_API TessOsdRenderer : public TessResultRenderer {
-public:
- explicit TessOsdRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-#endif // ndef DISABLED_LEGACY_ENGINE
-
-} // namespace tesseract.
-
-#endif // TESSERACT_API_RENDERER_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/resultiterator.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/resultiterator.h
deleted file mode 100644
index 3e4d5807..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/resultiterator.h
+++ /dev/null
@@ -1,250 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: resultiterator.h
-// Description: Iterator for tesseract results that is capable of
-// iterating in proper reading order over Bi Directional
-// (e.g. mixed Hebrew and English) text.
-// Author: David Eger
-//
-// (C) Copyright 2011, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_
-#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_
-
-#include "export.h" // for TESS_API, TESS_LOCAL
-#include "ltrresultiterator.h" // for LTRResultIterator
-#include "publictypes.h" // for PageIteratorLevel
-#include "unichar.h" // for StrongScriptDirection
-
-#include // for std::pair
-#include // for std::vector
-
-namespace tesseract {
-
-class TESS_API ResultIterator : public LTRResultIterator {
-public:
- static ResultIterator *StartOfParagraph(const LTRResultIterator &resit);
-
- /**
- * ResultIterator is copy constructible!
- * The default copy constructor works just fine for us.
- */
- ~ResultIterator() override = default;
-
- // ============= Moving around within the page ============.
- /**
- * Moves the iterator to point to the start of the page to begin
- * an iteration.
- */
- void Begin() override;
-
- /**
- * Moves to the start of the next object at the given level in the
- * page hierarchy in the appropriate reading order and returns false if
- * the end of the page was reached.
- * NOTE that RIL_SYMBOL will skip non-text blocks, but all other
- * PageIteratorLevel level values will visit each non-text block once.
- * Think of non text blocks as containing a single para, with a single line,
- * with a single imaginary word.
- * Calls to Next with different levels may be freely intermixed.
- * This function iterates words in right-to-left scripts correctly, if
- * the appropriate language has been loaded into Tesseract.
- */
- bool Next(PageIteratorLevel level) override;
-
- /**
- * IsAtBeginningOf() returns whether we're at the logical beginning of the
- * given level. (as opposed to ResultIterator's left-to-right top-to-bottom
- * order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf().
- * For a full description, see pageiterator.h
- */
- bool IsAtBeginningOf(PageIteratorLevel level) const override;
-
- /**
- * Implement PageIterator's IsAtFinalElement correctly in a BiDi context.
- * For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we
- * point at the last word in a paragraph. See PageIterator for full comment.
- */
- bool IsAtFinalElement(PageIteratorLevel level,
- PageIteratorLevel element) const override;
-
- // ============= Functions that refer to words only ============.
- // Returns the number of blanks before the current word.
- int BlanksBeforeWord() const;
-
- // ============= Accessing data ==============.
-
- /**
- * Returns the null terminated UTF-8 encoded text string for the current
- * object at the given level. Use delete [] to free after use.
- */
- virtual char *GetUTF8Text(PageIteratorLevel level) const;
-
- /**
- * Returns the LSTM choices for every LSTM timestep for the current word.
- */
- virtual std::vector>>>
- *GetRawLSTMTimesteps() const;
- virtual std::vector>>
- *GetBestLSTMSymbolChoices() const;
-
- /**
- * Return whether the current paragraph's dominant reading direction
- * is left-to-right (as opposed to right-to-left).
- */
- bool ParagraphIsLtr() const;
-
- // ============= Exposed only for testing =============.
-
- /**
- * Yields the reading order as a sequence of indices and (optional)
- * meta-marks for a set of words (given left-to-right).
- * The meta marks are passed as negative values:
- * kMinorRunStart Start of minor direction text.
- * kMinorRunEnd End of minor direction text.
- * kComplexWord The next indexed word contains both left-to-right and
- * right-to-left characters and was treated as neutral.
- *
- * For example, suppose we have five words in a text line,
- * indexed [0,1,2,3,4] from the leftmost side of the text line.
- * The following are all believable reading_orders:
- *
- * Left-to-Right (in ltr paragraph):
- * { 0, 1, 2, 3, 4 }
- * Left-to-Right (in rtl paragraph):
- * { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd }
- * Right-to-Left (in rtl paragraph):
- * { 4, 3, 2, 1, 0 }
- * Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph:
- * { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }
- */
- static void CalculateTextlineOrder(
- bool paragraph_is_ltr,
- const std::vector &word_dirs,
- std::vector *reading_order);
-
- static const int kMinorRunStart;
- static const int kMinorRunEnd;
- static const int kComplexWord;
-
-protected:
- /**
- * We presume the data associated with the given iterator will outlive us.
- * NB: This is private because it does something that is non-obvious:
- * it resets to the beginning of the paragraph instead of staying wherever
- * resit might have pointed.
- */
- explicit ResultIterator(const LTRResultIterator &resit);
-
-private:
- /**
- * Calculates the current paragraph's dominant writing direction.
- * Typically, members should use current_paragraph_ltr_ instead.
- */
- bool CurrentParagraphIsLtr() const;
-
- /**
- * Returns word indices as measured from resit->RestartRow() = index 0
- * for the reading order of words within a textline given an iterator
- * into the middle of the text line.
- * In addition to non-negative word indices, the following negative values
- * may be inserted:
- * kMinorRunStart Start of minor direction text.
- * kMinorRunEnd End of minor direction text.
- * kComplexWord The previous word contains both left-to-right and
- * right-to-left characters and was treated as neutral.
- */
- void CalculateTextlineOrder(bool paragraph_is_ltr,
- const LTRResultIterator &resit,
- std::vector *indices) const;
- /** Same as above, but the caller's ssd gets filled in if ssd != nullptr. */
- void CalculateTextlineOrder(bool paragraph_is_ltr,
- const LTRResultIterator &resit,
- std::vector *ssd,
- std::vector *indices) const;
-
- /**
- * What is the index of the current word in a strict left-to-right reading
- * of the row?
- */
- int LTRWordIndex() const;
-
- /**
- * Given an iterator pointing at a word, returns the logical reading order
- * of blob indices for the word.
- */
- void CalculateBlobOrder(std::vector *blob_indices) const;
-
- /** Precondition: current_paragraph_is_ltr_ is set. */
- void MoveToLogicalStartOfTextline();
-
- /**
- * Precondition: current_paragraph_is_ltr_ and in_minor_direction_
- * are set.
- */
- void MoveToLogicalStartOfWord();
-
- /** Are we pointing at the final (reading order) symbol of the word? */
- bool IsAtFinalSymbolOfWord() const;
-
- /** Are we pointing at the first (reading order) symbol of the word? */
- bool IsAtFirstSymbolOfWord() const;
-
- /**
- * Append any extra marks that should be appended to this word when printed.
- * Mostly, these are Unicode BiDi control characters.
- */
- void AppendSuffixMarks(std::string *text) const;
-
- /** Appends the current word in reading order to the given buffer.*/
- void AppendUTF8WordText(std::string *text) const;
-
- /**
- * Appends the text of the current text line, *assuming this iterator is
- * positioned at the beginning of the text line* This function
- * updates the iterator to point to the first position past the text line.
- * Each textline is terminated in a single newline character.
- * If the textline ends a paragraph, it gets a second terminal newline.
- */
- void IterateAndAppendUTF8TextlineText(std::string *text);
-
- /**
- * Appends the text of the current paragraph in reading order
- * to the given buffer.
- * Each textline is terminated in a single newline character, and the
- * paragraph gets an extra newline at the end.
- */
- void AppendUTF8ParagraphText(std::string *text) const;
-
- /** Returns whether the bidi_debug flag is set to at least min_level. */
- bool BidiDebug(int min_level) const;
-
- bool current_paragraph_is_ltr_;
-
- /**
- * Is the currently pointed-at character at the beginning of
- * a minor-direction run?
- */
- bool at_beginning_of_minor_run_;
-
- /** Is the currently pointed-at character in a minor-direction sequence? */
- bool in_minor_direction_;
-
- /**
- * Should detected inter-word spaces be preserved, or "compressed" to a single
- * space character (default behavior).
- */
- bool preserve_interword_spaces_;
-};
-
-} // namespace tesseract.
-
-#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/unichar.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/unichar.h
deleted file mode 100644
index 015109d7..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/unichar.h
+++ /dev/null
@@ -1,174 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: unichar.h
-// Description: Unicode character/ligature class.
-// Author: Ray Smith
-//
-// (C) Copyright 2006, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCUTIL_UNICHAR_H_
-#define TESSERACT_CCUTIL_UNICHAR_H_
-
-#include "export.h"
-
-#include
-#include
-#include
-#include
-
-namespace tesseract {
-
-// Maximum number of characters that can be stored in a UNICHAR. Must be
-// at least 4. Must not exceed 31 without changing the coding of length.
-#define UNICHAR_LEN 30
-
-// A UNICHAR_ID is the unique id of a unichar.
-using UNICHAR_ID = int;
-
-// A variable to indicate an invalid or uninitialized unichar id.
-static const int INVALID_UNICHAR_ID = -1;
-// A special unichar that corresponds to INVALID_UNICHAR_ID.
-static const char INVALID_UNICHAR[] = "__INVALID_UNICHAR__";
-
-enum StrongScriptDirection {
- DIR_NEUTRAL = 0, // Text contains only neutral characters.
- DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters.
- DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters.
- DIR_MIX = 3, // Text contains a mixture of left-to-right
- // and right-to-left characters.
-};
-
-using char32 = signed int;
-
-// The UNICHAR class holds a single classification result. This may be
-// a single Unicode character (stored as between 1 and 4 utf8 bytes) or
-// multiple Unicode characters representing the NFKC expansion of a ligature
-// such as fi, ffl etc. These are also stored as utf8.
-class TESS_API UNICHAR {
-public:
- UNICHAR() {
- memset(chars, 0, UNICHAR_LEN);
- }
-
- // Construct from a utf8 string. If len<0 then the string is null terminated.
- // If the string is too long to fit in the UNICHAR then it takes only what
- // will fit.
- UNICHAR(const char *utf8_str, int len);
-
- // Construct from a single UCS4 character.
- explicit UNICHAR(int unicode);
-
- // Default copy constructor and operator= are OK.
-
- // Get the first character as UCS-4.
- int first_uni() const;
-
- // Get the length of the UTF8 string.
- int utf8_len() const {
- int len = chars[UNICHAR_LEN - 1];
- return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;
- }
-
- // Get a UTF8 string, but NOT nullptr terminated.
- const char *utf8() const {
- return chars;
- }
-
- // Get a terminated UTF8 string: Must delete[] it after use.
- char *utf8_str() const;
-
- // Get the number of bytes in the first character of the given utf8 string.
- static int utf8_step(const char *utf8_str);
-
- // A class to simplify iterating over and accessing elements of a UTF8
- // string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or
- // take ownership of the underlying byte array. It also does not permit
- // modification of the array (as the name suggests).
- //
- // Example:
- // for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len);
- // it != UNICHAR::end(str, len);
- // ++it) {
- // printf("UCS-4 symbol code = %d\n", *it);
- // char buf[5];
- // int char_len = it.get_utf8(buf); buf[char_len] = '\0';
- // printf("Char = %s\n", buf);
- // }
- class TESS_API const_iterator {
- using CI = const_iterator;
-
- public:
- // Step to the next UTF8 character.
- // If the current position is at an illegal UTF8 character, then print an
- // error message and step by one byte. If the current position is at a
- // nullptr value, don't step past it.
- const_iterator &operator++();
-
- // Return the UCS-4 value at the current position.
- // If the current position is at an illegal UTF8 value, return a single
- // space character.
- int operator*() const;
-
- // Store the UTF-8 encoding of the current codepoint into buf, which must be
- // at least 4 bytes long. Return the number of bytes written.
- // If the current position is at an illegal UTF8 value, writes a single
- // space character and returns 1.
- // Note that this method does not null-terminate the buffer.
- int get_utf8(char *buf) const;
- // Returns the number of bytes of the current codepoint. Returns 1 if the
- // current position is at an illegal UTF8 value.
- int utf8_len() const;
- // Returns true if the UTF-8 encoding at the current position is legal.
- bool is_legal() const;
-
- // Return the pointer into the string at the current position.
- const char *utf8_data() const {
- return it_;
- }
-
- // Iterator equality operators.
- friend bool operator==(const CI &lhs, const CI &rhs) {
- return lhs.it_ == rhs.it_;
- }
- friend bool operator!=(const CI &lhs, const CI &rhs) {
- return !(lhs == rhs);
- }
-
- private:
- friend class UNICHAR;
- explicit const_iterator(const char *it) : it_(it) {}
-
- const char *it_; // Pointer into the string.
- };
-
- // Create a start/end iterator pointing to a string. Note that these methods
- // are static and do NOT create a copy or take ownership of the underlying
- // array.
- static const_iterator begin(const char *utf8_str, int byte_length);
- static const_iterator end(const char *utf8_str, int byte_length);
-
- // Converts a utf-8 string to a vector of unicodes.
- // Returns an empty vector if the input contains invalid UTF-8.
- static std::vector UTF8ToUTF32(const char *utf8_str);
- // Converts a vector of unicodes to a utf8 string.
- // Returns an empty string if the input contains an invalid unicode.
- static std::string UTF32ToUTF8(const std::vector &str32);
-
-private:
- // A UTF-8 representation of 1 or more Unicode characters.
- // The last element (chars[UNICHAR_LEN - 1]) is a length if
- // its value < UNICHAR_LEN, otherwise it is a genuine character.
- char chars[UNICHAR_LEN]{};
-};
-
-} // namespace tesseract
-
-#endif // TESSERACT_CCUTIL_UNICHAR_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/version.h b/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/version.h
deleted file mode 100644
index 6bac5d66..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/aarch64/include/tesseract/version.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: version.h
-// Description: Version information
-//
-// (C) Copyright 2018, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_API_VERSION_H_
-#define TESSERACT_API_VERSION_H_
-
-// clang-format off
-
-#define TESSERACT_MAJOR_VERSION @GENERIC_MAJOR_VERSION@
-#define TESSERACT_MINOR_VERSION @GENERIC_MINOR_VERSION@
-#define TESSERACT_MICRO_VERSION @GENERIC_MICRO_VERSION@
-
-#define TESSERACT_VERSION \
- (TESSERACT_MAJOR_VERSION << 16 | \
- TESSERACT_MINOR_VERSION << 8 | \
- TESSERACT_MICRO_VERSION)
-
-#define TESSERACT_VERSION_STR "@PACKAGE_VERSION@"
-
-// clang-format on
-
-#endif // TESSERACT_API_VERSION_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/baseapi.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/baseapi.h
deleted file mode 100644
index 5e1e4830..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/baseapi.h
+++ /dev/null
@@ -1,812 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: baseapi.h
-// Description: Simple API for calling tesseract.
-// Author: Ray Smith
-//
-// (C) Copyright 2006, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_API_BASEAPI_H_
-#define TESSERACT_API_BASEAPI_H_
-
-#ifdef HAVE_CONFIG_H
-# include "config_auto.h" // DISABLED_LEGACY_ENGINE
-#endif
-
-#include "export.h"
-#include "pageiterator.h"
-#include "publictypes.h"
-#include "resultiterator.h"
-#include "unichar.h"
-
-#include "version.h"
-
-#include
-#include // for std::vector
-
-struct Pix;
-struct Pixa;
-struct Boxa;
-
-namespace tesseract {
-
-class PAGE_RES;
-class ParagraphModel;
-class BLOCK_LIST;
-class ETEXT_DESC;
-struct OSResults;
-class UNICHARSET;
-
-class Dawg;
-class Dict;
-class EquationDetect;
-class PageIterator;
-class ImageThresholder;
-class LTRResultIterator;
-class ResultIterator;
-class MutableIterator;
-class TessResultRenderer;
-class Tesseract;
-
-// Function to read a std::vector from a whole file.
-// Returns false on failure.
-using FileReader = bool (*)(const char *filename, std::vector *data);
-
-using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID,
- bool) const;
-using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *,
- int, const char *, int);
-
-/**
- * Base class for all tesseract APIs.
- * Specific classes can add ability to work on different inputs or produce
- * different outputs.
- * This class is mostly an interface layer on top of the Tesseract instance
- * class to hide the data types so that users of this class don't have to
- * include any other Tesseract headers.
- */
-class TESS_API TessBaseAPI {
-public:
- TessBaseAPI();
- virtual ~TessBaseAPI();
- // Copy constructor and assignment operator are currently unsupported.
- TessBaseAPI(TessBaseAPI const &) = delete;
- TessBaseAPI &operator=(TessBaseAPI const &) = delete;
-
- /**
- * Returns the version identifier as a static string. Do not delete.
- */
- static const char *Version();
-
- /**
- * If compiled with OpenCL AND an available OpenCL
- * device is deemed faster than serial code, then
- * "device" is populated with the cl_device_id
- * and returns sizeof(cl_device_id)
- * otherwise *device=nullptr and returns 0.
- */
- static size_t getOpenCLDevice(void **device);
-
- /**
- * Set the name of the input file. Needed for training and
- * reading a UNLV zone file, and for searchable PDF output.
- */
- void SetInputName(const char *name);
- /**
- * These functions are required for searchable PDF output.
- * We need our hands on the input file so that we can include
- * it in the PDF without transcoding. If that is not possible,
- * we need the original image. Finally, resolution metadata
- * is stored in the PDF so we need that as well.
- */
- const char *GetInputName();
- // Takes ownership of the input pix.
- void SetInputImage(Pix *pix);
- Pix *GetInputImage();
- int GetSourceYResolution();
- const char *GetDatapath();
-
- /** Set the name of the bonus output files. Needed only for debugging. */
- void SetOutputName(const char *name);
-
- /**
- * Set the value of an internal "parameter."
- * Supply the name of the parameter and the value as a string, just as
- * you would in a config file.
- * Returns false if the name lookup failed.
- * Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z.
- * Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode.
- * SetVariable may be used before Init, but settings will revert to
- * defaults on End().
- *
- * Note: Must be called after Init(). Only works for non-init variables
- * (init variables should be passed to Init()).
- */
- bool SetVariable(const char *name, const char *value);
- bool SetDebugVariable(const char *name, const char *value);
-
- /**
- * Returns true if the parameter was found among Tesseract parameters.
- * Fills in value with the value of the parameter.
- */
- bool GetIntVariable(const char *name, int *value) const;
- bool GetBoolVariable(const char *name, bool *value) const;
- bool GetDoubleVariable(const char *name, double *value) const;
-
- /**
- * Returns the pointer to the string that represents the value of the
- * parameter if it was found among Tesseract parameters.
- */
- const char *GetStringVariable(const char *name) const;
-
-#ifndef DISABLED_LEGACY_ENGINE
-
- /**
- * Print Tesseract fonts table to the given file.
- */
- void PrintFontsTable(FILE *fp) const;
-
-#endif
-
- /**
- * Print Tesseract parameters to the given file.
- */
- void PrintVariables(FILE *fp) const;
-
- /**
- * Get value of named variable as a string, if it exists.
- */
- bool GetVariableAsString(const char *name, std::string *val) const;
-
- /**
- * Instances are now mostly thread-safe and totally independent,
- * but some global parameters remain. Basically it is safe to use multiple
- * TessBaseAPIs in different threads in parallel, UNLESS:
- * you use SetVariable on some of the Params in classify and textord.
- * If you do, then the effect will be to change it for all your instances.
- *
- * Start tesseract. Returns zero on success and -1 on failure.
- * NOTE that the only members that may be called before Init are those
- * listed above here in the class definition.
- *
- * The datapath must be the name of the tessdata directory.
- * The language is (usually) an ISO 639-3 string or nullptr will default to
- * eng. It is entirely safe (and eventually will be efficient too) to call
- * Init multiple times on the same instance to change language, or just
- * to reset the classifier.
- * The language may be a string of the form [~][+[~]]* indicating
- * that multiple languages are to be loaded. Eg hin+eng will load Hindi and
- * English. Languages may specify internally that they want to be loaded
- * with one or more other languages, so the ~ sign is available to override
- * that. Eg if hin were set to load eng by default, then hin+~eng would force
- * loading only hin. The number of loaded languages is limited only by
- * memory, with the caveat that loading additional languages will impact
- * both speed and accuracy, as there is more work to do to decide on the
- * applicable language, and there is more chance of hallucinating incorrect
- * words.
- * WARNING: On changing languages, all Tesseract parameters are reset
- * back to their default values. (Which may vary between languages.)
- * If you have a rare need to set a Variable that controls
- * initialization for a second call to Init you should explicitly
- * call End() and then use SetVariable before Init. This is only a very
- * rare use case, since there are very few uses that require any parameters
- * to be set before Init.
- *
- * If set_only_non_debug_params is true, only params that do not contain
- * "debug" in the name will be set.
- */
- int Init(const char *datapath, const char *language, OcrEngineMode mode,
- char **configs, int configs_size,
- const std::vector *vars_vec,
- const std::vector *vars_values,
- bool set_only_non_debug_params);
- int Init(const char *datapath, const char *language, OcrEngineMode oem) {
- return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false);
- }
- int Init(const char *datapath, const char *language) {
- return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr,
- false);
- }
- // In-memory version reads the traineddata file directly from the given
- // data[data_size] array, and/or reads data via a FileReader.
- int Init(const char *data, int data_size, const char *language,
- OcrEngineMode mode, char **configs, int configs_size,
- const std::vector *vars_vec,
- const std::vector *vars_values,
- bool set_only_non_debug_params, FileReader reader);
-
- /**
- * Returns the languages string used in the last valid initialization.
- * If the last initialization specified "deu+hin" then that will be
- * returned. If hin loaded eng automatically as well, then that will
- * not be included in this list. To find the languages actually
- * loaded use GetLoadedLanguagesAsVector.
- * The returned string should NOT be deleted.
- */
- const char *GetInitLanguagesAsString() const;
-
- /**
- * Returns the loaded languages in the vector of std::string.
- * Includes all languages loaded by the last Init, including those loaded
- * as dependencies of other loaded languages.
- */
- void GetLoadedLanguagesAsVector(std::vector *langs) const;
-
- /**
- * Returns the available languages in the sorted vector of std::string.
- */
- void GetAvailableLanguagesAsVector(std::vector *langs) const;
-
- /**
- * Init only for page layout analysis. Use only for calls to SetImage and
- * AnalysePage. Calls that attempt recognition will generate an error.
- */
- void InitForAnalysePage();
-
- /**
- * Read a "config" file containing a set of param, value pairs.
- * Searches the standard places: tessdata/configs, tessdata/tessconfigs
- * and also accepts a relative or absolute path name.
- * Note: only non-init params will be set (init params are set by Init()).
- */
- void ReadConfigFile(const char *filename);
- /** Same as above, but only set debug params from the given config file. */
- void ReadDebugConfigFile(const char *filename);
-
- /**
- * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK.
- * The mode is stored as an IntParam so it can also be modified by
- * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
- */
- void SetPageSegMode(PageSegMode mode);
-
- /** Return the current page segmentation mode. */
- PageSegMode GetPageSegMode() const;
-
- /**
- * Recognize a rectangle from an image and return the result as a string.
- * May be called many times for a single Init.
- * Currently has no error checking.
- * Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
- * Palette color images will not work properly and must be converted to
- * 24 bit.
- * Binary images of 1 bit per pixel may also be given but they must be
- * byte packed with the MSB of the first byte being the first pixel, and a
- * 1 represents WHITE. For binary images set bytes_per_pixel=0.
- * The recognized text is returned as a char* which is coded
- * as UTF8 and must be freed with the delete [] operator.
- *
- * Note that TesseractRect is the simplified convenience interface.
- * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize,
- * and one or more of the Get*Text functions below.
- */
- char *TesseractRect(const unsigned char *imagedata, int bytes_per_pixel,
- int bytes_per_line, int left, int top, int width,
- int height);
-
- /**
- * Call between pages or documents etc to free up memory and forget
- * adaptive data.
- */
- void ClearAdaptiveClassifier();
-
- /**
- * @defgroup AdvancedAPI Advanced API
- * The following methods break TesseractRect into pieces, so you can
- * get hold of the thresholded image, get the text in different formats,
- * get bounding boxes, confidences etc.
- */
- /* @{ */
-
- /**
- * Provide an image for Tesseract to recognize. Format is as
- * TesseractRect above. Copies the image buffer and converts to Pix.
- * SetImage clears all recognition results, and sets the rectangle to the
- * full image, so it may be followed immediately by a GetUTF8Text, and it
- * will automatically perform recognition.
- */
- void SetImage(const unsigned char *imagedata, int width, int height,
- int bytes_per_pixel, int bytes_per_line);
-
- /**
- * Provide an image for Tesseract to recognize. As with SetImage above,
- * Tesseract takes its own copy of the image, so it need not persist until
- * after Recognize.
- * Pix vs raw, which to use?
- * Use Pix where possible. Tesseract uses Pix as its internal representation
- * and it is therefore more efficient to provide a Pix directly.
- */
- void SetImage(Pix *pix);
-
- /**
- * Set the resolution of the source image in pixels per inch so font size
- * information can be calculated in results. Call this after SetImage().
- */
- void SetSourceResolution(int ppi);
-
- /**
- * Restrict recognition to a sub-rectangle of the image. Call after SetImage.
- * Each SetRectangle clears the recogntion results so multiple rectangles
- * can be recognized with the same image.
- */
- void SetRectangle(int left, int top, int width, int height);
-
- /**
- * Get a copy of the internal thresholded image from Tesseract.
- * Caller takes ownership of the Pix and must pixDestroy it.
- * May be called any time after SetImage, or after TesseractRect.
- */
- Pix *GetThresholdedImage();
-
- /**
- * Get the result of page layout analysis as a leptonica-style
- * Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- */
- Boxa *GetRegions(Pixa **pixa);
-
- /**
- * Get the textlines as a leptonica-style
- * Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- * If raw_image is true, then extract from the original image instead of the
- * thresholded image and pad by raw_padding pixels.
- * If blockids is not nullptr, the block-id of each line is also returned as
- * an array of one element per line. delete [] after use. If paraids is not
- * nullptr, the paragraph-id of each line within its block is also returned as
- * an array of one element per line. delete [] after use.
- */
- Boxa *GetTextlines(bool raw_image, int raw_padding, Pixa **pixa,
- int **blockids, int **paraids);
- /*
- Helper method to extract from the thresholded image. (most common usage)
-*/
- Boxa *GetTextlines(Pixa **pixa, int **blockids) {
- return GetTextlines(false, 0, pixa, blockids, nullptr);
- }
-
- /**
- * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa
- * pair, in reading order. Enables downstream handling of non-rectangular
- * regions.
- * Can be called before or after Recognize.
- * If blockids is not nullptr, the block-id of each line is also returned as
- * an array of one element per line. delete [] after use.
- */
- Boxa *GetStrips(Pixa **pixa, int **blockids);
-
- /**
- * Get the words as a leptonica-style
- * Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- */
- Boxa *GetWords(Pixa **pixa);
-
- /**
- * Gets the individual connected (text) components (created
- * after pages segmentation step, but before recognition)
- * as a leptonica-style Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- * Note: the caller is responsible for calling boxaDestroy()
- * on the returned Boxa array and pixaDestroy() on cc array.
- */
- Boxa *GetConnectedComponents(Pixa **cc);
-
- /**
- * Get the given level kind of components (block, textline, word etc.) as a
- * leptonica-style Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- * If blockids is not nullptr, the block-id of each component is also returned
- * as an array of one element per component. delete [] after use.
- * If blockids is not nullptr, the paragraph-id of each component with its
- * block is also returned as an array of one element per component. delete []
- * after use. If raw_image is true, then portions of the original image are
- * extracted instead of the thresholded image and padded with raw_padding. If
- * text_only is true, then only text components are returned.
- */
- Boxa *GetComponentImages(PageIteratorLevel level, bool text_only,
- bool raw_image, int raw_padding, Pixa **pixa,
- int **blockids, int **paraids);
- // Helper function to get binary images with no padding (most common usage).
- Boxa *GetComponentImages(const PageIteratorLevel level, const bool text_only,
- Pixa **pixa, int **blockids) {
- return GetComponentImages(level, text_only, false, 0, pixa, blockids,
- nullptr);
- }
-
- /**
- * Returns the scale factor of the thresholded image that would be returned by
- * GetThresholdedImage() and the various GetX() methods that call
- * GetComponentImages().
- * Returns 0 if no thresholder has been set.
- */
- int GetThresholdedImageScaleFactor() const;
-
- /**
- * Runs page layout analysis in the mode set by SetPageSegMode.
- * May optionally be called prior to Recognize to get access to just
- * the page layout results. Returns an iterator to the results.
- * If merge_similar_words is true, words are combined where suitable for use
- * with a line recognizer. Use if you want to use AnalyseLayout to find the
- * textlines, and then want to process textline fragments with an external
- * line recognizer.
- * Returns nullptr on error or an empty page.
- * The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- */
- PageIterator *AnalyseLayout();
- PageIterator *AnalyseLayout(bool merge_similar_words);
-
- /**
- * Recognize the image from SetAndThresholdImage, generating Tesseract
- * internal structures. Returns 0 on success.
- * Optional. The Get*Text functions below will call Recognize if needed.
- * After Recognize, the output is kept internally until the next SetImage.
- */
- int Recognize(ETEXT_DESC *monitor);
-
- /**
- * Methods to retrieve information after SetAndThresholdImage(),
- * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.)
- */
-
- /**
- * Turns images into symbolic text.
- *
- * filename can point to a single image, a multi-page TIFF,
- * or a plain text list of image filenames.
- *
- * retry_config is useful for debugging. If not nullptr, you can fall
- * back to an alternate configuration if a page fails for some
- * reason.
- *
- * timeout_millisec terminates processing if any single page
- * takes too long. Set to 0 for unlimited time.
- *
- * renderer is responible for creating the output. For example,
- * use the TessTextRenderer if you want plaintext output, or
- * the TessPDFRender to produce searchable PDF.
- *
- * If tessedit_page_number is non-negative, will only process that
- * single page. Works for multi-page tiff file, or filelist.
- *
- * Returns true if successful, false on error.
- */
- bool ProcessPages(const char *filename, const char *retry_config,
- int timeout_millisec, TessResultRenderer *renderer);
- // Does the real work of ProcessPages.
- bool ProcessPagesInternal(const char *filename, const char *retry_config,
- int timeout_millisec, TessResultRenderer *renderer);
-
- /**
- * Turn a single image into symbolic text.
- *
- * The pix is the image processed. filename and page_index are
- * metadata used by side-effect processes, such as reading a box
- * file or formatting as hOCR.
- *
- * See ProcessPages for descriptions of other parameters.
- */
- bool ProcessPage(Pix *pix, int page_index, const char *filename,
- const char *retry_config, int timeout_millisec,
- TessResultRenderer *renderer);
-
- /**
- * Get a reading-order iterator to the results of LayoutAnalysis and/or
- * Recognize. The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- */
- ResultIterator *GetIterator();
-
- /**
- * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.
- * The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- */
- MutableIterator *GetMutableIterator();
-
- /**
- * The recognized text is returned as a char* which is coded
- * as UTF8 and must be freed with the delete [] operator.
- */
- char *GetUTF8Text();
-
- /**
- * Make a HTML-formatted string with hOCR markup from the internal
- * data structures.
- * page_number is 0-based but will appear in the output as 1-based.
- * monitor can be used to
- * cancel the recognition
- * receive progress callbacks
- * Returned string must be freed with the delete [] operator.
- */
- char *GetHOCRText(ETEXT_DESC *monitor, int page_number);
-
- /**
- * Make a HTML-formatted string with hOCR markup from the internal
- * data structures.
- * page_number is 0-based but will appear in the output as 1-based.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetHOCRText(int page_number);
-
- /**
- * Make an XML-formatted string with Alto markup from the internal
- * data structures.
- */
- char *GetAltoText(ETEXT_DESC *monitor, int page_number);
-
- /**
- * Make an XML-formatted string with Alto markup from the internal
- * data structures.
- */
- char *GetAltoText(int page_number);
-
- /**
- * Make a TSV-formatted string from the internal data structures.
- * page_number is 0-based but will appear in the output as 1-based.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetTSVText(int page_number);
-
- /**
- * Make a box file for LSTM training from the internal data structures.
- * Constructs coordinates in the original image - not just the rectangle.
- * page_number is a 0-based page index that will appear in the box file.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetLSTMBoxText(int page_number);
-
- /**
- * The recognized text is returned as a char* which is coded in the same
- * format as a box file used in training.
- * Constructs coordinates in the original image - not just the rectangle.
- * page_number is a 0-based page index that will appear in the box file.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetBoxText(int page_number);
-
- /**
- * The recognized text is returned as a char* which is coded in the same
- * format as a WordStr box file used in training.
- * page_number is a 0-based page index that will appear in the box file.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetWordStrBoxText(int page_number);
-
- /**
- * The recognized text is returned as a char* which is coded
- * as UNLV format Latin-1 with specific reject and suspect codes.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetUNLVText();
-
- /**
- * Detect the orientation of the input image and apparent script (alphabet).
- * orient_deg is the detected clockwise rotation of the input image in degrees
- * (0, 90, 180, 270)
- * orient_conf is the confidence (15.0 is reasonably confident)
- * script_name is an ASCII string, the name of the script, e.g. "Latin"
- * script_conf is confidence level in the script
- * Returns true on success and writes values to each parameter as an output
- */
- bool DetectOrientationScript(int *orient_deg, float *orient_conf,
- const char **script_name, float *script_conf);
-
- /**
- * The recognized text is returned as a char* which is coded
- * as UTF8 and must be freed with the delete [] operator.
- * page_number is a 0-based page index that will appear in the osd file.
- */
- char *GetOsdText(int page_number);
-
- /** Returns the (average) confidence value between 0 and 100. */
- int MeanTextConf();
- /**
- * Returns all word confidences (between 0 and 100) in an array, terminated
- * by -1. The calling function must delete [] after use.
- * The number of confidences should correspond to the number of space-
- * delimited words in GetUTF8Text.
- */
- int *AllWordConfidences();
-
-#ifndef DISABLED_LEGACY_ENGINE
- /**
- * Applies the given word to the adaptive classifier if possible.
- * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can
- * tell the boundaries of the graphemes.
- * Assumes that SetImage/SetRectangle have been used to set the image
- * to the given word. The mode arg should be PSM_SINGLE_WORD or
- * PSM_CIRCLE_WORD, as that will be used to control layout analysis.
- * The currently set PageSegMode is preserved.
- * Returns false if adaption was not possible for some reason.
- */
- bool AdaptToWordStr(PageSegMode mode, const char *wordstr);
-#endif // ndef DISABLED_LEGACY_ENGINE
-
- /**
- * Free up recognition results and any stored image data, without actually
- * freeing any recognition data that would be time-consuming to reload.
- * Afterwards, you must call SetImage or TesseractRect before doing
- * any Recognize or Get* operation.
- */
- void Clear();
-
- /**
- * Close down tesseract and free up all memory. End() is equivalent to
- * destructing and reconstructing your TessBaseAPI.
- * Once End() has been used, none of the other API functions may be used
- * other than Init and anything declared above it in the class definition.
- */
- void End();
-
- /**
- * Clear any library-level memory caches.
- * There are a variety of expensive-to-load constant data structures (mostly
- * language dictionaries) that are cached globally -- surviving the Init()
- * and End() of individual TessBaseAPI's. This function allows the clearing
- * of these caches.
- **/
- static void ClearPersistentCache();
-
- /**
- * Check whether a word is valid according to Tesseract's language model
- * @return 0 if the word is invalid, non-zero if valid.
- * @warning temporary! This function will be removed from here and placed
- * in a separate API at some future time.
- */
- int IsValidWord(const char *word) const;
- // Returns true if utf8_character is defined in the UniCharset.
- bool IsValidCharacter(const char *utf8_character) const;
-
- bool GetTextDirection(int *out_offset, float *out_slope);
-
- /** Sets Dict::letter_is_okay_ function to point to the given function. */
- void SetDictFunc(DictFunc f);
-
- /** Sets Dict::probability_in_context_ function to point to the given
- * function.
- */
- void SetProbabilityInContextFunc(ProbabilityInContextFunc f);
-
- /**
- * Estimates the Orientation And Script of the image.
- * @return true if the image was processed successfully.
- */
- bool DetectOS(OSResults *);
-
- /**
- * Return text orientation of each block as determined by an earlier run
- * of layout analysis.
- */
- void GetBlockTextOrientations(int **block_orientation,
- bool **vertical_writing);
-
- /** This method returns the string form of the specified unichar. */
- const char *GetUnichar(int unichar_id) const;
-
- /** Return the pointer to the i-th dawg loaded into tesseract_ object. */
- const Dawg *GetDawg(int i) const;
-
- /** Return the number of dawgs loaded into tesseract_ object. */
- int NumDawgs() const;
-
- Tesseract *tesseract() const {
- return tesseract_;
- }
-
- OcrEngineMode oem() const {
- return last_oem_requested_;
- }
-
- void set_min_orientation_margin(double margin);
- /* @} */
-
-protected:
- /** Common code for setting the image. Returns true if Init has been called.
- */
- bool InternalSetImage();
-
- /**
- * Run the thresholder to make the thresholded image. If pix is not nullptr,
- * the source is thresholded to pix instead of the internal IMAGE.
- */
- virtual bool Threshold(Pix **pix);
-
- /**
- * Find lines from the image making the BLOCK_LIST.
- * @return 0 on success.
- */
- int FindLines();
-
- /** Delete the pageres and block list ready for a new page. */
- void ClearResults();
-
- /**
- * Return an LTR Result Iterator -- used only for training, as we really want
- * to ignore all BiDi smarts at that point.
- * delete once you're done with it.
- */
- LTRResultIterator *GetLTRIterator();
-
- /**
- * Return the length of the output text string, as UTF8, assuming
- * one newline per line and one per block, with a terminator,
- * and assuming a single character reject marker for each rejected character.
- * Also return the number of recognized blobs in blob_count.
- */
- int TextLength(int *blob_count) const;
-
- //// paragraphs.cpp ////////////////////////////////////////////////////
- void DetectParagraphs(bool after_text_recognition);
-
- const PAGE_RES *GetPageRes() const {
- return page_res_;
- }
-
-protected:
- Tesseract *tesseract_; ///< The underlying data object.
- Tesseract *osd_tesseract_; ///< For orientation & script detection.
- EquationDetect *equ_detect_; ///< The equation detector.
- FileReader reader_; ///< Reads files from any filesystem.
- ImageThresholder *thresholder_; ///< Image thresholding module.
- std::vector *paragraph_models_;
- BLOCK_LIST *block_list_; ///< The page layout.
- PAGE_RES *page_res_; ///< The page-level data.
- std::string input_file_; ///< Name used by training code.
- std::string output_file_; ///< Name used by debug code.
- std::string datapath_; ///< Current location of tessdata.
- std::string language_; ///< Last initialized language.
- OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested.
- bool recognition_done_; ///< page_res_ contains recognition data.
-
- /**
- * @defgroup ThresholderParams Thresholder Parameters
- * Parameters saved from the Thresholder. Needed to rebuild coordinates.
- */
- /* @{ */
- int rect_left_;
- int rect_top_;
- int rect_width_;
- int rect_height_;
- int image_width_;
- int image_height_;
- /* @} */
-
-private:
- // A list of image filenames gets special consideration
- bool ProcessPagesFileList(FILE *fp, std::string *buf,
- const char *retry_config, int timeout_millisec,
- TessResultRenderer *renderer,
- int tessedit_page_number);
- // TIFF supports multipage so gets special consideration.
- bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size,
- const char *filename, const char *retry_config,
- int timeout_millisec,
- TessResultRenderer *renderer,
- int tessedit_page_number);
-}; // class TessBaseAPI.
-
-/** Escape a char string - remove &<>"' with HTML codes. */
-std::string HOcrEscape(const char *text);
-
-} // namespace tesseract
-
-#endif // TESSERACT_API_BASEAPI_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/capi.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/capi.h
deleted file mode 100644
index 40f4856a..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/capi.h
+++ /dev/null
@@ -1,484 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: capi.h
-// Description: C-API TessBaseAPI
-//
-// (C) Copyright 2012, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef API_CAPI_H_
-#define API_CAPI_H_
-
-#include "export.h"
-
-#ifdef __cplusplus
-# include
-# include
-# include
-# include
-# include
-#endif
-
-#include
-#include
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef BOOL
-# define BOOL int
-# define TRUE 1
-# define FALSE 0
-#endif
-
-#ifdef __cplusplus
-typedef tesseract::TessResultRenderer TessResultRenderer;
-typedef tesseract::TessBaseAPI TessBaseAPI;
-typedef tesseract::PageIterator TessPageIterator;
-typedef tesseract::ResultIterator TessResultIterator;
-typedef tesseract::MutableIterator TessMutableIterator;
-typedef tesseract::ChoiceIterator TessChoiceIterator;
-typedef tesseract::OcrEngineMode TessOcrEngineMode;
-typedef tesseract::PageSegMode TessPageSegMode;
-typedef tesseract::PageIteratorLevel TessPageIteratorLevel;
-typedef tesseract::Orientation TessOrientation;
-typedef tesseract::ParagraphJustification TessParagraphJustification;
-typedef tesseract::WritingDirection TessWritingDirection;
-typedef tesseract::TextlineOrder TessTextlineOrder;
-typedef tesseract::PolyBlockType TessPolyBlockType;
-typedef tesseract::ETEXT_DESC ETEXT_DESC;
-#else
-typedef struct TessResultRenderer TessResultRenderer;
-typedef struct TessBaseAPI TessBaseAPI;
-typedef struct TessPageIterator TessPageIterator;
-typedef struct TessResultIterator TessResultIterator;
-typedef struct TessMutableIterator TessMutableIterator;
-typedef struct TessChoiceIterator TessChoiceIterator;
-typedef enum TessOcrEngineMode {
- OEM_TESSERACT_ONLY,
- OEM_LSTM_ONLY,
- OEM_TESSERACT_LSTM_COMBINED,
- OEM_DEFAULT
-} TessOcrEngineMode;
-typedef enum TessPageSegMode {
- PSM_OSD_ONLY,
- PSM_AUTO_OSD,
- PSM_AUTO_ONLY,
- PSM_AUTO,
- PSM_SINGLE_COLUMN,
- PSM_SINGLE_BLOCK_VERT_TEXT,
- PSM_SINGLE_BLOCK,
- PSM_SINGLE_LINE,
- PSM_SINGLE_WORD,
- PSM_CIRCLE_WORD,
- PSM_SINGLE_CHAR,
- PSM_SPARSE_TEXT,
- PSM_SPARSE_TEXT_OSD,
- PSM_RAW_LINE,
- PSM_COUNT
-} TessPageSegMode;
-typedef enum TessPageIteratorLevel {
- RIL_BLOCK,
- RIL_PARA,
- RIL_TEXTLINE,
- RIL_WORD,
- RIL_SYMBOL
-} TessPageIteratorLevel;
-typedef enum TessPolyBlockType {
- PT_UNKNOWN,
- PT_FLOWING_TEXT,
- PT_HEADING_TEXT,
- PT_PULLOUT_TEXT,
- PT_EQUATION,
- PT_INLINE_EQUATION,
- PT_TABLE,
- PT_VERTICAL_TEXT,
- PT_CAPTION_TEXT,
- PT_FLOWING_IMAGE,
- PT_HEADING_IMAGE,
- PT_PULLOUT_IMAGE,
- PT_HORZ_LINE,
- PT_VERT_LINE,
- PT_NOISE,
- PT_COUNT
-} TessPolyBlockType;
-typedef enum TessOrientation {
- ORIENTATION_PAGE_UP,
- ORIENTATION_PAGE_RIGHT,
- ORIENTATION_PAGE_DOWN,
- ORIENTATION_PAGE_LEFT
-} TessOrientation;
-typedef enum TessParagraphJustification {
- JUSTIFICATION_UNKNOWN,
- JUSTIFICATION_LEFT,
- JUSTIFICATION_CENTER,
- JUSTIFICATION_RIGHT
-} TessParagraphJustification;
-typedef enum TessWritingDirection {
- WRITING_DIRECTION_LEFT_TO_RIGHT,
- WRITING_DIRECTION_RIGHT_TO_LEFT,
- WRITING_DIRECTION_TOP_TO_BOTTOM
-} TessWritingDirection;
-typedef enum TessTextlineOrder {
- TEXTLINE_ORDER_LEFT_TO_RIGHT,
- TEXTLINE_ORDER_RIGHT_TO_LEFT,
- TEXTLINE_ORDER_TOP_TO_BOTTOM
-} TessTextlineOrder;
-typedef struct ETEXT_DESC ETEXT_DESC;
-#endif
-
-typedef bool (*TessCancelFunc)(void *cancel_this, int words);
-typedef bool (*TessProgressFunc)(ETEXT_DESC *ths, int left, int right, int top,
- int bottom);
-
-struct Pix;
-struct Boxa;
-struct Pixa;
-
-/* General free functions */
-
-TESS_API const char *TessVersion();
-TESS_API void TessDeleteText(const char *text);
-TESS_API void TessDeleteTextArray(char **arr);
-TESS_API void TessDeleteIntArray(const int *arr);
-
-/* Renderer API */
-TESS_API TessResultRenderer *TessTextRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessHOcrRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase,
- BOOL font_info);
-TESS_API TessResultRenderer *TessAltoRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessTsvRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessPDFRendererCreate(const char *outputbase,
- const char *datadir,
- BOOL textonly);
-TESS_API TessResultRenderer *TessUnlvRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessWordStrBoxRendererCreate(
- const char *outputbase);
-
-TESS_API void TessDeleteResultRenderer(TessResultRenderer *renderer);
-TESS_API void TessResultRendererInsert(TessResultRenderer *renderer,
- TessResultRenderer *next);
-TESS_API TessResultRenderer *TessResultRendererNext(
- TessResultRenderer *renderer);
-TESS_API BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer,
- const char *title);
-TESS_API BOOL TessResultRendererAddImage(TessResultRenderer *renderer,
- TessBaseAPI *api);
-TESS_API BOOL TessResultRendererEndDocument(TessResultRenderer *renderer);
-
-TESS_API const char *TessResultRendererExtention(TessResultRenderer *renderer);
-TESS_API const char *TessResultRendererTitle(TessResultRenderer *renderer);
-TESS_API int TessResultRendererImageNum(TessResultRenderer *renderer);
-
-/* Base API */
-
-TESS_API TessBaseAPI *TessBaseAPICreate();
-TESS_API void TessBaseAPIDelete(TessBaseAPI *handle);
-
-TESS_API size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI *handle, void **device);
-
-TESS_API void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name);
-TESS_API const char *TessBaseAPIGetInputName(TessBaseAPI *handle);
-
-TESS_API void TessBaseAPISetInputImage(TessBaseAPI *handle, struct Pix *pix);
-TESS_API struct Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle);
-
-TESS_API int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle);
-TESS_API const char *TessBaseAPIGetDatapath(TessBaseAPI *handle);
-
-TESS_API void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name);
-
-TESS_API BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name,
- const char *value);
-TESS_API BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name,
- const char *value);
-
-TESS_API BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle,
- const char *name, int *value);
-TESS_API BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle,
- const char *name, BOOL *value);
-TESS_API BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle,
- const char *name, double *value);
-TESS_API const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle,
- const char *name);
-
-TESS_API void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp);
-TESS_API BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle,
- const char *filename);
-
-TESS_API int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath,
- const char *language, TessOcrEngineMode oem,
- char **configs, int configs_size);
-TESS_API int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath,
- const char *language, TessOcrEngineMode oem);
-TESS_API int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath,
- const char *language);
-
-TESS_API int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath,
- const char *language, TessOcrEngineMode mode,
- char **configs, int configs_size, char **vars_vec,
- char **vars_values, size_t vars_vec_size,
- BOOL set_only_non_debug_params);
-
-TESS_API int TessBaseAPIInit5(TessBaseAPI *handle, const char *data, int data_size,
- const char *language, TessOcrEngineMode mode,
- char **configs, int configs_size, char **vars_vec,
- char **vars_values, size_t vars_vec_size,
- BOOL set_only_non_debug_params);
-
-TESS_API const char *TessBaseAPIGetInitLanguagesAsString(
- const TessBaseAPI *handle);
-TESS_API char **TessBaseAPIGetLoadedLanguagesAsVector(
- const TessBaseAPI *handle);
-TESS_API char **TessBaseAPIGetAvailableLanguagesAsVector(
- const TessBaseAPI *handle);
-
-TESS_API void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle);
-
-TESS_API void TessBaseAPIReadConfigFile(TessBaseAPI *handle,
- const char *filename);
-TESS_API void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle,
- const char *filename);
-
-TESS_API void TessBaseAPISetPageSegMode(TessBaseAPI *handle,
- TessPageSegMode mode);
-TESS_API TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle);
-
-TESS_API char *TessBaseAPIRect(TessBaseAPI *handle,
- const unsigned char *imagedata,
- int bytes_per_pixel, int bytes_per_line,
- int left, int top, int width, int height);
-
-TESS_API void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle);
-
-TESS_API void TessBaseAPISetImage(TessBaseAPI *handle,
- const unsigned char *imagedata, int width,
- int height, int bytes_per_pixel,
- int bytes_per_line);
-TESS_API void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix);
-
-TESS_API void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi);
-
-TESS_API void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top,
- int width, int height);
-
-TESS_API struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle);
-TESS_API struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle,
- struct Pixa **pixa);
-TESS_API struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle,
- struct Pixa **pixa,
- int **blockids);
-TESS_API struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle,
- BOOL raw_image, int raw_padding,
- struct Pixa **pixa,
- int **blockids, int **paraids);
-TESS_API struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle,
- struct Pixa **pixa, int **blockids);
-TESS_API struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle,
- struct Pixa **pixa);
-TESS_API struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle,
- struct Pixa **cc);
-TESS_API struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle,
- TessPageIteratorLevel level,
- BOOL text_only,
- struct Pixa **pixa,
- int **blockids);
-TESS_API struct Boxa *TessBaseAPIGetComponentImages1(
- TessBaseAPI *handle, TessPageIteratorLevel level, BOOL text_only,
- BOOL raw_image, int raw_padding, struct Pixa **pixa, int **blockids,
- int **paraids);
-
-TESS_API int TessBaseAPIGetThresholdedImageScaleFactor(
- const TessBaseAPI *handle);
-
-TESS_API TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle);
-
-TESS_API int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor);
-
-TESS_API BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename,
- const char *retry_config,
- int timeout_millisec,
- TessResultRenderer *renderer);
-TESS_API BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix,
- int page_index, const char *filename,
- const char *retry_config,
- int timeout_millisec,
- TessResultRenderer *renderer);
-
-TESS_API TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle);
-TESS_API TessMutableIterator *TessBaseAPIGetMutableIterator(
- TessBaseAPI *handle);
-
-TESS_API char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle);
-TESS_API char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number);
-
-TESS_API char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number);
-TESS_API char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number);
-
-TESS_API char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number);
-TESS_API char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number);
-TESS_API char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle,
- int page_number);
-
-TESS_API char *TessBaseAPIGetUNLVText(TessBaseAPI *handle);
-TESS_API int TessBaseAPIMeanTextConf(TessBaseAPI *handle);
-
-TESS_API int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle);
-
-#ifndef DISABLED_LEGACY_ENGINE
-TESS_API BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle,
- TessPageSegMode mode,
- const char *wordstr);
-#endif // #ifndef DISABLED_LEGACY_ENGINE
-
-TESS_API void TessBaseAPIClear(TessBaseAPI *handle);
-TESS_API void TessBaseAPIEnd(TessBaseAPI *handle);
-
-TESS_API int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word);
-TESS_API BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset,
- float *out_slope);
-
-TESS_API const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id);
-
-TESS_API void TessBaseAPIClearPersistentCache(TessBaseAPI *handle);
-
-#ifndef DISABLED_LEGACY_ENGINE
-
-// Call TessDeleteText(*best_script_name) to free memory allocated by this
-// function
-TESS_API BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle,
- int *orient_deg,
- float *orient_conf,
- const char **script_name,
- float *script_conf);
-#endif // #ifndef DISABLED_LEGACY_ENGINE
-
-TESS_API void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle,
- double margin);
-
-TESS_API int TessBaseAPINumDawgs(const TessBaseAPI *handle);
-
-TESS_API TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle);
-
-TESS_API void TessBaseGetBlockTextOrientations(TessBaseAPI *handle,
- int **block_orientation,
- bool **vertical_writing);
-
-/* Page iterator */
-
-TESS_API void TessPageIteratorDelete(TessPageIterator *handle);
-
-TESS_API TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle);
-
-TESS_API void TessPageIteratorBegin(TessPageIterator *handle);
-
-TESS_API BOOL TessPageIteratorNext(TessPageIterator *handle,
- TessPageIteratorLevel level);
-
-TESS_API BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle,
- TessPageIteratorLevel level);
-
-TESS_API BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle,
- TessPageIteratorLevel level,
- TessPageIteratorLevel element);
-
-TESS_API BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle,
- TessPageIteratorLevel level,
- int *left, int *top, int *right,
- int *bottom);
-
-TESS_API TessPolyBlockType
-TessPageIteratorBlockType(const TessPageIterator *handle);
-
-TESS_API struct Pix *TessPageIteratorGetBinaryImage(
- const TessPageIterator *handle, TessPageIteratorLevel level);
-
-TESS_API struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle,
- TessPageIteratorLevel level,
- int padding,
- struct Pix *original_image,
- int *left, int *top);
-
-TESS_API BOOL TessPageIteratorBaseline(const TessPageIterator *handle,
- TessPageIteratorLevel level, int *x1,
- int *y1, int *x2, int *y2);
-
-TESS_API void TessPageIteratorOrientation(
- TessPageIterator *handle, TessOrientation *orientation,
- TessWritingDirection *writing_direction, TessTextlineOrder *textline_order,
- float *deskew_angle);
-
-TESS_API void TessPageIteratorParagraphInfo(
- TessPageIterator *handle, TessParagraphJustification *justification,
- BOOL *is_list_item, BOOL *is_crown, int *first_line_indent);
-
-/* Result iterator */
-
-TESS_API void TessResultIteratorDelete(TessResultIterator *handle);
-TESS_API TessResultIterator *TessResultIteratorCopy(
- const TessResultIterator *handle);
-TESS_API TessPageIterator *TessResultIteratorGetPageIterator(
- TessResultIterator *handle);
-TESS_API const TessPageIterator *TessResultIteratorGetPageIteratorConst(
- const TessResultIterator *handle);
-TESS_API TessChoiceIterator *TessResultIteratorGetChoiceIterator(
- const TessResultIterator *handle);
-
-TESS_API BOOL TessResultIteratorNext(TessResultIterator *handle,
- TessPageIteratorLevel level);
-TESS_API char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle,
- TessPageIteratorLevel level);
-TESS_API float TessResultIteratorConfidence(const TessResultIterator *handle,
- TessPageIteratorLevel level);
-TESS_API const char *TessResultIteratorWordRecognitionLanguage(
- const TessResultIterator *handle);
-TESS_API const char *TessResultIteratorWordFontAttributes(
- const TessResultIterator *handle, BOOL *is_bold, BOOL *is_italic,
- BOOL *is_underlined, BOOL *is_monospace, BOOL *is_serif, BOOL *is_smallcaps,
- int *pointsize, int *font_id);
-
-TESS_API BOOL
-TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle);
-TESS_API BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle);
-TESS_API BOOL
-TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle);
-TESS_API BOOL
-TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle);
-TESS_API BOOL
-TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle);
-
-TESS_API void TessChoiceIteratorDelete(TessChoiceIterator *handle);
-TESS_API BOOL TessChoiceIteratorNext(TessChoiceIterator *handle);
-TESS_API const char *TessChoiceIteratorGetUTF8Text(
- const TessChoiceIterator *handle);
-TESS_API float TessChoiceIteratorConfidence(const TessChoiceIterator *handle);
-
-/* Progress monitor */
-
-TESS_API ETEXT_DESC *TessMonitorCreate();
-TESS_API void TessMonitorDelete(ETEXT_DESC *monitor);
-TESS_API void TessMonitorSetCancelFunc(ETEXT_DESC *monitor,
- TessCancelFunc cancelFunc);
-TESS_API void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis);
-TESS_API void *TessMonitorGetCancelThis(ETEXT_DESC *monitor);
-TESS_API void TessMonitorSetProgressFunc(ETEXT_DESC *monitor,
- TessProgressFunc progressFunc);
-TESS_API int TessMonitorGetProgress(ETEXT_DESC *monitor);
-TESS_API void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // API_CAPI_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/export.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/export.h
deleted file mode 100644
index d238b628..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/export.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: export.h
-// Description: Place holder
-//
-// (C) Copyright 2006, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_PLATFORM_H_
-#define TESSERACT_PLATFORM_H_
-
-#ifndef TESS_API
-# if defined(_WIN32) || defined(__CYGWIN__)
-# if defined(TESS_EXPORTS)
-# define TESS_API __declspec(dllexport)
-# elif defined(TESS_IMPORTS)
-# define TESS_API __declspec(dllimport)
-# else
-# define TESS_API
-# endif
-# else
-# if defined(TESS_EXPORTS) || defined(TESS_IMPORTS)
-# define TESS_API __attribute__((visibility("default")))
-# else
-# define TESS_API
-# endif
-# endif
-#endif
-
-#endif // TESSERACT_PLATFORM_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/ltrresultiterator.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/ltrresultiterator.h
deleted file mode 100644
index 6ca0a98e..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/ltrresultiterator.h
+++ /dev/null
@@ -1,235 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: ltrresultiterator.h
-// Description: Iterator for tesseract results in strict left-to-right
-// order that avoids using tesseract internal data structures.
-// Author: Ray Smith
-//
-// (C) Copyright 2010, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
-#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
-
-#include "export.h" // for TESS_API
-#include "pageiterator.h" // for PageIterator
-#include "publictypes.h" // for PageIteratorLevel
-#include "unichar.h" // for StrongScriptDirection
-
-namespace tesseract {
-
-class BLOB_CHOICE_IT;
-class PAGE_RES;
-class WERD_RES;
-
-class Tesseract;
-
-// Class to iterate over tesseract results, providing access to all levels
-// of the page hierarchy, without including any tesseract headers or having
-// to handle any tesseract structures.
-// WARNING! This class points to data held within the TessBaseAPI class, and
-// therefore can only be used while the TessBaseAPI class still exists and
-// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
-// DetectOS, or anything else that changes the internal PAGE_RES.
-// See tesseract/publictypes.h for the definition of PageIteratorLevel.
-// See also base class PageIterator, which contains the bulk of the interface.
-// LTRResultIterator adds text-specific methods for access to OCR output.
-
-class TESS_API LTRResultIterator : public PageIterator {
- friend class ChoiceIterator;
-
-public:
- // page_res and tesseract come directly from the BaseAPI.
- // The rectangle parameters are copied indirectly from the Thresholder,
- // via the BaseAPI. They represent the coordinates of some rectangle in an
- // original image (in top-left-origin coordinates) and therefore the top-left
- // needs to be added to any output boxes in order to specify coordinates
- // in the original image. See TessBaseAPI::SetRectangle.
- // The scale and scaled_yres are in case the Thresholder scaled the image
- // rectangle prior to thresholding. Any coordinates in tesseract's image
- // must be divided by scale before adding (rect_left, rect_top).
- // The scaled_yres indicates the effective resolution of the binary image
- // that tesseract has been given by the Thresholder.
- // After the constructor, Begin has already been called.
- LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
- int scaled_yres, int rect_left, int rect_top,
- int rect_width, int rect_height);
-
- ~LTRResultIterator() override;
-
- // LTRResultIterators may be copied! This makes it possible to iterate over
- // all the objects at a lower level, while maintaining an iterator to
- // objects at a higher level. These constructors DO NOT CALL Begin, so
- // iterations will continue from the location of src.
- // TODO: For now the copy constructor and operator= only need the base class
- // versions, but if new data members are added, don't forget to add them!
-
- // ============= Moving around within the page ============.
-
- // See PageIterator.
-
- // ============= Accessing data ==============.
-
- // Returns the null terminated UTF-8 encoded text string for the current
- // object at the given level. Use delete [] to free after use.
- char *GetUTF8Text(PageIteratorLevel level) const;
-
- // Set the string inserted at the end of each text line. "\n" by default.
- void SetLineSeparator(const char *new_line);
-
- // Set the string inserted at the end of each paragraph. "\n" by default.
- void SetParagraphSeparator(const char *new_para);
-
- // Returns the mean confidence of the current object at the given level.
- // The number should be interpreted as a percent probability. (0.0f-100.0f)
- float Confidence(PageIteratorLevel level) const;
-
- // ============= Functions that refer to words only ============.
-
- // Returns the font attributes of the current word. If iterating at a higher
- // level object than words, eg textlines, then this will return the
- // attributes of the first word in that textline.
- // The actual return value is a string representing a font name. It points
- // to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
- // the iterator itself, ie rendered invalid by various members of
- // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
- // Pointsize is returned in printers points (1/72 inch.)
- const char *WordFontAttributes(bool *is_bold, bool *is_italic,
- bool *is_underlined, bool *is_monospace,
- bool *is_serif, bool *is_smallcaps,
- int *pointsize, int *font_id) const;
-
- // Return the name of the language used to recognize this word.
- // On error, nullptr. Do not delete this pointer.
- const char *WordRecognitionLanguage() const;
-
- // Return the overall directionality of this word.
- StrongScriptDirection WordDirection() const;
-
- // Returns true if the current word was found in a dictionary.
- bool WordIsFromDictionary() const;
-
- // Returns the number of blanks before the current word.
- int BlanksBeforeWord() const;
-
- // Returns true if the current word is numeric.
- bool WordIsNumeric() const;
-
- // Returns true if the word contains blamer information.
- bool HasBlamerInfo() const;
-
- // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
- // of the current word.
- const void *GetParamsTrainingBundle() const;
-
- // Returns a pointer to the string with blamer information for this word.
- // Assumes that the word's blamer_bundle is not nullptr.
- const char *GetBlamerDebug() const;
-
- // Returns a pointer to the string with misadaption information for this word.
- // Assumes that the word's blamer_bundle is not nullptr.
- const char *GetBlamerMisadaptionDebug() const;
-
- // Returns true if a truth string was recorded for the current word.
- bool HasTruthString() const;
-
- // Returns true if the given string is equivalent to the truth string for
- // the current word.
- bool EquivalentToTruth(const char *str) const;
-
- // Returns a null terminated UTF-8 encoded truth string for the current word.
- // Use delete [] to free after use.
- char *WordTruthUTF8Text() const;
-
- // Returns a null terminated UTF-8 encoded normalized OCR string for the
- // current word. Use delete [] to free after use.
- char *WordNormedUTF8Text() const;
-
- // Returns a pointer to serialized choice lattice.
- // Fills lattice_size with the number of bytes in lattice data.
- const char *WordLattice(int *lattice_size) const;
-
- // ============= Functions that refer to symbols only ============.
-
- // Returns true if the current symbol is a superscript.
- // If iterating at a higher level object than symbols, eg words, then
- // this will return the attributes of the first symbol in that word.
- bool SymbolIsSuperscript() const;
- // Returns true if the current symbol is a subscript.
- // If iterating at a higher level object than symbols, eg words, then
- // this will return the attributes of the first symbol in that word.
- bool SymbolIsSubscript() const;
- // Returns true if the current symbol is a dropcap.
- // If iterating at a higher level object than symbols, eg words, then
- // this will return the attributes of the first symbol in that word.
- bool SymbolIsDropcap() const;
-
-protected:
- const char *line_separator_;
- const char *paragraph_separator_;
-};
-
-// Class to iterate over the classifier choices for a single RIL_SYMBOL.
-class TESS_API ChoiceIterator {
-public:
- // Construction is from a LTRResultIterator that points to the symbol of
- // interest. The ChoiceIterator allows a one-shot iteration over the
- // choices for this symbol and after that it is useless.
- explicit ChoiceIterator(const LTRResultIterator &result_it);
- ~ChoiceIterator();
-
- // Moves to the next choice for the symbol and returns false if there
- // are none left.
- bool Next();
-
- // ============= Accessing data ==============.
-
- // Returns the null terminated UTF-8 encoded text string for the current
- // choice.
- // NOTE: Unlike LTRResultIterator::GetUTF8Text, the return points to an
- // internal structure and should NOT be delete[]ed to free after use.
- const char *GetUTF8Text() const;
-
- // Returns the confidence of the current choice depending on the used language
- // data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All
- // choices for one symbol should roughly add up to 1.0f.
- // If only traineddata of the legacy engine is used, the number should be
- // interpreted as a percent probability. (0.0f-100.0f) In this case
- // probabilities won't add up to 100. Each one stands on its own.
- float Confidence() const;
-
- // Returns a vector containing all timesteps, which belong to the currently
- // selected symbol. A timestep is a vector containing pairs of symbols and
- // floating point numbers. The number states the probability for the
- // corresponding symbol.
- std::vector>> *Timesteps() const;
-
-private:
- // clears the remaining spaces out of the results and adapt the probabilities
- void filterSpaces();
- // Pointer to the WERD_RES object owned by the API.
- WERD_RES *word_res_;
- // Iterator over the blob choices.
- BLOB_CHOICE_IT *choice_it_;
- std::vector> *LSTM_choices_ = nullptr;
- std::vector>::iterator LSTM_choice_it_;
-
- const int *tstep_index_;
- // regulates the rating granularity
- double rating_coefficient_;
- // leading blanks
- int blanks_before_word_;
- // true when there is lstm engine related trained data
- bool oemLSTM_;
-};
-
-} // namespace tesseract.
-
-#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/ocrclass.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/ocrclass.h
deleted file mode 100644
index a55e6528..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/ocrclass.h
+++ /dev/null
@@ -1,158 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-/**********************************************************************
- * File: ocrclass.h
- * Description: Class definitions and constants for the OCR API.
- * Author: Hewlett-Packard Co
- *
- * (C) Copyright 1996, Hewlett-Packard Co.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-/**********************************************************************
- * This file contains typedefs for all the structures used by
- * the HP OCR interface.
- * The structures are designed to allow them to be used with any
- * structure alignment up to 8.
- **********************************************************************/
-
-#ifndef CCUTIL_OCRCLASS_H_
-#define CCUTIL_OCRCLASS_H_
-
-#include
-#include
-
-namespace tesseract {
-
-/**********************************************************************
- * EANYCODE_CHAR
- * Description of a single character. The character code is defined by
- * the character set of the current font.
- * Output text is sent as an array of these structures.
- * Spaces and line endings in the output are represented in the
- * structures of the surrounding characters. They are not directly
- * represented as characters.
- * The first character in a word has a positive value of blanks.
- * Missing information should be set to the defaults in the comments.
- * If word bounds are known, but not character bounds, then the top and
- * bottom of each character should be those of the word. The left of the
- * first and right of the last char in each word should be set. All other
- * lefts and rights should be set to -1.
- * If set, the values of right and bottom are left+width and top+height.
- * Most of the members come directly from the parameters to ocr_append_char.
- * The formatting member uses the enhancement parameter and combines the
- * line direction stuff into the top 3 bits.
- * The coding is 0=RL char, 1=LR char, 2=DR NL, 3=UL NL, 4=DR Para,
- * 5=UL Para, 6=TB char, 7=BT char. API users do not need to know what
- * the coding is, only that it is backwards compatible with the previous
- * version.
- **********************************************************************/
-
-struct EANYCODE_CHAR { /*single character */
- // It should be noted that the format for char_code for version 2.0 and beyond
- // is UTF8 which means that ASCII characters will come out as one structure
- // but other characters will be returned in two or more instances of this
- // structure with a single byte of the UTF8 code in each, but each will have
- // the same bounding box. Programs which want to handle languagues with
- // different characters sets will need to handle extended characters
- // appropriately, but *all* code needs to be prepared to receive UTF8 coded
- // characters for characters such as bullet and fancy quotes.
- uint16_t char_code; /*character itself */
- int16_t left; /*of char (-1) */
- int16_t right; /*of char (-1) */
- int16_t top; /*of char (-1) */
- int16_t bottom; /*of char (-1) */
- int16_t font_index; /*what font (0) */
- uint8_t confidence; /*0=perfect, 100=reject (0/100) */
- uint8_t point_size; /*of char, 72=i inch, (10) */
- int8_t blanks; /*no of spaces before this char (1) */
- uint8_t formatting; /*char formatting (0) */
-};
-
-/**********************************************************************
- * ETEXT_DESC
- * Description of the output of the OCR engine.
- * This structure is used as both a progress monitor and the final
- * output header, since it needs to be a valid progress monitor while
- * the OCR engine is storing its output to shared memory.
- * During progress, all the buffer info is -1.
- * Progress starts at 0 and increases to 100 during OCR. No other constraint.
- * Additionally the progress callback contains the bounding box of the word that
- * is currently being processed.
- * Every progress callback, the OCR engine must set ocr_alive to 1.
- * The HP side will set ocr_alive to 0. Repeated failure to reset
- * to 1 indicates that the OCR engine is dead.
- * If the cancel function is not null then it is called with the number of
- * user words found. If it returns true then operation is cancelled.
- **********************************************************************/
-class ETEXT_DESC;
-
-using CANCEL_FUNC = bool (*)(void *, int);
-using PROGRESS_FUNC = bool (*)(int, int, int, int, int);
-using PROGRESS_FUNC2 = bool (*)(ETEXT_DESC *, int, int, int, int);
-
-class ETEXT_DESC { // output header
-public:
- int16_t count{0}; /// chars in this buffer(0)
- int16_t progress{0}; /// percent complete increasing (0-100)
- /** Progress monitor covers word recognition and it does not cover layout
- * analysis.
- * See Ray comment in https://github.com/tesseract-ocr/tesseract/pull/27 */
- int8_t more_to_come{0}; /// true if not last
- volatile int8_t ocr_alive{0}; /// ocr sets to 1, HP 0
- int8_t err_code{0}; /// for errcode use
- CANCEL_FUNC cancel{nullptr}; /// returns true to cancel
- PROGRESS_FUNC progress_callback{
- nullptr}; /// called whenever progress increases
- PROGRESS_FUNC2 progress_callback2; /// monitor-aware progress callback
- void *cancel_this{nullptr}; /// this or other data for cancel
- std::chrono::steady_clock::time_point end_time;
- /// Time to stop. Expected to be set only
- /// by call to set_deadline_msecs().
- EANYCODE_CHAR text[1]{}; /// character data
-
- ETEXT_DESC() : progress_callback2(&default_progress_func) {
- end_time = std::chrono::time_point();
- }
-
- // Sets the end time to be deadline_msecs milliseconds from now.
- void set_deadline_msecs(int32_t deadline_msecs) {
- if (deadline_msecs > 0) {
- end_time = std::chrono::steady_clock::now() +
- std::chrono::milliseconds(deadline_msecs);
- }
- }
-
- // Returns false if we've not passed the end_time, or have not set a deadline.
- bool deadline_exceeded() const {
- if (end_time.time_since_epoch() ==
- std::chrono::steady_clock::duration::zero()) {
- return false;
- }
- auto now = std::chrono::steady_clock::now();
- return (now > end_time);
- }
-
-private:
- static bool default_progress_func(ETEXT_DESC *ths, int left, int right,
- int top, int bottom) {
- if (ths->progress_callback != nullptr) {
- return (*(ths->progress_callback))(ths->progress, left, right, top,
- bottom);
- }
- return true;
- }
-};
-
-} // namespace tesseract
-
-#endif // CCUTIL_OCRCLASS_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/osdetect.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/osdetect.h
deleted file mode 100644
index 34bfb557..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/osdetect.h
+++ /dev/null
@@ -1,139 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: osdetect.h
-// Description: Orientation and script detection.
-// Author: Samuel Charron
-// Ranjith Unnikrishnan
-//
-// (C) Copyright 2008, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCMAIN_OSDETECT_H_
-#define TESSERACT_CCMAIN_OSDETECT_H_
-
-#include "export.h" // for TESS_API
-
-#include // for std::vector
-
-namespace tesseract {
-
-class BLOBNBOX;
-class BLOBNBOX_CLIST;
-class BLOB_CHOICE_LIST;
-class TO_BLOCK_LIST;
-class UNICHARSET;
-
-class Tesseract;
-
-// Max number of scripts in ICU + "NULL" + Japanese and Korean + Fraktur
-const int kMaxNumberOfScripts = 116 + 1 + 2 + 1;
-
-struct OSBestResult {
- OSBestResult()
- : orientation_id(0), script_id(0), sconfidence(0.0), oconfidence(0.0) {}
- int orientation_id;
- int script_id;
- float sconfidence;
- float oconfidence;
-};
-
-struct OSResults {
- OSResults() : unicharset(nullptr) {
- for (int i = 0; i < 4; ++i) {
- for (int j = 0; j < kMaxNumberOfScripts; ++j) {
- scripts_na[i][j] = 0;
- }
- orientations[i] = 0;
- }
- }
- void update_best_orientation();
- // Set the estimate of the orientation to the given id.
- void set_best_orientation(int orientation_id);
- // Update/Compute the best estimate of the script assuming the given
- // orientation id.
- void update_best_script(int orientation_id);
- // Return the index of the script with the highest score for this orientation.
- TESS_API int get_best_script(int orientation_id) const;
- // Accumulate scores with given OSResults instance and update the best script.
- void accumulate(const OSResults &osr);
-
- // Print statistics.
- void print_scores(void) const;
- void print_scores(int orientation_id) const;
-
- // Array holding scores for each orientation id [0,3].
- // Orientation ids [0..3] map to [0, 270, 180, 90] degree orientations of the
- // page respectively, where the values refer to the amount of clockwise
- // rotation to be applied to the page for the text to be upright and readable.
- float orientations[4];
- // Script confidence scores for each of 4 possible orientations.
- float scripts_na[4][kMaxNumberOfScripts];
-
- UNICHARSET *unicharset;
- OSBestResult best_result;
-};
-
-class OrientationDetector {
-public:
- OrientationDetector(const std::vector *allowed_scripts,
- OSResults *results);
- bool detect_blob(BLOB_CHOICE_LIST *scores);
- int get_orientation();
-
-private:
- OSResults *osr_;
- const std::vector *allowed_scripts_;
-};
-
-class ScriptDetector {
-public:
- ScriptDetector(const std::vector *allowed_scripts, OSResults *osr,
- tesseract::Tesseract *tess);
- void detect_blob(BLOB_CHOICE_LIST *scores);
- bool must_stop(int orientation) const;
-
-private:
- OSResults *osr_;
- static const char *korean_script_;
- static const char *japanese_script_;
- static const char *fraktur_script_;
- int korean_id_;
- int japanese_id_;
- int katakana_id_;
- int hiragana_id_;
- int han_id_;
- int hangul_id_;
- int latin_id_;
- int fraktur_id_;
- tesseract::Tesseract *tess_;
- const std::vector *allowed_scripts_;
-};
-
-int orientation_and_script_detection(const char *filename, OSResults *,
- tesseract::Tesseract *);
-
-int os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr,
- tesseract::Tesseract *tess);
-
-int os_detect_blobs(const std::vector *allowed_scripts,
- BLOBNBOX_CLIST *blob_list, OSResults *osr,
- tesseract::Tesseract *tess);
-
-bool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s,
- OSResults *, tesseract::Tesseract *tess);
-
-// Helper method to convert an orientation index to its value in degrees.
-// The value represents the amount of clockwise rotation in degrees that must be
-// applied for the text to be upright (readable).
-TESS_API int OrientationIdToValue(const int &id);
-
-} // namespace tesseract
-
-#endif // TESSERACT_CCMAIN_OSDETECT_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/pageiterator.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/pageiterator.h
deleted file mode 100644
index 68739715..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/pageiterator.h
+++ /dev/null
@@ -1,364 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: pageiterator.h
-// Description: Iterator for tesseract page structure that avoids using
-// tesseract internal data structures.
-// Author: Ray Smith
-//
-// (C) Copyright 2010, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_
-#define TESSERACT_CCMAIN_PAGEITERATOR_H_
-
-#include "export.h"
-#include "publictypes.h"
-
-struct Pix;
-struct Pta;
-
-namespace tesseract {
-
-struct BlamerBundle;
-class C_BLOB_IT;
-class PAGE_RES;
-class PAGE_RES_IT;
-class WERD;
-
-class Tesseract;
-
-/**
- * Class to iterate over tesseract page structure, providing access to all
- * levels of the page hierarchy, without including any tesseract headers or
- * having to handle any tesseract structures.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- * See tesseract/publictypes.h for the definition of PageIteratorLevel.
- * See also ResultIterator, derived from PageIterator, which adds in the
- * ability to access OCR output with text-specific methods.
- */
-
-class TESS_API PageIterator {
-public:
- /**
- * page_res and tesseract come directly from the BaseAPI.
- * The rectangle parameters are copied indirectly from the Thresholder,
- * via the BaseAPI. They represent the coordinates of some rectangle in an
- * original image (in top-left-origin coordinates) and therefore the top-left
- * needs to be added to any output boxes in order to specify coordinates
- * in the original image. See TessBaseAPI::SetRectangle.
- * The scale and scaled_yres are in case the Thresholder scaled the image
- * rectangle prior to thresholding. Any coordinates in tesseract's image
- * must be divided by scale before adding (rect_left, rect_top).
- * The scaled_yres indicates the effective resolution of the binary image
- * that tesseract has been given by the Thresholder.
- * After the constructor, Begin has already been called.
- */
- PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
- int scaled_yres, int rect_left, int rect_top, int rect_width,
- int rect_height);
- virtual ~PageIterator();
-
- /**
- * Page/ResultIterators may be copied! This makes it possible to iterate over
- * all the objects at a lower level, while maintaining an iterator to
- * objects at a higher level. These constructors DO NOT CALL Begin, so
- * iterations will continue from the location of src.
- */
- PageIterator(const PageIterator &src);
- const PageIterator &operator=(const PageIterator &src);
-
- /** Are we positioned at the same location as other? */
- bool PositionedAtSameWord(const PAGE_RES_IT *other) const;
-
- // ============= Moving around within the page ============.
-
- /**
- * Moves the iterator to point to the start of the page to begin an
- * iteration.
- */
- virtual void Begin();
-
- /**
- * Moves the iterator to the beginning of the paragraph.
- * This class implements this functionality by moving it to the zero indexed
- * blob of the first (leftmost) word on the first row of the paragraph.
- */
- virtual void RestartParagraph();
-
- /**
- * Return whether this iterator points anywhere in the first textline of a
- * paragraph.
- */
- bool IsWithinFirstTextlineOfParagraph() const;
-
- /**
- * Moves the iterator to the beginning of the text line.
- * This class implements this functionality by moving it to the zero indexed
- * blob of the first (leftmost) word of the row.
- */
- virtual void RestartRow();
-
- /**
- * Moves to the start of the next object at the given level in the
- * page hierarchy, and returns false if the end of the page was reached.
- * NOTE that RIL_SYMBOL will skip non-text blocks, but all other
- * PageIteratorLevel level values will visit each non-text block once.
- * Think of non text blocks as containing a single para, with a single line,
- * with a single imaginary word.
- * Calls to Next with different levels may be freely intermixed.
- * This function iterates words in right-to-left scripts correctly, if
- * the appropriate language has been loaded into Tesseract.
- */
- virtual bool Next(PageIteratorLevel level);
-
- /**
- * Returns true if the iterator is at the start of an object at the given
- * level.
- *
- * For instance, suppose an iterator it is pointed to the first symbol of the
- * first word of the third line of the second paragraph of the first block in
- * a page, then:
- * it.IsAtBeginningOf(RIL_BLOCK) = false
- * it.IsAtBeginningOf(RIL_PARA) = false
- * it.IsAtBeginningOf(RIL_TEXTLINE) = true
- * it.IsAtBeginningOf(RIL_WORD) = true
- * it.IsAtBeginningOf(RIL_SYMBOL) = true
- */
- virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
-
- /**
- * Returns whether the iterator is positioned at the last element in a
- * given level. (e.g. the last word in a line, the last line in a block)
- *
- * Here's some two-paragraph example
- * text. It starts off innocuously
- * enough but quickly turns bizarre.
- * The author inserts a cornucopia
- * of words to guard against confused
- * references.
- *
- * Now take an iterator it pointed to the start of "bizarre."
- * it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false
- * it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true
- * it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false
- */
- virtual bool IsAtFinalElement(PageIteratorLevel level,
- PageIteratorLevel element) const;
-
- /**
- * Returns whether this iterator is positioned
- * before other: -1
- * equal to other: 0
- * after other: 1
- */
- int Cmp(const PageIterator &other) const;
-
- // ============= Accessing data ==============.
- // Coordinate system:
- // Integer coordinates are at the cracks between the pixels.
- // The top-left corner of the top-left pixel in the image is at (0,0).
- // The bottom-right corner of the bottom-right pixel in the image is at
- // (width, height).
- // Every bounding box goes from the top-left of the top-left contained
- // pixel to the bottom-right of the bottom-right contained pixel, so
- // the bounding box of the single top-left pixel in the image is:
- // (0,0)->(1,1).
- // If an image rectangle has been set in the API, then returned coordinates
- // relate to the original (full) image, rather than the rectangle.
-
- /**
- * Controls what to include in a bounding box. Bounding boxes of all levels
- * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics.
- * Between layout analysis and recognition, it isn't known where all
- * diacritics belong, so this control is used to include or exclude some
- * diacritics that are above or below the main body of the word. In most cases
- * where the placement is obvious, and after recognition, it doesn't make as
- * much difference, as the diacritics will already be included in the word.
- */
- void SetBoundingBoxComponents(bool include_upper_dots,
- bool include_lower_dots) {
- include_upper_dots_ = include_upper_dots;
- include_lower_dots_ = include_lower_dots;
- }
-
- /**
- * Returns the bounding rectangle of the current object at the given level.
- * See comment on coordinate system above.
- * Returns false if there is no such object at the current position.
- * The returned bounding box is guaranteed to match the size and position
- * of the image returned by GetBinaryImage, but may clip foreground pixels
- * from a grey image. The padding argument to GetImage can be used to expand
- * the image to include more foreground pixels. See GetImage below.
- */
- bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right,
- int *bottom) const;
- bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top,
- int *right, int *bottom) const;
- /**
- * Returns the bounding rectangle of the object in a coordinate system of the
- * working image rectangle having its origin at (rect_left_, rect_top_) with
- * respect to the original image and is scaled by a factor scale_.
- */
- bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top,
- int *right, int *bottom) const;
-
- /** Returns whether there is no object of a given level. */
- bool Empty(PageIteratorLevel level) const;
-
- /**
- * Returns the type of the current block.
- * See tesseract/publictypes.h for PolyBlockType.
- */
- PolyBlockType BlockType() const;
-
- /**
- * Returns the polygon outline of the current block. The returned Pta must
- * be ptaDestroy-ed after use. Note that the returned Pta lists the vertices
- * of the polygon, and the last edge is the line segment between the last
- * point and the first point. nullptr will be returned if the iterator is
- * at the end of the document or layout analysis was not used.
- */
- Pta *BlockPolygon() const;
-
- /**
- * Returns a binary image of the current object at the given level.
- * The position and size match the return from BoundingBoxInternal, and so
- * this could be upscaled with respect to the original input image.
- * Use pixDestroy to delete the image after use.
- */
- Pix *GetBinaryImage(PageIteratorLevel level) const;
-
- /**
- * Returns an image of the current object at the given level in greyscale
- * if available in the input. To guarantee a binary image use BinaryImage.
- * NOTE that in order to give the best possible image, the bounds are
- * expanded slightly over the binary connected component, by the supplied
- * padding, so the top-left position of the returned image is returned
- * in (left,top). These will most likely not match the coordinates
- * returned by BoundingBox.
- * If you do not supply an original image, you will get a binary one.
- * Use pixDestroy to delete the image after use.
- */
- Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img,
- int *left, int *top) const;
-
- /**
- * Returns the baseline of the current object at the given level.
- * The baseline is the line that passes through (x1, y1) and (x2, y2).
- * WARNING: with vertical text, baselines may be vertical!
- * Returns false if there is no baseline at the current position.
- */
- bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2,
- int *y2) const;
-
- // Returns the attributes of the current row.
- void RowAttributes(float *row_height, float *descenders,
- float *ascenders) const;
-
- /**
- * Returns orientation for the block the iterator points to.
- * orientation, writing_direction, textline_order: see publictypes.h
- * deskew_angle: after rotating the block so the text orientation is
- * upright, how many radians does one have to rotate the
- * block anti-clockwise for it to be level?
- * -Pi/4 <= deskew_angle <= Pi/4
- */
- void Orientation(tesseract::Orientation *orientation,
- tesseract::WritingDirection *writing_direction,
- tesseract::TextlineOrder *textline_order,
- float *deskew_angle) const;
-
- /**
- * Returns information about the current paragraph, if available.
- *
- * justification -
- * LEFT if ragged right, or fully justified and script is left-to-right.
- * RIGHT if ragged left, or fully justified and script is right-to-left.
- * unknown if it looks like source code or we have very few lines.
- * is_list_item -
- * true if we believe this is a member of an ordered or unordered list.
- * is_crown -
- * true if the first line of the paragraph is aligned with the other
- * lines of the paragraph even though subsequent paragraphs have first
- * line indents. This typically indicates that this is the continuation
- * of a previous paragraph or that it is the very first paragraph in
- * the chapter.
- * first_line_indent -
- * For LEFT aligned paragraphs, the first text line of paragraphs of
- * this kind are indented this many pixels from the left edge of the
- * rest of the paragraph.
- * for RIGHT aligned paragraphs, the first text line of paragraphs of
- * this kind are indented this many pixels from the right edge of the
- * rest of the paragraph.
- * NOTE 1: This value may be negative.
- * NOTE 2: if *is_crown == true, the first line of this paragraph is
- * actually flush, and first_line_indent is set to the "common"
- * first_line_indent for subsequent paragraphs in this block
- * of text.
- */
- void ParagraphInfo(tesseract::ParagraphJustification *justification,
- bool *is_list_item, bool *is_crown,
- int *first_line_indent) const;
-
- // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle
- // of the current word to the given pointer (takes ownership of the pointer)
- // and returns true.
- // Can only be used when iterating on the word level.
- bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);
-
-protected:
- /**
- * Sets up the internal data for iterating the blobs of a new word, then
- * moves the iterator to the given offset.
- */
- void BeginWord(int offset);
-
- /** Pointer to the page_res owned by the API. */
- PAGE_RES *page_res_;
- /** Pointer to the Tesseract object owned by the API. */
- Tesseract *tesseract_;
- /**
- * The iterator to the page_res_. Owned by this ResultIterator.
- * A pointer just to avoid dragging in Tesseract includes.
- */
- PAGE_RES_IT *it_;
- /**
- * The current input WERD being iterated. If there is an output from OCR,
- * then word_ is nullptr. Owned by the API
- */
- WERD *word_;
- /** The length of the current word_. */
- int word_length_;
- /** The current blob index within the word. */
- int blob_index_;
- /**
- * Iterator to the blobs within the word. If nullptr, then we are iterating
- * OCR results in the box_word.
- * Owned by this ResultIterator.
- */
- C_BLOB_IT *cblob_it_;
- /** Control over what to include in bounding boxes. */
- bool include_upper_dots_;
- bool include_lower_dots_;
- /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/
- int scale_;
- int scaled_yres_;
- int rect_left_;
- int rect_top_;
- int rect_width_;
- int rect_height_;
-};
-
-} // namespace tesseract.
-
-#endif // TESSERACT_CCMAIN_PAGEITERATOR_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/publictypes.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/publictypes.h
deleted file mode 100644
index 0069cf28..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/publictypes.h
+++ /dev/null
@@ -1,281 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: publictypes.h
-// Description: Types used in both the API and internally
-// Author: Ray Smith
-//
-// (C) Copyright 2010, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCSTRUCT_PUBLICTYPES_H_
-#define TESSERACT_CCSTRUCT_PUBLICTYPES_H_
-
-namespace tesseract {
-
-// This file contains types that are used both by the API and internally
-// to Tesseract. In order to decouple the API from Tesseract and prevent cyclic
-// dependencies, THIS FILE SHOULD NOT DEPEND ON ANY OTHER PART OF TESSERACT.
-// Restated: It is OK for low-level Tesseract files to include publictypes.h,
-// but not for the low-level tesseract code to include top-level API code.
-// This file should not use other Tesseract types, as that would drag
-// their includes into the API-level.
-
-/** Number of printers' points in an inch. The unit of the pointsize return. */
-constexpr int kPointsPerInch = 72;
-/**
- * Minimum believable resolution. Used as a default if there is no other
- * information, as it is safer to under-estimate than over-estimate.
- */
-constexpr int kMinCredibleResolution = 70;
-/** Maximum believable resolution. */
-constexpr int kMaxCredibleResolution = 2400;
-/**
- * Ratio between median blob size and likely resolution. Used to estimate
- * resolution when none is provided. This is basically 1/usual text size in
- * inches. */
-constexpr int kResolutionEstimationFactor = 10;
-
-/**
- * Possible types for a POLY_BLOCK or ColPartition.
- * Must be kept in sync with kPBColors in polyblk.cpp and PTIs*Type functions
- * below, as well as kPolyBlockNames in layout_test.cc.
- * Used extensively by ColPartition, and POLY_BLOCK.
- */
-enum PolyBlockType {
- PT_UNKNOWN, // Type is not yet known. Keep as the first element.
- PT_FLOWING_TEXT, // Text that lives inside a column.
- PT_HEADING_TEXT, // Text that spans more than one column.
- PT_PULLOUT_TEXT, // Text that is in a cross-column pull-out region.
- PT_EQUATION, // Partition belonging to an equation region.
- PT_INLINE_EQUATION, // Partition has inline equation.
- PT_TABLE, // Partition belonging to a table region.
- PT_VERTICAL_TEXT, // Text-line runs vertically.
- PT_CAPTION_TEXT, // Text that belongs to an image.
- PT_FLOWING_IMAGE, // Image that lives inside a column.
- PT_HEADING_IMAGE, // Image that spans more than one column.
- PT_PULLOUT_IMAGE, // Image that is in a cross-column pull-out region.
- PT_HORZ_LINE, // Horizontal Line.
- PT_VERT_LINE, // Vertical Line.
- PT_NOISE, // Lies outside of any column.
- PT_COUNT
-};
-
-/** Returns true if PolyBlockType is of horizontal line type */
-inline bool PTIsLineType(PolyBlockType type) {
- return type == PT_HORZ_LINE || type == PT_VERT_LINE;
-}
-/** Returns true if PolyBlockType is of image type */
-inline bool PTIsImageType(PolyBlockType type) {
- return type == PT_FLOWING_IMAGE || type == PT_HEADING_IMAGE ||
- type == PT_PULLOUT_IMAGE;
-}
-/** Returns true if PolyBlockType is of text type */
-inline bool PTIsTextType(PolyBlockType type) {
- return type == PT_FLOWING_TEXT || type == PT_HEADING_TEXT ||
- type == PT_PULLOUT_TEXT || type == PT_TABLE ||
- type == PT_VERTICAL_TEXT || type == PT_CAPTION_TEXT ||
- type == PT_INLINE_EQUATION;
-}
-// Returns true if PolyBlockType is of pullout(inter-column) type
-inline bool PTIsPulloutType(PolyBlockType type) {
- return type == PT_PULLOUT_IMAGE || type == PT_PULLOUT_TEXT;
-}
-
-/**
- * +------------------+ Orientation Example:
- * | 1 Aaaa Aaaa Aaaa | ====================
- * | Aaa aa aaa aa | To left is a diagram of some (1) English and
- * | aaaaaa A aa aaa. | (2) Chinese text and a (3) photo credit.
- * | 2 |
- * | ####### c c C | Upright Latin characters are represented as A and a.
- * | ####### c c c | '<' represents a latin character rotated
- * | < ####### c c c | anti-clockwise 90 degrees.
- * | < ####### c c |
- * | < ####### . c | Upright Chinese characters are represented C and c.
- * | 3 ####### c |
- * +------------------+ NOTA BENE: enum values here should match goodoc.proto
-
- * If you orient your head so that "up" aligns with Orientation,
- * then the characters will appear "right side up" and readable.
- *
- * In the example above, both the English and Chinese paragraphs are oriented
- * so their "up" is the top of the page (page up). The photo credit is read
- * with one's head turned leftward ("up" is to page left).
- *
- * The values of this enum match the convention of Tesseract's osdetect.h
-*/
-enum Orientation {
- ORIENTATION_PAGE_UP = 0,
- ORIENTATION_PAGE_RIGHT = 1,
- ORIENTATION_PAGE_DOWN = 2,
- ORIENTATION_PAGE_LEFT = 3,
-};
-
-/**
- * The grapheme clusters within a line of text are laid out logically
- * in this direction, judged when looking at the text line rotated so that
- * its Orientation is "page up".
- *
- * For English text, the writing direction is left-to-right. For the
- * Chinese text in the above example, the writing direction is top-to-bottom.
- */
-enum WritingDirection {
- WRITING_DIRECTION_LEFT_TO_RIGHT = 0,
- WRITING_DIRECTION_RIGHT_TO_LEFT = 1,
- WRITING_DIRECTION_TOP_TO_BOTTOM = 2,
-};
-
-/**
- * The text lines are read in the given sequence.
- *
- * In English, the order is top-to-bottom.
- * In Chinese, vertical text lines are read right-to-left. Mongolian is
- * written in vertical columns top to bottom like Chinese, but the lines
- * order left-to right.
- *
- * Note that only some combinations make sense. For example,
- * WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM
- */
-enum TextlineOrder {
- TEXTLINE_ORDER_LEFT_TO_RIGHT = 0,
- TEXTLINE_ORDER_RIGHT_TO_LEFT = 1,
- TEXTLINE_ORDER_TOP_TO_BOTTOM = 2,
-};
-
-/**
- * Possible modes for page layout analysis. These *must* be kept in order
- * of decreasing amount of layout analysis to be done, except for OSD_ONLY,
- * so that the inequality test macros below work.
- */
-enum PageSegMode {
- PSM_OSD_ONLY = 0, ///< Orientation and script detection only.
- PSM_AUTO_OSD = 1, ///< Automatic page segmentation with orientation and
- ///< script detection. (OSD)
- PSM_AUTO_ONLY = 2, ///< Automatic page segmentation, but no OSD, or OCR.
- PSM_AUTO = 3, ///< Fully automatic page segmentation, but no OSD.
- PSM_SINGLE_COLUMN = 4, ///< Assume a single column of text of variable sizes.
- PSM_SINGLE_BLOCK_VERT_TEXT = 5, ///< Assume a single uniform block of
- ///< vertically aligned text.
- PSM_SINGLE_BLOCK = 6, ///< Assume a single uniform block of text. (Default.)
- PSM_SINGLE_LINE = 7, ///< Treat the image as a single text line.
- PSM_SINGLE_WORD = 8, ///< Treat the image as a single word.
- PSM_CIRCLE_WORD = 9, ///< Treat the image as a single word in a circle.
- PSM_SINGLE_CHAR = 10, ///< Treat the image as a single character.
- PSM_SPARSE_TEXT =
- 11, ///< Find as much text as possible in no particular order.
- PSM_SPARSE_TEXT_OSD = 12, ///< Sparse text with orientation and script det.
- PSM_RAW_LINE = 13, ///< Treat the image as a single text line, bypassing
- ///< hacks that are Tesseract-specific.
-
- PSM_COUNT ///< Number of enum entries.
-};
-
-/**
- * Inline functions that act on a PageSegMode to determine whether components of
- * layout analysis are enabled.
- * *Depend critically on the order of elements of PageSegMode.*
- * NOTE that arg is an int for compatibility with INT_PARAM.
- */
-inline bool PSM_OSD_ENABLED(int pageseg_mode) {
- return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD;
-}
-inline bool PSM_ORIENTATION_ENABLED(int pageseg_mode) {
- return pageseg_mode <= PSM_AUTO || pageseg_mode == PSM_SPARSE_TEXT_OSD;
-}
-inline bool PSM_COL_FIND_ENABLED(int pageseg_mode) {
- return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO;
-}
-inline bool PSM_SPARSE(int pageseg_mode) {
- return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
-}
-inline bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode) {
- return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN;
-}
-inline bool PSM_LINE_FIND_ENABLED(int pageseg_mode) {
- return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK;
-}
-inline bool PSM_WORD_FIND_ENABLED(int pageseg_mode) {
- return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) ||
- pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
-}
-
-/**
- * enum of the elements of the page hierarchy, used in ResultIterator
- * to provide functions that operate on each level without having to
- * have 5x as many functions.
- */
-enum PageIteratorLevel {
- RIL_BLOCK, // Block of text/image/separator line.
- RIL_PARA, // Paragraph within a block.
- RIL_TEXTLINE, // Line within a paragraph.
- RIL_WORD, // Word within a textline.
- RIL_SYMBOL // Symbol/character within a word.
-};
-
-/**
- * JUSTIFICATION_UNKNOWN
- * The alignment is not clearly one of the other options. This could happen
- * for example if there are only one or two lines of text or the text looks
- * like source code or poetry.
- *
- * NOTA BENE: Fully justified paragraphs (text aligned to both left and right
- * margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text
- * is written with a left-to-right script and with JUSTIFICATION_RIGHT if
- * their text is written in a right-to-left script.
- *
- * Interpretation for text read in vertical lines:
- * "Left" is wherever the starting reading position is.
- *
- * JUSTIFICATION_LEFT
- * Each line, except possibly the first, is flush to the same left tab stop.
- *
- * JUSTIFICATION_CENTER
- * The text lines of the paragraph are centered about a line going
- * down through their middle of the text lines.
- *
- * JUSTIFICATION_RIGHT
- * Each line, except possibly the first, is flush to the same right tab stop.
- */
-enum ParagraphJustification {
- JUSTIFICATION_UNKNOWN,
- JUSTIFICATION_LEFT,
- JUSTIFICATION_CENTER,
- JUSTIFICATION_RIGHT,
-};
-
-/**
- * When Tesseract/Cube is initialized we can choose to instantiate/load/run
- * only the Tesseract part, only the Cube part or both along with the combiner.
- * The preference of which engine to use is stored in tessedit_ocr_engine_mode.
- *
- * ATTENTION: When modifying this enum, please make sure to make the
- * appropriate changes to all the enums mirroring it (e.g. OCREngine in
- * cityblock/workflow/detection/detection_storage.proto). Such enums will
- * mention the connection to OcrEngineMode in the comments.
- */
-enum OcrEngineMode {
- OEM_TESSERACT_ONLY, // Run Tesseract only - fastest; deprecated
- OEM_LSTM_ONLY, // Run just the LSTM line recognizer.
- OEM_TESSERACT_LSTM_COMBINED, // Run the LSTM recognizer, but allow fallback
- // to Tesseract when things get difficult.
- // deprecated
- OEM_DEFAULT, // Specify this mode when calling init_*(),
- // to indicate that any of the above modes
- // should be automatically inferred from the
- // variables in the language-specific config,
- // command-line configs, or if not specified
- // in any of the above should be set to the
- // default OEM_TESSERACT_ONLY.
- OEM_COUNT // Number of OEMs
-};
-
-} // namespace tesseract.
-
-#endif // TESSERACT_CCSTRUCT_PUBLICTYPES_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/renderer.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/renderer.h
deleted file mode 100644
index 6f405233..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/renderer.h
+++ /dev/null
@@ -1,311 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: renderer.h
-// Description: Rendering interface to inject into TessBaseAPI
-//
-// (C) Copyright 2011, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_API_RENDERER_H_
-#define TESSERACT_API_RENDERER_H_
-
-#include "export.h"
-
-// To avoid collision with other typenames include the ABSOLUTE MINIMUM
-// complexity of includes here. Use forward declarations wherever possible
-// and hide includes of complex types in baseapi.cpp.
-#include
-#include // for std::string
-#include // for std::vector
-
-struct Pix;
-
-namespace tesseract {
-
-class TessBaseAPI;
-
-/**
- * Interface for rendering tesseract results into a document, such as text,
- * HOCR or pdf. This class is abstract. Specific classes handle individual
- * formats. This interface is then used to inject the renderer class into
- * tesseract when processing images.
- *
- * For simplicity implementing this with tesseract version 3.01,
- * the renderer contains document state that is cleared from document
- * to document just as the TessBaseAPI is. This way the base API can just
- * delegate its rendering functionality to injected renderers, and the
- * renderers can manage the associated state needed for the specific formats
- * in addition to the heuristics for producing it.
- */
-class TESS_API TessResultRenderer {
-public:
- virtual ~TessResultRenderer();
-
- // Takes ownership of pointer so must be new'd instance.
- // Renderers aren't ordered, but appends the sequences of next parameter
- // and existing next(). The renderers should be unique across both lists.
- void insert(TessResultRenderer *next);
-
- // Returns the next renderer or nullptr.
- TessResultRenderer *next() {
- return next_;
- }
-
- /**
- * Starts a new document with the given title.
- * This clears the contents of the output data.
- * Title should use UTF-8 encoding.
- */
- bool BeginDocument(const char *title);
-
- /**
- * Adds the recognized text from the source image to the current document.
- * Invalid if BeginDocument not yet called.
- *
- * Note that this API is a bit weird but is designed to fit into the
- * current TessBaseAPI implementation where the api has lots of state
- * information that we might want to add in.
- */
- bool AddImage(TessBaseAPI *api);
-
- /**
- * Finishes the document and finalizes the output data
- * Invalid if BeginDocument not yet called.
- */
- bool EndDocument();
-
- const char *file_extension() const {
- return file_extension_;
- }
- const char *title() const {
- return title_.c_str();
- }
-
- // Is everything fine? Otherwise something went wrong.
- bool happy() const {
- return happy_;
- }
-
- /**
- * Returns the index of the last image given to AddImage
- * (i.e. images are incremented whether the image succeeded or not)
- *
- * This is always defined. It means either the number of the
- * current image, the last image ended, or in the completed document
- * depending on when in the document lifecycle you are looking at it.
- * Will return -1 if a document was never started.
- */
- int imagenum() const {
- return imagenum_;
- }
-
-protected:
- /**
- * Called by concrete classes.
- *
- * outputbase is the name of the output file excluding
- * extension. For example, "/path/to/chocolate-chip-cookie-recipe"
- *
- * extension indicates the file extension to be used for output
- * files. For example "pdf" will produce a .pdf file, and "hocr"
- * will produce .hocr files.
- */
- TessResultRenderer(const char *outputbase, const char *extension);
-
- // Hook for specialized handling in BeginDocument()
- virtual bool BeginDocumentHandler();
-
- // This must be overridden to render the OCR'd results
- virtual bool AddImageHandler(TessBaseAPI *api) = 0;
-
- // Hook for specialized handling in EndDocument()
- virtual bool EndDocumentHandler();
-
- // Renderers can call this to append '\0' terminated strings into
- // the output string returned by GetOutput.
- // This method will grow the output buffer if needed.
- void AppendString(const char *s);
-
- // Renderers can call this to append binary byte sequences into
- // the output string returned by GetOutput. Note that s is not necessarily
- // '\0' terminated (and can contain '\0' within it).
- // This method will grow the output buffer if needed.
- void AppendData(const char *s, int len);
-
-private:
- TessResultRenderer *next_; // Can link multiple renderers together
- FILE *fout_; // output file pointer
- const char *file_extension_; // standard extension for generated output
- std::string title_; // title of document being rendered
- int imagenum_; // index of last image added
- bool happy_; // I get grumpy when the disk fills up, etc.
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string
- */
-class TESS_API TessTextRenderer : public TessResultRenderer {
-public:
- explicit TessTextRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-/**
- * Renders tesseract output into an hocr text string
- */
-class TESS_API TessHOcrRenderer : public TessResultRenderer {
-public:
- explicit TessHOcrRenderer(const char *outputbase, bool font_info);
- explicit TessHOcrRenderer(const char *outputbase);
-
-protected:
- bool BeginDocumentHandler() override;
- bool AddImageHandler(TessBaseAPI *api) override;
- bool EndDocumentHandler() override;
-
-private:
- bool font_info_; // whether to print font information
-};
-
-/**
- * Renders tesseract output into an alto text string
- */
-class TESS_API TessAltoRenderer : public TessResultRenderer {
-public:
- explicit TessAltoRenderer(const char *outputbase);
-
-protected:
- bool BeginDocumentHandler() override;
- bool AddImageHandler(TessBaseAPI *api) override;
- bool EndDocumentHandler() override;
-
-private:
- bool begin_document;
-};
-
-/**
- * Renders Tesseract output into a TSV string
- */
-class TESS_API TessTsvRenderer : public TessResultRenderer {
-public:
- explicit TessTsvRenderer(const char *outputbase, bool font_info);
- explicit TessTsvRenderer(const char *outputbase);
-
-protected:
- bool BeginDocumentHandler() override;
- bool AddImageHandler(TessBaseAPI *api) override;
- bool EndDocumentHandler() override;
-
-private:
- bool font_info_; // whether to print font information
-};
-
-/**
- * Renders tesseract output into searchable PDF
- */
-class TESS_API TessPDFRenderer : public TessResultRenderer {
-public:
- // datadir is the location of the TESSDATA. We need it because
- // we load a custom PDF font from this location.
- TessPDFRenderer(const char *outputbase, const char *datadir,
- bool textonly = false);
-
-protected:
- bool BeginDocumentHandler() override;
- bool AddImageHandler(TessBaseAPI *api) override;
- bool EndDocumentHandler() override;
-
-private:
- // We don't want to have every image in memory at once,
- // so we store some metadata as we go along producing
- // PDFs one page at a time. At the end, that metadata is
- // used to make everything that isn't easily handled in a
- // streaming fashion.
- long int obj_; // counter for PDF objects
- std::vector offsets_; // offset of every PDF object in bytes
- std::vector pages_; // object number for every /Page object
- std::string datadir_; // where to find the custom font
- bool textonly_; // skip images if set
- // Bookkeeping only. DIY = Do It Yourself.
- void AppendPDFObjectDIY(size_t objectsize);
- // Bookkeeping + emit data.
- void AppendPDFObject(const char *data);
- // Create the /Contents object for an entire page.
- char *GetPDFTextObjects(TessBaseAPI *api, double width, double height);
- // Turn an image into a PDF object. Only transcode if we have to.
- static bool imageToPDFObj(Pix *pix, const char *filename, long int objnum,
- char **pdf_object, long int *pdf_object_size,
- int jpg_quality);
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string
- */
-class TESS_API TessUnlvRenderer : public TessResultRenderer {
-public:
- explicit TessUnlvRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string for LSTMBox
- */
-class TESS_API TessLSTMBoxRenderer : public TessResultRenderer {
-public:
- explicit TessLSTMBoxRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string
- */
-class TESS_API TessBoxTextRenderer : public TessResultRenderer {
-public:
- explicit TessBoxTextRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string in WordStr format
- */
-class TESS_API TessWordStrBoxRenderer : public TessResultRenderer {
-public:
- explicit TessWordStrBoxRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-#ifndef DISABLED_LEGACY_ENGINE
-
-/**
- * Renders tesseract output into an osd text string
- */
-class TESS_API TessOsdRenderer : public TessResultRenderer {
-public:
- explicit TessOsdRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-#endif // ndef DISABLED_LEGACY_ENGINE
-
-} // namespace tesseract.
-
-#endif // TESSERACT_API_RENDERER_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/resultiterator.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/resultiterator.h
deleted file mode 100644
index 3e4d5807..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/resultiterator.h
+++ /dev/null
@@ -1,250 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: resultiterator.h
-// Description: Iterator for tesseract results that is capable of
-// iterating in proper reading order over Bi Directional
-// (e.g. mixed Hebrew and English) text.
-// Author: David Eger
-//
-// (C) Copyright 2011, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_
-#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_
-
-#include "export.h" // for TESS_API, TESS_LOCAL
-#include "ltrresultiterator.h" // for LTRResultIterator
-#include "publictypes.h" // for PageIteratorLevel
-#include "unichar.h" // for StrongScriptDirection
-
-#include // for std::pair
-#include // for std::vector
-
-namespace tesseract {
-
-class TESS_API ResultIterator : public LTRResultIterator {
-public:
- static ResultIterator *StartOfParagraph(const LTRResultIterator &resit);
-
- /**
- * ResultIterator is copy constructible!
- * The default copy constructor works just fine for us.
- */
- ~ResultIterator() override = default;
-
- // ============= Moving around within the page ============.
- /**
- * Moves the iterator to point to the start of the page to begin
- * an iteration.
- */
- void Begin() override;
-
- /**
- * Moves to the start of the next object at the given level in the
- * page hierarchy in the appropriate reading order and returns false if
- * the end of the page was reached.
- * NOTE that RIL_SYMBOL will skip non-text blocks, but all other
- * PageIteratorLevel level values will visit each non-text block once.
- * Think of non text blocks as containing a single para, with a single line,
- * with a single imaginary word.
- * Calls to Next with different levels may be freely intermixed.
- * This function iterates words in right-to-left scripts correctly, if
- * the appropriate language has been loaded into Tesseract.
- */
- bool Next(PageIteratorLevel level) override;
-
- /**
- * IsAtBeginningOf() returns whether we're at the logical beginning of the
- * given level. (as opposed to ResultIterator's left-to-right top-to-bottom
- * order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf().
- * For a full description, see pageiterator.h
- */
- bool IsAtBeginningOf(PageIteratorLevel level) const override;
-
- /**
- * Implement PageIterator's IsAtFinalElement correctly in a BiDi context.
- * For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we
- * point at the last word in a paragraph. See PageIterator for full comment.
- */
- bool IsAtFinalElement(PageIteratorLevel level,
- PageIteratorLevel element) const override;
-
- // ============= Functions that refer to words only ============.
- // Returns the number of blanks before the current word.
- int BlanksBeforeWord() const;
-
- // ============= Accessing data ==============.
-
- /**
- * Returns the null terminated UTF-8 encoded text string for the current
- * object at the given level. Use delete [] to free after use.
- */
- virtual char *GetUTF8Text(PageIteratorLevel level) const;
-
- /**
- * Returns the LSTM choices for every LSTM timestep for the current word.
- */
- virtual std::vector>>>
- *GetRawLSTMTimesteps() const;
- virtual std::vector>>
- *GetBestLSTMSymbolChoices() const;
-
- /**
- * Return whether the current paragraph's dominant reading direction
- * is left-to-right (as opposed to right-to-left).
- */
- bool ParagraphIsLtr() const;
-
- // ============= Exposed only for testing =============.
-
- /**
- * Yields the reading order as a sequence of indices and (optional)
- * meta-marks for a set of words (given left-to-right).
- * The meta marks are passed as negative values:
- * kMinorRunStart Start of minor direction text.
- * kMinorRunEnd End of minor direction text.
- * kComplexWord The next indexed word contains both left-to-right and
- * right-to-left characters and was treated as neutral.
- *
- * For example, suppose we have five words in a text line,
- * indexed [0,1,2,3,4] from the leftmost side of the text line.
- * The following are all believable reading_orders:
- *
- * Left-to-Right (in ltr paragraph):
- * { 0, 1, 2, 3, 4 }
- * Left-to-Right (in rtl paragraph):
- * { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd }
- * Right-to-Left (in rtl paragraph):
- * { 4, 3, 2, 1, 0 }
- * Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph:
- * { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }
- */
- static void CalculateTextlineOrder(
- bool paragraph_is_ltr,
- const std::vector &word_dirs,
- std::vector *reading_order);
-
- static const int kMinorRunStart;
- static const int kMinorRunEnd;
- static const int kComplexWord;
-
-protected:
- /**
- * We presume the data associated with the given iterator will outlive us.
- * NB: This is private because it does something that is non-obvious:
- * it resets to the beginning of the paragraph instead of staying wherever
- * resit might have pointed.
- */
- explicit ResultIterator(const LTRResultIterator &resit);
-
-private:
- /**
- * Calculates the current paragraph's dominant writing direction.
- * Typically, members should use current_paragraph_ltr_ instead.
- */
- bool CurrentParagraphIsLtr() const;
-
- /**
- * Returns word indices as measured from resit->RestartRow() = index 0
- * for the reading order of words within a textline given an iterator
- * into the middle of the text line.
- * In addition to non-negative word indices, the following negative values
- * may be inserted:
- * kMinorRunStart Start of minor direction text.
- * kMinorRunEnd End of minor direction text.
- * kComplexWord The previous word contains both left-to-right and
- * right-to-left characters and was treated as neutral.
- */
- void CalculateTextlineOrder(bool paragraph_is_ltr,
- const LTRResultIterator &resit,
- std::vector *indices) const;
- /** Same as above, but the caller's ssd gets filled in if ssd != nullptr. */
- void CalculateTextlineOrder(bool paragraph_is_ltr,
- const LTRResultIterator &resit,
- std::vector *ssd,
- std::vector *indices) const;
-
- /**
- * What is the index of the current word in a strict left-to-right reading
- * of the row?
- */
- int LTRWordIndex() const;
-
- /**
- * Given an iterator pointing at a word, returns the logical reading order
- * of blob indices for the word.
- */
- void CalculateBlobOrder(std::vector *blob_indices) const;
-
- /** Precondition: current_paragraph_is_ltr_ is set. */
- void MoveToLogicalStartOfTextline();
-
- /**
- * Precondition: current_paragraph_is_ltr_ and in_minor_direction_
- * are set.
- */
- void MoveToLogicalStartOfWord();
-
- /** Are we pointing at the final (reading order) symbol of the word? */
- bool IsAtFinalSymbolOfWord() const;
-
- /** Are we pointing at the first (reading order) symbol of the word? */
- bool IsAtFirstSymbolOfWord() const;
-
- /**
- * Append any extra marks that should be appended to this word when printed.
- * Mostly, these are Unicode BiDi control characters.
- */
- void AppendSuffixMarks(std::string *text) const;
-
- /** Appends the current word in reading order to the given buffer.*/
- void AppendUTF8WordText(std::string *text) const;
-
- /**
- * Appends the text of the current text line, *assuming this iterator is
- * positioned at the beginning of the text line* This function
- * updates the iterator to point to the first position past the text line.
- * Each textline is terminated in a single newline character.
- * If the textline ends a paragraph, it gets a second terminal newline.
- */
- void IterateAndAppendUTF8TextlineText(std::string *text);
-
- /**
- * Appends the text of the current paragraph in reading order
- * to the given buffer.
- * Each textline is terminated in a single newline character, and the
- * paragraph gets an extra newline at the end.
- */
- void AppendUTF8ParagraphText(std::string *text) const;
-
- /** Returns whether the bidi_debug flag is set to at least min_level. */
- bool BidiDebug(int min_level) const;
-
- bool current_paragraph_is_ltr_;
-
- /**
- * Is the currently pointed-at character at the beginning of
- * a minor-direction run?
- */
- bool at_beginning_of_minor_run_;
-
- /** Is the currently pointed-at character in a minor-direction sequence? */
- bool in_minor_direction_;
-
- /**
- * Should detected inter-word spaces be preserved, or "compressed" to a single
- * space character (default behavior).
- */
- bool preserve_interword_spaces_;
-};
-
-} // namespace tesseract.
-
-#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/unichar.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/unichar.h
deleted file mode 100644
index 015109d7..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/unichar.h
+++ /dev/null
@@ -1,174 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: unichar.h
-// Description: Unicode character/ligature class.
-// Author: Ray Smith
-//
-// (C) Copyright 2006, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCUTIL_UNICHAR_H_
-#define TESSERACT_CCUTIL_UNICHAR_H_
-
-#include "export.h"
-
-#include
-#include
-#include
-#include
-
-namespace tesseract {
-
-// Maximum number of characters that can be stored in a UNICHAR. Must be
-// at least 4. Must not exceed 31 without changing the coding of length.
-#define UNICHAR_LEN 30
-
-// A UNICHAR_ID is the unique id of a unichar.
-using UNICHAR_ID = int;
-
-// A variable to indicate an invalid or uninitialized unichar id.
-static const int INVALID_UNICHAR_ID = -1;
-// A special unichar that corresponds to INVALID_UNICHAR_ID.
-static const char INVALID_UNICHAR[] = "__INVALID_UNICHAR__";
-
-enum StrongScriptDirection {
- DIR_NEUTRAL = 0, // Text contains only neutral characters.
- DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters.
- DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters.
- DIR_MIX = 3, // Text contains a mixture of left-to-right
- // and right-to-left characters.
-};
-
-using char32 = signed int;
-
-// The UNICHAR class holds a single classification result. This may be
-// a single Unicode character (stored as between 1 and 4 utf8 bytes) or
-// multiple Unicode characters representing the NFKC expansion of a ligature
-// such as fi, ffl etc. These are also stored as utf8.
-class TESS_API UNICHAR {
-public:
- UNICHAR() {
- memset(chars, 0, UNICHAR_LEN);
- }
-
- // Construct from a utf8 string. If len<0 then the string is null terminated.
- // If the string is too long to fit in the UNICHAR then it takes only what
- // will fit.
- UNICHAR(const char *utf8_str, int len);
-
- // Construct from a single UCS4 character.
- explicit UNICHAR(int unicode);
-
- // Default copy constructor and operator= are OK.
-
- // Get the first character as UCS-4.
- int first_uni() const;
-
- // Get the length of the UTF8 string.
- int utf8_len() const {
- int len = chars[UNICHAR_LEN - 1];
- return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;
- }
-
- // Get a UTF8 string, but NOT nullptr terminated.
- const char *utf8() const {
- return chars;
- }
-
- // Get a terminated UTF8 string: Must delete[] it after use.
- char *utf8_str() const;
-
- // Get the number of bytes in the first character of the given utf8 string.
- static int utf8_step(const char *utf8_str);
-
- // A class to simplify iterating over and accessing elements of a UTF8
- // string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or
- // take ownership of the underlying byte array. It also does not permit
- // modification of the array (as the name suggests).
- //
- // Example:
- // for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len);
- // it != UNICHAR::end(str, len);
- // ++it) {
- // printf("UCS-4 symbol code = %d\n", *it);
- // char buf[5];
- // int char_len = it.get_utf8(buf); buf[char_len] = '\0';
- // printf("Char = %s\n", buf);
- // }
- class TESS_API const_iterator {
- using CI = const_iterator;
-
- public:
- // Step to the next UTF8 character.
- // If the current position is at an illegal UTF8 character, then print an
- // error message and step by one byte. If the current position is at a
- // nullptr value, don't step past it.
- const_iterator &operator++();
-
- // Return the UCS-4 value at the current position.
- // If the current position is at an illegal UTF8 value, return a single
- // space character.
- int operator*() const;
-
- // Store the UTF-8 encoding of the current codepoint into buf, which must be
- // at least 4 bytes long. Return the number of bytes written.
- // If the current position is at an illegal UTF8 value, writes a single
- // space character and returns 1.
- // Note that this method does not null-terminate the buffer.
- int get_utf8(char *buf) const;
- // Returns the number of bytes of the current codepoint. Returns 1 if the
- // current position is at an illegal UTF8 value.
- int utf8_len() const;
- // Returns true if the UTF-8 encoding at the current position is legal.
- bool is_legal() const;
-
- // Return the pointer into the string at the current position.
- const char *utf8_data() const {
- return it_;
- }
-
- // Iterator equality operators.
- friend bool operator==(const CI &lhs, const CI &rhs) {
- return lhs.it_ == rhs.it_;
- }
- friend bool operator!=(const CI &lhs, const CI &rhs) {
- return !(lhs == rhs);
- }
-
- private:
- friend class UNICHAR;
- explicit const_iterator(const char *it) : it_(it) {}
-
- const char *it_; // Pointer into the string.
- };
-
- // Create a start/end iterator pointing to a string. Note that these methods
- // are static and do NOT create a copy or take ownership of the underlying
- // array.
- static const_iterator begin(const char *utf8_str, int byte_length);
- static const_iterator end(const char *utf8_str, int byte_length);
-
- // Converts a utf-8 string to a vector of unicodes.
- // Returns an empty vector if the input contains invalid UTF-8.
- static std::vector UTF8ToUTF32(const char *utf8_str);
- // Converts a vector of unicodes to a utf8 string.
- // Returns an empty string if the input contains an invalid unicode.
- static std::string UTF32ToUTF8(const std::vector &str32);
-
-private:
- // A UTF-8 representation of 1 or more Unicode characters.
- // The last element (chars[UNICHAR_LEN - 1]) is a length if
- // its value < UNICHAR_LEN, otherwise it is a genuine character.
- char chars[UNICHAR_LEN]{};
-};
-
-} // namespace tesseract
-
-#endif // TESSERACT_CCUTIL_UNICHAR_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/version.h b/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/version.h
deleted file mode 100644
index 6bac5d66..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/amd64/include/tesseract/version.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: version.h
-// Description: Version information
-//
-// (C) Copyright 2018, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_API_VERSION_H_
-#define TESSERACT_API_VERSION_H_
-
-// clang-format off
-
-#define TESSERACT_MAJOR_VERSION @GENERIC_MAJOR_VERSION@
-#define TESSERACT_MINOR_VERSION @GENERIC_MINOR_VERSION@
-#define TESSERACT_MICRO_VERSION @GENERIC_MICRO_VERSION@
-
-#define TESSERACT_VERSION \
- (TESSERACT_MAJOR_VERSION << 16 | \
- TESSERACT_MINOR_VERSION << 8 | \
- TESSERACT_MICRO_VERSION)
-
-#define TESSERACT_VERSION_STR "@PACKAGE_VERSION@"
-
-// clang-format on
-
-#endif // TESSERACT_API_VERSION_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/baseapi.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/baseapi.h
deleted file mode 100644
index 5e1e4830..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/baseapi.h
+++ /dev/null
@@ -1,812 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: baseapi.h
-// Description: Simple API for calling tesseract.
-// Author: Ray Smith
-//
-// (C) Copyright 2006, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_API_BASEAPI_H_
-#define TESSERACT_API_BASEAPI_H_
-
-#ifdef HAVE_CONFIG_H
-# include "config_auto.h" // DISABLED_LEGACY_ENGINE
-#endif
-
-#include "export.h"
-#include "pageiterator.h"
-#include "publictypes.h"
-#include "resultiterator.h"
-#include "unichar.h"
-
-#include "version.h"
-
-#include
-#include // for std::vector
-
-struct Pix;
-struct Pixa;
-struct Boxa;
-
-namespace tesseract {
-
-class PAGE_RES;
-class ParagraphModel;
-class BLOCK_LIST;
-class ETEXT_DESC;
-struct OSResults;
-class UNICHARSET;
-
-class Dawg;
-class Dict;
-class EquationDetect;
-class PageIterator;
-class ImageThresholder;
-class LTRResultIterator;
-class ResultIterator;
-class MutableIterator;
-class TessResultRenderer;
-class Tesseract;
-
-// Function to read a std::vector from a whole file.
-// Returns false on failure.
-using FileReader = bool (*)(const char *filename, std::vector *data);
-
-using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID,
- bool) const;
-using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *,
- int, const char *, int);
-
-/**
- * Base class for all tesseract APIs.
- * Specific classes can add ability to work on different inputs or produce
- * different outputs.
- * This class is mostly an interface layer on top of the Tesseract instance
- * class to hide the data types so that users of this class don't have to
- * include any other Tesseract headers.
- */
-class TESS_API TessBaseAPI {
-public:
- TessBaseAPI();
- virtual ~TessBaseAPI();
- // Copy constructor and assignment operator are currently unsupported.
- TessBaseAPI(TessBaseAPI const &) = delete;
- TessBaseAPI &operator=(TessBaseAPI const &) = delete;
-
- /**
- * Returns the version identifier as a static string. Do not delete.
- */
- static const char *Version();
-
- /**
- * If compiled with OpenCL AND an available OpenCL
- * device is deemed faster than serial code, then
- * "device" is populated with the cl_device_id
- * and returns sizeof(cl_device_id)
- * otherwise *device=nullptr and returns 0.
- */
- static size_t getOpenCLDevice(void **device);
-
- /**
- * Set the name of the input file. Needed for training and
- * reading a UNLV zone file, and for searchable PDF output.
- */
- void SetInputName(const char *name);
- /**
- * These functions are required for searchable PDF output.
- * We need our hands on the input file so that we can include
- * it in the PDF without transcoding. If that is not possible,
- * we need the original image. Finally, resolution metadata
- * is stored in the PDF so we need that as well.
- */
- const char *GetInputName();
- // Takes ownership of the input pix.
- void SetInputImage(Pix *pix);
- Pix *GetInputImage();
- int GetSourceYResolution();
- const char *GetDatapath();
-
- /** Set the name of the bonus output files. Needed only for debugging. */
- void SetOutputName(const char *name);
-
- /**
- * Set the value of an internal "parameter."
- * Supply the name of the parameter and the value as a string, just as
- * you would in a config file.
- * Returns false if the name lookup failed.
- * Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z.
- * Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode.
- * SetVariable may be used before Init, but settings will revert to
- * defaults on End().
- *
- * Note: Must be called after Init(). Only works for non-init variables
- * (init variables should be passed to Init()).
- */
- bool SetVariable(const char *name, const char *value);
- bool SetDebugVariable(const char *name, const char *value);
-
- /**
- * Returns true if the parameter was found among Tesseract parameters.
- * Fills in value with the value of the parameter.
- */
- bool GetIntVariable(const char *name, int *value) const;
- bool GetBoolVariable(const char *name, bool *value) const;
- bool GetDoubleVariable(const char *name, double *value) const;
-
- /**
- * Returns the pointer to the string that represents the value of the
- * parameter if it was found among Tesseract parameters.
- */
- const char *GetStringVariable(const char *name) const;
-
-#ifndef DISABLED_LEGACY_ENGINE
-
- /**
- * Print Tesseract fonts table to the given file.
- */
- void PrintFontsTable(FILE *fp) const;
-
-#endif
-
- /**
- * Print Tesseract parameters to the given file.
- */
- void PrintVariables(FILE *fp) const;
-
- /**
- * Get value of named variable as a string, if it exists.
- */
- bool GetVariableAsString(const char *name, std::string *val) const;
-
- /**
- * Instances are now mostly thread-safe and totally independent,
- * but some global parameters remain. Basically it is safe to use multiple
- * TessBaseAPIs in different threads in parallel, UNLESS:
- * you use SetVariable on some of the Params in classify and textord.
- * If you do, then the effect will be to change it for all your instances.
- *
- * Start tesseract. Returns zero on success and -1 on failure.
- * NOTE that the only members that may be called before Init are those
- * listed above here in the class definition.
- *
- * The datapath must be the name of the tessdata directory.
- * The language is (usually) an ISO 639-3 string or nullptr will default to
- * eng. It is entirely safe (and eventually will be efficient too) to call
- * Init multiple times on the same instance to change language, or just
- * to reset the classifier.
- * The language may be a string of the form [~][+[~]]* indicating
- * that multiple languages are to be loaded. Eg hin+eng will load Hindi and
- * English. Languages may specify internally that they want to be loaded
- * with one or more other languages, so the ~ sign is available to override
- * that. Eg if hin were set to load eng by default, then hin+~eng would force
- * loading only hin. The number of loaded languages is limited only by
- * memory, with the caveat that loading additional languages will impact
- * both speed and accuracy, as there is more work to do to decide on the
- * applicable language, and there is more chance of hallucinating incorrect
- * words.
- * WARNING: On changing languages, all Tesseract parameters are reset
- * back to their default values. (Which may vary between languages.)
- * If you have a rare need to set a Variable that controls
- * initialization for a second call to Init you should explicitly
- * call End() and then use SetVariable before Init. This is only a very
- * rare use case, since there are very few uses that require any parameters
- * to be set before Init.
- *
- * If set_only_non_debug_params is true, only params that do not contain
- * "debug" in the name will be set.
- */
- int Init(const char *datapath, const char *language, OcrEngineMode mode,
- char **configs, int configs_size,
- const std::vector *vars_vec,
- const std::vector *vars_values,
- bool set_only_non_debug_params);
- int Init(const char *datapath, const char *language, OcrEngineMode oem) {
- return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false);
- }
- int Init(const char *datapath, const char *language) {
- return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr,
- false);
- }
- // In-memory version reads the traineddata file directly from the given
- // data[data_size] array, and/or reads data via a FileReader.
- int Init(const char *data, int data_size, const char *language,
- OcrEngineMode mode, char **configs, int configs_size,
- const std::vector *vars_vec,
- const std::vector *vars_values,
- bool set_only_non_debug_params, FileReader reader);
-
- /**
- * Returns the languages string used in the last valid initialization.
- * If the last initialization specified "deu+hin" then that will be
- * returned. If hin loaded eng automatically as well, then that will
- * not be included in this list. To find the languages actually
- * loaded use GetLoadedLanguagesAsVector.
- * The returned string should NOT be deleted.
- */
- const char *GetInitLanguagesAsString() const;
-
- /**
- * Returns the loaded languages in the vector of std::string.
- * Includes all languages loaded by the last Init, including those loaded
- * as dependencies of other loaded languages.
- */
- void GetLoadedLanguagesAsVector(std::vector *langs) const;
-
- /**
- * Returns the available languages in the sorted vector of std::string.
- */
- void GetAvailableLanguagesAsVector(std::vector *langs) const;
-
- /**
- * Init only for page layout analysis. Use only for calls to SetImage and
- * AnalysePage. Calls that attempt recognition will generate an error.
- */
- void InitForAnalysePage();
-
- /**
- * Read a "config" file containing a set of param, value pairs.
- * Searches the standard places: tessdata/configs, tessdata/tessconfigs
- * and also accepts a relative or absolute path name.
- * Note: only non-init params will be set (init params are set by Init()).
- */
- void ReadConfigFile(const char *filename);
- /** Same as above, but only set debug params from the given config file. */
- void ReadDebugConfigFile(const char *filename);
-
- /**
- * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK.
- * The mode is stored as an IntParam so it can also be modified by
- * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
- */
- void SetPageSegMode(PageSegMode mode);
-
- /** Return the current page segmentation mode. */
- PageSegMode GetPageSegMode() const;
-
- /**
- * Recognize a rectangle from an image and return the result as a string.
- * May be called many times for a single Init.
- * Currently has no error checking.
- * Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
- * Palette color images will not work properly and must be converted to
- * 24 bit.
- * Binary images of 1 bit per pixel may also be given but they must be
- * byte packed with the MSB of the first byte being the first pixel, and a
- * 1 represents WHITE. For binary images set bytes_per_pixel=0.
- * The recognized text is returned as a char* which is coded
- * as UTF8 and must be freed with the delete [] operator.
- *
- * Note that TesseractRect is the simplified convenience interface.
- * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize,
- * and one or more of the Get*Text functions below.
- */
- char *TesseractRect(const unsigned char *imagedata, int bytes_per_pixel,
- int bytes_per_line, int left, int top, int width,
- int height);
-
- /**
- * Call between pages or documents etc to free up memory and forget
- * adaptive data.
- */
- void ClearAdaptiveClassifier();
-
- /**
- * @defgroup AdvancedAPI Advanced API
- * The following methods break TesseractRect into pieces, so you can
- * get hold of the thresholded image, get the text in different formats,
- * get bounding boxes, confidences etc.
- */
- /* @{ */
-
- /**
- * Provide an image for Tesseract to recognize. Format is as
- * TesseractRect above. Copies the image buffer and converts to Pix.
- * SetImage clears all recognition results, and sets the rectangle to the
- * full image, so it may be followed immediately by a GetUTF8Text, and it
- * will automatically perform recognition.
- */
- void SetImage(const unsigned char *imagedata, int width, int height,
- int bytes_per_pixel, int bytes_per_line);
-
- /**
- * Provide an image for Tesseract to recognize. As with SetImage above,
- * Tesseract takes its own copy of the image, so it need not persist until
- * after Recognize.
- * Pix vs raw, which to use?
- * Use Pix where possible. Tesseract uses Pix as its internal representation
- * and it is therefore more efficient to provide a Pix directly.
- */
- void SetImage(Pix *pix);
-
- /**
- * Set the resolution of the source image in pixels per inch so font size
- * information can be calculated in results. Call this after SetImage().
- */
- void SetSourceResolution(int ppi);
-
- /**
- * Restrict recognition to a sub-rectangle of the image. Call after SetImage.
- * Each SetRectangle clears the recogntion results so multiple rectangles
- * can be recognized with the same image.
- */
- void SetRectangle(int left, int top, int width, int height);
-
- /**
- * Get a copy of the internal thresholded image from Tesseract.
- * Caller takes ownership of the Pix and must pixDestroy it.
- * May be called any time after SetImage, or after TesseractRect.
- */
- Pix *GetThresholdedImage();
-
- /**
- * Get the result of page layout analysis as a leptonica-style
- * Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- */
- Boxa *GetRegions(Pixa **pixa);
-
- /**
- * Get the textlines as a leptonica-style
- * Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- * If raw_image is true, then extract from the original image instead of the
- * thresholded image and pad by raw_padding pixels.
- * If blockids is not nullptr, the block-id of each line is also returned as
- * an array of one element per line. delete [] after use. If paraids is not
- * nullptr, the paragraph-id of each line within its block is also returned as
- * an array of one element per line. delete [] after use.
- */
- Boxa *GetTextlines(bool raw_image, int raw_padding, Pixa **pixa,
- int **blockids, int **paraids);
- /*
- Helper method to extract from the thresholded image. (most common usage)
-*/
- Boxa *GetTextlines(Pixa **pixa, int **blockids) {
- return GetTextlines(false, 0, pixa, blockids, nullptr);
- }
-
- /**
- * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa
- * pair, in reading order. Enables downstream handling of non-rectangular
- * regions.
- * Can be called before or after Recognize.
- * If blockids is not nullptr, the block-id of each line is also returned as
- * an array of one element per line. delete [] after use.
- */
- Boxa *GetStrips(Pixa **pixa, int **blockids);
-
- /**
- * Get the words as a leptonica-style
- * Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- */
- Boxa *GetWords(Pixa **pixa);
-
- /**
- * Gets the individual connected (text) components (created
- * after pages segmentation step, but before recognition)
- * as a leptonica-style Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- * Note: the caller is responsible for calling boxaDestroy()
- * on the returned Boxa array and pixaDestroy() on cc array.
- */
- Boxa *GetConnectedComponents(Pixa **cc);
-
- /**
- * Get the given level kind of components (block, textline, word etc.) as a
- * leptonica-style Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- * If blockids is not nullptr, the block-id of each component is also returned
- * as an array of one element per component. delete [] after use.
- * If blockids is not nullptr, the paragraph-id of each component with its
- * block is also returned as an array of one element per component. delete []
- * after use. If raw_image is true, then portions of the original image are
- * extracted instead of the thresholded image and padded with raw_padding. If
- * text_only is true, then only text components are returned.
- */
- Boxa *GetComponentImages(PageIteratorLevel level, bool text_only,
- bool raw_image, int raw_padding, Pixa **pixa,
- int **blockids, int **paraids);
- // Helper function to get binary images with no padding (most common usage).
- Boxa *GetComponentImages(const PageIteratorLevel level, const bool text_only,
- Pixa **pixa, int **blockids) {
- return GetComponentImages(level, text_only, false, 0, pixa, blockids,
- nullptr);
- }
-
- /**
- * Returns the scale factor of the thresholded image that would be returned by
- * GetThresholdedImage() and the various GetX() methods that call
- * GetComponentImages().
- * Returns 0 if no thresholder has been set.
- */
- int GetThresholdedImageScaleFactor() const;
-
- /**
- * Runs page layout analysis in the mode set by SetPageSegMode.
- * May optionally be called prior to Recognize to get access to just
- * the page layout results. Returns an iterator to the results.
- * If merge_similar_words is true, words are combined where suitable for use
- * with a line recognizer. Use if you want to use AnalyseLayout to find the
- * textlines, and then want to process textline fragments with an external
- * line recognizer.
- * Returns nullptr on error or an empty page.
- * The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- */
- PageIterator *AnalyseLayout();
- PageIterator *AnalyseLayout(bool merge_similar_words);
-
- /**
- * Recognize the image from SetAndThresholdImage, generating Tesseract
- * internal structures. Returns 0 on success.
- * Optional. The Get*Text functions below will call Recognize if needed.
- * After Recognize, the output is kept internally until the next SetImage.
- */
- int Recognize(ETEXT_DESC *monitor);
-
- /**
- * Methods to retrieve information after SetAndThresholdImage(),
- * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.)
- */
-
- /**
- * Turns images into symbolic text.
- *
- * filename can point to a single image, a multi-page TIFF,
- * or a plain text list of image filenames.
- *
- * retry_config is useful for debugging. If not nullptr, you can fall
- * back to an alternate configuration if a page fails for some
- * reason.
- *
- * timeout_millisec terminates processing if any single page
- * takes too long. Set to 0 for unlimited time.
- *
- * renderer is responible for creating the output. For example,
- * use the TessTextRenderer if you want plaintext output, or
- * the TessPDFRender to produce searchable PDF.
- *
- * If tessedit_page_number is non-negative, will only process that
- * single page. Works for multi-page tiff file, or filelist.
- *
- * Returns true if successful, false on error.
- */
- bool ProcessPages(const char *filename, const char *retry_config,
- int timeout_millisec, TessResultRenderer *renderer);
- // Does the real work of ProcessPages.
- bool ProcessPagesInternal(const char *filename, const char *retry_config,
- int timeout_millisec, TessResultRenderer *renderer);
-
- /**
- * Turn a single image into symbolic text.
- *
- * The pix is the image processed. filename and page_index are
- * metadata used by side-effect processes, such as reading a box
- * file or formatting as hOCR.
- *
- * See ProcessPages for descriptions of other parameters.
- */
- bool ProcessPage(Pix *pix, int page_index, const char *filename,
- const char *retry_config, int timeout_millisec,
- TessResultRenderer *renderer);
-
- /**
- * Get a reading-order iterator to the results of LayoutAnalysis and/or
- * Recognize. The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- */
- ResultIterator *GetIterator();
-
- /**
- * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.
- * The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- */
- MutableIterator *GetMutableIterator();
-
- /**
- * The recognized text is returned as a char* which is coded
- * as UTF8 and must be freed with the delete [] operator.
- */
- char *GetUTF8Text();
-
- /**
- * Make a HTML-formatted string with hOCR markup from the internal
- * data structures.
- * page_number is 0-based but will appear in the output as 1-based.
- * monitor can be used to
- * cancel the recognition
- * receive progress callbacks
- * Returned string must be freed with the delete [] operator.
- */
- char *GetHOCRText(ETEXT_DESC *monitor, int page_number);
-
- /**
- * Make a HTML-formatted string with hOCR markup from the internal
- * data structures.
- * page_number is 0-based but will appear in the output as 1-based.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetHOCRText(int page_number);
-
- /**
- * Make an XML-formatted string with Alto markup from the internal
- * data structures.
- */
- char *GetAltoText(ETEXT_DESC *monitor, int page_number);
-
- /**
- * Make an XML-formatted string with Alto markup from the internal
- * data structures.
- */
- char *GetAltoText(int page_number);
-
- /**
- * Make a TSV-formatted string from the internal data structures.
- * page_number is 0-based but will appear in the output as 1-based.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetTSVText(int page_number);
-
- /**
- * Make a box file for LSTM training from the internal data structures.
- * Constructs coordinates in the original image - not just the rectangle.
- * page_number is a 0-based page index that will appear in the box file.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetLSTMBoxText(int page_number);
-
- /**
- * The recognized text is returned as a char* which is coded in the same
- * format as a box file used in training.
- * Constructs coordinates in the original image - not just the rectangle.
- * page_number is a 0-based page index that will appear in the box file.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetBoxText(int page_number);
-
- /**
- * The recognized text is returned as a char* which is coded in the same
- * format as a WordStr box file used in training.
- * page_number is a 0-based page index that will appear in the box file.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetWordStrBoxText(int page_number);
-
- /**
- * The recognized text is returned as a char* which is coded
- * as UNLV format Latin-1 with specific reject and suspect codes.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetUNLVText();
-
- /**
- * Detect the orientation of the input image and apparent script (alphabet).
- * orient_deg is the detected clockwise rotation of the input image in degrees
- * (0, 90, 180, 270)
- * orient_conf is the confidence (15.0 is reasonably confident)
- * script_name is an ASCII string, the name of the script, e.g. "Latin"
- * script_conf is confidence level in the script
- * Returns true on success and writes values to each parameter as an output
- */
- bool DetectOrientationScript(int *orient_deg, float *orient_conf,
- const char **script_name, float *script_conf);
-
- /**
- * The recognized text is returned as a char* which is coded
- * as UTF8 and must be freed with the delete [] operator.
- * page_number is a 0-based page index that will appear in the osd file.
- */
- char *GetOsdText(int page_number);
-
- /** Returns the (average) confidence value between 0 and 100. */
- int MeanTextConf();
- /**
- * Returns all word confidences (between 0 and 100) in an array, terminated
- * by -1. The calling function must delete [] after use.
- * The number of confidences should correspond to the number of space-
- * delimited words in GetUTF8Text.
- */
- int *AllWordConfidences();
-
-#ifndef DISABLED_LEGACY_ENGINE
- /**
- * Applies the given word to the adaptive classifier if possible.
- * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can
- * tell the boundaries of the graphemes.
- * Assumes that SetImage/SetRectangle have been used to set the image
- * to the given word. The mode arg should be PSM_SINGLE_WORD or
- * PSM_CIRCLE_WORD, as that will be used to control layout analysis.
- * The currently set PageSegMode is preserved.
- * Returns false if adaption was not possible for some reason.
- */
- bool AdaptToWordStr(PageSegMode mode, const char *wordstr);
-#endif // ndef DISABLED_LEGACY_ENGINE
-
- /**
- * Free up recognition results and any stored image data, without actually
- * freeing any recognition data that would be time-consuming to reload.
- * Afterwards, you must call SetImage or TesseractRect before doing
- * any Recognize or Get* operation.
- */
- void Clear();
-
- /**
- * Close down tesseract and free up all memory. End() is equivalent to
- * destructing and reconstructing your TessBaseAPI.
- * Once End() has been used, none of the other API functions may be used
- * other than Init and anything declared above it in the class definition.
- */
- void End();
-
- /**
- * Clear any library-level memory caches.
- * There are a variety of expensive-to-load constant data structures (mostly
- * language dictionaries) that are cached globally -- surviving the Init()
- * and End() of individual TessBaseAPI's. This function allows the clearing
- * of these caches.
- **/
- static void ClearPersistentCache();
-
- /**
- * Check whether a word is valid according to Tesseract's language model
- * @return 0 if the word is invalid, non-zero if valid.
- * @warning temporary! This function will be removed from here and placed
- * in a separate API at some future time.
- */
- int IsValidWord(const char *word) const;
- // Returns true if utf8_character is defined in the UniCharset.
- bool IsValidCharacter(const char *utf8_character) const;
-
- bool GetTextDirection(int *out_offset, float *out_slope);
-
- /** Sets Dict::letter_is_okay_ function to point to the given function. */
- void SetDictFunc(DictFunc f);
-
- /** Sets Dict::probability_in_context_ function to point to the given
- * function.
- */
- void SetProbabilityInContextFunc(ProbabilityInContextFunc f);
-
- /**
- * Estimates the Orientation And Script of the image.
- * @return true if the image was processed successfully.
- */
- bool DetectOS(OSResults *);
-
- /**
- * Return text orientation of each block as determined by an earlier run
- * of layout analysis.
- */
- void GetBlockTextOrientations(int **block_orientation,
- bool **vertical_writing);
-
- /** This method returns the string form of the specified unichar. */
- const char *GetUnichar(int unichar_id) const;
-
- /** Return the pointer to the i-th dawg loaded into tesseract_ object. */
- const Dawg *GetDawg(int i) const;
-
- /** Return the number of dawgs loaded into tesseract_ object. */
- int NumDawgs() const;
-
- Tesseract *tesseract() const {
- return tesseract_;
- }
-
- OcrEngineMode oem() const {
- return last_oem_requested_;
- }
-
- void set_min_orientation_margin(double margin);
- /* @} */
-
-protected:
- /** Common code for setting the image. Returns true if Init has been called.
- */
- bool InternalSetImage();
-
- /**
- * Run the thresholder to make the thresholded image. If pix is not nullptr,
- * the source is thresholded to pix instead of the internal IMAGE.
- */
- virtual bool Threshold(Pix **pix);
-
- /**
- * Find lines from the image making the BLOCK_LIST.
- * @return 0 on success.
- */
- int FindLines();
-
- /** Delete the pageres and block list ready for a new page. */
- void ClearResults();
-
- /**
- * Return an LTR Result Iterator -- used only for training, as we really want
- * to ignore all BiDi smarts at that point.
- * delete once you're done with it.
- */
- LTRResultIterator *GetLTRIterator();
-
- /**
- * Return the length of the output text string, as UTF8, assuming
- * one newline per line and one per block, with a terminator,
- * and assuming a single character reject marker for each rejected character.
- * Also return the number of recognized blobs in blob_count.
- */
- int TextLength(int *blob_count) const;
-
- //// paragraphs.cpp ////////////////////////////////////////////////////
- void DetectParagraphs(bool after_text_recognition);
-
- const PAGE_RES *GetPageRes() const {
- return page_res_;
- }
-
-protected:
- Tesseract *tesseract_; ///< The underlying data object.
- Tesseract *osd_tesseract_; ///< For orientation & script detection.
- EquationDetect *equ_detect_; ///< The equation detector.
- FileReader reader_; ///< Reads files from any filesystem.
- ImageThresholder *thresholder_; ///< Image thresholding module.
- std::vector *paragraph_models_;
- BLOCK_LIST *block_list_; ///< The page layout.
- PAGE_RES *page_res_; ///< The page-level data.
- std::string input_file_; ///< Name used by training code.
- std::string output_file_; ///< Name used by debug code.
- std::string datapath_; ///< Current location of tessdata.
- std::string language_; ///< Last initialized language.
- OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested.
- bool recognition_done_; ///< page_res_ contains recognition data.
-
- /**
- * @defgroup ThresholderParams Thresholder Parameters
- * Parameters saved from the Thresholder. Needed to rebuild coordinates.
- */
- /* @{ */
- int rect_left_;
- int rect_top_;
- int rect_width_;
- int rect_height_;
- int image_width_;
- int image_height_;
- /* @} */
-
-private:
- // A list of image filenames gets special consideration
- bool ProcessPagesFileList(FILE *fp, std::string *buf,
- const char *retry_config, int timeout_millisec,
- TessResultRenderer *renderer,
- int tessedit_page_number);
- // TIFF supports multipage so gets special consideration.
- bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size,
- const char *filename, const char *retry_config,
- int timeout_millisec,
- TessResultRenderer *renderer,
- int tessedit_page_number);
-}; // class TessBaseAPI.
-
-/** Escape a char string - remove &<>"' with HTML codes. */
-std::string HOcrEscape(const char *text);
-
-} // namespace tesseract
-
-#endif // TESSERACT_API_BASEAPI_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/capi.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/capi.h
deleted file mode 100644
index 40f4856a..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/capi.h
+++ /dev/null
@@ -1,484 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: capi.h
-// Description: C-API TessBaseAPI
-//
-// (C) Copyright 2012, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef API_CAPI_H_
-#define API_CAPI_H_
-
-#include "export.h"
-
-#ifdef __cplusplus
-# include
-# include
-# include
-# include
-# include
-#endif
-
-#include
-#include
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef BOOL
-# define BOOL int
-# define TRUE 1
-# define FALSE 0
-#endif
-
-#ifdef __cplusplus
-typedef tesseract::TessResultRenderer TessResultRenderer;
-typedef tesseract::TessBaseAPI TessBaseAPI;
-typedef tesseract::PageIterator TessPageIterator;
-typedef tesseract::ResultIterator TessResultIterator;
-typedef tesseract::MutableIterator TessMutableIterator;
-typedef tesseract::ChoiceIterator TessChoiceIterator;
-typedef tesseract::OcrEngineMode TessOcrEngineMode;
-typedef tesseract::PageSegMode TessPageSegMode;
-typedef tesseract::PageIteratorLevel TessPageIteratorLevel;
-typedef tesseract::Orientation TessOrientation;
-typedef tesseract::ParagraphJustification TessParagraphJustification;
-typedef tesseract::WritingDirection TessWritingDirection;
-typedef tesseract::TextlineOrder TessTextlineOrder;
-typedef tesseract::PolyBlockType TessPolyBlockType;
-typedef tesseract::ETEXT_DESC ETEXT_DESC;
-#else
-typedef struct TessResultRenderer TessResultRenderer;
-typedef struct TessBaseAPI TessBaseAPI;
-typedef struct TessPageIterator TessPageIterator;
-typedef struct TessResultIterator TessResultIterator;
-typedef struct TessMutableIterator TessMutableIterator;
-typedef struct TessChoiceIterator TessChoiceIterator;
-typedef enum TessOcrEngineMode {
- OEM_TESSERACT_ONLY,
- OEM_LSTM_ONLY,
- OEM_TESSERACT_LSTM_COMBINED,
- OEM_DEFAULT
-} TessOcrEngineMode;
-typedef enum TessPageSegMode {
- PSM_OSD_ONLY,
- PSM_AUTO_OSD,
- PSM_AUTO_ONLY,
- PSM_AUTO,
- PSM_SINGLE_COLUMN,
- PSM_SINGLE_BLOCK_VERT_TEXT,
- PSM_SINGLE_BLOCK,
- PSM_SINGLE_LINE,
- PSM_SINGLE_WORD,
- PSM_CIRCLE_WORD,
- PSM_SINGLE_CHAR,
- PSM_SPARSE_TEXT,
- PSM_SPARSE_TEXT_OSD,
- PSM_RAW_LINE,
- PSM_COUNT
-} TessPageSegMode;
-typedef enum TessPageIteratorLevel {
- RIL_BLOCK,
- RIL_PARA,
- RIL_TEXTLINE,
- RIL_WORD,
- RIL_SYMBOL
-} TessPageIteratorLevel;
-typedef enum TessPolyBlockType {
- PT_UNKNOWN,
- PT_FLOWING_TEXT,
- PT_HEADING_TEXT,
- PT_PULLOUT_TEXT,
- PT_EQUATION,
- PT_INLINE_EQUATION,
- PT_TABLE,
- PT_VERTICAL_TEXT,
- PT_CAPTION_TEXT,
- PT_FLOWING_IMAGE,
- PT_HEADING_IMAGE,
- PT_PULLOUT_IMAGE,
- PT_HORZ_LINE,
- PT_VERT_LINE,
- PT_NOISE,
- PT_COUNT
-} TessPolyBlockType;
-typedef enum TessOrientation {
- ORIENTATION_PAGE_UP,
- ORIENTATION_PAGE_RIGHT,
- ORIENTATION_PAGE_DOWN,
- ORIENTATION_PAGE_LEFT
-} TessOrientation;
-typedef enum TessParagraphJustification {
- JUSTIFICATION_UNKNOWN,
- JUSTIFICATION_LEFT,
- JUSTIFICATION_CENTER,
- JUSTIFICATION_RIGHT
-} TessParagraphJustification;
-typedef enum TessWritingDirection {
- WRITING_DIRECTION_LEFT_TO_RIGHT,
- WRITING_DIRECTION_RIGHT_TO_LEFT,
- WRITING_DIRECTION_TOP_TO_BOTTOM
-} TessWritingDirection;
-typedef enum TessTextlineOrder {
- TEXTLINE_ORDER_LEFT_TO_RIGHT,
- TEXTLINE_ORDER_RIGHT_TO_LEFT,
- TEXTLINE_ORDER_TOP_TO_BOTTOM
-} TessTextlineOrder;
-typedef struct ETEXT_DESC ETEXT_DESC;
-#endif
-
-typedef bool (*TessCancelFunc)(void *cancel_this, int words);
-typedef bool (*TessProgressFunc)(ETEXT_DESC *ths, int left, int right, int top,
- int bottom);
-
-struct Pix;
-struct Boxa;
-struct Pixa;
-
-/* General free functions */
-
-TESS_API const char *TessVersion();
-TESS_API void TessDeleteText(const char *text);
-TESS_API void TessDeleteTextArray(char **arr);
-TESS_API void TessDeleteIntArray(const int *arr);
-
-/* Renderer API */
-TESS_API TessResultRenderer *TessTextRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessHOcrRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase,
- BOOL font_info);
-TESS_API TessResultRenderer *TessAltoRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessTsvRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessPDFRendererCreate(const char *outputbase,
- const char *datadir,
- BOOL textonly);
-TESS_API TessResultRenderer *TessUnlvRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessWordStrBoxRendererCreate(
- const char *outputbase);
-
-TESS_API void TessDeleteResultRenderer(TessResultRenderer *renderer);
-TESS_API void TessResultRendererInsert(TessResultRenderer *renderer,
- TessResultRenderer *next);
-TESS_API TessResultRenderer *TessResultRendererNext(
- TessResultRenderer *renderer);
-TESS_API BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer,
- const char *title);
-TESS_API BOOL TessResultRendererAddImage(TessResultRenderer *renderer,
- TessBaseAPI *api);
-TESS_API BOOL TessResultRendererEndDocument(TessResultRenderer *renderer);
-
-TESS_API const char *TessResultRendererExtention(TessResultRenderer *renderer);
-TESS_API const char *TessResultRendererTitle(TessResultRenderer *renderer);
-TESS_API int TessResultRendererImageNum(TessResultRenderer *renderer);
-
-/* Base API */
-
-TESS_API TessBaseAPI *TessBaseAPICreate();
-TESS_API void TessBaseAPIDelete(TessBaseAPI *handle);
-
-TESS_API size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI *handle, void **device);
-
-TESS_API void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name);
-TESS_API const char *TessBaseAPIGetInputName(TessBaseAPI *handle);
-
-TESS_API void TessBaseAPISetInputImage(TessBaseAPI *handle, struct Pix *pix);
-TESS_API struct Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle);
-
-TESS_API int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle);
-TESS_API const char *TessBaseAPIGetDatapath(TessBaseAPI *handle);
-
-TESS_API void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name);
-
-TESS_API BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name,
- const char *value);
-TESS_API BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name,
- const char *value);
-
-TESS_API BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle,
- const char *name, int *value);
-TESS_API BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle,
- const char *name, BOOL *value);
-TESS_API BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle,
- const char *name, double *value);
-TESS_API const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle,
- const char *name);
-
-TESS_API void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp);
-TESS_API BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle,
- const char *filename);
-
-TESS_API int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath,
- const char *language, TessOcrEngineMode oem,
- char **configs, int configs_size);
-TESS_API int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath,
- const char *language, TessOcrEngineMode oem);
-TESS_API int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath,
- const char *language);
-
-TESS_API int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath,
- const char *language, TessOcrEngineMode mode,
- char **configs, int configs_size, char **vars_vec,
- char **vars_values, size_t vars_vec_size,
- BOOL set_only_non_debug_params);
-
-TESS_API int TessBaseAPIInit5(TessBaseAPI *handle, const char *data, int data_size,
- const char *language, TessOcrEngineMode mode,
- char **configs, int configs_size, char **vars_vec,
- char **vars_values, size_t vars_vec_size,
- BOOL set_only_non_debug_params);
-
-TESS_API const char *TessBaseAPIGetInitLanguagesAsString(
- const TessBaseAPI *handle);
-TESS_API char **TessBaseAPIGetLoadedLanguagesAsVector(
- const TessBaseAPI *handle);
-TESS_API char **TessBaseAPIGetAvailableLanguagesAsVector(
- const TessBaseAPI *handle);
-
-TESS_API void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle);
-
-TESS_API void TessBaseAPIReadConfigFile(TessBaseAPI *handle,
- const char *filename);
-TESS_API void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle,
- const char *filename);
-
-TESS_API void TessBaseAPISetPageSegMode(TessBaseAPI *handle,
- TessPageSegMode mode);
-TESS_API TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle);
-
-TESS_API char *TessBaseAPIRect(TessBaseAPI *handle,
- const unsigned char *imagedata,
- int bytes_per_pixel, int bytes_per_line,
- int left, int top, int width, int height);
-
-TESS_API void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle);
-
-TESS_API void TessBaseAPISetImage(TessBaseAPI *handle,
- const unsigned char *imagedata, int width,
- int height, int bytes_per_pixel,
- int bytes_per_line);
-TESS_API void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix);
-
-TESS_API void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi);
-
-TESS_API void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top,
- int width, int height);
-
-TESS_API struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle);
-TESS_API struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle,
- struct Pixa **pixa);
-TESS_API struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle,
- struct Pixa **pixa,
- int **blockids);
-TESS_API struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle,
- BOOL raw_image, int raw_padding,
- struct Pixa **pixa,
- int **blockids, int **paraids);
-TESS_API struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle,
- struct Pixa **pixa, int **blockids);
-TESS_API struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle,
- struct Pixa **pixa);
-TESS_API struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle,
- struct Pixa **cc);
-TESS_API struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle,
- TessPageIteratorLevel level,
- BOOL text_only,
- struct Pixa **pixa,
- int **blockids);
-TESS_API struct Boxa *TessBaseAPIGetComponentImages1(
- TessBaseAPI *handle, TessPageIteratorLevel level, BOOL text_only,
- BOOL raw_image, int raw_padding, struct Pixa **pixa, int **blockids,
- int **paraids);
-
-TESS_API int TessBaseAPIGetThresholdedImageScaleFactor(
- const TessBaseAPI *handle);
-
-TESS_API TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle);
-
-TESS_API int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor);
-
-TESS_API BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename,
- const char *retry_config,
- int timeout_millisec,
- TessResultRenderer *renderer);
-TESS_API BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix,
- int page_index, const char *filename,
- const char *retry_config,
- int timeout_millisec,
- TessResultRenderer *renderer);
-
-TESS_API TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle);
-TESS_API TessMutableIterator *TessBaseAPIGetMutableIterator(
- TessBaseAPI *handle);
-
-TESS_API char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle);
-TESS_API char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number);
-
-TESS_API char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number);
-TESS_API char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number);
-
-TESS_API char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number);
-TESS_API char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number);
-TESS_API char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle,
- int page_number);
-
-TESS_API char *TessBaseAPIGetUNLVText(TessBaseAPI *handle);
-TESS_API int TessBaseAPIMeanTextConf(TessBaseAPI *handle);
-
-TESS_API int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle);
-
-#ifndef DISABLED_LEGACY_ENGINE
-TESS_API BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle,
- TessPageSegMode mode,
- const char *wordstr);
-#endif // #ifndef DISABLED_LEGACY_ENGINE
-
-TESS_API void TessBaseAPIClear(TessBaseAPI *handle);
-TESS_API void TessBaseAPIEnd(TessBaseAPI *handle);
-
-TESS_API int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word);
-TESS_API BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset,
- float *out_slope);
-
-TESS_API const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id);
-
-TESS_API void TessBaseAPIClearPersistentCache(TessBaseAPI *handle);
-
-#ifndef DISABLED_LEGACY_ENGINE
-
-// Call TessDeleteText(*best_script_name) to free memory allocated by this
-// function
-TESS_API BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle,
- int *orient_deg,
- float *orient_conf,
- const char **script_name,
- float *script_conf);
-#endif // #ifndef DISABLED_LEGACY_ENGINE
-
-TESS_API void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle,
- double margin);
-
-TESS_API int TessBaseAPINumDawgs(const TessBaseAPI *handle);
-
-TESS_API TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle);
-
-TESS_API void TessBaseGetBlockTextOrientations(TessBaseAPI *handle,
- int **block_orientation,
- bool **vertical_writing);
-
-/* Page iterator */
-
-TESS_API void TessPageIteratorDelete(TessPageIterator *handle);
-
-TESS_API TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle);
-
-TESS_API void TessPageIteratorBegin(TessPageIterator *handle);
-
-TESS_API BOOL TessPageIteratorNext(TessPageIterator *handle,
- TessPageIteratorLevel level);
-
-TESS_API BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle,
- TessPageIteratorLevel level);
-
-TESS_API BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle,
- TessPageIteratorLevel level,
- TessPageIteratorLevel element);
-
-TESS_API BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle,
- TessPageIteratorLevel level,
- int *left, int *top, int *right,
- int *bottom);
-
-TESS_API TessPolyBlockType
-TessPageIteratorBlockType(const TessPageIterator *handle);
-
-TESS_API struct Pix *TessPageIteratorGetBinaryImage(
- const TessPageIterator *handle, TessPageIteratorLevel level);
-
-TESS_API struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle,
- TessPageIteratorLevel level,
- int padding,
- struct Pix *original_image,
- int *left, int *top);
-
-TESS_API BOOL TessPageIteratorBaseline(const TessPageIterator *handle,
- TessPageIteratorLevel level, int *x1,
- int *y1, int *x2, int *y2);
-
-TESS_API void TessPageIteratorOrientation(
- TessPageIterator *handle, TessOrientation *orientation,
- TessWritingDirection *writing_direction, TessTextlineOrder *textline_order,
- float *deskew_angle);
-
-TESS_API void TessPageIteratorParagraphInfo(
- TessPageIterator *handle, TessParagraphJustification *justification,
- BOOL *is_list_item, BOOL *is_crown, int *first_line_indent);
-
-/* Result iterator */
-
-TESS_API void TessResultIteratorDelete(TessResultIterator *handle);
-TESS_API TessResultIterator *TessResultIteratorCopy(
- const TessResultIterator *handle);
-TESS_API TessPageIterator *TessResultIteratorGetPageIterator(
- TessResultIterator *handle);
-TESS_API const TessPageIterator *TessResultIteratorGetPageIteratorConst(
- const TessResultIterator *handle);
-TESS_API TessChoiceIterator *TessResultIteratorGetChoiceIterator(
- const TessResultIterator *handle);
-
-TESS_API BOOL TessResultIteratorNext(TessResultIterator *handle,
- TessPageIteratorLevel level);
-TESS_API char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle,
- TessPageIteratorLevel level);
-TESS_API float TessResultIteratorConfidence(const TessResultIterator *handle,
- TessPageIteratorLevel level);
-TESS_API const char *TessResultIteratorWordRecognitionLanguage(
- const TessResultIterator *handle);
-TESS_API const char *TessResultIteratorWordFontAttributes(
- const TessResultIterator *handle, BOOL *is_bold, BOOL *is_italic,
- BOOL *is_underlined, BOOL *is_monospace, BOOL *is_serif, BOOL *is_smallcaps,
- int *pointsize, int *font_id);
-
-TESS_API BOOL
-TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle);
-TESS_API BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle);
-TESS_API BOOL
-TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle);
-TESS_API BOOL
-TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle);
-TESS_API BOOL
-TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle);
-
-TESS_API void TessChoiceIteratorDelete(TessChoiceIterator *handle);
-TESS_API BOOL TessChoiceIteratorNext(TessChoiceIterator *handle);
-TESS_API const char *TessChoiceIteratorGetUTF8Text(
- const TessChoiceIterator *handle);
-TESS_API float TessChoiceIteratorConfidence(const TessChoiceIterator *handle);
-
-/* Progress monitor */
-
-TESS_API ETEXT_DESC *TessMonitorCreate();
-TESS_API void TessMonitorDelete(ETEXT_DESC *monitor);
-TESS_API void TessMonitorSetCancelFunc(ETEXT_DESC *monitor,
- TessCancelFunc cancelFunc);
-TESS_API void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis);
-TESS_API void *TessMonitorGetCancelThis(ETEXT_DESC *monitor);
-TESS_API void TessMonitorSetProgressFunc(ETEXT_DESC *monitor,
- TessProgressFunc progressFunc);
-TESS_API int TessMonitorGetProgress(ETEXT_DESC *monitor);
-TESS_API void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // API_CAPI_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/export.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/export.h
deleted file mode 100644
index d238b628..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/export.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: export.h
-// Description: Place holder
-//
-// (C) Copyright 2006, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_PLATFORM_H_
-#define TESSERACT_PLATFORM_H_
-
-#ifndef TESS_API
-# if defined(_WIN32) || defined(__CYGWIN__)
-# if defined(TESS_EXPORTS)
-# define TESS_API __declspec(dllexport)
-# elif defined(TESS_IMPORTS)
-# define TESS_API __declspec(dllimport)
-# else
-# define TESS_API
-# endif
-# else
-# if defined(TESS_EXPORTS) || defined(TESS_IMPORTS)
-# define TESS_API __attribute__((visibility("default")))
-# else
-# define TESS_API
-# endif
-# endif
-#endif
-
-#endif // TESSERACT_PLATFORM_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/ltrresultiterator.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/ltrresultiterator.h
deleted file mode 100644
index 6ca0a98e..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/ltrresultiterator.h
+++ /dev/null
@@ -1,235 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: ltrresultiterator.h
-// Description: Iterator for tesseract results in strict left-to-right
-// order that avoids using tesseract internal data structures.
-// Author: Ray Smith
-//
-// (C) Copyright 2010, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
-#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
-
-#include "export.h" // for TESS_API
-#include "pageiterator.h" // for PageIterator
-#include "publictypes.h" // for PageIteratorLevel
-#include "unichar.h" // for StrongScriptDirection
-
-namespace tesseract {
-
-class BLOB_CHOICE_IT;
-class PAGE_RES;
-class WERD_RES;
-
-class Tesseract;
-
-// Class to iterate over tesseract results, providing access to all levels
-// of the page hierarchy, without including any tesseract headers or having
-// to handle any tesseract structures.
-// WARNING! This class points to data held within the TessBaseAPI class, and
-// therefore can only be used while the TessBaseAPI class still exists and
-// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
-// DetectOS, or anything else that changes the internal PAGE_RES.
-// See tesseract/publictypes.h for the definition of PageIteratorLevel.
-// See also base class PageIterator, which contains the bulk of the interface.
-// LTRResultIterator adds text-specific methods for access to OCR output.
-
-class TESS_API LTRResultIterator : public PageIterator {
- friend class ChoiceIterator;
-
-public:
- // page_res and tesseract come directly from the BaseAPI.
- // The rectangle parameters are copied indirectly from the Thresholder,
- // via the BaseAPI. They represent the coordinates of some rectangle in an
- // original image (in top-left-origin coordinates) and therefore the top-left
- // needs to be added to any output boxes in order to specify coordinates
- // in the original image. See TessBaseAPI::SetRectangle.
- // The scale and scaled_yres are in case the Thresholder scaled the image
- // rectangle prior to thresholding. Any coordinates in tesseract's image
- // must be divided by scale before adding (rect_left, rect_top).
- // The scaled_yres indicates the effective resolution of the binary image
- // that tesseract has been given by the Thresholder.
- // After the constructor, Begin has already been called.
- LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
- int scaled_yres, int rect_left, int rect_top,
- int rect_width, int rect_height);
-
- ~LTRResultIterator() override;
-
- // LTRResultIterators may be copied! This makes it possible to iterate over
- // all the objects at a lower level, while maintaining an iterator to
- // objects at a higher level. These constructors DO NOT CALL Begin, so
- // iterations will continue from the location of src.
- // TODO: For now the copy constructor and operator= only need the base class
- // versions, but if new data members are added, don't forget to add them!
-
- // ============= Moving around within the page ============.
-
- // See PageIterator.
-
- // ============= Accessing data ==============.
-
- // Returns the null terminated UTF-8 encoded text string for the current
- // object at the given level. Use delete [] to free after use.
- char *GetUTF8Text(PageIteratorLevel level) const;
-
- // Set the string inserted at the end of each text line. "\n" by default.
- void SetLineSeparator(const char *new_line);
-
- // Set the string inserted at the end of each paragraph. "\n" by default.
- void SetParagraphSeparator(const char *new_para);
-
- // Returns the mean confidence of the current object at the given level.
- // The number should be interpreted as a percent probability. (0.0f-100.0f)
- float Confidence(PageIteratorLevel level) const;
-
- // ============= Functions that refer to words only ============.
-
- // Returns the font attributes of the current word. If iterating at a higher
- // level object than words, eg textlines, then this will return the
- // attributes of the first word in that textline.
- // The actual return value is a string representing a font name. It points
- // to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
- // the iterator itself, ie rendered invalid by various members of
- // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
- // Pointsize is returned in printers points (1/72 inch.)
- const char *WordFontAttributes(bool *is_bold, bool *is_italic,
- bool *is_underlined, bool *is_monospace,
- bool *is_serif, bool *is_smallcaps,
- int *pointsize, int *font_id) const;
-
- // Return the name of the language used to recognize this word.
- // On error, nullptr. Do not delete this pointer.
- const char *WordRecognitionLanguage() const;
-
- // Return the overall directionality of this word.
- StrongScriptDirection WordDirection() const;
-
- // Returns true if the current word was found in a dictionary.
- bool WordIsFromDictionary() const;
-
- // Returns the number of blanks before the current word.
- int BlanksBeforeWord() const;
-
- // Returns true if the current word is numeric.
- bool WordIsNumeric() const;
-
- // Returns true if the word contains blamer information.
- bool HasBlamerInfo() const;
-
- // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
- // of the current word.
- const void *GetParamsTrainingBundle() const;
-
- // Returns a pointer to the string with blamer information for this word.
- // Assumes that the word's blamer_bundle is not nullptr.
- const char *GetBlamerDebug() const;
-
- // Returns a pointer to the string with misadaption information for this word.
- // Assumes that the word's blamer_bundle is not nullptr.
- const char *GetBlamerMisadaptionDebug() const;
-
- // Returns true if a truth string was recorded for the current word.
- bool HasTruthString() const;
-
- // Returns true if the given string is equivalent to the truth string for
- // the current word.
- bool EquivalentToTruth(const char *str) const;
-
- // Returns a null terminated UTF-8 encoded truth string for the current word.
- // Use delete [] to free after use.
- char *WordTruthUTF8Text() const;
-
- // Returns a null terminated UTF-8 encoded normalized OCR string for the
- // current word. Use delete [] to free after use.
- char *WordNormedUTF8Text() const;
-
- // Returns a pointer to serialized choice lattice.
- // Fills lattice_size with the number of bytes in lattice data.
- const char *WordLattice(int *lattice_size) const;
-
- // ============= Functions that refer to symbols only ============.
-
- // Returns true if the current symbol is a superscript.
- // If iterating at a higher level object than symbols, eg words, then
- // this will return the attributes of the first symbol in that word.
- bool SymbolIsSuperscript() const;
- // Returns true if the current symbol is a subscript.
- // If iterating at a higher level object than symbols, eg words, then
- // this will return the attributes of the first symbol in that word.
- bool SymbolIsSubscript() const;
- // Returns true if the current symbol is a dropcap.
- // If iterating at a higher level object than symbols, eg words, then
- // this will return the attributes of the first symbol in that word.
- bool SymbolIsDropcap() const;
-
-protected:
- const char *line_separator_;
- const char *paragraph_separator_;
-};
-
-// Class to iterate over the classifier choices for a single RIL_SYMBOL.
-class TESS_API ChoiceIterator {
-public:
- // Construction is from a LTRResultIterator that points to the symbol of
- // interest. The ChoiceIterator allows a one-shot iteration over the
- // choices for this symbol and after that it is useless.
- explicit ChoiceIterator(const LTRResultIterator &result_it);
- ~ChoiceIterator();
-
- // Moves to the next choice for the symbol and returns false if there
- // are none left.
- bool Next();
-
- // ============= Accessing data ==============.
-
- // Returns the null terminated UTF-8 encoded text string for the current
- // choice.
- // NOTE: Unlike LTRResultIterator::GetUTF8Text, the return points to an
- // internal structure and should NOT be delete[]ed to free after use.
- const char *GetUTF8Text() const;
-
- // Returns the confidence of the current choice depending on the used language
- // data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All
- // choices for one symbol should roughly add up to 1.0f.
- // If only traineddata of the legacy engine is used, the number should be
- // interpreted as a percent probability. (0.0f-100.0f) In this case
- // probabilities won't add up to 100. Each one stands on its own.
- float Confidence() const;
-
- // Returns a vector containing all timesteps, which belong to the currently
- // selected symbol. A timestep is a vector containing pairs of symbols and
- // floating point numbers. The number states the probability for the
- // corresponding symbol.
- std::vector>> *Timesteps() const;
-
-private:
- // clears the remaining spaces out of the results and adapt the probabilities
- void filterSpaces();
- // Pointer to the WERD_RES object owned by the API.
- WERD_RES *word_res_;
- // Iterator over the blob choices.
- BLOB_CHOICE_IT *choice_it_;
- std::vector> *LSTM_choices_ = nullptr;
- std::vector>::iterator LSTM_choice_it_;
-
- const int *tstep_index_;
- // regulates the rating granularity
- double rating_coefficient_;
- // leading blanks
- int blanks_before_word_;
- // true when there is lstm engine related trained data
- bool oemLSTM_;
-};
-
-} // namespace tesseract.
-
-#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/ocrclass.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/ocrclass.h
deleted file mode 100644
index a55e6528..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/ocrclass.h
+++ /dev/null
@@ -1,158 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-/**********************************************************************
- * File: ocrclass.h
- * Description: Class definitions and constants for the OCR API.
- * Author: Hewlett-Packard Co
- *
- * (C) Copyright 1996, Hewlett-Packard Co.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-/**********************************************************************
- * This file contains typedefs for all the structures used by
- * the HP OCR interface.
- * The structures are designed to allow them to be used with any
- * structure alignment up to 8.
- **********************************************************************/
-
-#ifndef CCUTIL_OCRCLASS_H_
-#define CCUTIL_OCRCLASS_H_
-
-#include
-#include
-
-namespace tesseract {
-
-/**********************************************************************
- * EANYCODE_CHAR
- * Description of a single character. The character code is defined by
- * the character set of the current font.
- * Output text is sent as an array of these structures.
- * Spaces and line endings in the output are represented in the
- * structures of the surrounding characters. They are not directly
- * represented as characters.
- * The first character in a word has a positive value of blanks.
- * Missing information should be set to the defaults in the comments.
- * If word bounds are known, but not character bounds, then the top and
- * bottom of each character should be those of the word. The left of the
- * first and right of the last char in each word should be set. All other
- * lefts and rights should be set to -1.
- * If set, the values of right and bottom are left+width and top+height.
- * Most of the members come directly from the parameters to ocr_append_char.
- * The formatting member uses the enhancement parameter and combines the
- * line direction stuff into the top 3 bits.
- * The coding is 0=RL char, 1=LR char, 2=DR NL, 3=UL NL, 4=DR Para,
- * 5=UL Para, 6=TB char, 7=BT char. API users do not need to know what
- * the coding is, only that it is backwards compatible with the previous
- * version.
- **********************************************************************/
-
-struct EANYCODE_CHAR { /*single character */
- // It should be noted that the format for char_code for version 2.0 and beyond
- // is UTF8 which means that ASCII characters will come out as one structure
- // but other characters will be returned in two or more instances of this
- // structure with a single byte of the UTF8 code in each, but each will have
- // the same bounding box. Programs which want to handle languagues with
- // different characters sets will need to handle extended characters
- // appropriately, but *all* code needs to be prepared to receive UTF8 coded
- // characters for characters such as bullet and fancy quotes.
- uint16_t char_code; /*character itself */
- int16_t left; /*of char (-1) */
- int16_t right; /*of char (-1) */
- int16_t top; /*of char (-1) */
- int16_t bottom; /*of char (-1) */
- int16_t font_index; /*what font (0) */
- uint8_t confidence; /*0=perfect, 100=reject (0/100) */
- uint8_t point_size; /*of char, 72=i inch, (10) */
- int8_t blanks; /*no of spaces before this char (1) */
- uint8_t formatting; /*char formatting (0) */
-};
-
-/**********************************************************************
- * ETEXT_DESC
- * Description of the output of the OCR engine.
- * This structure is used as both a progress monitor and the final
- * output header, since it needs to be a valid progress monitor while
- * the OCR engine is storing its output to shared memory.
- * During progress, all the buffer info is -1.
- * Progress starts at 0 and increases to 100 during OCR. No other constraint.
- * Additionally the progress callback contains the bounding box of the word that
- * is currently being processed.
- * Every progress callback, the OCR engine must set ocr_alive to 1.
- * The HP side will set ocr_alive to 0. Repeated failure to reset
- * to 1 indicates that the OCR engine is dead.
- * If the cancel function is not null then it is called with the number of
- * user words found. If it returns true then operation is cancelled.
- **********************************************************************/
-class ETEXT_DESC;
-
-using CANCEL_FUNC = bool (*)(void *, int);
-using PROGRESS_FUNC = bool (*)(int, int, int, int, int);
-using PROGRESS_FUNC2 = bool (*)(ETEXT_DESC *, int, int, int, int);
-
-class ETEXT_DESC { // output header
-public:
- int16_t count{0}; /// chars in this buffer(0)
- int16_t progress{0}; /// percent complete increasing (0-100)
- /** Progress monitor covers word recognition and it does not cover layout
- * analysis.
- * See Ray comment in https://github.com/tesseract-ocr/tesseract/pull/27 */
- int8_t more_to_come{0}; /// true if not last
- volatile int8_t ocr_alive{0}; /// ocr sets to 1, HP 0
- int8_t err_code{0}; /// for errcode use
- CANCEL_FUNC cancel{nullptr}; /// returns true to cancel
- PROGRESS_FUNC progress_callback{
- nullptr}; /// called whenever progress increases
- PROGRESS_FUNC2 progress_callback2; /// monitor-aware progress callback
- void *cancel_this{nullptr}; /// this or other data for cancel
- std::chrono::steady_clock::time_point end_time;
- /// Time to stop. Expected to be set only
- /// by call to set_deadline_msecs().
- EANYCODE_CHAR text[1]{}; /// character data
-
- ETEXT_DESC() : progress_callback2(&default_progress_func) {
- end_time = std::chrono::time_point();
- }
-
- // Sets the end time to be deadline_msecs milliseconds from now.
- void set_deadline_msecs(int32_t deadline_msecs) {
- if (deadline_msecs > 0) {
- end_time = std::chrono::steady_clock::now() +
- std::chrono::milliseconds(deadline_msecs);
- }
- }
-
- // Returns false if we've not passed the end_time, or have not set a deadline.
- bool deadline_exceeded() const {
- if (end_time.time_since_epoch() ==
- std::chrono::steady_clock::duration::zero()) {
- return false;
- }
- auto now = std::chrono::steady_clock::now();
- return (now > end_time);
- }
-
-private:
- static bool default_progress_func(ETEXT_DESC *ths, int left, int right,
- int top, int bottom) {
- if (ths->progress_callback != nullptr) {
- return (*(ths->progress_callback))(ths->progress, left, right, top,
- bottom);
- }
- return true;
- }
-};
-
-} // namespace tesseract
-
-#endif // CCUTIL_OCRCLASS_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/osdetect.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/osdetect.h
deleted file mode 100644
index 34bfb557..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/osdetect.h
+++ /dev/null
@@ -1,139 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: osdetect.h
-// Description: Orientation and script detection.
-// Author: Samuel Charron
-// Ranjith Unnikrishnan
-//
-// (C) Copyright 2008, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCMAIN_OSDETECT_H_
-#define TESSERACT_CCMAIN_OSDETECT_H_
-
-#include "export.h" // for TESS_API
-
-#include // for std::vector
-
-namespace tesseract {
-
-class BLOBNBOX;
-class BLOBNBOX_CLIST;
-class BLOB_CHOICE_LIST;
-class TO_BLOCK_LIST;
-class UNICHARSET;
-
-class Tesseract;
-
-// Max number of scripts in ICU + "NULL" + Japanese and Korean + Fraktur
-const int kMaxNumberOfScripts = 116 + 1 + 2 + 1;
-
-struct OSBestResult {
- OSBestResult()
- : orientation_id(0), script_id(0), sconfidence(0.0), oconfidence(0.0) {}
- int orientation_id;
- int script_id;
- float sconfidence;
- float oconfidence;
-};
-
-struct OSResults {
- OSResults() : unicharset(nullptr) {
- for (int i = 0; i < 4; ++i) {
- for (int j = 0; j < kMaxNumberOfScripts; ++j) {
- scripts_na[i][j] = 0;
- }
- orientations[i] = 0;
- }
- }
- void update_best_orientation();
- // Set the estimate of the orientation to the given id.
- void set_best_orientation(int orientation_id);
- // Update/Compute the best estimate of the script assuming the given
- // orientation id.
- void update_best_script(int orientation_id);
- // Return the index of the script with the highest score for this orientation.
- TESS_API int get_best_script(int orientation_id) const;
- // Accumulate scores with given OSResults instance and update the best script.
- void accumulate(const OSResults &osr);
-
- // Print statistics.
- void print_scores(void) const;
- void print_scores(int orientation_id) const;
-
- // Array holding scores for each orientation id [0,3].
- // Orientation ids [0..3] map to [0, 270, 180, 90] degree orientations of the
- // page respectively, where the values refer to the amount of clockwise
- // rotation to be applied to the page for the text to be upright and readable.
- float orientations[4];
- // Script confidence scores for each of 4 possible orientations.
- float scripts_na[4][kMaxNumberOfScripts];
-
- UNICHARSET *unicharset;
- OSBestResult best_result;
-};
-
-class OrientationDetector {
-public:
- OrientationDetector(const std::vector *allowed_scripts,
- OSResults *results);
- bool detect_blob(BLOB_CHOICE_LIST *scores);
- int get_orientation();
-
-private:
- OSResults *osr_;
- const std::vector *allowed_scripts_;
-};
-
-class ScriptDetector {
-public:
- ScriptDetector(const std::vector *allowed_scripts, OSResults *osr,
- tesseract::Tesseract *tess);
- void detect_blob(BLOB_CHOICE_LIST *scores);
- bool must_stop(int orientation) const;
-
-private:
- OSResults *osr_;
- static const char *korean_script_;
- static const char *japanese_script_;
- static const char *fraktur_script_;
- int korean_id_;
- int japanese_id_;
- int katakana_id_;
- int hiragana_id_;
- int han_id_;
- int hangul_id_;
- int latin_id_;
- int fraktur_id_;
- tesseract::Tesseract *tess_;
- const std::vector *allowed_scripts_;
-};
-
-int orientation_and_script_detection(const char *filename, OSResults *,
- tesseract::Tesseract *);
-
-int os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr,
- tesseract::Tesseract *tess);
-
-int os_detect_blobs(const std::vector *allowed_scripts,
- BLOBNBOX_CLIST *blob_list, OSResults *osr,
- tesseract::Tesseract *tess);
-
-bool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s,
- OSResults *, tesseract::Tesseract *tess);
-
-// Helper method to convert an orientation index to its value in degrees.
-// The value represents the amount of clockwise rotation in degrees that must be
-// applied for the text to be upright (readable).
-TESS_API int OrientationIdToValue(const int &id);
-
-} // namespace tesseract
-
-#endif // TESSERACT_CCMAIN_OSDETECT_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/pageiterator.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/pageiterator.h
deleted file mode 100644
index 68739715..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/pageiterator.h
+++ /dev/null
@@ -1,364 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: pageiterator.h
-// Description: Iterator for tesseract page structure that avoids using
-// tesseract internal data structures.
-// Author: Ray Smith
-//
-// (C) Copyright 2010, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_
-#define TESSERACT_CCMAIN_PAGEITERATOR_H_
-
-#include "export.h"
-#include "publictypes.h"
-
-struct Pix;
-struct Pta;
-
-namespace tesseract {
-
-struct BlamerBundle;
-class C_BLOB_IT;
-class PAGE_RES;
-class PAGE_RES_IT;
-class WERD;
-
-class Tesseract;
-
-/**
- * Class to iterate over tesseract page structure, providing access to all
- * levels of the page hierarchy, without including any tesseract headers or
- * having to handle any tesseract structures.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- * See tesseract/publictypes.h for the definition of PageIteratorLevel.
- * See also ResultIterator, derived from PageIterator, which adds in the
- * ability to access OCR output with text-specific methods.
- */
-
-class TESS_API PageIterator {
-public:
- /**
- * page_res and tesseract come directly from the BaseAPI.
- * The rectangle parameters are copied indirectly from the Thresholder,
- * via the BaseAPI. They represent the coordinates of some rectangle in an
- * original image (in top-left-origin coordinates) and therefore the top-left
- * needs to be added to any output boxes in order to specify coordinates
- * in the original image. See TessBaseAPI::SetRectangle.
- * The scale and scaled_yres are in case the Thresholder scaled the image
- * rectangle prior to thresholding. Any coordinates in tesseract's image
- * must be divided by scale before adding (rect_left, rect_top).
- * The scaled_yres indicates the effective resolution of the binary image
- * that tesseract has been given by the Thresholder.
- * After the constructor, Begin has already been called.
- */
- PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
- int scaled_yres, int rect_left, int rect_top, int rect_width,
- int rect_height);
- virtual ~PageIterator();
-
- /**
- * Page/ResultIterators may be copied! This makes it possible to iterate over
- * all the objects at a lower level, while maintaining an iterator to
- * objects at a higher level. These constructors DO NOT CALL Begin, so
- * iterations will continue from the location of src.
- */
- PageIterator(const PageIterator &src);
- const PageIterator &operator=(const PageIterator &src);
-
- /** Are we positioned at the same location as other? */
- bool PositionedAtSameWord(const PAGE_RES_IT *other) const;
-
- // ============= Moving around within the page ============.
-
- /**
- * Moves the iterator to point to the start of the page to begin an
- * iteration.
- */
- virtual void Begin();
-
- /**
- * Moves the iterator to the beginning of the paragraph.
- * This class implements this functionality by moving it to the zero indexed
- * blob of the first (leftmost) word on the first row of the paragraph.
- */
- virtual void RestartParagraph();
-
- /**
- * Return whether this iterator points anywhere in the first textline of a
- * paragraph.
- */
- bool IsWithinFirstTextlineOfParagraph() const;
-
- /**
- * Moves the iterator to the beginning of the text line.
- * This class implements this functionality by moving it to the zero indexed
- * blob of the first (leftmost) word of the row.
- */
- virtual void RestartRow();
-
- /**
- * Moves to the start of the next object at the given level in the
- * page hierarchy, and returns false if the end of the page was reached.
- * NOTE that RIL_SYMBOL will skip non-text blocks, but all other
- * PageIteratorLevel level values will visit each non-text block once.
- * Think of non text blocks as containing a single para, with a single line,
- * with a single imaginary word.
- * Calls to Next with different levels may be freely intermixed.
- * This function iterates words in right-to-left scripts correctly, if
- * the appropriate language has been loaded into Tesseract.
- */
- virtual bool Next(PageIteratorLevel level);
-
- /**
- * Returns true if the iterator is at the start of an object at the given
- * level.
- *
- * For instance, suppose an iterator it is pointed to the first symbol of the
- * first word of the third line of the second paragraph of the first block in
- * a page, then:
- * it.IsAtBeginningOf(RIL_BLOCK) = false
- * it.IsAtBeginningOf(RIL_PARA) = false
- * it.IsAtBeginningOf(RIL_TEXTLINE) = true
- * it.IsAtBeginningOf(RIL_WORD) = true
- * it.IsAtBeginningOf(RIL_SYMBOL) = true
- */
- virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
-
- /**
- * Returns whether the iterator is positioned at the last element in a
- * given level. (e.g. the last word in a line, the last line in a block)
- *
- * Here's some two-paragraph example
- * text. It starts off innocuously
- * enough but quickly turns bizarre.
- * The author inserts a cornucopia
- * of words to guard against confused
- * references.
- *
- * Now take an iterator it pointed to the start of "bizarre."
- * it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false
- * it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true
- * it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false
- */
- virtual bool IsAtFinalElement(PageIteratorLevel level,
- PageIteratorLevel element) const;
-
- /**
- * Returns whether this iterator is positioned
- * before other: -1
- * equal to other: 0
- * after other: 1
- */
- int Cmp(const PageIterator &other) const;
-
- // ============= Accessing data ==============.
- // Coordinate system:
- // Integer coordinates are at the cracks between the pixels.
- // The top-left corner of the top-left pixel in the image is at (0,0).
- // The bottom-right corner of the bottom-right pixel in the image is at
- // (width, height).
- // Every bounding box goes from the top-left of the top-left contained
- // pixel to the bottom-right of the bottom-right contained pixel, so
- // the bounding box of the single top-left pixel in the image is:
- // (0,0)->(1,1).
- // If an image rectangle has been set in the API, then returned coordinates
- // relate to the original (full) image, rather than the rectangle.
-
- /**
- * Controls what to include in a bounding box. Bounding boxes of all levels
- * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics.
- * Between layout analysis and recognition, it isn't known where all
- * diacritics belong, so this control is used to include or exclude some
- * diacritics that are above or below the main body of the word. In most cases
- * where the placement is obvious, and after recognition, it doesn't make as
- * much difference, as the diacritics will already be included in the word.
- */
- void SetBoundingBoxComponents(bool include_upper_dots,
- bool include_lower_dots) {
- include_upper_dots_ = include_upper_dots;
- include_lower_dots_ = include_lower_dots;
- }
-
- /**
- * Returns the bounding rectangle of the current object at the given level.
- * See comment on coordinate system above.
- * Returns false if there is no such object at the current position.
- * The returned bounding box is guaranteed to match the size and position
- * of the image returned by GetBinaryImage, but may clip foreground pixels
- * from a grey image. The padding argument to GetImage can be used to expand
- * the image to include more foreground pixels. See GetImage below.
- */
- bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right,
- int *bottom) const;
- bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top,
- int *right, int *bottom) const;
- /**
- * Returns the bounding rectangle of the object in a coordinate system of the
- * working image rectangle having its origin at (rect_left_, rect_top_) with
- * respect to the original image and is scaled by a factor scale_.
- */
- bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top,
- int *right, int *bottom) const;
-
- /** Returns whether there is no object of a given level. */
- bool Empty(PageIteratorLevel level) const;
-
- /**
- * Returns the type of the current block.
- * See tesseract/publictypes.h for PolyBlockType.
- */
- PolyBlockType BlockType() const;
-
- /**
- * Returns the polygon outline of the current block. The returned Pta must
- * be ptaDestroy-ed after use. Note that the returned Pta lists the vertices
- * of the polygon, and the last edge is the line segment between the last
- * point and the first point. nullptr will be returned if the iterator is
- * at the end of the document or layout analysis was not used.
- */
- Pta *BlockPolygon() const;
-
- /**
- * Returns a binary image of the current object at the given level.
- * The position and size match the return from BoundingBoxInternal, and so
- * this could be upscaled with respect to the original input image.
- * Use pixDestroy to delete the image after use.
- */
- Pix *GetBinaryImage(PageIteratorLevel level) const;
-
- /**
- * Returns an image of the current object at the given level in greyscale
- * if available in the input. To guarantee a binary image use BinaryImage.
- * NOTE that in order to give the best possible image, the bounds are
- * expanded slightly over the binary connected component, by the supplied
- * padding, so the top-left position of the returned image is returned
- * in (left,top). These will most likely not match the coordinates
- * returned by BoundingBox.
- * If you do not supply an original image, you will get a binary one.
- * Use pixDestroy to delete the image after use.
- */
- Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img,
- int *left, int *top) const;
-
- /**
- * Returns the baseline of the current object at the given level.
- * The baseline is the line that passes through (x1, y1) and (x2, y2).
- * WARNING: with vertical text, baselines may be vertical!
- * Returns false if there is no baseline at the current position.
- */
- bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2,
- int *y2) const;
-
- // Returns the attributes of the current row.
- void RowAttributes(float *row_height, float *descenders,
- float *ascenders) const;
-
- /**
- * Returns orientation for the block the iterator points to.
- * orientation, writing_direction, textline_order: see publictypes.h
- * deskew_angle: after rotating the block so the text orientation is
- * upright, how many radians does one have to rotate the
- * block anti-clockwise for it to be level?
- * -Pi/4 <= deskew_angle <= Pi/4
- */
- void Orientation(tesseract::Orientation *orientation,
- tesseract::WritingDirection *writing_direction,
- tesseract::TextlineOrder *textline_order,
- float *deskew_angle) const;
-
- /**
- * Returns information about the current paragraph, if available.
- *
- * justification -
- * LEFT if ragged right, or fully justified and script is left-to-right.
- * RIGHT if ragged left, or fully justified and script is right-to-left.
- * unknown if it looks like source code or we have very few lines.
- * is_list_item -
- * true if we believe this is a member of an ordered or unordered list.
- * is_crown -
- * true if the first line of the paragraph is aligned with the other
- * lines of the paragraph even though subsequent paragraphs have first
- * line indents. This typically indicates that this is the continuation
- * of a previous paragraph or that it is the very first paragraph in
- * the chapter.
- * first_line_indent -
- * For LEFT aligned paragraphs, the first text line of paragraphs of
- * this kind are indented this many pixels from the left edge of the
- * rest of the paragraph.
- * for RIGHT aligned paragraphs, the first text line of paragraphs of
- * this kind are indented this many pixels from the right edge of the
- * rest of the paragraph.
- * NOTE 1: This value may be negative.
- * NOTE 2: if *is_crown == true, the first line of this paragraph is
- * actually flush, and first_line_indent is set to the "common"
- * first_line_indent for subsequent paragraphs in this block
- * of text.
- */
- void ParagraphInfo(tesseract::ParagraphJustification *justification,
- bool *is_list_item, bool *is_crown,
- int *first_line_indent) const;
-
- // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle
- // of the current word to the given pointer (takes ownership of the pointer)
- // and returns true.
- // Can only be used when iterating on the word level.
- bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);
-
-protected:
- /**
- * Sets up the internal data for iterating the blobs of a new word, then
- * moves the iterator to the given offset.
- */
- void BeginWord(int offset);
-
- /** Pointer to the page_res owned by the API. */
- PAGE_RES *page_res_;
- /** Pointer to the Tesseract object owned by the API. */
- Tesseract *tesseract_;
- /**
- * The iterator to the page_res_. Owned by this ResultIterator.
- * A pointer just to avoid dragging in Tesseract includes.
- */
- PAGE_RES_IT *it_;
- /**
- * The current input WERD being iterated. If there is an output from OCR,
- * then word_ is nullptr. Owned by the API
- */
- WERD *word_;
- /** The length of the current word_. */
- int word_length_;
- /** The current blob index within the word. */
- int blob_index_;
- /**
- * Iterator to the blobs within the word. If nullptr, then we are iterating
- * OCR results in the box_word.
- * Owned by this ResultIterator.
- */
- C_BLOB_IT *cblob_it_;
- /** Control over what to include in bounding boxes. */
- bool include_upper_dots_;
- bool include_lower_dots_;
- /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/
- int scale_;
- int scaled_yres_;
- int rect_left_;
- int rect_top_;
- int rect_width_;
- int rect_height_;
-};
-
-} // namespace tesseract.
-
-#endif // TESSERACT_CCMAIN_PAGEITERATOR_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/publictypes.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/publictypes.h
deleted file mode 100644
index 0069cf28..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/publictypes.h
+++ /dev/null
@@ -1,281 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: publictypes.h
-// Description: Types used in both the API and internally
-// Author: Ray Smith
-//
-// (C) Copyright 2010, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCSTRUCT_PUBLICTYPES_H_
-#define TESSERACT_CCSTRUCT_PUBLICTYPES_H_
-
-namespace tesseract {
-
-// This file contains types that are used both by the API and internally
-// to Tesseract. In order to decouple the API from Tesseract and prevent cyclic
-// dependencies, THIS FILE SHOULD NOT DEPEND ON ANY OTHER PART OF TESSERACT.
-// Restated: It is OK for low-level Tesseract files to include publictypes.h,
-// but not for the low-level tesseract code to include top-level API code.
-// This file should not use other Tesseract types, as that would drag
-// their includes into the API-level.
-
-/** Number of printers' points in an inch. The unit of the pointsize return. */
-constexpr int kPointsPerInch = 72;
-/**
- * Minimum believable resolution. Used as a default if there is no other
- * information, as it is safer to under-estimate than over-estimate.
- */
-constexpr int kMinCredibleResolution = 70;
-/** Maximum believable resolution. */
-constexpr int kMaxCredibleResolution = 2400;
-/**
- * Ratio between median blob size and likely resolution. Used to estimate
- * resolution when none is provided. This is basically 1/usual text size in
- * inches. */
-constexpr int kResolutionEstimationFactor = 10;
-
-/**
- * Possible types for a POLY_BLOCK or ColPartition.
- * Must be kept in sync with kPBColors in polyblk.cpp and PTIs*Type functions
- * below, as well as kPolyBlockNames in layout_test.cc.
- * Used extensively by ColPartition, and POLY_BLOCK.
- */
-enum PolyBlockType {
- PT_UNKNOWN, // Type is not yet known. Keep as the first element.
- PT_FLOWING_TEXT, // Text that lives inside a column.
- PT_HEADING_TEXT, // Text that spans more than one column.
- PT_PULLOUT_TEXT, // Text that is in a cross-column pull-out region.
- PT_EQUATION, // Partition belonging to an equation region.
- PT_INLINE_EQUATION, // Partition has inline equation.
- PT_TABLE, // Partition belonging to a table region.
- PT_VERTICAL_TEXT, // Text-line runs vertically.
- PT_CAPTION_TEXT, // Text that belongs to an image.
- PT_FLOWING_IMAGE, // Image that lives inside a column.
- PT_HEADING_IMAGE, // Image that spans more than one column.
- PT_PULLOUT_IMAGE, // Image that is in a cross-column pull-out region.
- PT_HORZ_LINE, // Horizontal Line.
- PT_VERT_LINE, // Vertical Line.
- PT_NOISE, // Lies outside of any column.
- PT_COUNT
-};
-
-/** Returns true if PolyBlockType is of horizontal line type */
-inline bool PTIsLineType(PolyBlockType type) {
- return type == PT_HORZ_LINE || type == PT_VERT_LINE;
-}
-/** Returns true if PolyBlockType is of image type */
-inline bool PTIsImageType(PolyBlockType type) {
- return type == PT_FLOWING_IMAGE || type == PT_HEADING_IMAGE ||
- type == PT_PULLOUT_IMAGE;
-}
-/** Returns true if PolyBlockType is of text type */
-inline bool PTIsTextType(PolyBlockType type) {
- return type == PT_FLOWING_TEXT || type == PT_HEADING_TEXT ||
- type == PT_PULLOUT_TEXT || type == PT_TABLE ||
- type == PT_VERTICAL_TEXT || type == PT_CAPTION_TEXT ||
- type == PT_INLINE_EQUATION;
-}
-// Returns true if PolyBlockType is of pullout(inter-column) type
-inline bool PTIsPulloutType(PolyBlockType type) {
- return type == PT_PULLOUT_IMAGE || type == PT_PULLOUT_TEXT;
-}
-
-/**
- * +------------------+ Orientation Example:
- * | 1 Aaaa Aaaa Aaaa | ====================
- * | Aaa aa aaa aa | To left is a diagram of some (1) English and
- * | aaaaaa A aa aaa. | (2) Chinese text and a (3) photo credit.
- * | 2 |
- * | ####### c c C | Upright Latin characters are represented as A and a.
- * | ####### c c c | '<' represents a latin character rotated
- * | < ####### c c c | anti-clockwise 90 degrees.
- * | < ####### c c |
- * | < ####### . c | Upright Chinese characters are represented C and c.
- * | 3 ####### c |
- * +------------------+ NOTA BENE: enum values here should match goodoc.proto
-
- * If you orient your head so that "up" aligns with Orientation,
- * then the characters will appear "right side up" and readable.
- *
- * In the example above, both the English and Chinese paragraphs are oriented
- * so their "up" is the top of the page (page up). The photo credit is read
- * with one's head turned leftward ("up" is to page left).
- *
- * The values of this enum match the convention of Tesseract's osdetect.h
-*/
-enum Orientation {
- ORIENTATION_PAGE_UP = 0,
- ORIENTATION_PAGE_RIGHT = 1,
- ORIENTATION_PAGE_DOWN = 2,
- ORIENTATION_PAGE_LEFT = 3,
-};
-
-/**
- * The grapheme clusters within a line of text are laid out logically
- * in this direction, judged when looking at the text line rotated so that
- * its Orientation is "page up".
- *
- * For English text, the writing direction is left-to-right. For the
- * Chinese text in the above example, the writing direction is top-to-bottom.
- */
-enum WritingDirection {
- WRITING_DIRECTION_LEFT_TO_RIGHT = 0,
- WRITING_DIRECTION_RIGHT_TO_LEFT = 1,
- WRITING_DIRECTION_TOP_TO_BOTTOM = 2,
-};
-
-/**
- * The text lines are read in the given sequence.
- *
- * In English, the order is top-to-bottom.
- * In Chinese, vertical text lines are read right-to-left. Mongolian is
- * written in vertical columns top to bottom like Chinese, but the lines
- * order left-to right.
- *
- * Note that only some combinations make sense. For example,
- * WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM
- */
-enum TextlineOrder {
- TEXTLINE_ORDER_LEFT_TO_RIGHT = 0,
- TEXTLINE_ORDER_RIGHT_TO_LEFT = 1,
- TEXTLINE_ORDER_TOP_TO_BOTTOM = 2,
-};
-
-/**
- * Possible modes for page layout analysis. These *must* be kept in order
- * of decreasing amount of layout analysis to be done, except for OSD_ONLY,
- * so that the inequality test macros below work.
- */
-enum PageSegMode {
- PSM_OSD_ONLY = 0, ///< Orientation and script detection only.
- PSM_AUTO_OSD = 1, ///< Automatic page segmentation with orientation and
- ///< script detection. (OSD)
- PSM_AUTO_ONLY = 2, ///< Automatic page segmentation, but no OSD, or OCR.
- PSM_AUTO = 3, ///< Fully automatic page segmentation, but no OSD.
- PSM_SINGLE_COLUMN = 4, ///< Assume a single column of text of variable sizes.
- PSM_SINGLE_BLOCK_VERT_TEXT = 5, ///< Assume a single uniform block of
- ///< vertically aligned text.
- PSM_SINGLE_BLOCK = 6, ///< Assume a single uniform block of text. (Default.)
- PSM_SINGLE_LINE = 7, ///< Treat the image as a single text line.
- PSM_SINGLE_WORD = 8, ///< Treat the image as a single word.
- PSM_CIRCLE_WORD = 9, ///< Treat the image as a single word in a circle.
- PSM_SINGLE_CHAR = 10, ///< Treat the image as a single character.
- PSM_SPARSE_TEXT =
- 11, ///< Find as much text as possible in no particular order.
- PSM_SPARSE_TEXT_OSD = 12, ///< Sparse text with orientation and script det.
- PSM_RAW_LINE = 13, ///< Treat the image as a single text line, bypassing
- ///< hacks that are Tesseract-specific.
-
- PSM_COUNT ///< Number of enum entries.
-};
-
-/**
- * Inline functions that act on a PageSegMode to determine whether components of
- * layout analysis are enabled.
- * *Depend critically on the order of elements of PageSegMode.*
- * NOTE that arg is an int for compatibility with INT_PARAM.
- */
-inline bool PSM_OSD_ENABLED(int pageseg_mode) {
- return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD;
-}
-inline bool PSM_ORIENTATION_ENABLED(int pageseg_mode) {
- return pageseg_mode <= PSM_AUTO || pageseg_mode == PSM_SPARSE_TEXT_OSD;
-}
-inline bool PSM_COL_FIND_ENABLED(int pageseg_mode) {
- return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO;
-}
-inline bool PSM_SPARSE(int pageseg_mode) {
- return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
-}
-inline bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode) {
- return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN;
-}
-inline bool PSM_LINE_FIND_ENABLED(int pageseg_mode) {
- return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK;
-}
-inline bool PSM_WORD_FIND_ENABLED(int pageseg_mode) {
- return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) ||
- pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
-}
-
-/**
- * enum of the elements of the page hierarchy, used in ResultIterator
- * to provide functions that operate on each level without having to
- * have 5x as many functions.
- */
-enum PageIteratorLevel {
- RIL_BLOCK, // Block of text/image/separator line.
- RIL_PARA, // Paragraph within a block.
- RIL_TEXTLINE, // Line within a paragraph.
- RIL_WORD, // Word within a textline.
- RIL_SYMBOL // Symbol/character within a word.
-};
-
-/**
- * JUSTIFICATION_UNKNOWN
- * The alignment is not clearly one of the other options. This could happen
- * for example if there are only one or two lines of text or the text looks
- * like source code or poetry.
- *
- * NOTA BENE: Fully justified paragraphs (text aligned to both left and right
- * margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text
- * is written with a left-to-right script and with JUSTIFICATION_RIGHT if
- * their text is written in a right-to-left script.
- *
- * Interpretation for text read in vertical lines:
- * "Left" is wherever the starting reading position is.
- *
- * JUSTIFICATION_LEFT
- * Each line, except possibly the first, is flush to the same left tab stop.
- *
- * JUSTIFICATION_CENTER
- * The text lines of the paragraph are centered about a line going
- * down through their middle of the text lines.
- *
- * JUSTIFICATION_RIGHT
- * Each line, except possibly the first, is flush to the same right tab stop.
- */
-enum ParagraphJustification {
- JUSTIFICATION_UNKNOWN,
- JUSTIFICATION_LEFT,
- JUSTIFICATION_CENTER,
- JUSTIFICATION_RIGHT,
-};
-
-/**
- * When Tesseract/Cube is initialized we can choose to instantiate/load/run
- * only the Tesseract part, only the Cube part or both along with the combiner.
- * The preference of which engine to use is stored in tessedit_ocr_engine_mode.
- *
- * ATTENTION: When modifying this enum, please make sure to make the
- * appropriate changes to all the enums mirroring it (e.g. OCREngine in
- * cityblock/workflow/detection/detection_storage.proto). Such enums will
- * mention the connection to OcrEngineMode in the comments.
- */
-enum OcrEngineMode {
- OEM_TESSERACT_ONLY, // Run Tesseract only - fastest; deprecated
- OEM_LSTM_ONLY, // Run just the LSTM line recognizer.
- OEM_TESSERACT_LSTM_COMBINED, // Run the LSTM recognizer, but allow fallback
- // to Tesseract when things get difficult.
- // deprecated
- OEM_DEFAULT, // Specify this mode when calling init_*(),
- // to indicate that any of the above modes
- // should be automatically inferred from the
- // variables in the language-specific config,
- // command-line configs, or if not specified
- // in any of the above should be set to the
- // default OEM_TESSERACT_ONLY.
- OEM_COUNT // Number of OEMs
-};
-
-} // namespace tesseract.
-
-#endif // TESSERACT_CCSTRUCT_PUBLICTYPES_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/renderer.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/renderer.h
deleted file mode 100644
index 6f405233..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/renderer.h
+++ /dev/null
@@ -1,311 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: renderer.h
-// Description: Rendering interface to inject into TessBaseAPI
-//
-// (C) Copyright 2011, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_API_RENDERER_H_
-#define TESSERACT_API_RENDERER_H_
-
-#include "export.h"
-
-// To avoid collision with other typenames include the ABSOLUTE MINIMUM
-// complexity of includes here. Use forward declarations wherever possible
-// and hide includes of complex types in baseapi.cpp.
-#include
-#include // for std::string
-#include // for std::vector
-
-struct Pix;
-
-namespace tesseract {
-
-class TessBaseAPI;
-
-/**
- * Interface for rendering tesseract results into a document, such as text,
- * HOCR or pdf. This class is abstract. Specific classes handle individual
- * formats. This interface is then used to inject the renderer class into
- * tesseract when processing images.
- *
- * For simplicity implementing this with tesseract version 3.01,
- * the renderer contains document state that is cleared from document
- * to document just as the TessBaseAPI is. This way the base API can just
- * delegate its rendering functionality to injected renderers, and the
- * renderers can manage the associated state needed for the specific formats
- * in addition to the heuristics for producing it.
- */
-class TESS_API TessResultRenderer {
-public:
- virtual ~TessResultRenderer();
-
- // Takes ownership of pointer so must be new'd instance.
- // Renderers aren't ordered, but appends the sequences of next parameter
- // and existing next(). The renderers should be unique across both lists.
- void insert(TessResultRenderer *next);
-
- // Returns the next renderer or nullptr.
- TessResultRenderer *next() {
- return next_;
- }
-
- /**
- * Starts a new document with the given title.
- * This clears the contents of the output data.
- * Title should use UTF-8 encoding.
- */
- bool BeginDocument(const char *title);
-
- /**
- * Adds the recognized text from the source image to the current document.
- * Invalid if BeginDocument not yet called.
- *
- * Note that this API is a bit weird but is designed to fit into the
- * current TessBaseAPI implementation where the api has lots of state
- * information that we might want to add in.
- */
- bool AddImage(TessBaseAPI *api);
-
- /**
- * Finishes the document and finalizes the output data
- * Invalid if BeginDocument not yet called.
- */
- bool EndDocument();
-
- const char *file_extension() const {
- return file_extension_;
- }
- const char *title() const {
- return title_.c_str();
- }
-
- // Is everything fine? Otherwise something went wrong.
- bool happy() const {
- return happy_;
- }
-
- /**
- * Returns the index of the last image given to AddImage
- * (i.e. images are incremented whether the image succeeded or not)
- *
- * This is always defined. It means either the number of the
- * current image, the last image ended, or in the completed document
- * depending on when in the document lifecycle you are looking at it.
- * Will return -1 if a document was never started.
- */
- int imagenum() const {
- return imagenum_;
- }
-
-protected:
- /**
- * Called by concrete classes.
- *
- * outputbase is the name of the output file excluding
- * extension. For example, "/path/to/chocolate-chip-cookie-recipe"
- *
- * extension indicates the file extension to be used for output
- * files. For example "pdf" will produce a .pdf file, and "hocr"
- * will produce .hocr files.
- */
- TessResultRenderer(const char *outputbase, const char *extension);
-
- // Hook for specialized handling in BeginDocument()
- virtual bool BeginDocumentHandler();
-
- // This must be overridden to render the OCR'd results
- virtual bool AddImageHandler(TessBaseAPI *api) = 0;
-
- // Hook for specialized handling in EndDocument()
- virtual bool EndDocumentHandler();
-
- // Renderers can call this to append '\0' terminated strings into
- // the output string returned by GetOutput.
- // This method will grow the output buffer if needed.
- void AppendString(const char *s);
-
- // Renderers can call this to append binary byte sequences into
- // the output string returned by GetOutput. Note that s is not necessarily
- // '\0' terminated (and can contain '\0' within it).
- // This method will grow the output buffer if needed.
- void AppendData(const char *s, int len);
-
-private:
- TessResultRenderer *next_; // Can link multiple renderers together
- FILE *fout_; // output file pointer
- const char *file_extension_; // standard extension for generated output
- std::string title_; // title of document being rendered
- int imagenum_; // index of last image added
- bool happy_; // I get grumpy when the disk fills up, etc.
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string
- */
-class TESS_API TessTextRenderer : public TessResultRenderer {
-public:
- explicit TessTextRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-/**
- * Renders tesseract output into an hocr text string
- */
-class TESS_API TessHOcrRenderer : public TessResultRenderer {
-public:
- explicit TessHOcrRenderer(const char *outputbase, bool font_info);
- explicit TessHOcrRenderer(const char *outputbase);
-
-protected:
- bool BeginDocumentHandler() override;
- bool AddImageHandler(TessBaseAPI *api) override;
- bool EndDocumentHandler() override;
-
-private:
- bool font_info_; // whether to print font information
-};
-
-/**
- * Renders tesseract output into an alto text string
- */
-class TESS_API TessAltoRenderer : public TessResultRenderer {
-public:
- explicit TessAltoRenderer(const char *outputbase);
-
-protected:
- bool BeginDocumentHandler() override;
- bool AddImageHandler(TessBaseAPI *api) override;
- bool EndDocumentHandler() override;
-
-private:
- bool begin_document;
-};
-
-/**
- * Renders Tesseract output into a TSV string
- */
-class TESS_API TessTsvRenderer : public TessResultRenderer {
-public:
- explicit TessTsvRenderer(const char *outputbase, bool font_info);
- explicit TessTsvRenderer(const char *outputbase);
-
-protected:
- bool BeginDocumentHandler() override;
- bool AddImageHandler(TessBaseAPI *api) override;
- bool EndDocumentHandler() override;
-
-private:
- bool font_info_; // whether to print font information
-};
-
-/**
- * Renders tesseract output into searchable PDF
- */
-class TESS_API TessPDFRenderer : public TessResultRenderer {
-public:
- // datadir is the location of the TESSDATA. We need it because
- // we load a custom PDF font from this location.
- TessPDFRenderer(const char *outputbase, const char *datadir,
- bool textonly = false);
-
-protected:
- bool BeginDocumentHandler() override;
- bool AddImageHandler(TessBaseAPI *api) override;
- bool EndDocumentHandler() override;
-
-private:
- // We don't want to have every image in memory at once,
- // so we store some metadata as we go along producing
- // PDFs one page at a time. At the end, that metadata is
- // used to make everything that isn't easily handled in a
- // streaming fashion.
- long int obj_; // counter for PDF objects
- std::vector offsets_; // offset of every PDF object in bytes
- std::vector pages_; // object number for every /Page object
- std::string datadir_; // where to find the custom font
- bool textonly_; // skip images if set
- // Bookkeeping only. DIY = Do It Yourself.
- void AppendPDFObjectDIY(size_t objectsize);
- // Bookkeeping + emit data.
- void AppendPDFObject(const char *data);
- // Create the /Contents object for an entire page.
- char *GetPDFTextObjects(TessBaseAPI *api, double width, double height);
- // Turn an image into a PDF object. Only transcode if we have to.
- static bool imageToPDFObj(Pix *pix, const char *filename, long int objnum,
- char **pdf_object, long int *pdf_object_size,
- int jpg_quality);
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string
- */
-class TESS_API TessUnlvRenderer : public TessResultRenderer {
-public:
- explicit TessUnlvRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string for LSTMBox
- */
-class TESS_API TessLSTMBoxRenderer : public TessResultRenderer {
-public:
- explicit TessLSTMBoxRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string
- */
-class TESS_API TessBoxTextRenderer : public TessResultRenderer {
-public:
- explicit TessBoxTextRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string in WordStr format
- */
-class TESS_API TessWordStrBoxRenderer : public TessResultRenderer {
-public:
- explicit TessWordStrBoxRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-#ifndef DISABLED_LEGACY_ENGINE
-
-/**
- * Renders tesseract output into an osd text string
- */
-class TESS_API TessOsdRenderer : public TessResultRenderer {
-public:
- explicit TessOsdRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-#endif // ndef DISABLED_LEGACY_ENGINE
-
-} // namespace tesseract.
-
-#endif // TESSERACT_API_RENDERER_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/resultiterator.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/resultiterator.h
deleted file mode 100644
index 3e4d5807..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/resultiterator.h
+++ /dev/null
@@ -1,250 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: resultiterator.h
-// Description: Iterator for tesseract results that is capable of
-// iterating in proper reading order over Bi Directional
-// (e.g. mixed Hebrew and English) text.
-// Author: David Eger
-//
-// (C) Copyright 2011, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_
-#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_
-
-#include "export.h" // for TESS_API, TESS_LOCAL
-#include "ltrresultiterator.h" // for LTRResultIterator
-#include "publictypes.h" // for PageIteratorLevel
-#include "unichar.h" // for StrongScriptDirection
-
-#include // for std::pair
-#include // for std::vector
-
-namespace tesseract {
-
-class TESS_API ResultIterator : public LTRResultIterator {
-public:
- static ResultIterator *StartOfParagraph(const LTRResultIterator &resit);
-
- /**
- * ResultIterator is copy constructible!
- * The default copy constructor works just fine for us.
- */
- ~ResultIterator() override = default;
-
- // ============= Moving around within the page ============.
- /**
- * Moves the iterator to point to the start of the page to begin
- * an iteration.
- */
- void Begin() override;
-
- /**
- * Moves to the start of the next object at the given level in the
- * page hierarchy in the appropriate reading order and returns false if
- * the end of the page was reached.
- * NOTE that RIL_SYMBOL will skip non-text blocks, but all other
- * PageIteratorLevel level values will visit each non-text block once.
- * Think of non text blocks as containing a single para, with a single line,
- * with a single imaginary word.
- * Calls to Next with different levels may be freely intermixed.
- * This function iterates words in right-to-left scripts correctly, if
- * the appropriate language has been loaded into Tesseract.
- */
- bool Next(PageIteratorLevel level) override;
-
- /**
- * IsAtBeginningOf() returns whether we're at the logical beginning of the
- * given level. (as opposed to ResultIterator's left-to-right top-to-bottom
- * order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf().
- * For a full description, see pageiterator.h
- */
- bool IsAtBeginningOf(PageIteratorLevel level) const override;
-
- /**
- * Implement PageIterator's IsAtFinalElement correctly in a BiDi context.
- * For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we
- * point at the last word in a paragraph. See PageIterator for full comment.
- */
- bool IsAtFinalElement(PageIteratorLevel level,
- PageIteratorLevel element) const override;
-
- // ============= Functions that refer to words only ============.
- // Returns the number of blanks before the current word.
- int BlanksBeforeWord() const;
-
- // ============= Accessing data ==============.
-
- /**
- * Returns the null terminated UTF-8 encoded text string for the current
- * object at the given level. Use delete [] to free after use.
- */
- virtual char *GetUTF8Text(PageIteratorLevel level) const;
-
- /**
- * Returns the LSTM choices for every LSTM timestep for the current word.
- */
- virtual std::vector>>>
- *GetRawLSTMTimesteps() const;
- virtual std::vector>>
- *GetBestLSTMSymbolChoices() const;
-
- /**
- * Return whether the current paragraph's dominant reading direction
- * is left-to-right (as opposed to right-to-left).
- */
- bool ParagraphIsLtr() const;
-
- // ============= Exposed only for testing =============.
-
- /**
- * Yields the reading order as a sequence of indices and (optional)
- * meta-marks for a set of words (given left-to-right).
- * The meta marks are passed as negative values:
- * kMinorRunStart Start of minor direction text.
- * kMinorRunEnd End of minor direction text.
- * kComplexWord The next indexed word contains both left-to-right and
- * right-to-left characters and was treated as neutral.
- *
- * For example, suppose we have five words in a text line,
- * indexed [0,1,2,3,4] from the leftmost side of the text line.
- * The following are all believable reading_orders:
- *
- * Left-to-Right (in ltr paragraph):
- * { 0, 1, 2, 3, 4 }
- * Left-to-Right (in rtl paragraph):
- * { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd }
- * Right-to-Left (in rtl paragraph):
- * { 4, 3, 2, 1, 0 }
- * Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph:
- * { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }
- */
- static void CalculateTextlineOrder(
- bool paragraph_is_ltr,
- const std::vector &word_dirs,
- std::vector *reading_order);
-
- static const int kMinorRunStart;
- static const int kMinorRunEnd;
- static const int kComplexWord;
-
-protected:
- /**
- * We presume the data associated with the given iterator will outlive us.
- * NB: This is private because it does something that is non-obvious:
- * it resets to the beginning of the paragraph instead of staying wherever
- * resit might have pointed.
- */
- explicit ResultIterator(const LTRResultIterator &resit);
-
-private:
- /**
- * Calculates the current paragraph's dominant writing direction.
- * Typically, members should use current_paragraph_ltr_ instead.
- */
- bool CurrentParagraphIsLtr() const;
-
- /**
- * Returns word indices as measured from resit->RestartRow() = index 0
- * for the reading order of words within a textline given an iterator
- * into the middle of the text line.
- * In addition to non-negative word indices, the following negative values
- * may be inserted:
- * kMinorRunStart Start of minor direction text.
- * kMinorRunEnd End of minor direction text.
- * kComplexWord The previous word contains both left-to-right and
- * right-to-left characters and was treated as neutral.
- */
- void CalculateTextlineOrder(bool paragraph_is_ltr,
- const LTRResultIterator &resit,
- std::vector *indices) const;
- /** Same as above, but the caller's ssd gets filled in if ssd != nullptr. */
- void CalculateTextlineOrder(bool paragraph_is_ltr,
- const LTRResultIterator &resit,
- std::vector *ssd,
- std::vector *indices) const;
-
- /**
- * What is the index of the current word in a strict left-to-right reading
- * of the row?
- */
- int LTRWordIndex() const;
-
- /**
- * Given an iterator pointing at a word, returns the logical reading order
- * of blob indices for the word.
- */
- void CalculateBlobOrder(std::vector *blob_indices) const;
-
- /** Precondition: current_paragraph_is_ltr_ is set. */
- void MoveToLogicalStartOfTextline();
-
- /**
- * Precondition: current_paragraph_is_ltr_ and in_minor_direction_
- * are set.
- */
- void MoveToLogicalStartOfWord();
-
- /** Are we pointing at the final (reading order) symbol of the word? */
- bool IsAtFinalSymbolOfWord() const;
-
- /** Are we pointing at the first (reading order) symbol of the word? */
- bool IsAtFirstSymbolOfWord() const;
-
- /**
- * Append any extra marks that should be appended to this word when printed.
- * Mostly, these are Unicode BiDi control characters.
- */
- void AppendSuffixMarks(std::string *text) const;
-
- /** Appends the current word in reading order to the given buffer.*/
- void AppendUTF8WordText(std::string *text) const;
-
- /**
- * Appends the text of the current text line, *assuming this iterator is
- * positioned at the beginning of the text line* This function
- * updates the iterator to point to the first position past the text line.
- * Each textline is terminated in a single newline character.
- * If the textline ends a paragraph, it gets a second terminal newline.
- */
- void IterateAndAppendUTF8TextlineText(std::string *text);
-
- /**
- * Appends the text of the current paragraph in reading order
- * to the given buffer.
- * Each textline is terminated in a single newline character, and the
- * paragraph gets an extra newline at the end.
- */
- void AppendUTF8ParagraphText(std::string *text) const;
-
- /** Returns whether the bidi_debug flag is set to at least min_level. */
- bool BidiDebug(int min_level) const;
-
- bool current_paragraph_is_ltr_;
-
- /**
- * Is the currently pointed-at character at the beginning of
- * a minor-direction run?
- */
- bool at_beginning_of_minor_run_;
-
- /** Is the currently pointed-at character in a minor-direction sequence? */
- bool in_minor_direction_;
-
- /**
- * Should detected inter-word spaces be preserved, or "compressed" to a single
- * space character (default behavior).
- */
- bool preserve_interword_spaces_;
-};
-
-} // namespace tesseract.
-
-#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/unichar.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/unichar.h
deleted file mode 100644
index 015109d7..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/unichar.h
+++ /dev/null
@@ -1,174 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: unichar.h
-// Description: Unicode character/ligature class.
-// Author: Ray Smith
-//
-// (C) Copyright 2006, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCUTIL_UNICHAR_H_
-#define TESSERACT_CCUTIL_UNICHAR_H_
-
-#include "export.h"
-
-#include
-#include
-#include
-#include
-
-namespace tesseract {
-
-// Maximum number of characters that can be stored in a UNICHAR. Must be
-// at least 4. Must not exceed 31 without changing the coding of length.
-#define UNICHAR_LEN 30
-
-// A UNICHAR_ID is the unique id of a unichar.
-using UNICHAR_ID = int;
-
-// A variable to indicate an invalid or uninitialized unichar id.
-static const int INVALID_UNICHAR_ID = -1;
-// A special unichar that corresponds to INVALID_UNICHAR_ID.
-static const char INVALID_UNICHAR[] = "__INVALID_UNICHAR__";
-
-enum StrongScriptDirection {
- DIR_NEUTRAL = 0, // Text contains only neutral characters.
- DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters.
- DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters.
- DIR_MIX = 3, // Text contains a mixture of left-to-right
- // and right-to-left characters.
-};
-
-using char32 = signed int;
-
-// The UNICHAR class holds a single classification result. This may be
-// a single Unicode character (stored as between 1 and 4 utf8 bytes) or
-// multiple Unicode characters representing the NFKC expansion of a ligature
-// such as fi, ffl etc. These are also stored as utf8.
-class TESS_API UNICHAR {
-public:
- UNICHAR() {
- memset(chars, 0, UNICHAR_LEN);
- }
-
- // Construct from a utf8 string. If len<0 then the string is null terminated.
- // If the string is too long to fit in the UNICHAR then it takes only what
- // will fit.
- UNICHAR(const char *utf8_str, int len);
-
- // Construct from a single UCS4 character.
- explicit UNICHAR(int unicode);
-
- // Default copy constructor and operator= are OK.
-
- // Get the first character as UCS-4.
- int first_uni() const;
-
- // Get the length of the UTF8 string.
- int utf8_len() const {
- int len = chars[UNICHAR_LEN - 1];
- return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;
- }
-
- // Get a UTF8 string, but NOT nullptr terminated.
- const char *utf8() const {
- return chars;
- }
-
- // Get a terminated UTF8 string: Must delete[] it after use.
- char *utf8_str() const;
-
- // Get the number of bytes in the first character of the given utf8 string.
- static int utf8_step(const char *utf8_str);
-
- // A class to simplify iterating over and accessing elements of a UTF8
- // string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or
- // take ownership of the underlying byte array. It also does not permit
- // modification of the array (as the name suggests).
- //
- // Example:
- // for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len);
- // it != UNICHAR::end(str, len);
- // ++it) {
- // printf("UCS-4 symbol code = %d\n", *it);
- // char buf[5];
- // int char_len = it.get_utf8(buf); buf[char_len] = '\0';
- // printf("Char = %s\n", buf);
- // }
- class TESS_API const_iterator {
- using CI = const_iterator;
-
- public:
- // Step to the next UTF8 character.
- // If the current position is at an illegal UTF8 character, then print an
- // error message and step by one byte. If the current position is at a
- // nullptr value, don't step past it.
- const_iterator &operator++();
-
- // Return the UCS-4 value at the current position.
- // If the current position is at an illegal UTF8 value, return a single
- // space character.
- int operator*() const;
-
- // Store the UTF-8 encoding of the current codepoint into buf, which must be
- // at least 4 bytes long. Return the number of bytes written.
- // If the current position is at an illegal UTF8 value, writes a single
- // space character and returns 1.
- // Note that this method does not null-terminate the buffer.
- int get_utf8(char *buf) const;
- // Returns the number of bytes of the current codepoint. Returns 1 if the
- // current position is at an illegal UTF8 value.
- int utf8_len() const;
- // Returns true if the UTF-8 encoding at the current position is legal.
- bool is_legal() const;
-
- // Return the pointer into the string at the current position.
- const char *utf8_data() const {
- return it_;
- }
-
- // Iterator equality operators.
- friend bool operator==(const CI &lhs, const CI &rhs) {
- return lhs.it_ == rhs.it_;
- }
- friend bool operator!=(const CI &lhs, const CI &rhs) {
- return !(lhs == rhs);
- }
-
- private:
- friend class UNICHAR;
- explicit const_iterator(const char *it) : it_(it) {}
-
- const char *it_; // Pointer into the string.
- };
-
- // Create a start/end iterator pointing to a string. Note that these methods
- // are static and do NOT create a copy or take ownership of the underlying
- // array.
- static const_iterator begin(const char *utf8_str, int byte_length);
- static const_iterator end(const char *utf8_str, int byte_length);
-
- // Converts a utf-8 string to a vector of unicodes.
- // Returns an empty vector if the input contains invalid UTF-8.
- static std::vector UTF8ToUTF32(const char *utf8_str);
- // Converts a vector of unicodes to a utf8 string.
- // Returns an empty string if the input contains an invalid unicode.
- static std::string UTF32ToUTF8(const std::vector &str32);
-
-private:
- // A UTF-8 representation of 1 or more Unicode characters.
- // The last element (chars[UNICHAR_LEN - 1]) is a length if
- // its value < UNICHAR_LEN, otherwise it is a genuine character.
- char chars[UNICHAR_LEN]{};
-};
-
-} // namespace tesseract
-
-#endif // TESSERACT_CCUTIL_UNICHAR_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/version.h b/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/version.h
deleted file mode 100644
index 6bac5d66..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/loongarch64/include/tesseract/version.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: version.h
-// Description: Version information
-//
-// (C) Copyright 2018, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_API_VERSION_H_
-#define TESSERACT_API_VERSION_H_
-
-// clang-format off
-
-#define TESSERACT_MAJOR_VERSION @GENERIC_MAJOR_VERSION@
-#define TESSERACT_MINOR_VERSION @GENERIC_MINOR_VERSION@
-#define TESSERACT_MICRO_VERSION @GENERIC_MICRO_VERSION@
-
-#define TESSERACT_VERSION \
- (TESSERACT_MAJOR_VERSION << 16 | \
- TESSERACT_MINOR_VERSION << 8 | \
- TESSERACT_MICRO_VERSION)
-
-#define TESSERACT_VERSION_STR "@PACKAGE_VERSION@"
-
-// clang-format on
-
-#endif // TESSERACT_API_VERSION_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/baseapi.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/baseapi.h
deleted file mode 100644
index 5e1e4830..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/baseapi.h
+++ /dev/null
@@ -1,812 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: baseapi.h
-// Description: Simple API for calling tesseract.
-// Author: Ray Smith
-//
-// (C) Copyright 2006, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_API_BASEAPI_H_
-#define TESSERACT_API_BASEAPI_H_
-
-#ifdef HAVE_CONFIG_H
-# include "config_auto.h" // DISABLED_LEGACY_ENGINE
-#endif
-
-#include "export.h"
-#include "pageiterator.h"
-#include "publictypes.h"
-#include "resultiterator.h"
-#include "unichar.h"
-
-#include "version.h"
-
-#include
-#include // for std::vector
-
-struct Pix;
-struct Pixa;
-struct Boxa;
-
-namespace tesseract {
-
-class PAGE_RES;
-class ParagraphModel;
-class BLOCK_LIST;
-class ETEXT_DESC;
-struct OSResults;
-class UNICHARSET;
-
-class Dawg;
-class Dict;
-class EquationDetect;
-class PageIterator;
-class ImageThresholder;
-class LTRResultIterator;
-class ResultIterator;
-class MutableIterator;
-class TessResultRenderer;
-class Tesseract;
-
-// Function to read a std::vector from a whole file.
-// Returns false on failure.
-using FileReader = bool (*)(const char *filename, std::vector *data);
-
-using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID,
- bool) const;
-using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *,
- int, const char *, int);
-
-/**
- * Base class for all tesseract APIs.
- * Specific classes can add ability to work on different inputs or produce
- * different outputs.
- * This class is mostly an interface layer on top of the Tesseract instance
- * class to hide the data types so that users of this class don't have to
- * include any other Tesseract headers.
- */
-class TESS_API TessBaseAPI {
-public:
- TessBaseAPI();
- virtual ~TessBaseAPI();
- // Copy constructor and assignment operator are currently unsupported.
- TessBaseAPI(TessBaseAPI const &) = delete;
- TessBaseAPI &operator=(TessBaseAPI const &) = delete;
-
- /**
- * Returns the version identifier as a static string. Do not delete.
- */
- static const char *Version();
-
- /**
- * If compiled with OpenCL AND an available OpenCL
- * device is deemed faster than serial code, then
- * "device" is populated with the cl_device_id
- * and returns sizeof(cl_device_id)
- * otherwise *device=nullptr and returns 0.
- */
- static size_t getOpenCLDevice(void **device);
-
- /**
- * Set the name of the input file. Needed for training and
- * reading a UNLV zone file, and for searchable PDF output.
- */
- void SetInputName(const char *name);
- /**
- * These functions are required for searchable PDF output.
- * We need our hands on the input file so that we can include
- * it in the PDF without transcoding. If that is not possible,
- * we need the original image. Finally, resolution metadata
- * is stored in the PDF so we need that as well.
- */
- const char *GetInputName();
- // Takes ownership of the input pix.
- void SetInputImage(Pix *pix);
- Pix *GetInputImage();
- int GetSourceYResolution();
- const char *GetDatapath();
-
- /** Set the name of the bonus output files. Needed only for debugging. */
- void SetOutputName(const char *name);
-
- /**
- * Set the value of an internal "parameter."
- * Supply the name of the parameter and the value as a string, just as
- * you would in a config file.
- * Returns false if the name lookup failed.
- * Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z.
- * Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode.
- * SetVariable may be used before Init, but settings will revert to
- * defaults on End().
- *
- * Note: Must be called after Init(). Only works for non-init variables
- * (init variables should be passed to Init()).
- */
- bool SetVariable(const char *name, const char *value);
- bool SetDebugVariable(const char *name, const char *value);
-
- /**
- * Returns true if the parameter was found among Tesseract parameters.
- * Fills in value with the value of the parameter.
- */
- bool GetIntVariable(const char *name, int *value) const;
- bool GetBoolVariable(const char *name, bool *value) const;
- bool GetDoubleVariable(const char *name, double *value) const;
-
- /**
- * Returns the pointer to the string that represents the value of the
- * parameter if it was found among Tesseract parameters.
- */
- const char *GetStringVariable(const char *name) const;
-
-#ifndef DISABLED_LEGACY_ENGINE
-
- /**
- * Print Tesseract fonts table to the given file.
- */
- void PrintFontsTable(FILE *fp) const;
-
-#endif
-
- /**
- * Print Tesseract parameters to the given file.
- */
- void PrintVariables(FILE *fp) const;
-
- /**
- * Get value of named variable as a string, if it exists.
- */
- bool GetVariableAsString(const char *name, std::string *val) const;
-
- /**
- * Instances are now mostly thread-safe and totally independent,
- * but some global parameters remain. Basically it is safe to use multiple
- * TessBaseAPIs in different threads in parallel, UNLESS:
- * you use SetVariable on some of the Params in classify and textord.
- * If you do, then the effect will be to change it for all your instances.
- *
- * Start tesseract. Returns zero on success and -1 on failure.
- * NOTE that the only members that may be called before Init are those
- * listed above here in the class definition.
- *
- * The datapath must be the name of the tessdata directory.
- * The language is (usually) an ISO 639-3 string or nullptr will default to
- * eng. It is entirely safe (and eventually will be efficient too) to call
- * Init multiple times on the same instance to change language, or just
- * to reset the classifier.
- * The language may be a string of the form [~][+[~]]* indicating
- * that multiple languages are to be loaded. Eg hin+eng will load Hindi and
- * English. Languages may specify internally that they want to be loaded
- * with one or more other languages, so the ~ sign is available to override
- * that. Eg if hin were set to load eng by default, then hin+~eng would force
- * loading only hin. The number of loaded languages is limited only by
- * memory, with the caveat that loading additional languages will impact
- * both speed and accuracy, as there is more work to do to decide on the
- * applicable language, and there is more chance of hallucinating incorrect
- * words.
- * WARNING: On changing languages, all Tesseract parameters are reset
- * back to their default values. (Which may vary between languages.)
- * If you have a rare need to set a Variable that controls
- * initialization for a second call to Init you should explicitly
- * call End() and then use SetVariable before Init. This is only a very
- * rare use case, since there are very few uses that require any parameters
- * to be set before Init.
- *
- * If set_only_non_debug_params is true, only params that do not contain
- * "debug" in the name will be set.
- */
- int Init(const char *datapath, const char *language, OcrEngineMode mode,
- char **configs, int configs_size,
- const std::vector *vars_vec,
- const std::vector *vars_values,
- bool set_only_non_debug_params);
- int Init(const char *datapath, const char *language, OcrEngineMode oem) {
- return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false);
- }
- int Init(const char *datapath, const char *language) {
- return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr,
- false);
- }
- // In-memory version reads the traineddata file directly from the given
- // data[data_size] array, and/or reads data via a FileReader.
- int Init(const char *data, int data_size, const char *language,
- OcrEngineMode mode, char **configs, int configs_size,
- const std::vector *vars_vec,
- const std::vector *vars_values,
- bool set_only_non_debug_params, FileReader reader);
-
- /**
- * Returns the languages string used in the last valid initialization.
- * If the last initialization specified "deu+hin" then that will be
- * returned. If hin loaded eng automatically as well, then that will
- * not be included in this list. To find the languages actually
- * loaded use GetLoadedLanguagesAsVector.
- * The returned string should NOT be deleted.
- */
- const char *GetInitLanguagesAsString() const;
-
- /**
- * Returns the loaded languages in the vector of std::string.
- * Includes all languages loaded by the last Init, including those loaded
- * as dependencies of other loaded languages.
- */
- void GetLoadedLanguagesAsVector(std::vector *langs) const;
-
- /**
- * Returns the available languages in the sorted vector of std::string.
- */
- void GetAvailableLanguagesAsVector(std::vector *langs) const;
-
- /**
- * Init only for page layout analysis. Use only for calls to SetImage and
- * AnalysePage. Calls that attempt recognition will generate an error.
- */
- void InitForAnalysePage();
-
- /**
- * Read a "config" file containing a set of param, value pairs.
- * Searches the standard places: tessdata/configs, tessdata/tessconfigs
- * and also accepts a relative or absolute path name.
- * Note: only non-init params will be set (init params are set by Init()).
- */
- void ReadConfigFile(const char *filename);
- /** Same as above, but only set debug params from the given config file. */
- void ReadDebugConfigFile(const char *filename);
-
- /**
- * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK.
- * The mode is stored as an IntParam so it can also be modified by
- * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
- */
- void SetPageSegMode(PageSegMode mode);
-
- /** Return the current page segmentation mode. */
- PageSegMode GetPageSegMode() const;
-
- /**
- * Recognize a rectangle from an image and return the result as a string.
- * May be called many times for a single Init.
- * Currently has no error checking.
- * Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
- * Palette color images will not work properly and must be converted to
- * 24 bit.
- * Binary images of 1 bit per pixel may also be given but they must be
- * byte packed with the MSB of the first byte being the first pixel, and a
- * 1 represents WHITE. For binary images set bytes_per_pixel=0.
- * The recognized text is returned as a char* which is coded
- * as UTF8 and must be freed with the delete [] operator.
- *
- * Note that TesseractRect is the simplified convenience interface.
- * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize,
- * and one or more of the Get*Text functions below.
- */
- char *TesseractRect(const unsigned char *imagedata, int bytes_per_pixel,
- int bytes_per_line, int left, int top, int width,
- int height);
-
- /**
- * Call between pages or documents etc to free up memory and forget
- * adaptive data.
- */
- void ClearAdaptiveClassifier();
-
- /**
- * @defgroup AdvancedAPI Advanced API
- * The following methods break TesseractRect into pieces, so you can
- * get hold of the thresholded image, get the text in different formats,
- * get bounding boxes, confidences etc.
- */
- /* @{ */
-
- /**
- * Provide an image for Tesseract to recognize. Format is as
- * TesseractRect above. Copies the image buffer and converts to Pix.
- * SetImage clears all recognition results, and sets the rectangle to the
- * full image, so it may be followed immediately by a GetUTF8Text, and it
- * will automatically perform recognition.
- */
- void SetImage(const unsigned char *imagedata, int width, int height,
- int bytes_per_pixel, int bytes_per_line);
-
- /**
- * Provide an image for Tesseract to recognize. As with SetImage above,
- * Tesseract takes its own copy of the image, so it need not persist until
- * after Recognize.
- * Pix vs raw, which to use?
- * Use Pix where possible. Tesseract uses Pix as its internal representation
- * and it is therefore more efficient to provide a Pix directly.
- */
- void SetImage(Pix *pix);
-
- /**
- * Set the resolution of the source image in pixels per inch so font size
- * information can be calculated in results. Call this after SetImage().
- */
- void SetSourceResolution(int ppi);
-
- /**
- * Restrict recognition to a sub-rectangle of the image. Call after SetImage.
- * Each SetRectangle clears the recogntion results so multiple rectangles
- * can be recognized with the same image.
- */
- void SetRectangle(int left, int top, int width, int height);
-
- /**
- * Get a copy of the internal thresholded image from Tesseract.
- * Caller takes ownership of the Pix and must pixDestroy it.
- * May be called any time after SetImage, or after TesseractRect.
- */
- Pix *GetThresholdedImage();
-
- /**
- * Get the result of page layout analysis as a leptonica-style
- * Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- */
- Boxa *GetRegions(Pixa **pixa);
-
- /**
- * Get the textlines as a leptonica-style
- * Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- * If raw_image is true, then extract from the original image instead of the
- * thresholded image and pad by raw_padding pixels.
- * If blockids is not nullptr, the block-id of each line is also returned as
- * an array of one element per line. delete [] after use. If paraids is not
- * nullptr, the paragraph-id of each line within its block is also returned as
- * an array of one element per line. delete [] after use.
- */
- Boxa *GetTextlines(bool raw_image, int raw_padding, Pixa **pixa,
- int **blockids, int **paraids);
- /*
- Helper method to extract from the thresholded image. (most common usage)
-*/
- Boxa *GetTextlines(Pixa **pixa, int **blockids) {
- return GetTextlines(false, 0, pixa, blockids, nullptr);
- }
-
- /**
- * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa
- * pair, in reading order. Enables downstream handling of non-rectangular
- * regions.
- * Can be called before or after Recognize.
- * If blockids is not nullptr, the block-id of each line is also returned as
- * an array of one element per line. delete [] after use.
- */
- Boxa *GetStrips(Pixa **pixa, int **blockids);
-
- /**
- * Get the words as a leptonica-style
- * Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- */
- Boxa *GetWords(Pixa **pixa);
-
- /**
- * Gets the individual connected (text) components (created
- * after pages segmentation step, but before recognition)
- * as a leptonica-style Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- * Note: the caller is responsible for calling boxaDestroy()
- * on the returned Boxa array and pixaDestroy() on cc array.
- */
- Boxa *GetConnectedComponents(Pixa **cc);
-
- /**
- * Get the given level kind of components (block, textline, word etc.) as a
- * leptonica-style Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- * If blockids is not nullptr, the block-id of each component is also returned
- * as an array of one element per component. delete [] after use.
- * If blockids is not nullptr, the paragraph-id of each component with its
- * block is also returned as an array of one element per component. delete []
- * after use. If raw_image is true, then portions of the original image are
- * extracted instead of the thresholded image and padded with raw_padding. If
- * text_only is true, then only text components are returned.
- */
- Boxa *GetComponentImages(PageIteratorLevel level, bool text_only,
- bool raw_image, int raw_padding, Pixa **pixa,
- int **blockids, int **paraids);
- // Helper function to get binary images with no padding (most common usage).
- Boxa *GetComponentImages(const PageIteratorLevel level, const bool text_only,
- Pixa **pixa, int **blockids) {
- return GetComponentImages(level, text_only, false, 0, pixa, blockids,
- nullptr);
- }
-
- /**
- * Returns the scale factor of the thresholded image that would be returned by
- * GetThresholdedImage() and the various GetX() methods that call
- * GetComponentImages().
- * Returns 0 if no thresholder has been set.
- */
- int GetThresholdedImageScaleFactor() const;
-
- /**
- * Runs page layout analysis in the mode set by SetPageSegMode.
- * May optionally be called prior to Recognize to get access to just
- * the page layout results. Returns an iterator to the results.
- * If merge_similar_words is true, words are combined where suitable for use
- * with a line recognizer. Use if you want to use AnalyseLayout to find the
- * textlines, and then want to process textline fragments with an external
- * line recognizer.
- * Returns nullptr on error or an empty page.
- * The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- */
- PageIterator *AnalyseLayout();
- PageIterator *AnalyseLayout(bool merge_similar_words);
-
- /**
- * Recognize the image from SetAndThresholdImage, generating Tesseract
- * internal structures. Returns 0 on success.
- * Optional. The Get*Text functions below will call Recognize if needed.
- * After Recognize, the output is kept internally until the next SetImage.
- */
- int Recognize(ETEXT_DESC *monitor);
-
- /**
- * Methods to retrieve information after SetAndThresholdImage(),
- * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.)
- */
-
- /**
- * Turns images into symbolic text.
- *
- * filename can point to a single image, a multi-page TIFF,
- * or a plain text list of image filenames.
- *
- * retry_config is useful for debugging. If not nullptr, you can fall
- * back to an alternate configuration if a page fails for some
- * reason.
- *
- * timeout_millisec terminates processing if any single page
- * takes too long. Set to 0 for unlimited time.
- *
- * renderer is responible for creating the output. For example,
- * use the TessTextRenderer if you want plaintext output, or
- * the TessPDFRender to produce searchable PDF.
- *
- * If tessedit_page_number is non-negative, will only process that
- * single page. Works for multi-page tiff file, or filelist.
- *
- * Returns true if successful, false on error.
- */
- bool ProcessPages(const char *filename, const char *retry_config,
- int timeout_millisec, TessResultRenderer *renderer);
- // Does the real work of ProcessPages.
- bool ProcessPagesInternal(const char *filename, const char *retry_config,
- int timeout_millisec, TessResultRenderer *renderer);
-
- /**
- * Turn a single image into symbolic text.
- *
- * The pix is the image processed. filename and page_index are
- * metadata used by side-effect processes, such as reading a box
- * file or formatting as hOCR.
- *
- * See ProcessPages for descriptions of other parameters.
- */
- bool ProcessPage(Pix *pix, int page_index, const char *filename,
- const char *retry_config, int timeout_millisec,
- TessResultRenderer *renderer);
-
- /**
- * Get a reading-order iterator to the results of LayoutAnalysis and/or
- * Recognize. The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- */
- ResultIterator *GetIterator();
-
- /**
- * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.
- * The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- */
- MutableIterator *GetMutableIterator();
-
- /**
- * The recognized text is returned as a char* which is coded
- * as UTF8 and must be freed with the delete [] operator.
- */
- char *GetUTF8Text();
-
- /**
- * Make a HTML-formatted string with hOCR markup from the internal
- * data structures.
- * page_number is 0-based but will appear in the output as 1-based.
- * monitor can be used to
- * cancel the recognition
- * receive progress callbacks
- * Returned string must be freed with the delete [] operator.
- */
- char *GetHOCRText(ETEXT_DESC *monitor, int page_number);
-
- /**
- * Make a HTML-formatted string with hOCR markup from the internal
- * data structures.
- * page_number is 0-based but will appear in the output as 1-based.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetHOCRText(int page_number);
-
- /**
- * Make an XML-formatted string with Alto markup from the internal
- * data structures.
- */
- char *GetAltoText(ETEXT_DESC *monitor, int page_number);
-
- /**
- * Make an XML-formatted string with Alto markup from the internal
- * data structures.
- */
- char *GetAltoText(int page_number);
-
- /**
- * Make a TSV-formatted string from the internal data structures.
- * page_number is 0-based but will appear in the output as 1-based.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetTSVText(int page_number);
-
- /**
- * Make a box file for LSTM training from the internal data structures.
- * Constructs coordinates in the original image - not just the rectangle.
- * page_number is a 0-based page index that will appear in the box file.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetLSTMBoxText(int page_number);
-
- /**
- * The recognized text is returned as a char* which is coded in the same
- * format as a box file used in training.
- * Constructs coordinates in the original image - not just the rectangle.
- * page_number is a 0-based page index that will appear in the box file.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetBoxText(int page_number);
-
- /**
- * The recognized text is returned as a char* which is coded in the same
- * format as a WordStr box file used in training.
- * page_number is a 0-based page index that will appear in the box file.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetWordStrBoxText(int page_number);
-
- /**
- * The recognized text is returned as a char* which is coded
- * as UNLV format Latin-1 with specific reject and suspect codes.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetUNLVText();
-
- /**
- * Detect the orientation of the input image and apparent script (alphabet).
- * orient_deg is the detected clockwise rotation of the input image in degrees
- * (0, 90, 180, 270)
- * orient_conf is the confidence (15.0 is reasonably confident)
- * script_name is an ASCII string, the name of the script, e.g. "Latin"
- * script_conf is confidence level in the script
- * Returns true on success and writes values to each parameter as an output
- */
- bool DetectOrientationScript(int *orient_deg, float *orient_conf,
- const char **script_name, float *script_conf);
-
- /**
- * The recognized text is returned as a char* which is coded
- * as UTF8 and must be freed with the delete [] operator.
- * page_number is a 0-based page index that will appear in the osd file.
- */
- char *GetOsdText(int page_number);
-
- /** Returns the (average) confidence value between 0 and 100. */
- int MeanTextConf();
- /**
- * Returns all word confidences (between 0 and 100) in an array, terminated
- * by -1. The calling function must delete [] after use.
- * The number of confidences should correspond to the number of space-
- * delimited words in GetUTF8Text.
- */
- int *AllWordConfidences();
-
-#ifndef DISABLED_LEGACY_ENGINE
- /**
- * Applies the given word to the adaptive classifier if possible.
- * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can
- * tell the boundaries of the graphemes.
- * Assumes that SetImage/SetRectangle have been used to set the image
- * to the given word. The mode arg should be PSM_SINGLE_WORD or
- * PSM_CIRCLE_WORD, as that will be used to control layout analysis.
- * The currently set PageSegMode is preserved.
- * Returns false if adaption was not possible for some reason.
- */
- bool AdaptToWordStr(PageSegMode mode, const char *wordstr);
-#endif // ndef DISABLED_LEGACY_ENGINE
-
- /**
- * Free up recognition results and any stored image data, without actually
- * freeing any recognition data that would be time-consuming to reload.
- * Afterwards, you must call SetImage or TesseractRect before doing
- * any Recognize or Get* operation.
- */
- void Clear();
-
- /**
- * Close down tesseract and free up all memory. End() is equivalent to
- * destructing and reconstructing your TessBaseAPI.
- * Once End() has been used, none of the other API functions may be used
- * other than Init and anything declared above it in the class definition.
- */
- void End();
-
- /**
- * Clear any library-level memory caches.
- * There are a variety of expensive-to-load constant data structures (mostly
- * language dictionaries) that are cached globally -- surviving the Init()
- * and End() of individual TessBaseAPI's. This function allows the clearing
- * of these caches.
- **/
- static void ClearPersistentCache();
-
- /**
- * Check whether a word is valid according to Tesseract's language model
- * @return 0 if the word is invalid, non-zero if valid.
- * @warning temporary! This function will be removed from here and placed
- * in a separate API at some future time.
- */
- int IsValidWord(const char *word) const;
- // Returns true if utf8_character is defined in the UniCharset.
- bool IsValidCharacter(const char *utf8_character) const;
-
- bool GetTextDirection(int *out_offset, float *out_slope);
-
- /** Sets Dict::letter_is_okay_ function to point to the given function. */
- void SetDictFunc(DictFunc f);
-
- /** Sets Dict::probability_in_context_ function to point to the given
- * function.
- */
- void SetProbabilityInContextFunc(ProbabilityInContextFunc f);
-
- /**
- * Estimates the Orientation And Script of the image.
- * @return true if the image was processed successfully.
- */
- bool DetectOS(OSResults *);
-
- /**
- * Return text orientation of each block as determined by an earlier run
- * of layout analysis.
- */
- void GetBlockTextOrientations(int **block_orientation,
- bool **vertical_writing);
-
- /** This method returns the string form of the specified unichar. */
- const char *GetUnichar(int unichar_id) const;
-
- /** Return the pointer to the i-th dawg loaded into tesseract_ object. */
- const Dawg *GetDawg(int i) const;
-
- /** Return the number of dawgs loaded into tesseract_ object. */
- int NumDawgs() const;
-
- Tesseract *tesseract() const {
- return tesseract_;
- }
-
- OcrEngineMode oem() const {
- return last_oem_requested_;
- }
-
- void set_min_orientation_margin(double margin);
- /* @} */
-
-protected:
- /** Common code for setting the image. Returns true if Init has been called.
- */
- bool InternalSetImage();
-
- /**
- * Run the thresholder to make the thresholded image. If pix is not nullptr,
- * the source is thresholded to pix instead of the internal IMAGE.
- */
- virtual bool Threshold(Pix **pix);
-
- /**
- * Find lines from the image making the BLOCK_LIST.
- * @return 0 on success.
- */
- int FindLines();
-
- /** Delete the pageres and block list ready for a new page. */
- void ClearResults();
-
- /**
- * Return an LTR Result Iterator -- used only for training, as we really want
- * to ignore all BiDi smarts at that point.
- * delete once you're done with it.
- */
- LTRResultIterator *GetLTRIterator();
-
- /**
- * Return the length of the output text string, as UTF8, assuming
- * one newline per line and one per block, with a terminator,
- * and assuming a single character reject marker for each rejected character.
- * Also return the number of recognized blobs in blob_count.
- */
- int TextLength(int *blob_count) const;
-
- //// paragraphs.cpp ////////////////////////////////////////////////////
- void DetectParagraphs(bool after_text_recognition);
-
- const PAGE_RES *GetPageRes() const {
- return page_res_;
- }
-
-protected:
- Tesseract *tesseract_; ///< The underlying data object.
- Tesseract *osd_tesseract_; ///< For orientation & script detection.
- EquationDetect *equ_detect_; ///< The equation detector.
- FileReader reader_; ///< Reads files from any filesystem.
- ImageThresholder *thresholder_; ///< Image thresholding module.
- std::vector *paragraph_models_;
- BLOCK_LIST *block_list_; ///< The page layout.
- PAGE_RES *page_res_; ///< The page-level data.
- std::string input_file_; ///< Name used by training code.
- std::string output_file_; ///< Name used by debug code.
- std::string datapath_; ///< Current location of tessdata.
- std::string language_; ///< Last initialized language.
- OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested.
- bool recognition_done_; ///< page_res_ contains recognition data.
-
- /**
- * @defgroup ThresholderParams Thresholder Parameters
- * Parameters saved from the Thresholder. Needed to rebuild coordinates.
- */
- /* @{ */
- int rect_left_;
- int rect_top_;
- int rect_width_;
- int rect_height_;
- int image_width_;
- int image_height_;
- /* @} */
-
-private:
- // A list of image filenames gets special consideration
- bool ProcessPagesFileList(FILE *fp, std::string *buf,
- const char *retry_config, int timeout_millisec,
- TessResultRenderer *renderer,
- int tessedit_page_number);
- // TIFF supports multipage so gets special consideration.
- bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size,
- const char *filename, const char *retry_config,
- int timeout_millisec,
- TessResultRenderer *renderer,
- int tessedit_page_number);
-}; // class TessBaseAPI.
-
-/** Escape a char string - remove &<>"' with HTML codes. */
-std::string HOcrEscape(const char *text);
-
-} // namespace tesseract
-
-#endif // TESSERACT_API_BASEAPI_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/capi.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/capi.h
deleted file mode 100644
index 40f4856a..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/capi.h
+++ /dev/null
@@ -1,484 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: capi.h
-// Description: C-API TessBaseAPI
-//
-// (C) Copyright 2012, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef API_CAPI_H_
-#define API_CAPI_H_
-
-#include "export.h"
-
-#ifdef __cplusplus
-# include
-# include
-# include
-# include
-# include
-#endif
-
-#include
-#include
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef BOOL
-# define BOOL int
-# define TRUE 1
-# define FALSE 0
-#endif
-
-#ifdef __cplusplus
-typedef tesseract::TessResultRenderer TessResultRenderer;
-typedef tesseract::TessBaseAPI TessBaseAPI;
-typedef tesseract::PageIterator TessPageIterator;
-typedef tesseract::ResultIterator TessResultIterator;
-typedef tesseract::MutableIterator TessMutableIterator;
-typedef tesseract::ChoiceIterator TessChoiceIterator;
-typedef tesseract::OcrEngineMode TessOcrEngineMode;
-typedef tesseract::PageSegMode TessPageSegMode;
-typedef tesseract::PageIteratorLevel TessPageIteratorLevel;
-typedef tesseract::Orientation TessOrientation;
-typedef tesseract::ParagraphJustification TessParagraphJustification;
-typedef tesseract::WritingDirection TessWritingDirection;
-typedef tesseract::TextlineOrder TessTextlineOrder;
-typedef tesseract::PolyBlockType TessPolyBlockType;
-typedef tesseract::ETEXT_DESC ETEXT_DESC;
-#else
-typedef struct TessResultRenderer TessResultRenderer;
-typedef struct TessBaseAPI TessBaseAPI;
-typedef struct TessPageIterator TessPageIterator;
-typedef struct TessResultIterator TessResultIterator;
-typedef struct TessMutableIterator TessMutableIterator;
-typedef struct TessChoiceIterator TessChoiceIterator;
-typedef enum TessOcrEngineMode {
- OEM_TESSERACT_ONLY,
- OEM_LSTM_ONLY,
- OEM_TESSERACT_LSTM_COMBINED,
- OEM_DEFAULT
-} TessOcrEngineMode;
-typedef enum TessPageSegMode {
- PSM_OSD_ONLY,
- PSM_AUTO_OSD,
- PSM_AUTO_ONLY,
- PSM_AUTO,
- PSM_SINGLE_COLUMN,
- PSM_SINGLE_BLOCK_VERT_TEXT,
- PSM_SINGLE_BLOCK,
- PSM_SINGLE_LINE,
- PSM_SINGLE_WORD,
- PSM_CIRCLE_WORD,
- PSM_SINGLE_CHAR,
- PSM_SPARSE_TEXT,
- PSM_SPARSE_TEXT_OSD,
- PSM_RAW_LINE,
- PSM_COUNT
-} TessPageSegMode;
-typedef enum TessPageIteratorLevel {
- RIL_BLOCK,
- RIL_PARA,
- RIL_TEXTLINE,
- RIL_WORD,
- RIL_SYMBOL
-} TessPageIteratorLevel;
-typedef enum TessPolyBlockType {
- PT_UNKNOWN,
- PT_FLOWING_TEXT,
- PT_HEADING_TEXT,
- PT_PULLOUT_TEXT,
- PT_EQUATION,
- PT_INLINE_EQUATION,
- PT_TABLE,
- PT_VERTICAL_TEXT,
- PT_CAPTION_TEXT,
- PT_FLOWING_IMAGE,
- PT_HEADING_IMAGE,
- PT_PULLOUT_IMAGE,
- PT_HORZ_LINE,
- PT_VERT_LINE,
- PT_NOISE,
- PT_COUNT
-} TessPolyBlockType;
-typedef enum TessOrientation {
- ORIENTATION_PAGE_UP,
- ORIENTATION_PAGE_RIGHT,
- ORIENTATION_PAGE_DOWN,
- ORIENTATION_PAGE_LEFT
-} TessOrientation;
-typedef enum TessParagraphJustification {
- JUSTIFICATION_UNKNOWN,
- JUSTIFICATION_LEFT,
- JUSTIFICATION_CENTER,
- JUSTIFICATION_RIGHT
-} TessParagraphJustification;
-typedef enum TessWritingDirection {
- WRITING_DIRECTION_LEFT_TO_RIGHT,
- WRITING_DIRECTION_RIGHT_TO_LEFT,
- WRITING_DIRECTION_TOP_TO_BOTTOM
-} TessWritingDirection;
-typedef enum TessTextlineOrder {
- TEXTLINE_ORDER_LEFT_TO_RIGHT,
- TEXTLINE_ORDER_RIGHT_TO_LEFT,
- TEXTLINE_ORDER_TOP_TO_BOTTOM
-} TessTextlineOrder;
-typedef struct ETEXT_DESC ETEXT_DESC;
-#endif
-
-typedef bool (*TessCancelFunc)(void *cancel_this, int words);
-typedef bool (*TessProgressFunc)(ETEXT_DESC *ths, int left, int right, int top,
- int bottom);
-
-struct Pix;
-struct Boxa;
-struct Pixa;
-
-/* General free functions */
-
-TESS_API const char *TessVersion();
-TESS_API void TessDeleteText(const char *text);
-TESS_API void TessDeleteTextArray(char **arr);
-TESS_API void TessDeleteIntArray(const int *arr);
-
-/* Renderer API */
-TESS_API TessResultRenderer *TessTextRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessHOcrRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase,
- BOOL font_info);
-TESS_API TessResultRenderer *TessAltoRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessTsvRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessPDFRendererCreate(const char *outputbase,
- const char *datadir,
- BOOL textonly);
-TESS_API TessResultRenderer *TessUnlvRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessWordStrBoxRendererCreate(
- const char *outputbase);
-
-TESS_API void TessDeleteResultRenderer(TessResultRenderer *renderer);
-TESS_API void TessResultRendererInsert(TessResultRenderer *renderer,
- TessResultRenderer *next);
-TESS_API TessResultRenderer *TessResultRendererNext(
- TessResultRenderer *renderer);
-TESS_API BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer,
- const char *title);
-TESS_API BOOL TessResultRendererAddImage(TessResultRenderer *renderer,
- TessBaseAPI *api);
-TESS_API BOOL TessResultRendererEndDocument(TessResultRenderer *renderer);
-
-TESS_API const char *TessResultRendererExtention(TessResultRenderer *renderer);
-TESS_API const char *TessResultRendererTitle(TessResultRenderer *renderer);
-TESS_API int TessResultRendererImageNum(TessResultRenderer *renderer);
-
-/* Base API */
-
-TESS_API TessBaseAPI *TessBaseAPICreate();
-TESS_API void TessBaseAPIDelete(TessBaseAPI *handle);
-
-TESS_API size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI *handle, void **device);
-
-TESS_API void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name);
-TESS_API const char *TessBaseAPIGetInputName(TessBaseAPI *handle);
-
-TESS_API void TessBaseAPISetInputImage(TessBaseAPI *handle, struct Pix *pix);
-TESS_API struct Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle);
-
-TESS_API int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle);
-TESS_API const char *TessBaseAPIGetDatapath(TessBaseAPI *handle);
-
-TESS_API void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name);
-
-TESS_API BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name,
- const char *value);
-TESS_API BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name,
- const char *value);
-
-TESS_API BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle,
- const char *name, int *value);
-TESS_API BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle,
- const char *name, BOOL *value);
-TESS_API BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle,
- const char *name, double *value);
-TESS_API const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle,
- const char *name);
-
-TESS_API void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp);
-TESS_API BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle,
- const char *filename);
-
-TESS_API int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath,
- const char *language, TessOcrEngineMode oem,
- char **configs, int configs_size);
-TESS_API int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath,
- const char *language, TessOcrEngineMode oem);
-TESS_API int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath,
- const char *language);
-
-TESS_API int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath,
- const char *language, TessOcrEngineMode mode,
- char **configs, int configs_size, char **vars_vec,
- char **vars_values, size_t vars_vec_size,
- BOOL set_only_non_debug_params);
-
-TESS_API int TessBaseAPIInit5(TessBaseAPI *handle, const char *data, int data_size,
- const char *language, TessOcrEngineMode mode,
- char **configs, int configs_size, char **vars_vec,
- char **vars_values, size_t vars_vec_size,
- BOOL set_only_non_debug_params);
-
-TESS_API const char *TessBaseAPIGetInitLanguagesAsString(
- const TessBaseAPI *handle);
-TESS_API char **TessBaseAPIGetLoadedLanguagesAsVector(
- const TessBaseAPI *handle);
-TESS_API char **TessBaseAPIGetAvailableLanguagesAsVector(
- const TessBaseAPI *handle);
-
-TESS_API void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle);
-
-TESS_API void TessBaseAPIReadConfigFile(TessBaseAPI *handle,
- const char *filename);
-TESS_API void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle,
- const char *filename);
-
-TESS_API void TessBaseAPISetPageSegMode(TessBaseAPI *handle,
- TessPageSegMode mode);
-TESS_API TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle);
-
-TESS_API char *TessBaseAPIRect(TessBaseAPI *handle,
- const unsigned char *imagedata,
- int bytes_per_pixel, int bytes_per_line,
- int left, int top, int width, int height);
-
-TESS_API void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle);
-
-TESS_API void TessBaseAPISetImage(TessBaseAPI *handle,
- const unsigned char *imagedata, int width,
- int height, int bytes_per_pixel,
- int bytes_per_line);
-TESS_API void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix);
-
-TESS_API void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi);
-
-TESS_API void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top,
- int width, int height);
-
-TESS_API struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle);
-TESS_API struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle,
- struct Pixa **pixa);
-TESS_API struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle,
- struct Pixa **pixa,
- int **blockids);
-TESS_API struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle,
- BOOL raw_image, int raw_padding,
- struct Pixa **pixa,
- int **blockids, int **paraids);
-TESS_API struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle,
- struct Pixa **pixa, int **blockids);
-TESS_API struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle,
- struct Pixa **pixa);
-TESS_API struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle,
- struct Pixa **cc);
-TESS_API struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle,
- TessPageIteratorLevel level,
- BOOL text_only,
- struct Pixa **pixa,
- int **blockids);
-TESS_API struct Boxa *TessBaseAPIGetComponentImages1(
- TessBaseAPI *handle, TessPageIteratorLevel level, BOOL text_only,
- BOOL raw_image, int raw_padding, struct Pixa **pixa, int **blockids,
- int **paraids);
-
-TESS_API int TessBaseAPIGetThresholdedImageScaleFactor(
- const TessBaseAPI *handle);
-
-TESS_API TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle);
-
-TESS_API int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor);
-
-TESS_API BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename,
- const char *retry_config,
- int timeout_millisec,
- TessResultRenderer *renderer);
-TESS_API BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix,
- int page_index, const char *filename,
- const char *retry_config,
- int timeout_millisec,
- TessResultRenderer *renderer);
-
-TESS_API TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle);
-TESS_API TessMutableIterator *TessBaseAPIGetMutableIterator(
- TessBaseAPI *handle);
-
-TESS_API char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle);
-TESS_API char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number);
-
-TESS_API char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number);
-TESS_API char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number);
-
-TESS_API char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number);
-TESS_API char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number);
-TESS_API char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle,
- int page_number);
-
-TESS_API char *TessBaseAPIGetUNLVText(TessBaseAPI *handle);
-TESS_API int TessBaseAPIMeanTextConf(TessBaseAPI *handle);
-
-TESS_API int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle);
-
-#ifndef DISABLED_LEGACY_ENGINE
-TESS_API BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle,
- TessPageSegMode mode,
- const char *wordstr);
-#endif // #ifndef DISABLED_LEGACY_ENGINE
-
-TESS_API void TessBaseAPIClear(TessBaseAPI *handle);
-TESS_API void TessBaseAPIEnd(TessBaseAPI *handle);
-
-TESS_API int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word);
-TESS_API BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset,
- float *out_slope);
-
-TESS_API const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id);
-
-TESS_API void TessBaseAPIClearPersistentCache(TessBaseAPI *handle);
-
-#ifndef DISABLED_LEGACY_ENGINE
-
-// Call TessDeleteText(*best_script_name) to free memory allocated by this
-// function
-TESS_API BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle,
- int *orient_deg,
- float *orient_conf,
- const char **script_name,
- float *script_conf);
-#endif // #ifndef DISABLED_LEGACY_ENGINE
-
-TESS_API void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle,
- double margin);
-
-TESS_API int TessBaseAPINumDawgs(const TessBaseAPI *handle);
-
-TESS_API TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle);
-
-TESS_API void TessBaseGetBlockTextOrientations(TessBaseAPI *handle,
- int **block_orientation,
- bool **vertical_writing);
-
-/* Page iterator */
-
-TESS_API void TessPageIteratorDelete(TessPageIterator *handle);
-
-TESS_API TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle);
-
-TESS_API void TessPageIteratorBegin(TessPageIterator *handle);
-
-TESS_API BOOL TessPageIteratorNext(TessPageIterator *handle,
- TessPageIteratorLevel level);
-
-TESS_API BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle,
- TessPageIteratorLevel level);
-
-TESS_API BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle,
- TessPageIteratorLevel level,
- TessPageIteratorLevel element);
-
-TESS_API BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle,
- TessPageIteratorLevel level,
- int *left, int *top, int *right,
- int *bottom);
-
-TESS_API TessPolyBlockType
-TessPageIteratorBlockType(const TessPageIterator *handle);
-
-TESS_API struct Pix *TessPageIteratorGetBinaryImage(
- const TessPageIterator *handle, TessPageIteratorLevel level);
-
-TESS_API struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle,
- TessPageIteratorLevel level,
- int padding,
- struct Pix *original_image,
- int *left, int *top);
-
-TESS_API BOOL TessPageIteratorBaseline(const TessPageIterator *handle,
- TessPageIteratorLevel level, int *x1,
- int *y1, int *x2, int *y2);
-
-TESS_API void TessPageIteratorOrientation(
- TessPageIterator *handle, TessOrientation *orientation,
- TessWritingDirection *writing_direction, TessTextlineOrder *textline_order,
- float *deskew_angle);
-
-TESS_API void TessPageIteratorParagraphInfo(
- TessPageIterator *handle, TessParagraphJustification *justification,
- BOOL *is_list_item, BOOL *is_crown, int *first_line_indent);
-
-/* Result iterator */
-
-TESS_API void TessResultIteratorDelete(TessResultIterator *handle);
-TESS_API TessResultIterator *TessResultIteratorCopy(
- const TessResultIterator *handle);
-TESS_API TessPageIterator *TessResultIteratorGetPageIterator(
- TessResultIterator *handle);
-TESS_API const TessPageIterator *TessResultIteratorGetPageIteratorConst(
- const TessResultIterator *handle);
-TESS_API TessChoiceIterator *TessResultIteratorGetChoiceIterator(
- const TessResultIterator *handle);
-
-TESS_API BOOL TessResultIteratorNext(TessResultIterator *handle,
- TessPageIteratorLevel level);
-TESS_API char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle,
- TessPageIteratorLevel level);
-TESS_API float TessResultIteratorConfidence(const TessResultIterator *handle,
- TessPageIteratorLevel level);
-TESS_API const char *TessResultIteratorWordRecognitionLanguage(
- const TessResultIterator *handle);
-TESS_API const char *TessResultIteratorWordFontAttributes(
- const TessResultIterator *handle, BOOL *is_bold, BOOL *is_italic,
- BOOL *is_underlined, BOOL *is_monospace, BOOL *is_serif, BOOL *is_smallcaps,
- int *pointsize, int *font_id);
-
-TESS_API BOOL
-TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle);
-TESS_API BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle);
-TESS_API BOOL
-TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle);
-TESS_API BOOL
-TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle);
-TESS_API BOOL
-TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle);
-
-TESS_API void TessChoiceIteratorDelete(TessChoiceIterator *handle);
-TESS_API BOOL TessChoiceIteratorNext(TessChoiceIterator *handle);
-TESS_API const char *TessChoiceIteratorGetUTF8Text(
- const TessChoiceIterator *handle);
-TESS_API float TessChoiceIteratorConfidence(const TessChoiceIterator *handle);
-
-/* Progress monitor */
-
-TESS_API ETEXT_DESC *TessMonitorCreate();
-TESS_API void TessMonitorDelete(ETEXT_DESC *monitor);
-TESS_API void TessMonitorSetCancelFunc(ETEXT_DESC *monitor,
- TessCancelFunc cancelFunc);
-TESS_API void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis);
-TESS_API void *TessMonitorGetCancelThis(ETEXT_DESC *monitor);
-TESS_API void TessMonitorSetProgressFunc(ETEXT_DESC *monitor,
- TessProgressFunc progressFunc);
-TESS_API int TessMonitorGetProgress(ETEXT_DESC *monitor);
-TESS_API void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // API_CAPI_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/export.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/export.h
deleted file mode 100644
index d238b628..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/export.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: export.h
-// Description: Place holder
-//
-// (C) Copyright 2006, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_PLATFORM_H_
-#define TESSERACT_PLATFORM_H_
-
-#ifndef TESS_API
-# if defined(_WIN32) || defined(__CYGWIN__)
-# if defined(TESS_EXPORTS)
-# define TESS_API __declspec(dllexport)
-# elif defined(TESS_IMPORTS)
-# define TESS_API __declspec(dllimport)
-# else
-# define TESS_API
-# endif
-# else
-# if defined(TESS_EXPORTS) || defined(TESS_IMPORTS)
-# define TESS_API __attribute__((visibility("default")))
-# else
-# define TESS_API
-# endif
-# endif
-#endif
-
-#endif // TESSERACT_PLATFORM_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/ltrresultiterator.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/ltrresultiterator.h
deleted file mode 100644
index 6ca0a98e..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/ltrresultiterator.h
+++ /dev/null
@@ -1,235 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: ltrresultiterator.h
-// Description: Iterator for tesseract results in strict left-to-right
-// order that avoids using tesseract internal data structures.
-// Author: Ray Smith
-//
-// (C) Copyright 2010, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
-#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
-
-#include "export.h" // for TESS_API
-#include "pageiterator.h" // for PageIterator
-#include "publictypes.h" // for PageIteratorLevel
-#include "unichar.h" // for StrongScriptDirection
-
-namespace tesseract {
-
-class BLOB_CHOICE_IT;
-class PAGE_RES;
-class WERD_RES;
-
-class Tesseract;
-
-// Class to iterate over tesseract results, providing access to all levels
-// of the page hierarchy, without including any tesseract headers or having
-// to handle any tesseract structures.
-// WARNING! This class points to data held within the TessBaseAPI class, and
-// therefore can only be used while the TessBaseAPI class still exists and
-// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
-// DetectOS, or anything else that changes the internal PAGE_RES.
-// See tesseract/publictypes.h for the definition of PageIteratorLevel.
-// See also base class PageIterator, which contains the bulk of the interface.
-// LTRResultIterator adds text-specific methods for access to OCR output.
-
-class TESS_API LTRResultIterator : public PageIterator {
- friend class ChoiceIterator;
-
-public:
- // page_res and tesseract come directly from the BaseAPI.
- // The rectangle parameters are copied indirectly from the Thresholder,
- // via the BaseAPI. They represent the coordinates of some rectangle in an
- // original image (in top-left-origin coordinates) and therefore the top-left
- // needs to be added to any output boxes in order to specify coordinates
- // in the original image. See TessBaseAPI::SetRectangle.
- // The scale and scaled_yres are in case the Thresholder scaled the image
- // rectangle prior to thresholding. Any coordinates in tesseract's image
- // must be divided by scale before adding (rect_left, rect_top).
- // The scaled_yres indicates the effective resolution of the binary image
- // that tesseract has been given by the Thresholder.
- // After the constructor, Begin has already been called.
- LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
- int scaled_yres, int rect_left, int rect_top,
- int rect_width, int rect_height);
-
- ~LTRResultIterator() override;
-
- // LTRResultIterators may be copied! This makes it possible to iterate over
- // all the objects at a lower level, while maintaining an iterator to
- // objects at a higher level. These constructors DO NOT CALL Begin, so
- // iterations will continue from the location of src.
- // TODO: For now the copy constructor and operator= only need the base class
- // versions, but if new data members are added, don't forget to add them!
-
- // ============= Moving around within the page ============.
-
- // See PageIterator.
-
- // ============= Accessing data ==============.
-
- // Returns the null terminated UTF-8 encoded text string for the current
- // object at the given level. Use delete [] to free after use.
- char *GetUTF8Text(PageIteratorLevel level) const;
-
- // Set the string inserted at the end of each text line. "\n" by default.
- void SetLineSeparator(const char *new_line);
-
- // Set the string inserted at the end of each paragraph. "\n" by default.
- void SetParagraphSeparator(const char *new_para);
-
- // Returns the mean confidence of the current object at the given level.
- // The number should be interpreted as a percent probability. (0.0f-100.0f)
- float Confidence(PageIteratorLevel level) const;
-
- // ============= Functions that refer to words only ============.
-
- // Returns the font attributes of the current word. If iterating at a higher
- // level object than words, eg textlines, then this will return the
- // attributes of the first word in that textline.
- // The actual return value is a string representing a font name. It points
- // to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
- // the iterator itself, ie rendered invalid by various members of
- // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
- // Pointsize is returned in printers points (1/72 inch.)
- const char *WordFontAttributes(bool *is_bold, bool *is_italic,
- bool *is_underlined, bool *is_monospace,
- bool *is_serif, bool *is_smallcaps,
- int *pointsize, int *font_id) const;
-
- // Return the name of the language used to recognize this word.
- // On error, nullptr. Do not delete this pointer.
- const char *WordRecognitionLanguage() const;
-
- // Return the overall directionality of this word.
- StrongScriptDirection WordDirection() const;
-
- // Returns true if the current word was found in a dictionary.
- bool WordIsFromDictionary() const;
-
- // Returns the number of blanks before the current word.
- int BlanksBeforeWord() const;
-
- // Returns true if the current word is numeric.
- bool WordIsNumeric() const;
-
- // Returns true if the word contains blamer information.
- bool HasBlamerInfo() const;
-
- // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
- // of the current word.
- const void *GetParamsTrainingBundle() const;
-
- // Returns a pointer to the string with blamer information for this word.
- // Assumes that the word's blamer_bundle is not nullptr.
- const char *GetBlamerDebug() const;
-
- // Returns a pointer to the string with misadaption information for this word.
- // Assumes that the word's blamer_bundle is not nullptr.
- const char *GetBlamerMisadaptionDebug() const;
-
- // Returns true if a truth string was recorded for the current word.
- bool HasTruthString() const;
-
- // Returns true if the given string is equivalent to the truth string for
- // the current word.
- bool EquivalentToTruth(const char *str) const;
-
- // Returns a null terminated UTF-8 encoded truth string for the current word.
- // Use delete [] to free after use.
- char *WordTruthUTF8Text() const;
-
- // Returns a null terminated UTF-8 encoded normalized OCR string for the
- // current word. Use delete [] to free after use.
- char *WordNormedUTF8Text() const;
-
- // Returns a pointer to serialized choice lattice.
- // Fills lattice_size with the number of bytes in lattice data.
- const char *WordLattice(int *lattice_size) const;
-
- // ============= Functions that refer to symbols only ============.
-
- // Returns true if the current symbol is a superscript.
- // If iterating at a higher level object than symbols, eg words, then
- // this will return the attributes of the first symbol in that word.
- bool SymbolIsSuperscript() const;
- // Returns true if the current symbol is a subscript.
- // If iterating at a higher level object than symbols, eg words, then
- // this will return the attributes of the first symbol in that word.
- bool SymbolIsSubscript() const;
- // Returns true if the current symbol is a dropcap.
- // If iterating at a higher level object than symbols, eg words, then
- // this will return the attributes of the first symbol in that word.
- bool SymbolIsDropcap() const;
-
-protected:
- const char *line_separator_;
- const char *paragraph_separator_;
-};
-
-// Class to iterate over the classifier choices for a single RIL_SYMBOL.
-class TESS_API ChoiceIterator {
-public:
- // Construction is from a LTRResultIterator that points to the symbol of
- // interest. The ChoiceIterator allows a one-shot iteration over the
- // choices for this symbol and after that it is useless.
- explicit ChoiceIterator(const LTRResultIterator &result_it);
- ~ChoiceIterator();
-
- // Moves to the next choice for the symbol and returns false if there
- // are none left.
- bool Next();
-
- // ============= Accessing data ==============.
-
- // Returns the null terminated UTF-8 encoded text string for the current
- // choice.
- // NOTE: Unlike LTRResultIterator::GetUTF8Text, the return points to an
- // internal structure and should NOT be delete[]ed to free after use.
- const char *GetUTF8Text() const;
-
- // Returns the confidence of the current choice depending on the used language
- // data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All
- // choices for one symbol should roughly add up to 1.0f.
- // If only traineddata of the legacy engine is used, the number should be
- // interpreted as a percent probability. (0.0f-100.0f) In this case
- // probabilities won't add up to 100. Each one stands on its own.
- float Confidence() const;
-
- // Returns a vector containing all timesteps, which belong to the currently
- // selected symbol. A timestep is a vector containing pairs of symbols and
- // floating point numbers. The number states the probability for the
- // corresponding symbol.
- std::vector>> *Timesteps() const;
-
-private:
- // clears the remaining spaces out of the results and adapt the probabilities
- void filterSpaces();
- // Pointer to the WERD_RES object owned by the API.
- WERD_RES *word_res_;
- // Iterator over the blob choices.
- BLOB_CHOICE_IT *choice_it_;
- std::vector> *LSTM_choices_ = nullptr;
- std::vector>::iterator LSTM_choice_it_;
-
- const int *tstep_index_;
- // regulates the rating granularity
- double rating_coefficient_;
- // leading blanks
- int blanks_before_word_;
- // true when there is lstm engine related trained data
- bool oemLSTM_;
-};
-
-} // namespace tesseract.
-
-#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/ocrclass.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/ocrclass.h
deleted file mode 100644
index a55e6528..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/ocrclass.h
+++ /dev/null
@@ -1,158 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-/**********************************************************************
- * File: ocrclass.h
- * Description: Class definitions and constants for the OCR API.
- * Author: Hewlett-Packard Co
- *
- * (C) Copyright 1996, Hewlett-Packard Co.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-/**********************************************************************
- * This file contains typedefs for all the structures used by
- * the HP OCR interface.
- * The structures are designed to allow them to be used with any
- * structure alignment up to 8.
- **********************************************************************/
-
-#ifndef CCUTIL_OCRCLASS_H_
-#define CCUTIL_OCRCLASS_H_
-
-#include
-#include
-
-namespace tesseract {
-
-/**********************************************************************
- * EANYCODE_CHAR
- * Description of a single character. The character code is defined by
- * the character set of the current font.
- * Output text is sent as an array of these structures.
- * Spaces and line endings in the output are represented in the
- * structures of the surrounding characters. They are not directly
- * represented as characters.
- * The first character in a word has a positive value of blanks.
- * Missing information should be set to the defaults in the comments.
- * If word bounds are known, but not character bounds, then the top and
- * bottom of each character should be those of the word. The left of the
- * first and right of the last char in each word should be set. All other
- * lefts and rights should be set to -1.
- * If set, the values of right and bottom are left+width and top+height.
- * Most of the members come directly from the parameters to ocr_append_char.
- * The formatting member uses the enhancement parameter and combines the
- * line direction stuff into the top 3 bits.
- * The coding is 0=RL char, 1=LR char, 2=DR NL, 3=UL NL, 4=DR Para,
- * 5=UL Para, 6=TB char, 7=BT char. API users do not need to know what
- * the coding is, only that it is backwards compatible with the previous
- * version.
- **********************************************************************/
-
-struct EANYCODE_CHAR { /*single character */
- // It should be noted that the format for char_code for version 2.0 and beyond
- // is UTF8 which means that ASCII characters will come out as one structure
- // but other characters will be returned in two or more instances of this
- // structure with a single byte of the UTF8 code in each, but each will have
- // the same bounding box. Programs which want to handle languagues with
- // different characters sets will need to handle extended characters
- // appropriately, but *all* code needs to be prepared to receive UTF8 coded
- // characters for characters such as bullet and fancy quotes.
- uint16_t char_code; /*character itself */
- int16_t left; /*of char (-1) */
- int16_t right; /*of char (-1) */
- int16_t top; /*of char (-1) */
- int16_t bottom; /*of char (-1) */
- int16_t font_index; /*what font (0) */
- uint8_t confidence; /*0=perfect, 100=reject (0/100) */
- uint8_t point_size; /*of char, 72=i inch, (10) */
- int8_t blanks; /*no of spaces before this char (1) */
- uint8_t formatting; /*char formatting (0) */
-};
-
-/**********************************************************************
- * ETEXT_DESC
- * Description of the output of the OCR engine.
- * This structure is used as both a progress monitor and the final
- * output header, since it needs to be a valid progress monitor while
- * the OCR engine is storing its output to shared memory.
- * During progress, all the buffer info is -1.
- * Progress starts at 0 and increases to 100 during OCR. No other constraint.
- * Additionally the progress callback contains the bounding box of the word that
- * is currently being processed.
- * Every progress callback, the OCR engine must set ocr_alive to 1.
- * The HP side will set ocr_alive to 0. Repeated failure to reset
- * to 1 indicates that the OCR engine is dead.
- * If the cancel function is not null then it is called with the number of
- * user words found. If it returns true then operation is cancelled.
- **********************************************************************/
-class ETEXT_DESC;
-
-using CANCEL_FUNC = bool (*)(void *, int);
-using PROGRESS_FUNC = bool (*)(int, int, int, int, int);
-using PROGRESS_FUNC2 = bool (*)(ETEXT_DESC *, int, int, int, int);
-
-class ETEXT_DESC { // output header
-public:
- int16_t count{0}; /// chars in this buffer(0)
- int16_t progress{0}; /// percent complete increasing (0-100)
- /** Progress monitor covers word recognition and it does not cover layout
- * analysis.
- * See Ray comment in https://github.com/tesseract-ocr/tesseract/pull/27 */
- int8_t more_to_come{0}; /// true if not last
- volatile int8_t ocr_alive{0}; /// ocr sets to 1, HP 0
- int8_t err_code{0}; /// for errcode use
- CANCEL_FUNC cancel{nullptr}; /// returns true to cancel
- PROGRESS_FUNC progress_callback{
- nullptr}; /// called whenever progress increases
- PROGRESS_FUNC2 progress_callback2; /// monitor-aware progress callback
- void *cancel_this{nullptr}; /// this or other data for cancel
- std::chrono::steady_clock::time_point end_time;
- /// Time to stop. Expected to be set only
- /// by call to set_deadline_msecs().
- EANYCODE_CHAR text[1]{}; /// character data
-
- ETEXT_DESC() : progress_callback2(&default_progress_func) {
- end_time = std::chrono::time_point();
- }
-
- // Sets the end time to be deadline_msecs milliseconds from now.
- void set_deadline_msecs(int32_t deadline_msecs) {
- if (deadline_msecs > 0) {
- end_time = std::chrono::steady_clock::now() +
- std::chrono::milliseconds(deadline_msecs);
- }
- }
-
- // Returns false if we've not passed the end_time, or have not set a deadline.
- bool deadline_exceeded() const {
- if (end_time.time_since_epoch() ==
- std::chrono::steady_clock::duration::zero()) {
- return false;
- }
- auto now = std::chrono::steady_clock::now();
- return (now > end_time);
- }
-
-private:
- static bool default_progress_func(ETEXT_DESC *ths, int left, int right,
- int top, int bottom) {
- if (ths->progress_callback != nullptr) {
- return (*(ths->progress_callback))(ths->progress, left, right, top,
- bottom);
- }
- return true;
- }
-};
-
-} // namespace tesseract
-
-#endif // CCUTIL_OCRCLASS_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/osdetect.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/osdetect.h
deleted file mode 100644
index 34bfb557..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/osdetect.h
+++ /dev/null
@@ -1,139 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: osdetect.h
-// Description: Orientation and script detection.
-// Author: Samuel Charron
-// Ranjith Unnikrishnan
-//
-// (C) Copyright 2008, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCMAIN_OSDETECT_H_
-#define TESSERACT_CCMAIN_OSDETECT_H_
-
-#include "export.h" // for TESS_API
-
-#include // for std::vector
-
-namespace tesseract {
-
-class BLOBNBOX;
-class BLOBNBOX_CLIST;
-class BLOB_CHOICE_LIST;
-class TO_BLOCK_LIST;
-class UNICHARSET;
-
-class Tesseract;
-
-// Max number of scripts in ICU + "NULL" + Japanese and Korean + Fraktur
-const int kMaxNumberOfScripts = 116 + 1 + 2 + 1;
-
-struct OSBestResult {
- OSBestResult()
- : orientation_id(0), script_id(0), sconfidence(0.0), oconfidence(0.0) {}
- int orientation_id;
- int script_id;
- float sconfidence;
- float oconfidence;
-};
-
-struct OSResults {
- OSResults() : unicharset(nullptr) {
- for (int i = 0; i < 4; ++i) {
- for (int j = 0; j < kMaxNumberOfScripts; ++j) {
- scripts_na[i][j] = 0;
- }
- orientations[i] = 0;
- }
- }
- void update_best_orientation();
- // Set the estimate of the orientation to the given id.
- void set_best_orientation(int orientation_id);
- // Update/Compute the best estimate of the script assuming the given
- // orientation id.
- void update_best_script(int orientation_id);
- // Return the index of the script with the highest score for this orientation.
- TESS_API int get_best_script(int orientation_id) const;
- // Accumulate scores with given OSResults instance and update the best script.
- void accumulate(const OSResults &osr);
-
- // Print statistics.
- void print_scores(void) const;
- void print_scores(int orientation_id) const;
-
- // Array holding scores for each orientation id [0,3].
- // Orientation ids [0..3] map to [0, 270, 180, 90] degree orientations of the
- // page respectively, where the values refer to the amount of clockwise
- // rotation to be applied to the page for the text to be upright and readable.
- float orientations[4];
- // Script confidence scores for each of 4 possible orientations.
- float scripts_na[4][kMaxNumberOfScripts];
-
- UNICHARSET *unicharset;
- OSBestResult best_result;
-};
-
-class OrientationDetector {
-public:
- OrientationDetector(const std::vector *allowed_scripts,
- OSResults *results);
- bool detect_blob(BLOB_CHOICE_LIST *scores);
- int get_orientation();
-
-private:
- OSResults *osr_;
- const std::vector *allowed_scripts_;
-};
-
-class ScriptDetector {
-public:
- ScriptDetector(const std::vector *allowed_scripts, OSResults *osr,
- tesseract::Tesseract *tess);
- void detect_blob(BLOB_CHOICE_LIST *scores);
- bool must_stop(int orientation) const;
-
-private:
- OSResults *osr_;
- static const char *korean_script_;
- static const char *japanese_script_;
- static const char *fraktur_script_;
- int korean_id_;
- int japanese_id_;
- int katakana_id_;
- int hiragana_id_;
- int han_id_;
- int hangul_id_;
- int latin_id_;
- int fraktur_id_;
- tesseract::Tesseract *tess_;
- const std::vector *allowed_scripts_;
-};
-
-int orientation_and_script_detection(const char *filename, OSResults *,
- tesseract::Tesseract *);
-
-int os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr,
- tesseract::Tesseract *tess);
-
-int os_detect_blobs(const std::vector *allowed_scripts,
- BLOBNBOX_CLIST *blob_list, OSResults *osr,
- tesseract::Tesseract *tess);
-
-bool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s,
- OSResults *, tesseract::Tesseract *tess);
-
-// Helper method to convert an orientation index to its value in degrees.
-// The value represents the amount of clockwise rotation in degrees that must be
-// applied for the text to be upright (readable).
-TESS_API int OrientationIdToValue(const int &id);
-
-} // namespace tesseract
-
-#endif // TESSERACT_CCMAIN_OSDETECT_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/pageiterator.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/pageiterator.h
deleted file mode 100644
index 68739715..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/pageiterator.h
+++ /dev/null
@@ -1,364 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: pageiterator.h
-// Description: Iterator for tesseract page structure that avoids using
-// tesseract internal data structures.
-// Author: Ray Smith
-//
-// (C) Copyright 2010, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_
-#define TESSERACT_CCMAIN_PAGEITERATOR_H_
-
-#include "export.h"
-#include "publictypes.h"
-
-struct Pix;
-struct Pta;
-
-namespace tesseract {
-
-struct BlamerBundle;
-class C_BLOB_IT;
-class PAGE_RES;
-class PAGE_RES_IT;
-class WERD;
-
-class Tesseract;
-
-/**
- * Class to iterate over tesseract page structure, providing access to all
- * levels of the page hierarchy, without including any tesseract headers or
- * having to handle any tesseract structures.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- * See tesseract/publictypes.h for the definition of PageIteratorLevel.
- * See also ResultIterator, derived from PageIterator, which adds in the
- * ability to access OCR output with text-specific methods.
- */
-
-class TESS_API PageIterator {
-public:
- /**
- * page_res and tesseract come directly from the BaseAPI.
- * The rectangle parameters are copied indirectly from the Thresholder,
- * via the BaseAPI. They represent the coordinates of some rectangle in an
- * original image (in top-left-origin coordinates) and therefore the top-left
- * needs to be added to any output boxes in order to specify coordinates
- * in the original image. See TessBaseAPI::SetRectangle.
- * The scale and scaled_yres are in case the Thresholder scaled the image
- * rectangle prior to thresholding. Any coordinates in tesseract's image
- * must be divided by scale before adding (rect_left, rect_top).
- * The scaled_yres indicates the effective resolution of the binary image
- * that tesseract has been given by the Thresholder.
- * After the constructor, Begin has already been called.
- */
- PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
- int scaled_yres, int rect_left, int rect_top, int rect_width,
- int rect_height);
- virtual ~PageIterator();
-
- /**
- * Page/ResultIterators may be copied! This makes it possible to iterate over
- * all the objects at a lower level, while maintaining an iterator to
- * objects at a higher level. These constructors DO NOT CALL Begin, so
- * iterations will continue from the location of src.
- */
- PageIterator(const PageIterator &src);
- const PageIterator &operator=(const PageIterator &src);
-
- /** Are we positioned at the same location as other? */
- bool PositionedAtSameWord(const PAGE_RES_IT *other) const;
-
- // ============= Moving around within the page ============.
-
- /**
- * Moves the iterator to point to the start of the page to begin an
- * iteration.
- */
- virtual void Begin();
-
- /**
- * Moves the iterator to the beginning of the paragraph.
- * This class implements this functionality by moving it to the zero indexed
- * blob of the first (leftmost) word on the first row of the paragraph.
- */
- virtual void RestartParagraph();
-
- /**
- * Return whether this iterator points anywhere in the first textline of a
- * paragraph.
- */
- bool IsWithinFirstTextlineOfParagraph() const;
-
- /**
- * Moves the iterator to the beginning of the text line.
- * This class implements this functionality by moving it to the zero indexed
- * blob of the first (leftmost) word of the row.
- */
- virtual void RestartRow();
-
- /**
- * Moves to the start of the next object at the given level in the
- * page hierarchy, and returns false if the end of the page was reached.
- * NOTE that RIL_SYMBOL will skip non-text blocks, but all other
- * PageIteratorLevel level values will visit each non-text block once.
- * Think of non text blocks as containing a single para, with a single line,
- * with a single imaginary word.
- * Calls to Next with different levels may be freely intermixed.
- * This function iterates words in right-to-left scripts correctly, if
- * the appropriate language has been loaded into Tesseract.
- */
- virtual bool Next(PageIteratorLevel level);
-
- /**
- * Returns true if the iterator is at the start of an object at the given
- * level.
- *
- * For instance, suppose an iterator it is pointed to the first symbol of the
- * first word of the third line of the second paragraph of the first block in
- * a page, then:
- * it.IsAtBeginningOf(RIL_BLOCK) = false
- * it.IsAtBeginningOf(RIL_PARA) = false
- * it.IsAtBeginningOf(RIL_TEXTLINE) = true
- * it.IsAtBeginningOf(RIL_WORD) = true
- * it.IsAtBeginningOf(RIL_SYMBOL) = true
- */
- virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
-
- /**
- * Returns whether the iterator is positioned at the last element in a
- * given level. (e.g. the last word in a line, the last line in a block)
- *
- * Here's some two-paragraph example
- * text. It starts off innocuously
- * enough but quickly turns bizarre.
- * The author inserts a cornucopia
- * of words to guard against confused
- * references.
- *
- * Now take an iterator it pointed to the start of "bizarre."
- * it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false
- * it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true
- * it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false
- */
- virtual bool IsAtFinalElement(PageIteratorLevel level,
- PageIteratorLevel element) const;
-
- /**
- * Returns whether this iterator is positioned
- * before other: -1
- * equal to other: 0
- * after other: 1
- */
- int Cmp(const PageIterator &other) const;
-
- // ============= Accessing data ==============.
- // Coordinate system:
- // Integer coordinates are at the cracks between the pixels.
- // The top-left corner of the top-left pixel in the image is at (0,0).
- // The bottom-right corner of the bottom-right pixel in the image is at
- // (width, height).
- // Every bounding box goes from the top-left of the top-left contained
- // pixel to the bottom-right of the bottom-right contained pixel, so
- // the bounding box of the single top-left pixel in the image is:
- // (0,0)->(1,1).
- // If an image rectangle has been set in the API, then returned coordinates
- // relate to the original (full) image, rather than the rectangle.
-
- /**
- * Controls what to include in a bounding box. Bounding boxes of all levels
- * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics.
- * Between layout analysis and recognition, it isn't known where all
- * diacritics belong, so this control is used to include or exclude some
- * diacritics that are above or below the main body of the word. In most cases
- * where the placement is obvious, and after recognition, it doesn't make as
- * much difference, as the diacritics will already be included in the word.
- */
- void SetBoundingBoxComponents(bool include_upper_dots,
- bool include_lower_dots) {
- include_upper_dots_ = include_upper_dots;
- include_lower_dots_ = include_lower_dots;
- }
-
- /**
- * Returns the bounding rectangle of the current object at the given level.
- * See comment on coordinate system above.
- * Returns false if there is no such object at the current position.
- * The returned bounding box is guaranteed to match the size and position
- * of the image returned by GetBinaryImage, but may clip foreground pixels
- * from a grey image. The padding argument to GetImage can be used to expand
- * the image to include more foreground pixels. See GetImage below.
- */
- bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right,
- int *bottom) const;
- bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top,
- int *right, int *bottom) const;
- /**
- * Returns the bounding rectangle of the object in a coordinate system of the
- * working image rectangle having its origin at (rect_left_, rect_top_) with
- * respect to the original image and is scaled by a factor scale_.
- */
- bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top,
- int *right, int *bottom) const;
-
- /** Returns whether there is no object of a given level. */
- bool Empty(PageIteratorLevel level) const;
-
- /**
- * Returns the type of the current block.
- * See tesseract/publictypes.h for PolyBlockType.
- */
- PolyBlockType BlockType() const;
-
- /**
- * Returns the polygon outline of the current block. The returned Pta must
- * be ptaDestroy-ed after use. Note that the returned Pta lists the vertices
- * of the polygon, and the last edge is the line segment between the last
- * point and the first point. nullptr will be returned if the iterator is
- * at the end of the document or layout analysis was not used.
- */
- Pta *BlockPolygon() const;
-
- /**
- * Returns a binary image of the current object at the given level.
- * The position and size match the return from BoundingBoxInternal, and so
- * this could be upscaled with respect to the original input image.
- * Use pixDestroy to delete the image after use.
- */
- Pix *GetBinaryImage(PageIteratorLevel level) const;
-
- /**
- * Returns an image of the current object at the given level in greyscale
- * if available in the input. To guarantee a binary image use BinaryImage.
- * NOTE that in order to give the best possible image, the bounds are
- * expanded slightly over the binary connected component, by the supplied
- * padding, so the top-left position of the returned image is returned
- * in (left,top). These will most likely not match the coordinates
- * returned by BoundingBox.
- * If you do not supply an original image, you will get a binary one.
- * Use pixDestroy to delete the image after use.
- */
- Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img,
- int *left, int *top) const;
-
- /**
- * Returns the baseline of the current object at the given level.
- * The baseline is the line that passes through (x1, y1) and (x2, y2).
- * WARNING: with vertical text, baselines may be vertical!
- * Returns false if there is no baseline at the current position.
- */
- bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2,
- int *y2) const;
-
- // Returns the attributes of the current row.
- void RowAttributes(float *row_height, float *descenders,
- float *ascenders) const;
-
- /**
- * Returns orientation for the block the iterator points to.
- * orientation, writing_direction, textline_order: see publictypes.h
- * deskew_angle: after rotating the block so the text orientation is
- * upright, how many radians does one have to rotate the
- * block anti-clockwise for it to be level?
- * -Pi/4 <= deskew_angle <= Pi/4
- */
- void Orientation(tesseract::Orientation *orientation,
- tesseract::WritingDirection *writing_direction,
- tesseract::TextlineOrder *textline_order,
- float *deskew_angle) const;
-
- /**
- * Returns information about the current paragraph, if available.
- *
- * justification -
- * LEFT if ragged right, or fully justified and script is left-to-right.
- * RIGHT if ragged left, or fully justified and script is right-to-left.
- * unknown if it looks like source code or we have very few lines.
- * is_list_item -
- * true if we believe this is a member of an ordered or unordered list.
- * is_crown -
- * true if the first line of the paragraph is aligned with the other
- * lines of the paragraph even though subsequent paragraphs have first
- * line indents. This typically indicates that this is the continuation
- * of a previous paragraph or that it is the very first paragraph in
- * the chapter.
- * first_line_indent -
- * For LEFT aligned paragraphs, the first text line of paragraphs of
- * this kind are indented this many pixels from the left edge of the
- * rest of the paragraph.
- * for RIGHT aligned paragraphs, the first text line of paragraphs of
- * this kind are indented this many pixels from the right edge of the
- * rest of the paragraph.
- * NOTE 1: This value may be negative.
- * NOTE 2: if *is_crown == true, the first line of this paragraph is
- * actually flush, and first_line_indent is set to the "common"
- * first_line_indent for subsequent paragraphs in this block
- * of text.
- */
- void ParagraphInfo(tesseract::ParagraphJustification *justification,
- bool *is_list_item, bool *is_crown,
- int *first_line_indent) const;
-
- // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle
- // of the current word to the given pointer (takes ownership of the pointer)
- // and returns true.
- // Can only be used when iterating on the word level.
- bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);
-
-protected:
- /**
- * Sets up the internal data for iterating the blobs of a new word, then
- * moves the iterator to the given offset.
- */
- void BeginWord(int offset);
-
- /** Pointer to the page_res owned by the API. */
- PAGE_RES *page_res_;
- /** Pointer to the Tesseract object owned by the API. */
- Tesseract *tesseract_;
- /**
- * The iterator to the page_res_. Owned by this ResultIterator.
- * A pointer just to avoid dragging in Tesseract includes.
- */
- PAGE_RES_IT *it_;
- /**
- * The current input WERD being iterated. If there is an output from OCR,
- * then word_ is nullptr. Owned by the API
- */
- WERD *word_;
- /** The length of the current word_. */
- int word_length_;
- /** The current blob index within the word. */
- int blob_index_;
- /**
- * Iterator to the blobs within the word. If nullptr, then we are iterating
- * OCR results in the box_word.
- * Owned by this ResultIterator.
- */
- C_BLOB_IT *cblob_it_;
- /** Control over what to include in bounding boxes. */
- bool include_upper_dots_;
- bool include_lower_dots_;
- /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/
- int scale_;
- int scaled_yres_;
- int rect_left_;
- int rect_top_;
- int rect_width_;
- int rect_height_;
-};
-
-} // namespace tesseract.
-
-#endif // TESSERACT_CCMAIN_PAGEITERATOR_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/publictypes.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/publictypes.h
deleted file mode 100644
index 0069cf28..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/publictypes.h
+++ /dev/null
@@ -1,281 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: publictypes.h
-// Description: Types used in both the API and internally
-// Author: Ray Smith
-//
-// (C) Copyright 2010, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCSTRUCT_PUBLICTYPES_H_
-#define TESSERACT_CCSTRUCT_PUBLICTYPES_H_
-
-namespace tesseract {
-
-// This file contains types that are used both by the API and internally
-// to Tesseract. In order to decouple the API from Tesseract and prevent cyclic
-// dependencies, THIS FILE SHOULD NOT DEPEND ON ANY OTHER PART OF TESSERACT.
-// Restated: It is OK for low-level Tesseract files to include publictypes.h,
-// but not for the low-level tesseract code to include top-level API code.
-// This file should not use other Tesseract types, as that would drag
-// their includes into the API-level.
-
-/** Number of printers' points in an inch. The unit of the pointsize return. */
-constexpr int kPointsPerInch = 72;
-/**
- * Minimum believable resolution. Used as a default if there is no other
- * information, as it is safer to under-estimate than over-estimate.
- */
-constexpr int kMinCredibleResolution = 70;
-/** Maximum believable resolution. */
-constexpr int kMaxCredibleResolution = 2400;
-/**
- * Ratio between median blob size and likely resolution. Used to estimate
- * resolution when none is provided. This is basically 1/usual text size in
- * inches. */
-constexpr int kResolutionEstimationFactor = 10;
-
-/**
- * Possible types for a POLY_BLOCK or ColPartition.
- * Must be kept in sync with kPBColors in polyblk.cpp and PTIs*Type functions
- * below, as well as kPolyBlockNames in layout_test.cc.
- * Used extensively by ColPartition, and POLY_BLOCK.
- */
-enum PolyBlockType {
- PT_UNKNOWN, // Type is not yet known. Keep as the first element.
- PT_FLOWING_TEXT, // Text that lives inside a column.
- PT_HEADING_TEXT, // Text that spans more than one column.
- PT_PULLOUT_TEXT, // Text that is in a cross-column pull-out region.
- PT_EQUATION, // Partition belonging to an equation region.
- PT_INLINE_EQUATION, // Partition has inline equation.
- PT_TABLE, // Partition belonging to a table region.
- PT_VERTICAL_TEXT, // Text-line runs vertically.
- PT_CAPTION_TEXT, // Text that belongs to an image.
- PT_FLOWING_IMAGE, // Image that lives inside a column.
- PT_HEADING_IMAGE, // Image that spans more than one column.
- PT_PULLOUT_IMAGE, // Image that is in a cross-column pull-out region.
- PT_HORZ_LINE, // Horizontal Line.
- PT_VERT_LINE, // Vertical Line.
- PT_NOISE, // Lies outside of any column.
- PT_COUNT
-};
-
-/** Returns true if PolyBlockType is of horizontal line type */
-inline bool PTIsLineType(PolyBlockType type) {
- return type == PT_HORZ_LINE || type == PT_VERT_LINE;
-}
-/** Returns true if PolyBlockType is of image type */
-inline bool PTIsImageType(PolyBlockType type) {
- return type == PT_FLOWING_IMAGE || type == PT_HEADING_IMAGE ||
- type == PT_PULLOUT_IMAGE;
-}
-/** Returns true if PolyBlockType is of text type */
-inline bool PTIsTextType(PolyBlockType type) {
- return type == PT_FLOWING_TEXT || type == PT_HEADING_TEXT ||
- type == PT_PULLOUT_TEXT || type == PT_TABLE ||
- type == PT_VERTICAL_TEXT || type == PT_CAPTION_TEXT ||
- type == PT_INLINE_EQUATION;
-}
-// Returns true if PolyBlockType is of pullout(inter-column) type
-inline bool PTIsPulloutType(PolyBlockType type) {
- return type == PT_PULLOUT_IMAGE || type == PT_PULLOUT_TEXT;
-}
-
-/**
- * +------------------+ Orientation Example:
- * | 1 Aaaa Aaaa Aaaa | ====================
- * | Aaa aa aaa aa | To left is a diagram of some (1) English and
- * | aaaaaa A aa aaa. | (2) Chinese text and a (3) photo credit.
- * | 2 |
- * | ####### c c C | Upright Latin characters are represented as A and a.
- * | ####### c c c | '<' represents a latin character rotated
- * | < ####### c c c | anti-clockwise 90 degrees.
- * | < ####### c c |
- * | < ####### . c | Upright Chinese characters are represented C and c.
- * | 3 ####### c |
- * +------------------+ NOTA BENE: enum values here should match goodoc.proto
-
- * If you orient your head so that "up" aligns with Orientation,
- * then the characters will appear "right side up" and readable.
- *
- * In the example above, both the English and Chinese paragraphs are oriented
- * so their "up" is the top of the page (page up). The photo credit is read
- * with one's head turned leftward ("up" is to page left).
- *
- * The values of this enum match the convention of Tesseract's osdetect.h
-*/
-enum Orientation {
- ORIENTATION_PAGE_UP = 0,
- ORIENTATION_PAGE_RIGHT = 1,
- ORIENTATION_PAGE_DOWN = 2,
- ORIENTATION_PAGE_LEFT = 3,
-};
-
-/**
- * The grapheme clusters within a line of text are laid out logically
- * in this direction, judged when looking at the text line rotated so that
- * its Orientation is "page up".
- *
- * For English text, the writing direction is left-to-right. For the
- * Chinese text in the above example, the writing direction is top-to-bottom.
- */
-enum WritingDirection {
- WRITING_DIRECTION_LEFT_TO_RIGHT = 0,
- WRITING_DIRECTION_RIGHT_TO_LEFT = 1,
- WRITING_DIRECTION_TOP_TO_BOTTOM = 2,
-};
-
-/**
- * The text lines are read in the given sequence.
- *
- * In English, the order is top-to-bottom.
- * In Chinese, vertical text lines are read right-to-left. Mongolian is
- * written in vertical columns top to bottom like Chinese, but the lines
- * order left-to right.
- *
- * Note that only some combinations make sense. For example,
- * WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM
- */
-enum TextlineOrder {
- TEXTLINE_ORDER_LEFT_TO_RIGHT = 0,
- TEXTLINE_ORDER_RIGHT_TO_LEFT = 1,
- TEXTLINE_ORDER_TOP_TO_BOTTOM = 2,
-};
-
-/**
- * Possible modes for page layout analysis. These *must* be kept in order
- * of decreasing amount of layout analysis to be done, except for OSD_ONLY,
- * so that the inequality test macros below work.
- */
-enum PageSegMode {
- PSM_OSD_ONLY = 0, ///< Orientation and script detection only.
- PSM_AUTO_OSD = 1, ///< Automatic page segmentation with orientation and
- ///< script detection. (OSD)
- PSM_AUTO_ONLY = 2, ///< Automatic page segmentation, but no OSD, or OCR.
- PSM_AUTO = 3, ///< Fully automatic page segmentation, but no OSD.
- PSM_SINGLE_COLUMN = 4, ///< Assume a single column of text of variable sizes.
- PSM_SINGLE_BLOCK_VERT_TEXT = 5, ///< Assume a single uniform block of
- ///< vertically aligned text.
- PSM_SINGLE_BLOCK = 6, ///< Assume a single uniform block of text. (Default.)
- PSM_SINGLE_LINE = 7, ///< Treat the image as a single text line.
- PSM_SINGLE_WORD = 8, ///< Treat the image as a single word.
- PSM_CIRCLE_WORD = 9, ///< Treat the image as a single word in a circle.
- PSM_SINGLE_CHAR = 10, ///< Treat the image as a single character.
- PSM_SPARSE_TEXT =
- 11, ///< Find as much text as possible in no particular order.
- PSM_SPARSE_TEXT_OSD = 12, ///< Sparse text with orientation and script det.
- PSM_RAW_LINE = 13, ///< Treat the image as a single text line, bypassing
- ///< hacks that are Tesseract-specific.
-
- PSM_COUNT ///< Number of enum entries.
-};
-
-/**
- * Inline functions that act on a PageSegMode to determine whether components of
- * layout analysis are enabled.
- * *Depend critically on the order of elements of PageSegMode.*
- * NOTE that arg is an int for compatibility with INT_PARAM.
- */
-inline bool PSM_OSD_ENABLED(int pageseg_mode) {
- return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD;
-}
-inline bool PSM_ORIENTATION_ENABLED(int pageseg_mode) {
- return pageseg_mode <= PSM_AUTO || pageseg_mode == PSM_SPARSE_TEXT_OSD;
-}
-inline bool PSM_COL_FIND_ENABLED(int pageseg_mode) {
- return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO;
-}
-inline bool PSM_SPARSE(int pageseg_mode) {
- return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
-}
-inline bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode) {
- return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN;
-}
-inline bool PSM_LINE_FIND_ENABLED(int pageseg_mode) {
- return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK;
-}
-inline bool PSM_WORD_FIND_ENABLED(int pageseg_mode) {
- return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) ||
- pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
-}
-
-/**
- * enum of the elements of the page hierarchy, used in ResultIterator
- * to provide functions that operate on each level without having to
- * have 5x as many functions.
- */
-enum PageIteratorLevel {
- RIL_BLOCK, // Block of text/image/separator line.
- RIL_PARA, // Paragraph within a block.
- RIL_TEXTLINE, // Line within a paragraph.
- RIL_WORD, // Word within a textline.
- RIL_SYMBOL // Symbol/character within a word.
-};
-
-/**
- * JUSTIFICATION_UNKNOWN
- * The alignment is not clearly one of the other options. This could happen
- * for example if there are only one or two lines of text or the text looks
- * like source code or poetry.
- *
- * NOTA BENE: Fully justified paragraphs (text aligned to both left and right
- * margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text
- * is written with a left-to-right script and with JUSTIFICATION_RIGHT if
- * their text is written in a right-to-left script.
- *
- * Interpretation for text read in vertical lines:
- * "Left" is wherever the starting reading position is.
- *
- * JUSTIFICATION_LEFT
- * Each line, except possibly the first, is flush to the same left tab stop.
- *
- * JUSTIFICATION_CENTER
- * The text lines of the paragraph are centered about a line going
- * down through their middle of the text lines.
- *
- * JUSTIFICATION_RIGHT
- * Each line, except possibly the first, is flush to the same right tab stop.
- */
-enum ParagraphJustification {
- JUSTIFICATION_UNKNOWN,
- JUSTIFICATION_LEFT,
- JUSTIFICATION_CENTER,
- JUSTIFICATION_RIGHT,
-};
-
-/**
- * When Tesseract/Cube is initialized we can choose to instantiate/load/run
- * only the Tesseract part, only the Cube part or both along with the combiner.
- * The preference of which engine to use is stored in tessedit_ocr_engine_mode.
- *
- * ATTENTION: When modifying this enum, please make sure to make the
- * appropriate changes to all the enums mirroring it (e.g. OCREngine in
- * cityblock/workflow/detection/detection_storage.proto). Such enums will
- * mention the connection to OcrEngineMode in the comments.
- */
-enum OcrEngineMode {
- OEM_TESSERACT_ONLY, // Run Tesseract only - fastest; deprecated
- OEM_LSTM_ONLY, // Run just the LSTM line recognizer.
- OEM_TESSERACT_LSTM_COMBINED, // Run the LSTM recognizer, but allow fallback
- // to Tesseract when things get difficult.
- // deprecated
- OEM_DEFAULT, // Specify this mode when calling init_*(),
- // to indicate that any of the above modes
- // should be automatically inferred from the
- // variables in the language-specific config,
- // command-line configs, or if not specified
- // in any of the above should be set to the
- // default OEM_TESSERACT_ONLY.
- OEM_COUNT // Number of OEMs
-};
-
-} // namespace tesseract.
-
-#endif // TESSERACT_CCSTRUCT_PUBLICTYPES_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/renderer.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/renderer.h
deleted file mode 100644
index 6f405233..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/renderer.h
+++ /dev/null
@@ -1,311 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: renderer.h
-// Description: Rendering interface to inject into TessBaseAPI
-//
-// (C) Copyright 2011, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_API_RENDERER_H_
-#define TESSERACT_API_RENDERER_H_
-
-#include "export.h"
-
-// To avoid collision with other typenames include the ABSOLUTE MINIMUM
-// complexity of includes here. Use forward declarations wherever possible
-// and hide includes of complex types in baseapi.cpp.
-#include
-#include // for std::string
-#include // for std::vector
-
-struct Pix;
-
-namespace tesseract {
-
-class TessBaseAPI;
-
-/**
- * Interface for rendering tesseract results into a document, such as text,
- * HOCR or pdf. This class is abstract. Specific classes handle individual
- * formats. This interface is then used to inject the renderer class into
- * tesseract when processing images.
- *
- * For simplicity implementing this with tesseract version 3.01,
- * the renderer contains document state that is cleared from document
- * to document just as the TessBaseAPI is. This way the base API can just
- * delegate its rendering functionality to injected renderers, and the
- * renderers can manage the associated state needed for the specific formats
- * in addition to the heuristics for producing it.
- */
-class TESS_API TessResultRenderer {
-public:
- virtual ~TessResultRenderer();
-
- // Takes ownership of pointer so must be new'd instance.
- // Renderers aren't ordered, but appends the sequences of next parameter
- // and existing next(). The renderers should be unique across both lists.
- void insert(TessResultRenderer *next);
-
- // Returns the next renderer or nullptr.
- TessResultRenderer *next() {
- return next_;
- }
-
- /**
- * Starts a new document with the given title.
- * This clears the contents of the output data.
- * Title should use UTF-8 encoding.
- */
- bool BeginDocument(const char *title);
-
- /**
- * Adds the recognized text from the source image to the current document.
- * Invalid if BeginDocument not yet called.
- *
- * Note that this API is a bit weird but is designed to fit into the
- * current TessBaseAPI implementation where the api has lots of state
- * information that we might want to add in.
- */
- bool AddImage(TessBaseAPI *api);
-
- /**
- * Finishes the document and finalizes the output data
- * Invalid if BeginDocument not yet called.
- */
- bool EndDocument();
-
- const char *file_extension() const {
- return file_extension_;
- }
- const char *title() const {
- return title_.c_str();
- }
-
- // Is everything fine? Otherwise something went wrong.
- bool happy() const {
- return happy_;
- }
-
- /**
- * Returns the index of the last image given to AddImage
- * (i.e. images are incremented whether the image succeeded or not)
- *
- * This is always defined. It means either the number of the
- * current image, the last image ended, or in the completed document
- * depending on when in the document lifecycle you are looking at it.
- * Will return -1 if a document was never started.
- */
- int imagenum() const {
- return imagenum_;
- }
-
-protected:
- /**
- * Called by concrete classes.
- *
- * outputbase is the name of the output file excluding
- * extension. For example, "/path/to/chocolate-chip-cookie-recipe"
- *
- * extension indicates the file extension to be used for output
- * files. For example "pdf" will produce a .pdf file, and "hocr"
- * will produce .hocr files.
- */
- TessResultRenderer(const char *outputbase, const char *extension);
-
- // Hook for specialized handling in BeginDocument()
- virtual bool BeginDocumentHandler();
-
- // This must be overridden to render the OCR'd results
- virtual bool AddImageHandler(TessBaseAPI *api) = 0;
-
- // Hook for specialized handling in EndDocument()
- virtual bool EndDocumentHandler();
-
- // Renderers can call this to append '\0' terminated strings into
- // the output string returned by GetOutput.
- // This method will grow the output buffer if needed.
- void AppendString(const char *s);
-
- // Renderers can call this to append binary byte sequences into
- // the output string returned by GetOutput. Note that s is not necessarily
- // '\0' terminated (and can contain '\0' within it).
- // This method will grow the output buffer if needed.
- void AppendData(const char *s, int len);
-
-private:
- TessResultRenderer *next_; // Can link multiple renderers together
- FILE *fout_; // output file pointer
- const char *file_extension_; // standard extension for generated output
- std::string title_; // title of document being rendered
- int imagenum_; // index of last image added
- bool happy_; // I get grumpy when the disk fills up, etc.
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string
- */
-class TESS_API TessTextRenderer : public TessResultRenderer {
-public:
- explicit TessTextRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-/**
- * Renders tesseract output into an hocr text string
- */
-class TESS_API TessHOcrRenderer : public TessResultRenderer {
-public:
- explicit TessHOcrRenderer(const char *outputbase, bool font_info);
- explicit TessHOcrRenderer(const char *outputbase);
-
-protected:
- bool BeginDocumentHandler() override;
- bool AddImageHandler(TessBaseAPI *api) override;
- bool EndDocumentHandler() override;
-
-private:
- bool font_info_; // whether to print font information
-};
-
-/**
- * Renders tesseract output into an alto text string
- */
-class TESS_API TessAltoRenderer : public TessResultRenderer {
-public:
- explicit TessAltoRenderer(const char *outputbase);
-
-protected:
- bool BeginDocumentHandler() override;
- bool AddImageHandler(TessBaseAPI *api) override;
- bool EndDocumentHandler() override;
-
-private:
- bool begin_document;
-};
-
-/**
- * Renders Tesseract output into a TSV string
- */
-class TESS_API TessTsvRenderer : public TessResultRenderer {
-public:
- explicit TessTsvRenderer(const char *outputbase, bool font_info);
- explicit TessTsvRenderer(const char *outputbase);
-
-protected:
- bool BeginDocumentHandler() override;
- bool AddImageHandler(TessBaseAPI *api) override;
- bool EndDocumentHandler() override;
-
-private:
- bool font_info_; // whether to print font information
-};
-
-/**
- * Renders tesseract output into searchable PDF
- */
-class TESS_API TessPDFRenderer : public TessResultRenderer {
-public:
- // datadir is the location of the TESSDATA. We need it because
- // we load a custom PDF font from this location.
- TessPDFRenderer(const char *outputbase, const char *datadir,
- bool textonly = false);
-
-protected:
- bool BeginDocumentHandler() override;
- bool AddImageHandler(TessBaseAPI *api) override;
- bool EndDocumentHandler() override;
-
-private:
- // We don't want to have every image in memory at once,
- // so we store some metadata as we go along producing
- // PDFs one page at a time. At the end, that metadata is
- // used to make everything that isn't easily handled in a
- // streaming fashion.
- long int obj_; // counter for PDF objects
- std::vector offsets_; // offset of every PDF object in bytes
- std::vector pages_; // object number for every /Page object
- std::string datadir_; // where to find the custom font
- bool textonly_; // skip images if set
- // Bookkeeping only. DIY = Do It Yourself.
- void AppendPDFObjectDIY(size_t objectsize);
- // Bookkeeping + emit data.
- void AppendPDFObject(const char *data);
- // Create the /Contents object for an entire page.
- char *GetPDFTextObjects(TessBaseAPI *api, double width, double height);
- // Turn an image into a PDF object. Only transcode if we have to.
- static bool imageToPDFObj(Pix *pix, const char *filename, long int objnum,
- char **pdf_object, long int *pdf_object_size,
- int jpg_quality);
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string
- */
-class TESS_API TessUnlvRenderer : public TessResultRenderer {
-public:
- explicit TessUnlvRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string for LSTMBox
- */
-class TESS_API TessLSTMBoxRenderer : public TessResultRenderer {
-public:
- explicit TessLSTMBoxRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string
- */
-class TESS_API TessBoxTextRenderer : public TessResultRenderer {
-public:
- explicit TessBoxTextRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string in WordStr format
- */
-class TESS_API TessWordStrBoxRenderer : public TessResultRenderer {
-public:
- explicit TessWordStrBoxRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-#ifndef DISABLED_LEGACY_ENGINE
-
-/**
- * Renders tesseract output into an osd text string
- */
-class TESS_API TessOsdRenderer : public TessResultRenderer {
-public:
- explicit TessOsdRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-#endif // ndef DISABLED_LEGACY_ENGINE
-
-} // namespace tesseract.
-
-#endif // TESSERACT_API_RENDERER_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/resultiterator.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/resultiterator.h
deleted file mode 100644
index 3e4d5807..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/resultiterator.h
+++ /dev/null
@@ -1,250 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: resultiterator.h
-// Description: Iterator for tesseract results that is capable of
-// iterating in proper reading order over Bi Directional
-// (e.g. mixed Hebrew and English) text.
-// Author: David Eger
-//
-// (C) Copyright 2011, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_
-#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_
-
-#include "export.h" // for TESS_API, TESS_LOCAL
-#include "ltrresultiterator.h" // for LTRResultIterator
-#include "publictypes.h" // for PageIteratorLevel
-#include "unichar.h" // for StrongScriptDirection
-
-#include // for std::pair
-#include // for std::vector
-
-namespace tesseract {
-
-class TESS_API ResultIterator : public LTRResultIterator {
-public:
- static ResultIterator *StartOfParagraph(const LTRResultIterator &resit);
-
- /**
- * ResultIterator is copy constructible!
- * The default copy constructor works just fine for us.
- */
- ~ResultIterator() override = default;
-
- // ============= Moving around within the page ============.
- /**
- * Moves the iterator to point to the start of the page to begin
- * an iteration.
- */
- void Begin() override;
-
- /**
- * Moves to the start of the next object at the given level in the
- * page hierarchy in the appropriate reading order and returns false if
- * the end of the page was reached.
- * NOTE that RIL_SYMBOL will skip non-text blocks, but all other
- * PageIteratorLevel level values will visit each non-text block once.
- * Think of non text blocks as containing a single para, with a single line,
- * with a single imaginary word.
- * Calls to Next with different levels may be freely intermixed.
- * This function iterates words in right-to-left scripts correctly, if
- * the appropriate language has been loaded into Tesseract.
- */
- bool Next(PageIteratorLevel level) override;
-
- /**
- * IsAtBeginningOf() returns whether we're at the logical beginning of the
- * given level. (as opposed to ResultIterator's left-to-right top-to-bottom
- * order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf().
- * For a full description, see pageiterator.h
- */
- bool IsAtBeginningOf(PageIteratorLevel level) const override;
-
- /**
- * Implement PageIterator's IsAtFinalElement correctly in a BiDi context.
- * For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we
- * point at the last word in a paragraph. See PageIterator for full comment.
- */
- bool IsAtFinalElement(PageIteratorLevel level,
- PageIteratorLevel element) const override;
-
- // ============= Functions that refer to words only ============.
- // Returns the number of blanks before the current word.
- int BlanksBeforeWord() const;
-
- // ============= Accessing data ==============.
-
- /**
- * Returns the null terminated UTF-8 encoded text string for the current
- * object at the given level. Use delete [] to free after use.
- */
- virtual char *GetUTF8Text(PageIteratorLevel level) const;
-
- /**
- * Returns the LSTM choices for every LSTM timestep for the current word.
- */
- virtual std::vector>>>
- *GetRawLSTMTimesteps() const;
- virtual std::vector>>
- *GetBestLSTMSymbolChoices() const;
-
- /**
- * Return whether the current paragraph's dominant reading direction
- * is left-to-right (as opposed to right-to-left).
- */
- bool ParagraphIsLtr() const;
-
- // ============= Exposed only for testing =============.
-
- /**
- * Yields the reading order as a sequence of indices and (optional)
- * meta-marks for a set of words (given left-to-right).
- * The meta marks are passed as negative values:
- * kMinorRunStart Start of minor direction text.
- * kMinorRunEnd End of minor direction text.
- * kComplexWord The next indexed word contains both left-to-right and
- * right-to-left characters and was treated as neutral.
- *
- * For example, suppose we have five words in a text line,
- * indexed [0,1,2,3,4] from the leftmost side of the text line.
- * The following are all believable reading_orders:
- *
- * Left-to-Right (in ltr paragraph):
- * { 0, 1, 2, 3, 4 }
- * Left-to-Right (in rtl paragraph):
- * { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd }
- * Right-to-Left (in rtl paragraph):
- * { 4, 3, 2, 1, 0 }
- * Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph:
- * { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }
- */
- static void CalculateTextlineOrder(
- bool paragraph_is_ltr,
- const std::vector &word_dirs,
- std::vector *reading_order);
-
- static const int kMinorRunStart;
- static const int kMinorRunEnd;
- static const int kComplexWord;
-
-protected:
- /**
- * We presume the data associated with the given iterator will outlive us.
- * NB: This is private because it does something that is non-obvious:
- * it resets to the beginning of the paragraph instead of staying wherever
- * resit might have pointed.
- */
- explicit ResultIterator(const LTRResultIterator &resit);
-
-private:
- /**
- * Calculates the current paragraph's dominant writing direction.
- * Typically, members should use current_paragraph_ltr_ instead.
- */
- bool CurrentParagraphIsLtr() const;
-
- /**
- * Returns word indices as measured from resit->RestartRow() = index 0
- * for the reading order of words within a textline given an iterator
- * into the middle of the text line.
- * In addition to non-negative word indices, the following negative values
- * may be inserted:
- * kMinorRunStart Start of minor direction text.
- * kMinorRunEnd End of minor direction text.
- * kComplexWord The previous word contains both left-to-right and
- * right-to-left characters and was treated as neutral.
- */
- void CalculateTextlineOrder(bool paragraph_is_ltr,
- const LTRResultIterator &resit,
- std::vector *indices) const;
- /** Same as above, but the caller's ssd gets filled in if ssd != nullptr. */
- void CalculateTextlineOrder(bool paragraph_is_ltr,
- const LTRResultIterator &resit,
- std::vector *ssd,
- std::vector *indices) const;
-
- /**
- * What is the index of the current word in a strict left-to-right reading
- * of the row?
- */
- int LTRWordIndex() const;
-
- /**
- * Given an iterator pointing at a word, returns the logical reading order
- * of blob indices for the word.
- */
- void CalculateBlobOrder(std::vector *blob_indices) const;
-
- /** Precondition: current_paragraph_is_ltr_ is set. */
- void MoveToLogicalStartOfTextline();
-
- /**
- * Precondition: current_paragraph_is_ltr_ and in_minor_direction_
- * are set.
- */
- void MoveToLogicalStartOfWord();
-
- /** Are we pointing at the final (reading order) symbol of the word? */
- bool IsAtFinalSymbolOfWord() const;
-
- /** Are we pointing at the first (reading order) symbol of the word? */
- bool IsAtFirstSymbolOfWord() const;
-
- /**
- * Append any extra marks that should be appended to this word when printed.
- * Mostly, these are Unicode BiDi control characters.
- */
- void AppendSuffixMarks(std::string *text) const;
-
- /** Appends the current word in reading order to the given buffer.*/
- void AppendUTF8WordText(std::string *text) const;
-
- /**
- * Appends the text of the current text line, *assuming this iterator is
- * positioned at the beginning of the text line* This function
- * updates the iterator to point to the first position past the text line.
- * Each textline is terminated in a single newline character.
- * If the textline ends a paragraph, it gets a second terminal newline.
- */
- void IterateAndAppendUTF8TextlineText(std::string *text);
-
- /**
- * Appends the text of the current paragraph in reading order
- * to the given buffer.
- * Each textline is terminated in a single newline character, and the
- * paragraph gets an extra newline at the end.
- */
- void AppendUTF8ParagraphText(std::string *text) const;
-
- /** Returns whether the bidi_debug flag is set to at least min_level. */
- bool BidiDebug(int min_level) const;
-
- bool current_paragraph_is_ltr_;
-
- /**
- * Is the currently pointed-at character at the beginning of
- * a minor-direction run?
- */
- bool at_beginning_of_minor_run_;
-
- /** Is the currently pointed-at character in a minor-direction sequence? */
- bool in_minor_direction_;
-
- /**
- * Should detected inter-word spaces be preserved, or "compressed" to a single
- * space character (default behavior).
- */
- bool preserve_interword_spaces_;
-};
-
-} // namespace tesseract.
-
-#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/unichar.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/unichar.h
deleted file mode 100644
index 015109d7..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/unichar.h
+++ /dev/null
@@ -1,174 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: unichar.h
-// Description: Unicode character/ligature class.
-// Author: Ray Smith
-//
-// (C) Copyright 2006, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCUTIL_UNICHAR_H_
-#define TESSERACT_CCUTIL_UNICHAR_H_
-
-#include "export.h"
-
-#include
-#include
-#include
-#include
-
-namespace tesseract {
-
-// Maximum number of characters that can be stored in a UNICHAR. Must be
-// at least 4. Must not exceed 31 without changing the coding of length.
-#define UNICHAR_LEN 30
-
-// A UNICHAR_ID is the unique id of a unichar.
-using UNICHAR_ID = int;
-
-// A variable to indicate an invalid or uninitialized unichar id.
-static const int INVALID_UNICHAR_ID = -1;
-// A special unichar that corresponds to INVALID_UNICHAR_ID.
-static const char INVALID_UNICHAR[] = "__INVALID_UNICHAR__";
-
-enum StrongScriptDirection {
- DIR_NEUTRAL = 0, // Text contains only neutral characters.
- DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters.
- DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters.
- DIR_MIX = 3, // Text contains a mixture of left-to-right
- // and right-to-left characters.
-};
-
-using char32 = signed int;
-
-// The UNICHAR class holds a single classification result. This may be
-// a single Unicode character (stored as between 1 and 4 utf8 bytes) or
-// multiple Unicode characters representing the NFKC expansion of a ligature
-// such as fi, ffl etc. These are also stored as utf8.
-class TESS_API UNICHAR {
-public:
- UNICHAR() {
- memset(chars, 0, UNICHAR_LEN);
- }
-
- // Construct from a utf8 string. If len<0 then the string is null terminated.
- // If the string is too long to fit in the UNICHAR then it takes only what
- // will fit.
- UNICHAR(const char *utf8_str, int len);
-
- // Construct from a single UCS4 character.
- explicit UNICHAR(int unicode);
-
- // Default copy constructor and operator= are OK.
-
- // Get the first character as UCS-4.
- int first_uni() const;
-
- // Get the length of the UTF8 string.
- int utf8_len() const {
- int len = chars[UNICHAR_LEN - 1];
- return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;
- }
-
- // Get a UTF8 string, but NOT nullptr terminated.
- const char *utf8() const {
- return chars;
- }
-
- // Get a terminated UTF8 string: Must delete[] it after use.
- char *utf8_str() const;
-
- // Get the number of bytes in the first character of the given utf8 string.
- static int utf8_step(const char *utf8_str);
-
- // A class to simplify iterating over and accessing elements of a UTF8
- // string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or
- // take ownership of the underlying byte array. It also does not permit
- // modification of the array (as the name suggests).
- //
- // Example:
- // for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len);
- // it != UNICHAR::end(str, len);
- // ++it) {
- // printf("UCS-4 symbol code = %d\n", *it);
- // char buf[5];
- // int char_len = it.get_utf8(buf); buf[char_len] = '\0';
- // printf("Char = %s\n", buf);
- // }
- class TESS_API const_iterator {
- using CI = const_iterator;
-
- public:
- // Step to the next UTF8 character.
- // If the current position is at an illegal UTF8 character, then print an
- // error message and step by one byte. If the current position is at a
- // nullptr value, don't step past it.
- const_iterator &operator++();
-
- // Return the UCS-4 value at the current position.
- // If the current position is at an illegal UTF8 value, return a single
- // space character.
- int operator*() const;
-
- // Store the UTF-8 encoding of the current codepoint into buf, which must be
- // at least 4 bytes long. Return the number of bytes written.
- // If the current position is at an illegal UTF8 value, writes a single
- // space character and returns 1.
- // Note that this method does not null-terminate the buffer.
- int get_utf8(char *buf) const;
- // Returns the number of bytes of the current codepoint. Returns 1 if the
- // current position is at an illegal UTF8 value.
- int utf8_len() const;
- // Returns true if the UTF-8 encoding at the current position is legal.
- bool is_legal() const;
-
- // Return the pointer into the string at the current position.
- const char *utf8_data() const {
- return it_;
- }
-
- // Iterator equality operators.
- friend bool operator==(const CI &lhs, const CI &rhs) {
- return lhs.it_ == rhs.it_;
- }
- friend bool operator!=(const CI &lhs, const CI &rhs) {
- return !(lhs == rhs);
- }
-
- private:
- friend class UNICHAR;
- explicit const_iterator(const char *it) : it_(it) {}
-
- const char *it_; // Pointer into the string.
- };
-
- // Create a start/end iterator pointing to a string. Note that these methods
- // are static and do NOT create a copy or take ownership of the underlying
- // array.
- static const_iterator begin(const char *utf8_str, int byte_length);
- static const_iterator end(const char *utf8_str, int byte_length);
-
- // Converts a utf-8 string to a vector of unicodes.
- // Returns an empty vector if the input contains invalid UTF-8.
- static std::vector UTF8ToUTF32(const char *utf8_str);
- // Converts a vector of unicodes to a utf8 string.
- // Returns an empty string if the input contains an invalid unicode.
- static std::string UTF32ToUTF8(const std::vector &str32);
-
-private:
- // A UTF-8 representation of 1 or more Unicode characters.
- // The last element (chars[UNICHAR_LEN - 1]) is a length if
- // its value < UNICHAR_LEN, otherwise it is a genuine character.
- char chars[UNICHAR_LEN]{};
-};
-
-} // namespace tesseract
-
-#endif // TESSERACT_CCUTIL_UNICHAR_H_
diff --git a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/version.h b/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/version.h
deleted file mode 100644
index 6bac5d66..00000000
--- a/third_party/ocr/tesseract-ocr/kylin/mips64/include/tesseract/version.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: version.h
-// Description: Version information
-//
-// (C) Copyright 2018, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_API_VERSION_H_
-#define TESSERACT_API_VERSION_H_
-
-// clang-format off
-
-#define TESSERACT_MAJOR_VERSION @GENERIC_MAJOR_VERSION@
-#define TESSERACT_MINOR_VERSION @GENERIC_MINOR_VERSION@
-#define TESSERACT_MICRO_VERSION @GENERIC_MICRO_VERSION@
-
-#define TESSERACT_VERSION \
- (TESSERACT_MAJOR_VERSION << 16 | \
- TESSERACT_MINOR_VERSION << 8 | \
- TESSERACT_MICRO_VERSION)
-
-#define TESSERACT_VERSION_STR "@PACKAGE_VERSION@"
-
-// clang-format on
-
-#endif // TESSERACT_API_VERSION_H_
diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/baseapi.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/baseapi.h
deleted file mode 100644
index 5e1e4830..00000000
--- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/baseapi.h
+++ /dev/null
@@ -1,812 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: baseapi.h
-// Description: Simple API for calling tesseract.
-// Author: Ray Smith
-//
-// (C) Copyright 2006, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_API_BASEAPI_H_
-#define TESSERACT_API_BASEAPI_H_
-
-#ifdef HAVE_CONFIG_H
-# include "config_auto.h" // DISABLED_LEGACY_ENGINE
-#endif
-
-#include "export.h"
-#include "pageiterator.h"
-#include "publictypes.h"
-#include "resultiterator.h"
-#include "unichar.h"
-
-#include "version.h"
-
-#include
-#include // for std::vector
-
-struct Pix;
-struct Pixa;
-struct Boxa;
-
-namespace tesseract {
-
-class PAGE_RES;
-class ParagraphModel;
-class BLOCK_LIST;
-class ETEXT_DESC;
-struct OSResults;
-class UNICHARSET;
-
-class Dawg;
-class Dict;
-class EquationDetect;
-class PageIterator;
-class ImageThresholder;
-class LTRResultIterator;
-class ResultIterator;
-class MutableIterator;
-class TessResultRenderer;
-class Tesseract;
-
-// Function to read a std::vector from a whole file.
-// Returns false on failure.
-using FileReader = bool (*)(const char *filename, std::vector *data);
-
-using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID,
- bool) const;
-using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *,
- int, const char *, int);
-
-/**
- * Base class for all tesseract APIs.
- * Specific classes can add ability to work on different inputs or produce
- * different outputs.
- * This class is mostly an interface layer on top of the Tesseract instance
- * class to hide the data types so that users of this class don't have to
- * include any other Tesseract headers.
- */
-class TESS_API TessBaseAPI {
-public:
- TessBaseAPI();
- virtual ~TessBaseAPI();
- // Copy constructor and assignment operator are currently unsupported.
- TessBaseAPI(TessBaseAPI const &) = delete;
- TessBaseAPI &operator=(TessBaseAPI const &) = delete;
-
- /**
- * Returns the version identifier as a static string. Do not delete.
- */
- static const char *Version();
-
- /**
- * If compiled with OpenCL AND an available OpenCL
- * device is deemed faster than serial code, then
- * "device" is populated with the cl_device_id
- * and returns sizeof(cl_device_id)
- * otherwise *device=nullptr and returns 0.
- */
- static size_t getOpenCLDevice(void **device);
-
- /**
- * Set the name of the input file. Needed for training and
- * reading a UNLV zone file, and for searchable PDF output.
- */
- void SetInputName(const char *name);
- /**
- * These functions are required for searchable PDF output.
- * We need our hands on the input file so that we can include
- * it in the PDF without transcoding. If that is not possible,
- * we need the original image. Finally, resolution metadata
- * is stored in the PDF so we need that as well.
- */
- const char *GetInputName();
- // Takes ownership of the input pix.
- void SetInputImage(Pix *pix);
- Pix *GetInputImage();
- int GetSourceYResolution();
- const char *GetDatapath();
-
- /** Set the name of the bonus output files. Needed only for debugging. */
- void SetOutputName(const char *name);
-
- /**
- * Set the value of an internal "parameter."
- * Supply the name of the parameter and the value as a string, just as
- * you would in a config file.
- * Returns false if the name lookup failed.
- * Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z.
- * Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode.
- * SetVariable may be used before Init, but settings will revert to
- * defaults on End().
- *
- * Note: Must be called after Init(). Only works for non-init variables
- * (init variables should be passed to Init()).
- */
- bool SetVariable(const char *name, const char *value);
- bool SetDebugVariable(const char *name, const char *value);
-
- /**
- * Returns true if the parameter was found among Tesseract parameters.
- * Fills in value with the value of the parameter.
- */
- bool GetIntVariable(const char *name, int *value) const;
- bool GetBoolVariable(const char *name, bool *value) const;
- bool GetDoubleVariable(const char *name, double *value) const;
-
- /**
- * Returns the pointer to the string that represents the value of the
- * parameter if it was found among Tesseract parameters.
- */
- const char *GetStringVariable(const char *name) const;
-
-#ifndef DISABLED_LEGACY_ENGINE
-
- /**
- * Print Tesseract fonts table to the given file.
- */
- void PrintFontsTable(FILE *fp) const;
-
-#endif
-
- /**
- * Print Tesseract parameters to the given file.
- */
- void PrintVariables(FILE *fp) const;
-
- /**
- * Get value of named variable as a string, if it exists.
- */
- bool GetVariableAsString(const char *name, std::string *val) const;
-
- /**
- * Instances are now mostly thread-safe and totally independent,
- * but some global parameters remain. Basically it is safe to use multiple
- * TessBaseAPIs in different threads in parallel, UNLESS:
- * you use SetVariable on some of the Params in classify and textord.
- * If you do, then the effect will be to change it for all your instances.
- *
- * Start tesseract. Returns zero on success and -1 on failure.
- * NOTE that the only members that may be called before Init are those
- * listed above here in the class definition.
- *
- * The datapath must be the name of the tessdata directory.
- * The language is (usually) an ISO 639-3 string or nullptr will default to
- * eng. It is entirely safe (and eventually will be efficient too) to call
- * Init multiple times on the same instance to change language, or just
- * to reset the classifier.
- * The language may be a string of the form [~][+[~]]* indicating
- * that multiple languages are to be loaded. Eg hin+eng will load Hindi and
- * English. Languages may specify internally that they want to be loaded
- * with one or more other languages, so the ~ sign is available to override
- * that. Eg if hin were set to load eng by default, then hin+~eng would force
- * loading only hin. The number of loaded languages is limited only by
- * memory, with the caveat that loading additional languages will impact
- * both speed and accuracy, as there is more work to do to decide on the
- * applicable language, and there is more chance of hallucinating incorrect
- * words.
- * WARNING: On changing languages, all Tesseract parameters are reset
- * back to their default values. (Which may vary between languages.)
- * If you have a rare need to set a Variable that controls
- * initialization for a second call to Init you should explicitly
- * call End() and then use SetVariable before Init. This is only a very
- * rare use case, since there are very few uses that require any parameters
- * to be set before Init.
- *
- * If set_only_non_debug_params is true, only params that do not contain
- * "debug" in the name will be set.
- */
- int Init(const char *datapath, const char *language, OcrEngineMode mode,
- char **configs, int configs_size,
- const std::vector *vars_vec,
- const std::vector *vars_values,
- bool set_only_non_debug_params);
- int Init(const char *datapath, const char *language, OcrEngineMode oem) {
- return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false);
- }
- int Init(const char *datapath, const char *language) {
- return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr,
- false);
- }
- // In-memory version reads the traineddata file directly from the given
- // data[data_size] array, and/or reads data via a FileReader.
- int Init(const char *data, int data_size, const char *language,
- OcrEngineMode mode, char **configs, int configs_size,
- const std::vector *vars_vec,
- const std::vector *vars_values,
- bool set_only_non_debug_params, FileReader reader);
-
- /**
- * Returns the languages string used in the last valid initialization.
- * If the last initialization specified "deu+hin" then that will be
- * returned. If hin loaded eng automatically as well, then that will
- * not be included in this list. To find the languages actually
- * loaded use GetLoadedLanguagesAsVector.
- * The returned string should NOT be deleted.
- */
- const char *GetInitLanguagesAsString() const;
-
- /**
- * Returns the loaded languages in the vector of std::string.
- * Includes all languages loaded by the last Init, including those loaded
- * as dependencies of other loaded languages.
- */
- void GetLoadedLanguagesAsVector(std::vector *langs) const;
-
- /**
- * Returns the available languages in the sorted vector of std::string.
- */
- void GetAvailableLanguagesAsVector(std::vector *langs) const;
-
- /**
- * Init only for page layout analysis. Use only for calls to SetImage and
- * AnalysePage. Calls that attempt recognition will generate an error.
- */
- void InitForAnalysePage();
-
- /**
- * Read a "config" file containing a set of param, value pairs.
- * Searches the standard places: tessdata/configs, tessdata/tessconfigs
- * and also accepts a relative or absolute path name.
- * Note: only non-init params will be set (init params are set by Init()).
- */
- void ReadConfigFile(const char *filename);
- /** Same as above, but only set debug params from the given config file. */
- void ReadDebugConfigFile(const char *filename);
-
- /**
- * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK.
- * The mode is stored as an IntParam so it can also be modified by
- * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
- */
- void SetPageSegMode(PageSegMode mode);
-
- /** Return the current page segmentation mode. */
- PageSegMode GetPageSegMode() const;
-
- /**
- * Recognize a rectangle from an image and return the result as a string.
- * May be called many times for a single Init.
- * Currently has no error checking.
- * Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
- * Palette color images will not work properly and must be converted to
- * 24 bit.
- * Binary images of 1 bit per pixel may also be given but they must be
- * byte packed with the MSB of the first byte being the first pixel, and a
- * 1 represents WHITE. For binary images set bytes_per_pixel=0.
- * The recognized text is returned as a char* which is coded
- * as UTF8 and must be freed with the delete [] operator.
- *
- * Note that TesseractRect is the simplified convenience interface.
- * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize,
- * and one or more of the Get*Text functions below.
- */
- char *TesseractRect(const unsigned char *imagedata, int bytes_per_pixel,
- int bytes_per_line, int left, int top, int width,
- int height);
-
- /**
- * Call between pages or documents etc to free up memory and forget
- * adaptive data.
- */
- void ClearAdaptiveClassifier();
-
- /**
- * @defgroup AdvancedAPI Advanced API
- * The following methods break TesseractRect into pieces, so you can
- * get hold of the thresholded image, get the text in different formats,
- * get bounding boxes, confidences etc.
- */
- /* @{ */
-
- /**
- * Provide an image for Tesseract to recognize. Format is as
- * TesseractRect above. Copies the image buffer and converts to Pix.
- * SetImage clears all recognition results, and sets the rectangle to the
- * full image, so it may be followed immediately by a GetUTF8Text, and it
- * will automatically perform recognition.
- */
- void SetImage(const unsigned char *imagedata, int width, int height,
- int bytes_per_pixel, int bytes_per_line);
-
- /**
- * Provide an image for Tesseract to recognize. As with SetImage above,
- * Tesseract takes its own copy of the image, so it need not persist until
- * after Recognize.
- * Pix vs raw, which to use?
- * Use Pix where possible. Tesseract uses Pix as its internal representation
- * and it is therefore more efficient to provide a Pix directly.
- */
- void SetImage(Pix *pix);
-
- /**
- * Set the resolution of the source image in pixels per inch so font size
- * information can be calculated in results. Call this after SetImage().
- */
- void SetSourceResolution(int ppi);
-
- /**
- * Restrict recognition to a sub-rectangle of the image. Call after SetImage.
- * Each SetRectangle clears the recogntion results so multiple rectangles
- * can be recognized with the same image.
- */
- void SetRectangle(int left, int top, int width, int height);
-
- /**
- * Get a copy of the internal thresholded image from Tesseract.
- * Caller takes ownership of the Pix and must pixDestroy it.
- * May be called any time after SetImage, or after TesseractRect.
- */
- Pix *GetThresholdedImage();
-
- /**
- * Get the result of page layout analysis as a leptonica-style
- * Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- */
- Boxa *GetRegions(Pixa **pixa);
-
- /**
- * Get the textlines as a leptonica-style
- * Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- * If raw_image is true, then extract from the original image instead of the
- * thresholded image and pad by raw_padding pixels.
- * If blockids is not nullptr, the block-id of each line is also returned as
- * an array of one element per line. delete [] after use. If paraids is not
- * nullptr, the paragraph-id of each line within its block is also returned as
- * an array of one element per line. delete [] after use.
- */
- Boxa *GetTextlines(bool raw_image, int raw_padding, Pixa **pixa,
- int **blockids, int **paraids);
- /*
- Helper method to extract from the thresholded image. (most common usage)
-*/
- Boxa *GetTextlines(Pixa **pixa, int **blockids) {
- return GetTextlines(false, 0, pixa, blockids, nullptr);
- }
-
- /**
- * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa
- * pair, in reading order. Enables downstream handling of non-rectangular
- * regions.
- * Can be called before or after Recognize.
- * If blockids is not nullptr, the block-id of each line is also returned as
- * an array of one element per line. delete [] after use.
- */
- Boxa *GetStrips(Pixa **pixa, int **blockids);
-
- /**
- * Get the words as a leptonica-style
- * Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- */
- Boxa *GetWords(Pixa **pixa);
-
- /**
- * Gets the individual connected (text) components (created
- * after pages segmentation step, but before recognition)
- * as a leptonica-style Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- * Note: the caller is responsible for calling boxaDestroy()
- * on the returned Boxa array and pixaDestroy() on cc array.
- */
- Boxa *GetConnectedComponents(Pixa **cc);
-
- /**
- * Get the given level kind of components (block, textline, word etc.) as a
- * leptonica-style Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- * If blockids is not nullptr, the block-id of each component is also returned
- * as an array of one element per component. delete [] after use.
- * If blockids is not nullptr, the paragraph-id of each component with its
- * block is also returned as an array of one element per component. delete []
- * after use. If raw_image is true, then portions of the original image are
- * extracted instead of the thresholded image and padded with raw_padding. If
- * text_only is true, then only text components are returned.
- */
- Boxa *GetComponentImages(PageIteratorLevel level, bool text_only,
- bool raw_image, int raw_padding, Pixa **pixa,
- int **blockids, int **paraids);
- // Helper function to get binary images with no padding (most common usage).
- Boxa *GetComponentImages(const PageIteratorLevel level, const bool text_only,
- Pixa **pixa, int **blockids) {
- return GetComponentImages(level, text_only, false, 0, pixa, blockids,
- nullptr);
- }
-
- /**
- * Returns the scale factor of the thresholded image that would be returned by
- * GetThresholdedImage() and the various GetX() methods that call
- * GetComponentImages().
- * Returns 0 if no thresholder has been set.
- */
- int GetThresholdedImageScaleFactor() const;
-
- /**
- * Runs page layout analysis in the mode set by SetPageSegMode.
- * May optionally be called prior to Recognize to get access to just
- * the page layout results. Returns an iterator to the results.
- * If merge_similar_words is true, words are combined where suitable for use
- * with a line recognizer. Use if you want to use AnalyseLayout to find the
- * textlines, and then want to process textline fragments with an external
- * line recognizer.
- * Returns nullptr on error or an empty page.
- * The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- */
- PageIterator *AnalyseLayout();
- PageIterator *AnalyseLayout(bool merge_similar_words);
-
- /**
- * Recognize the image from SetAndThresholdImage, generating Tesseract
- * internal structures. Returns 0 on success.
- * Optional. The Get*Text functions below will call Recognize if needed.
- * After Recognize, the output is kept internally until the next SetImage.
- */
- int Recognize(ETEXT_DESC *monitor);
-
- /**
- * Methods to retrieve information after SetAndThresholdImage(),
- * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.)
- */
-
- /**
- * Turns images into symbolic text.
- *
- * filename can point to a single image, a multi-page TIFF,
- * or a plain text list of image filenames.
- *
- * retry_config is useful for debugging. If not nullptr, you can fall
- * back to an alternate configuration if a page fails for some
- * reason.
- *
- * timeout_millisec terminates processing if any single page
- * takes too long. Set to 0 for unlimited time.
- *
- * renderer is responible for creating the output. For example,
- * use the TessTextRenderer if you want plaintext output, or
- * the TessPDFRender to produce searchable PDF.
- *
- * If tessedit_page_number is non-negative, will only process that
- * single page. Works for multi-page tiff file, or filelist.
- *
- * Returns true if successful, false on error.
- */
- bool ProcessPages(const char *filename, const char *retry_config,
- int timeout_millisec, TessResultRenderer *renderer);
- // Does the real work of ProcessPages.
- bool ProcessPagesInternal(const char *filename, const char *retry_config,
- int timeout_millisec, TessResultRenderer *renderer);
-
- /**
- * Turn a single image into symbolic text.
- *
- * The pix is the image processed. filename and page_index are
- * metadata used by side-effect processes, such as reading a box
- * file or formatting as hOCR.
- *
- * See ProcessPages for descriptions of other parameters.
- */
- bool ProcessPage(Pix *pix, int page_index, const char *filename,
- const char *retry_config, int timeout_millisec,
- TessResultRenderer *renderer);
-
- /**
- * Get a reading-order iterator to the results of LayoutAnalysis and/or
- * Recognize. The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- */
- ResultIterator *GetIterator();
-
- /**
- * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.
- * The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- */
- MutableIterator *GetMutableIterator();
-
- /**
- * The recognized text is returned as a char* which is coded
- * as UTF8 and must be freed with the delete [] operator.
- */
- char *GetUTF8Text();
-
- /**
- * Make a HTML-formatted string with hOCR markup from the internal
- * data structures.
- * page_number is 0-based but will appear in the output as 1-based.
- * monitor can be used to
- * cancel the recognition
- * receive progress callbacks
- * Returned string must be freed with the delete [] operator.
- */
- char *GetHOCRText(ETEXT_DESC *monitor, int page_number);
-
- /**
- * Make a HTML-formatted string with hOCR markup from the internal
- * data structures.
- * page_number is 0-based but will appear in the output as 1-based.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetHOCRText(int page_number);
-
- /**
- * Make an XML-formatted string with Alto markup from the internal
- * data structures.
- */
- char *GetAltoText(ETEXT_DESC *monitor, int page_number);
-
- /**
- * Make an XML-formatted string with Alto markup from the internal
- * data structures.
- */
- char *GetAltoText(int page_number);
-
- /**
- * Make a TSV-formatted string from the internal data structures.
- * page_number is 0-based but will appear in the output as 1-based.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetTSVText(int page_number);
-
- /**
- * Make a box file for LSTM training from the internal data structures.
- * Constructs coordinates in the original image - not just the rectangle.
- * page_number is a 0-based page index that will appear in the box file.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetLSTMBoxText(int page_number);
-
- /**
- * The recognized text is returned as a char* which is coded in the same
- * format as a box file used in training.
- * Constructs coordinates in the original image - not just the rectangle.
- * page_number is a 0-based page index that will appear in the box file.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetBoxText(int page_number);
-
- /**
- * The recognized text is returned as a char* which is coded in the same
- * format as a WordStr box file used in training.
- * page_number is a 0-based page index that will appear in the box file.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetWordStrBoxText(int page_number);
-
- /**
- * The recognized text is returned as a char* which is coded
- * as UNLV format Latin-1 with specific reject and suspect codes.
- * Returned string must be freed with the delete [] operator.
- */
- char *GetUNLVText();
-
- /**
- * Detect the orientation of the input image and apparent script (alphabet).
- * orient_deg is the detected clockwise rotation of the input image in degrees
- * (0, 90, 180, 270)
- * orient_conf is the confidence (15.0 is reasonably confident)
- * script_name is an ASCII string, the name of the script, e.g. "Latin"
- * script_conf is confidence level in the script
- * Returns true on success and writes values to each parameter as an output
- */
- bool DetectOrientationScript(int *orient_deg, float *orient_conf,
- const char **script_name, float *script_conf);
-
- /**
- * The recognized text is returned as a char* which is coded
- * as UTF8 and must be freed with the delete [] operator.
- * page_number is a 0-based page index that will appear in the osd file.
- */
- char *GetOsdText(int page_number);
-
- /** Returns the (average) confidence value between 0 and 100. */
- int MeanTextConf();
- /**
- * Returns all word confidences (between 0 and 100) in an array, terminated
- * by -1. The calling function must delete [] after use.
- * The number of confidences should correspond to the number of space-
- * delimited words in GetUTF8Text.
- */
- int *AllWordConfidences();
-
-#ifndef DISABLED_LEGACY_ENGINE
- /**
- * Applies the given word to the adaptive classifier if possible.
- * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can
- * tell the boundaries of the graphemes.
- * Assumes that SetImage/SetRectangle have been used to set the image
- * to the given word. The mode arg should be PSM_SINGLE_WORD or
- * PSM_CIRCLE_WORD, as that will be used to control layout analysis.
- * The currently set PageSegMode is preserved.
- * Returns false if adaption was not possible for some reason.
- */
- bool AdaptToWordStr(PageSegMode mode, const char *wordstr);
-#endif // ndef DISABLED_LEGACY_ENGINE
-
- /**
- * Free up recognition results and any stored image data, without actually
- * freeing any recognition data that would be time-consuming to reload.
- * Afterwards, you must call SetImage or TesseractRect before doing
- * any Recognize or Get* operation.
- */
- void Clear();
-
- /**
- * Close down tesseract and free up all memory. End() is equivalent to
- * destructing and reconstructing your TessBaseAPI.
- * Once End() has been used, none of the other API functions may be used
- * other than Init and anything declared above it in the class definition.
- */
- void End();
-
- /**
- * Clear any library-level memory caches.
- * There are a variety of expensive-to-load constant data structures (mostly
- * language dictionaries) that are cached globally -- surviving the Init()
- * and End() of individual TessBaseAPI's. This function allows the clearing
- * of these caches.
- **/
- static void ClearPersistentCache();
-
- /**
- * Check whether a word is valid according to Tesseract's language model
- * @return 0 if the word is invalid, non-zero if valid.
- * @warning temporary! This function will be removed from here and placed
- * in a separate API at some future time.
- */
- int IsValidWord(const char *word) const;
- // Returns true if utf8_character is defined in the UniCharset.
- bool IsValidCharacter(const char *utf8_character) const;
-
- bool GetTextDirection(int *out_offset, float *out_slope);
-
- /** Sets Dict::letter_is_okay_ function to point to the given function. */
- void SetDictFunc(DictFunc f);
-
- /** Sets Dict::probability_in_context_ function to point to the given
- * function.
- */
- void SetProbabilityInContextFunc(ProbabilityInContextFunc f);
-
- /**
- * Estimates the Orientation And Script of the image.
- * @return true if the image was processed successfully.
- */
- bool DetectOS(OSResults *);
-
- /**
- * Return text orientation of each block as determined by an earlier run
- * of layout analysis.
- */
- void GetBlockTextOrientations(int **block_orientation,
- bool **vertical_writing);
-
- /** This method returns the string form of the specified unichar. */
- const char *GetUnichar(int unichar_id) const;
-
- /** Return the pointer to the i-th dawg loaded into tesseract_ object. */
- const Dawg *GetDawg(int i) const;
-
- /** Return the number of dawgs loaded into tesseract_ object. */
- int NumDawgs() const;
-
- Tesseract *tesseract() const {
- return tesseract_;
- }
-
- OcrEngineMode oem() const {
- return last_oem_requested_;
- }
-
- void set_min_orientation_margin(double margin);
- /* @} */
-
-protected:
- /** Common code for setting the image. Returns true if Init has been called.
- */
- bool InternalSetImage();
-
- /**
- * Run the thresholder to make the thresholded image. If pix is not nullptr,
- * the source is thresholded to pix instead of the internal IMAGE.
- */
- virtual bool Threshold(Pix **pix);
-
- /**
- * Find lines from the image making the BLOCK_LIST.
- * @return 0 on success.
- */
- int FindLines();
-
- /** Delete the pageres and block list ready for a new page. */
- void ClearResults();
-
- /**
- * Return an LTR Result Iterator -- used only for training, as we really want
- * to ignore all BiDi smarts at that point.
- * delete once you're done with it.
- */
- LTRResultIterator *GetLTRIterator();
-
- /**
- * Return the length of the output text string, as UTF8, assuming
- * one newline per line and one per block, with a terminator,
- * and assuming a single character reject marker for each rejected character.
- * Also return the number of recognized blobs in blob_count.
- */
- int TextLength(int *blob_count) const;
-
- //// paragraphs.cpp ////////////////////////////////////////////////////
- void DetectParagraphs(bool after_text_recognition);
-
- const PAGE_RES *GetPageRes() const {
- return page_res_;
- }
-
-protected:
- Tesseract *tesseract_; ///< The underlying data object.
- Tesseract *osd_tesseract_; ///< For orientation & script detection.
- EquationDetect *equ_detect_; ///< The equation detector.
- FileReader reader_; ///< Reads files from any filesystem.
- ImageThresholder *thresholder_; ///< Image thresholding module.
- std::vector *paragraph_models_;
- BLOCK_LIST *block_list_; ///< The page layout.
- PAGE_RES *page_res_; ///< The page-level data.
- std::string input_file_; ///< Name used by training code.
- std::string output_file_; ///< Name used by debug code.
- std::string datapath_; ///< Current location of tessdata.
- std::string language_; ///< Last initialized language.
- OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested.
- bool recognition_done_; ///< page_res_ contains recognition data.
-
- /**
- * @defgroup ThresholderParams Thresholder Parameters
- * Parameters saved from the Thresholder. Needed to rebuild coordinates.
- */
- /* @{ */
- int rect_left_;
- int rect_top_;
- int rect_width_;
- int rect_height_;
- int image_width_;
- int image_height_;
- /* @} */
-
-private:
- // A list of image filenames gets special consideration
- bool ProcessPagesFileList(FILE *fp, std::string *buf,
- const char *retry_config, int timeout_millisec,
- TessResultRenderer *renderer,
- int tessedit_page_number);
- // TIFF supports multipage so gets special consideration.
- bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size,
- const char *filename, const char *retry_config,
- int timeout_millisec,
- TessResultRenderer *renderer,
- int tessedit_page_number);
-}; // class TessBaseAPI.
-
-/** Escape a char string - remove &<>"' with HTML codes. */
-std::string HOcrEscape(const char *text);
-
-} // namespace tesseract
-
-#endif // TESSERACT_API_BASEAPI_H_
diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/capi.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/capi.h
deleted file mode 100644
index 40f4856a..00000000
--- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/capi.h
+++ /dev/null
@@ -1,484 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: capi.h
-// Description: C-API TessBaseAPI
-//
-// (C) Copyright 2012, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef API_CAPI_H_
-#define API_CAPI_H_
-
-#include "export.h"
-
-#ifdef __cplusplus
-# include
-# include
-# include
-# include
-# include
-#endif
-
-#include
-#include
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef BOOL
-# define BOOL int
-# define TRUE 1
-# define FALSE 0
-#endif
-
-#ifdef __cplusplus
-typedef tesseract::TessResultRenderer TessResultRenderer;
-typedef tesseract::TessBaseAPI TessBaseAPI;
-typedef tesseract::PageIterator TessPageIterator;
-typedef tesseract::ResultIterator TessResultIterator;
-typedef tesseract::MutableIterator TessMutableIterator;
-typedef tesseract::ChoiceIterator TessChoiceIterator;
-typedef tesseract::OcrEngineMode TessOcrEngineMode;
-typedef tesseract::PageSegMode TessPageSegMode;
-typedef tesseract::PageIteratorLevel TessPageIteratorLevel;
-typedef tesseract::Orientation TessOrientation;
-typedef tesseract::ParagraphJustification TessParagraphJustification;
-typedef tesseract::WritingDirection TessWritingDirection;
-typedef tesseract::TextlineOrder TessTextlineOrder;
-typedef tesseract::PolyBlockType TessPolyBlockType;
-typedef tesseract::ETEXT_DESC ETEXT_DESC;
-#else
-typedef struct TessResultRenderer TessResultRenderer;
-typedef struct TessBaseAPI TessBaseAPI;
-typedef struct TessPageIterator TessPageIterator;
-typedef struct TessResultIterator TessResultIterator;
-typedef struct TessMutableIterator TessMutableIterator;
-typedef struct TessChoiceIterator TessChoiceIterator;
-typedef enum TessOcrEngineMode {
- OEM_TESSERACT_ONLY,
- OEM_LSTM_ONLY,
- OEM_TESSERACT_LSTM_COMBINED,
- OEM_DEFAULT
-} TessOcrEngineMode;
-typedef enum TessPageSegMode {
- PSM_OSD_ONLY,
- PSM_AUTO_OSD,
- PSM_AUTO_ONLY,
- PSM_AUTO,
- PSM_SINGLE_COLUMN,
- PSM_SINGLE_BLOCK_VERT_TEXT,
- PSM_SINGLE_BLOCK,
- PSM_SINGLE_LINE,
- PSM_SINGLE_WORD,
- PSM_CIRCLE_WORD,
- PSM_SINGLE_CHAR,
- PSM_SPARSE_TEXT,
- PSM_SPARSE_TEXT_OSD,
- PSM_RAW_LINE,
- PSM_COUNT
-} TessPageSegMode;
-typedef enum TessPageIteratorLevel {
- RIL_BLOCK,
- RIL_PARA,
- RIL_TEXTLINE,
- RIL_WORD,
- RIL_SYMBOL
-} TessPageIteratorLevel;
-typedef enum TessPolyBlockType {
- PT_UNKNOWN,
- PT_FLOWING_TEXT,
- PT_HEADING_TEXT,
- PT_PULLOUT_TEXT,
- PT_EQUATION,
- PT_INLINE_EQUATION,
- PT_TABLE,
- PT_VERTICAL_TEXT,
- PT_CAPTION_TEXT,
- PT_FLOWING_IMAGE,
- PT_HEADING_IMAGE,
- PT_PULLOUT_IMAGE,
- PT_HORZ_LINE,
- PT_VERT_LINE,
- PT_NOISE,
- PT_COUNT
-} TessPolyBlockType;
-typedef enum TessOrientation {
- ORIENTATION_PAGE_UP,
- ORIENTATION_PAGE_RIGHT,
- ORIENTATION_PAGE_DOWN,
- ORIENTATION_PAGE_LEFT
-} TessOrientation;
-typedef enum TessParagraphJustification {
- JUSTIFICATION_UNKNOWN,
- JUSTIFICATION_LEFT,
- JUSTIFICATION_CENTER,
- JUSTIFICATION_RIGHT
-} TessParagraphJustification;
-typedef enum TessWritingDirection {
- WRITING_DIRECTION_LEFT_TO_RIGHT,
- WRITING_DIRECTION_RIGHT_TO_LEFT,
- WRITING_DIRECTION_TOP_TO_BOTTOM
-} TessWritingDirection;
-typedef enum TessTextlineOrder {
- TEXTLINE_ORDER_LEFT_TO_RIGHT,
- TEXTLINE_ORDER_RIGHT_TO_LEFT,
- TEXTLINE_ORDER_TOP_TO_BOTTOM
-} TessTextlineOrder;
-typedef struct ETEXT_DESC ETEXT_DESC;
-#endif
-
-typedef bool (*TessCancelFunc)(void *cancel_this, int words);
-typedef bool (*TessProgressFunc)(ETEXT_DESC *ths, int left, int right, int top,
- int bottom);
-
-struct Pix;
-struct Boxa;
-struct Pixa;
-
-/* General free functions */
-
-TESS_API const char *TessVersion();
-TESS_API void TessDeleteText(const char *text);
-TESS_API void TessDeleteTextArray(char **arr);
-TESS_API void TessDeleteIntArray(const int *arr);
-
-/* Renderer API */
-TESS_API TessResultRenderer *TessTextRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessHOcrRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase,
- BOOL font_info);
-TESS_API TessResultRenderer *TessAltoRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessTsvRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessPDFRendererCreate(const char *outputbase,
- const char *datadir,
- BOOL textonly);
-TESS_API TessResultRenderer *TessUnlvRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase);
-TESS_API TessResultRenderer *TessWordStrBoxRendererCreate(
- const char *outputbase);
-
-TESS_API void TessDeleteResultRenderer(TessResultRenderer *renderer);
-TESS_API void TessResultRendererInsert(TessResultRenderer *renderer,
- TessResultRenderer *next);
-TESS_API TessResultRenderer *TessResultRendererNext(
- TessResultRenderer *renderer);
-TESS_API BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer,
- const char *title);
-TESS_API BOOL TessResultRendererAddImage(TessResultRenderer *renderer,
- TessBaseAPI *api);
-TESS_API BOOL TessResultRendererEndDocument(TessResultRenderer *renderer);
-
-TESS_API const char *TessResultRendererExtention(TessResultRenderer *renderer);
-TESS_API const char *TessResultRendererTitle(TessResultRenderer *renderer);
-TESS_API int TessResultRendererImageNum(TessResultRenderer *renderer);
-
-/* Base API */
-
-TESS_API TessBaseAPI *TessBaseAPICreate();
-TESS_API void TessBaseAPIDelete(TessBaseAPI *handle);
-
-TESS_API size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI *handle, void **device);
-
-TESS_API void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name);
-TESS_API const char *TessBaseAPIGetInputName(TessBaseAPI *handle);
-
-TESS_API void TessBaseAPISetInputImage(TessBaseAPI *handle, struct Pix *pix);
-TESS_API struct Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle);
-
-TESS_API int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle);
-TESS_API const char *TessBaseAPIGetDatapath(TessBaseAPI *handle);
-
-TESS_API void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name);
-
-TESS_API BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name,
- const char *value);
-TESS_API BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name,
- const char *value);
-
-TESS_API BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle,
- const char *name, int *value);
-TESS_API BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle,
- const char *name, BOOL *value);
-TESS_API BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle,
- const char *name, double *value);
-TESS_API const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle,
- const char *name);
-
-TESS_API void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp);
-TESS_API BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle,
- const char *filename);
-
-TESS_API int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath,
- const char *language, TessOcrEngineMode oem,
- char **configs, int configs_size);
-TESS_API int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath,
- const char *language, TessOcrEngineMode oem);
-TESS_API int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath,
- const char *language);
-
-TESS_API int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath,
- const char *language, TessOcrEngineMode mode,
- char **configs, int configs_size, char **vars_vec,
- char **vars_values, size_t vars_vec_size,
- BOOL set_only_non_debug_params);
-
-TESS_API int TessBaseAPIInit5(TessBaseAPI *handle, const char *data, int data_size,
- const char *language, TessOcrEngineMode mode,
- char **configs, int configs_size, char **vars_vec,
- char **vars_values, size_t vars_vec_size,
- BOOL set_only_non_debug_params);
-
-TESS_API const char *TessBaseAPIGetInitLanguagesAsString(
- const TessBaseAPI *handle);
-TESS_API char **TessBaseAPIGetLoadedLanguagesAsVector(
- const TessBaseAPI *handle);
-TESS_API char **TessBaseAPIGetAvailableLanguagesAsVector(
- const TessBaseAPI *handle);
-
-TESS_API void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle);
-
-TESS_API void TessBaseAPIReadConfigFile(TessBaseAPI *handle,
- const char *filename);
-TESS_API void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle,
- const char *filename);
-
-TESS_API void TessBaseAPISetPageSegMode(TessBaseAPI *handle,
- TessPageSegMode mode);
-TESS_API TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle);
-
-TESS_API char *TessBaseAPIRect(TessBaseAPI *handle,
- const unsigned char *imagedata,
- int bytes_per_pixel, int bytes_per_line,
- int left, int top, int width, int height);
-
-TESS_API void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle);
-
-TESS_API void TessBaseAPISetImage(TessBaseAPI *handle,
- const unsigned char *imagedata, int width,
- int height, int bytes_per_pixel,
- int bytes_per_line);
-TESS_API void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix);
-
-TESS_API void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi);
-
-TESS_API void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top,
- int width, int height);
-
-TESS_API struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle);
-TESS_API struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle,
- struct Pixa **pixa);
-TESS_API struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle,
- struct Pixa **pixa,
- int **blockids);
-TESS_API struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle,
- BOOL raw_image, int raw_padding,
- struct Pixa **pixa,
- int **blockids, int **paraids);
-TESS_API struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle,
- struct Pixa **pixa, int **blockids);
-TESS_API struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle,
- struct Pixa **pixa);
-TESS_API struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle,
- struct Pixa **cc);
-TESS_API struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle,
- TessPageIteratorLevel level,
- BOOL text_only,
- struct Pixa **pixa,
- int **blockids);
-TESS_API struct Boxa *TessBaseAPIGetComponentImages1(
- TessBaseAPI *handle, TessPageIteratorLevel level, BOOL text_only,
- BOOL raw_image, int raw_padding, struct Pixa **pixa, int **blockids,
- int **paraids);
-
-TESS_API int TessBaseAPIGetThresholdedImageScaleFactor(
- const TessBaseAPI *handle);
-
-TESS_API TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle);
-
-TESS_API int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor);
-
-TESS_API BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename,
- const char *retry_config,
- int timeout_millisec,
- TessResultRenderer *renderer);
-TESS_API BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix,
- int page_index, const char *filename,
- const char *retry_config,
- int timeout_millisec,
- TessResultRenderer *renderer);
-
-TESS_API TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle);
-TESS_API TessMutableIterator *TessBaseAPIGetMutableIterator(
- TessBaseAPI *handle);
-
-TESS_API char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle);
-TESS_API char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number);
-
-TESS_API char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number);
-TESS_API char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number);
-
-TESS_API char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number);
-TESS_API char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number);
-TESS_API char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle,
- int page_number);
-
-TESS_API char *TessBaseAPIGetUNLVText(TessBaseAPI *handle);
-TESS_API int TessBaseAPIMeanTextConf(TessBaseAPI *handle);
-
-TESS_API int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle);
-
-#ifndef DISABLED_LEGACY_ENGINE
-TESS_API BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle,
- TessPageSegMode mode,
- const char *wordstr);
-#endif // #ifndef DISABLED_LEGACY_ENGINE
-
-TESS_API void TessBaseAPIClear(TessBaseAPI *handle);
-TESS_API void TessBaseAPIEnd(TessBaseAPI *handle);
-
-TESS_API int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word);
-TESS_API BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset,
- float *out_slope);
-
-TESS_API const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id);
-
-TESS_API void TessBaseAPIClearPersistentCache(TessBaseAPI *handle);
-
-#ifndef DISABLED_LEGACY_ENGINE
-
-// Call TessDeleteText(*best_script_name) to free memory allocated by this
-// function
-TESS_API BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle,
- int *orient_deg,
- float *orient_conf,
- const char **script_name,
- float *script_conf);
-#endif // #ifndef DISABLED_LEGACY_ENGINE
-
-TESS_API void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle,
- double margin);
-
-TESS_API int TessBaseAPINumDawgs(const TessBaseAPI *handle);
-
-TESS_API TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle);
-
-TESS_API void TessBaseGetBlockTextOrientations(TessBaseAPI *handle,
- int **block_orientation,
- bool **vertical_writing);
-
-/* Page iterator */
-
-TESS_API void TessPageIteratorDelete(TessPageIterator *handle);
-
-TESS_API TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle);
-
-TESS_API void TessPageIteratorBegin(TessPageIterator *handle);
-
-TESS_API BOOL TessPageIteratorNext(TessPageIterator *handle,
- TessPageIteratorLevel level);
-
-TESS_API BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle,
- TessPageIteratorLevel level);
-
-TESS_API BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle,
- TessPageIteratorLevel level,
- TessPageIteratorLevel element);
-
-TESS_API BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle,
- TessPageIteratorLevel level,
- int *left, int *top, int *right,
- int *bottom);
-
-TESS_API TessPolyBlockType
-TessPageIteratorBlockType(const TessPageIterator *handle);
-
-TESS_API struct Pix *TessPageIteratorGetBinaryImage(
- const TessPageIterator *handle, TessPageIteratorLevel level);
-
-TESS_API struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle,
- TessPageIteratorLevel level,
- int padding,
- struct Pix *original_image,
- int *left, int *top);
-
-TESS_API BOOL TessPageIteratorBaseline(const TessPageIterator *handle,
- TessPageIteratorLevel level, int *x1,
- int *y1, int *x2, int *y2);
-
-TESS_API void TessPageIteratorOrientation(
- TessPageIterator *handle, TessOrientation *orientation,
- TessWritingDirection *writing_direction, TessTextlineOrder *textline_order,
- float *deskew_angle);
-
-TESS_API void TessPageIteratorParagraphInfo(
- TessPageIterator *handle, TessParagraphJustification *justification,
- BOOL *is_list_item, BOOL *is_crown, int *first_line_indent);
-
-/* Result iterator */
-
-TESS_API void TessResultIteratorDelete(TessResultIterator *handle);
-TESS_API TessResultIterator *TessResultIteratorCopy(
- const TessResultIterator *handle);
-TESS_API TessPageIterator *TessResultIteratorGetPageIterator(
- TessResultIterator *handle);
-TESS_API const TessPageIterator *TessResultIteratorGetPageIteratorConst(
- const TessResultIterator *handle);
-TESS_API TessChoiceIterator *TessResultIteratorGetChoiceIterator(
- const TessResultIterator *handle);
-
-TESS_API BOOL TessResultIteratorNext(TessResultIterator *handle,
- TessPageIteratorLevel level);
-TESS_API char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle,
- TessPageIteratorLevel level);
-TESS_API float TessResultIteratorConfidence(const TessResultIterator *handle,
- TessPageIteratorLevel level);
-TESS_API const char *TessResultIteratorWordRecognitionLanguage(
- const TessResultIterator *handle);
-TESS_API const char *TessResultIteratorWordFontAttributes(
- const TessResultIterator *handle, BOOL *is_bold, BOOL *is_italic,
- BOOL *is_underlined, BOOL *is_monospace, BOOL *is_serif, BOOL *is_smallcaps,
- int *pointsize, int *font_id);
-
-TESS_API BOOL
-TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle);
-TESS_API BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle);
-TESS_API BOOL
-TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle);
-TESS_API BOOL
-TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle);
-TESS_API BOOL
-TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle);
-
-TESS_API void TessChoiceIteratorDelete(TessChoiceIterator *handle);
-TESS_API BOOL TessChoiceIteratorNext(TessChoiceIterator *handle);
-TESS_API const char *TessChoiceIteratorGetUTF8Text(
- const TessChoiceIterator *handle);
-TESS_API float TessChoiceIteratorConfidence(const TessChoiceIterator *handle);
-
-/* Progress monitor */
-
-TESS_API ETEXT_DESC *TessMonitorCreate();
-TESS_API void TessMonitorDelete(ETEXT_DESC *monitor);
-TESS_API void TessMonitorSetCancelFunc(ETEXT_DESC *monitor,
- TessCancelFunc cancelFunc);
-TESS_API void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis);
-TESS_API void *TessMonitorGetCancelThis(ETEXT_DESC *monitor);
-TESS_API void TessMonitorSetProgressFunc(ETEXT_DESC *monitor,
- TessProgressFunc progressFunc);
-TESS_API int TessMonitorGetProgress(ETEXT_DESC *monitor);
-TESS_API void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // API_CAPI_H_
diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/export.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/export.h
deleted file mode 100644
index d238b628..00000000
--- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/export.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: export.h
-// Description: Place holder
-//
-// (C) Copyright 2006, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_PLATFORM_H_
-#define TESSERACT_PLATFORM_H_
-
-#ifndef TESS_API
-# if defined(_WIN32) || defined(__CYGWIN__)
-# if defined(TESS_EXPORTS)
-# define TESS_API __declspec(dllexport)
-# elif defined(TESS_IMPORTS)
-# define TESS_API __declspec(dllimport)
-# else
-# define TESS_API
-# endif
-# else
-# if defined(TESS_EXPORTS) || defined(TESS_IMPORTS)
-# define TESS_API __attribute__((visibility("default")))
-# else
-# define TESS_API
-# endif
-# endif
-#endif
-
-#endif // TESSERACT_PLATFORM_H_
diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/ltrresultiterator.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/ltrresultiterator.h
deleted file mode 100644
index 6ca0a98e..00000000
--- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/ltrresultiterator.h
+++ /dev/null
@@ -1,235 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: ltrresultiterator.h
-// Description: Iterator for tesseract results in strict left-to-right
-// order that avoids using tesseract internal data structures.
-// Author: Ray Smith
-//
-// (C) Copyright 2010, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
-#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
-
-#include "export.h" // for TESS_API
-#include "pageiterator.h" // for PageIterator
-#include "publictypes.h" // for PageIteratorLevel
-#include "unichar.h" // for StrongScriptDirection
-
-namespace tesseract {
-
-class BLOB_CHOICE_IT;
-class PAGE_RES;
-class WERD_RES;
-
-class Tesseract;
-
-// Class to iterate over tesseract results, providing access to all levels
-// of the page hierarchy, without including any tesseract headers or having
-// to handle any tesseract structures.
-// WARNING! This class points to data held within the TessBaseAPI class, and
-// therefore can only be used while the TessBaseAPI class still exists and
-// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
-// DetectOS, or anything else that changes the internal PAGE_RES.
-// See tesseract/publictypes.h for the definition of PageIteratorLevel.
-// See also base class PageIterator, which contains the bulk of the interface.
-// LTRResultIterator adds text-specific methods for access to OCR output.
-
-class TESS_API LTRResultIterator : public PageIterator {
- friend class ChoiceIterator;
-
-public:
- // page_res and tesseract come directly from the BaseAPI.
- // The rectangle parameters are copied indirectly from the Thresholder,
- // via the BaseAPI. They represent the coordinates of some rectangle in an
- // original image (in top-left-origin coordinates) and therefore the top-left
- // needs to be added to any output boxes in order to specify coordinates
- // in the original image. See TessBaseAPI::SetRectangle.
- // The scale and scaled_yres are in case the Thresholder scaled the image
- // rectangle prior to thresholding. Any coordinates in tesseract's image
- // must be divided by scale before adding (rect_left, rect_top).
- // The scaled_yres indicates the effective resolution of the binary image
- // that tesseract has been given by the Thresholder.
- // After the constructor, Begin has already been called.
- LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
- int scaled_yres, int rect_left, int rect_top,
- int rect_width, int rect_height);
-
- ~LTRResultIterator() override;
-
- // LTRResultIterators may be copied! This makes it possible to iterate over
- // all the objects at a lower level, while maintaining an iterator to
- // objects at a higher level. These constructors DO NOT CALL Begin, so
- // iterations will continue from the location of src.
- // TODO: For now the copy constructor and operator= only need the base class
- // versions, but if new data members are added, don't forget to add them!
-
- // ============= Moving around within the page ============.
-
- // See PageIterator.
-
- // ============= Accessing data ==============.
-
- // Returns the null terminated UTF-8 encoded text string for the current
- // object at the given level. Use delete [] to free after use.
- char *GetUTF8Text(PageIteratorLevel level) const;
-
- // Set the string inserted at the end of each text line. "\n" by default.
- void SetLineSeparator(const char *new_line);
-
- // Set the string inserted at the end of each paragraph. "\n" by default.
- void SetParagraphSeparator(const char *new_para);
-
- // Returns the mean confidence of the current object at the given level.
- // The number should be interpreted as a percent probability. (0.0f-100.0f)
- float Confidence(PageIteratorLevel level) const;
-
- // ============= Functions that refer to words only ============.
-
- // Returns the font attributes of the current word. If iterating at a higher
- // level object than words, eg textlines, then this will return the
- // attributes of the first word in that textline.
- // The actual return value is a string representing a font name. It points
- // to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
- // the iterator itself, ie rendered invalid by various members of
- // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
- // Pointsize is returned in printers points (1/72 inch.)
- const char *WordFontAttributes(bool *is_bold, bool *is_italic,
- bool *is_underlined, bool *is_monospace,
- bool *is_serif, bool *is_smallcaps,
- int *pointsize, int *font_id) const;
-
- // Return the name of the language used to recognize this word.
- // On error, nullptr. Do not delete this pointer.
- const char *WordRecognitionLanguage() const;
-
- // Return the overall directionality of this word.
- StrongScriptDirection WordDirection() const;
-
- // Returns true if the current word was found in a dictionary.
- bool WordIsFromDictionary() const;
-
- // Returns the number of blanks before the current word.
- int BlanksBeforeWord() const;
-
- // Returns true if the current word is numeric.
- bool WordIsNumeric() const;
-
- // Returns true if the word contains blamer information.
- bool HasBlamerInfo() const;
-
- // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
- // of the current word.
- const void *GetParamsTrainingBundle() const;
-
- // Returns a pointer to the string with blamer information for this word.
- // Assumes that the word's blamer_bundle is not nullptr.
- const char *GetBlamerDebug() const;
-
- // Returns a pointer to the string with misadaption information for this word.
- // Assumes that the word's blamer_bundle is not nullptr.
- const char *GetBlamerMisadaptionDebug() const;
-
- // Returns true if a truth string was recorded for the current word.
- bool HasTruthString() const;
-
- // Returns true if the given string is equivalent to the truth string for
- // the current word.
- bool EquivalentToTruth(const char *str) const;
-
- // Returns a null terminated UTF-8 encoded truth string for the current word.
- // Use delete [] to free after use.
- char *WordTruthUTF8Text() const;
-
- // Returns a null terminated UTF-8 encoded normalized OCR string for the
- // current word. Use delete [] to free after use.
- char *WordNormedUTF8Text() const;
-
- // Returns a pointer to serialized choice lattice.
- // Fills lattice_size with the number of bytes in lattice data.
- const char *WordLattice(int *lattice_size) const;
-
- // ============= Functions that refer to symbols only ============.
-
- // Returns true if the current symbol is a superscript.
- // If iterating at a higher level object than symbols, eg words, then
- // this will return the attributes of the first symbol in that word.
- bool SymbolIsSuperscript() const;
- // Returns true if the current symbol is a subscript.
- // If iterating at a higher level object than symbols, eg words, then
- // this will return the attributes of the first symbol in that word.
- bool SymbolIsSubscript() const;
- // Returns true if the current symbol is a dropcap.
- // If iterating at a higher level object than symbols, eg words, then
- // this will return the attributes of the first symbol in that word.
- bool SymbolIsDropcap() const;
-
-protected:
- const char *line_separator_;
- const char *paragraph_separator_;
-};
-
-// Class to iterate over the classifier choices for a single RIL_SYMBOL.
-class TESS_API ChoiceIterator {
-public:
- // Construction is from a LTRResultIterator that points to the symbol of
- // interest. The ChoiceIterator allows a one-shot iteration over the
- // choices for this symbol and after that it is useless.
- explicit ChoiceIterator(const LTRResultIterator &result_it);
- ~ChoiceIterator();
-
- // Moves to the next choice for the symbol and returns false if there
- // are none left.
- bool Next();
-
- // ============= Accessing data ==============.
-
- // Returns the null terminated UTF-8 encoded text string for the current
- // choice.
- // NOTE: Unlike LTRResultIterator::GetUTF8Text, the return points to an
- // internal structure and should NOT be delete[]ed to free after use.
- const char *GetUTF8Text() const;
-
- // Returns the confidence of the current choice depending on the used language
- // data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All
- // choices for one symbol should roughly add up to 1.0f.
- // If only traineddata of the legacy engine is used, the number should be
- // interpreted as a percent probability. (0.0f-100.0f) In this case
- // probabilities won't add up to 100. Each one stands on its own.
- float Confidence() const;
-
- // Returns a vector containing all timesteps, which belong to the currently
- // selected symbol. A timestep is a vector containing pairs of symbols and
- // floating point numbers. The number states the probability for the
- // corresponding symbol.
- std::vector>> *Timesteps() const;
-
-private:
- // clears the remaining spaces out of the results and adapt the probabilities
- void filterSpaces();
- // Pointer to the WERD_RES object owned by the API.
- WERD_RES *word_res_;
- // Iterator over the blob choices.
- BLOB_CHOICE_IT *choice_it_;
- std::vector> *LSTM_choices_ = nullptr;
- std::vector>::iterator LSTM_choice_it_;
-
- const int *tstep_index_;
- // regulates the rating granularity
- double rating_coefficient_;
- // leading blanks
- int blanks_before_word_;
- // true when there is lstm engine related trained data
- bool oemLSTM_;
-};
-
-} // namespace tesseract.
-
-#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/ocrclass.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/ocrclass.h
deleted file mode 100644
index a55e6528..00000000
--- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/ocrclass.h
+++ /dev/null
@@ -1,158 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-/**********************************************************************
- * File: ocrclass.h
- * Description: Class definitions and constants for the OCR API.
- * Author: Hewlett-Packard Co
- *
- * (C) Copyright 1996, Hewlett-Packard Co.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-/**********************************************************************
- * This file contains typedefs for all the structures used by
- * the HP OCR interface.
- * The structures are designed to allow them to be used with any
- * structure alignment up to 8.
- **********************************************************************/
-
-#ifndef CCUTIL_OCRCLASS_H_
-#define CCUTIL_OCRCLASS_H_
-
-#include
-#include
-
-namespace tesseract {
-
-/**********************************************************************
- * EANYCODE_CHAR
- * Description of a single character. The character code is defined by
- * the character set of the current font.
- * Output text is sent as an array of these structures.
- * Spaces and line endings in the output are represented in the
- * structures of the surrounding characters. They are not directly
- * represented as characters.
- * The first character in a word has a positive value of blanks.
- * Missing information should be set to the defaults in the comments.
- * If word bounds are known, but not character bounds, then the top and
- * bottom of each character should be those of the word. The left of the
- * first and right of the last char in each word should be set. All other
- * lefts and rights should be set to -1.
- * If set, the values of right and bottom are left+width and top+height.
- * Most of the members come directly from the parameters to ocr_append_char.
- * The formatting member uses the enhancement parameter and combines the
- * line direction stuff into the top 3 bits.
- * The coding is 0=RL char, 1=LR char, 2=DR NL, 3=UL NL, 4=DR Para,
- * 5=UL Para, 6=TB char, 7=BT char. API users do not need to know what
- * the coding is, only that it is backwards compatible with the previous
- * version.
- **********************************************************************/
-
-struct EANYCODE_CHAR { /*single character */
- // It should be noted that the format for char_code for version 2.0 and beyond
- // is UTF8 which means that ASCII characters will come out as one structure
- // but other characters will be returned in two or more instances of this
- // structure with a single byte of the UTF8 code in each, but each will have
- // the same bounding box. Programs which want to handle languagues with
- // different characters sets will need to handle extended characters
- // appropriately, but *all* code needs to be prepared to receive UTF8 coded
- // characters for characters such as bullet and fancy quotes.
- uint16_t char_code; /*character itself */
- int16_t left; /*of char (-1) */
- int16_t right; /*of char (-1) */
- int16_t top; /*of char (-1) */
- int16_t bottom; /*of char (-1) */
- int16_t font_index; /*what font (0) */
- uint8_t confidence; /*0=perfect, 100=reject (0/100) */
- uint8_t point_size; /*of char, 72=i inch, (10) */
- int8_t blanks; /*no of spaces before this char (1) */
- uint8_t formatting; /*char formatting (0) */
-};
-
-/**********************************************************************
- * ETEXT_DESC
- * Description of the output of the OCR engine.
- * This structure is used as both a progress monitor and the final
- * output header, since it needs to be a valid progress monitor while
- * the OCR engine is storing its output to shared memory.
- * During progress, all the buffer info is -1.
- * Progress starts at 0 and increases to 100 during OCR. No other constraint.
- * Additionally the progress callback contains the bounding box of the word that
- * is currently being processed.
- * Every progress callback, the OCR engine must set ocr_alive to 1.
- * The HP side will set ocr_alive to 0. Repeated failure to reset
- * to 1 indicates that the OCR engine is dead.
- * If the cancel function is not null then it is called with the number of
- * user words found. If it returns true then operation is cancelled.
- **********************************************************************/
-class ETEXT_DESC;
-
-using CANCEL_FUNC = bool (*)(void *, int);
-using PROGRESS_FUNC = bool (*)(int, int, int, int, int);
-using PROGRESS_FUNC2 = bool (*)(ETEXT_DESC *, int, int, int, int);
-
-class ETEXT_DESC { // output header
-public:
- int16_t count{0}; /// chars in this buffer(0)
- int16_t progress{0}; /// percent complete increasing (0-100)
- /** Progress monitor covers word recognition and it does not cover layout
- * analysis.
- * See Ray comment in https://github.com/tesseract-ocr/tesseract/pull/27 */
- int8_t more_to_come{0}; /// true if not last
- volatile int8_t ocr_alive{0}; /// ocr sets to 1, HP 0
- int8_t err_code{0}; /// for errcode use
- CANCEL_FUNC cancel{nullptr}; /// returns true to cancel
- PROGRESS_FUNC progress_callback{
- nullptr}; /// called whenever progress increases
- PROGRESS_FUNC2 progress_callback2; /// monitor-aware progress callback
- void *cancel_this{nullptr}; /// this or other data for cancel
- std::chrono::steady_clock::time_point end_time;
- /// Time to stop. Expected to be set only
- /// by call to set_deadline_msecs().
- EANYCODE_CHAR text[1]{}; /// character data
-
- ETEXT_DESC() : progress_callback2(&default_progress_func) {
- end_time = std::chrono::time_point();
- }
-
- // Sets the end time to be deadline_msecs milliseconds from now.
- void set_deadline_msecs(int32_t deadline_msecs) {
- if (deadline_msecs > 0) {
- end_time = std::chrono::steady_clock::now() +
- std::chrono::milliseconds(deadline_msecs);
- }
- }
-
- // Returns false if we've not passed the end_time, or have not set a deadline.
- bool deadline_exceeded() const {
- if (end_time.time_since_epoch() ==
- std::chrono::steady_clock::duration::zero()) {
- return false;
- }
- auto now = std::chrono::steady_clock::now();
- return (now > end_time);
- }
-
-private:
- static bool default_progress_func(ETEXT_DESC *ths, int left, int right,
- int top, int bottom) {
- if (ths->progress_callback != nullptr) {
- return (*(ths->progress_callback))(ths->progress, left, right, top,
- bottom);
- }
- return true;
- }
-};
-
-} // namespace tesseract
-
-#endif // CCUTIL_OCRCLASS_H_
diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/osdetect.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/osdetect.h
deleted file mode 100644
index 34bfb557..00000000
--- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/osdetect.h
+++ /dev/null
@@ -1,139 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: osdetect.h
-// Description: Orientation and script detection.
-// Author: Samuel Charron
-// Ranjith Unnikrishnan
-//
-// (C) Copyright 2008, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCMAIN_OSDETECT_H_
-#define TESSERACT_CCMAIN_OSDETECT_H_
-
-#include "export.h" // for TESS_API
-
-#include // for std::vector
-
-namespace tesseract {
-
-class BLOBNBOX;
-class BLOBNBOX_CLIST;
-class BLOB_CHOICE_LIST;
-class TO_BLOCK_LIST;
-class UNICHARSET;
-
-class Tesseract;
-
-// Max number of scripts in ICU + "NULL" + Japanese and Korean + Fraktur
-const int kMaxNumberOfScripts = 116 + 1 + 2 + 1;
-
-struct OSBestResult {
- OSBestResult()
- : orientation_id(0), script_id(0), sconfidence(0.0), oconfidence(0.0) {}
- int orientation_id;
- int script_id;
- float sconfidence;
- float oconfidence;
-};
-
-struct OSResults {
- OSResults() : unicharset(nullptr) {
- for (int i = 0; i < 4; ++i) {
- for (int j = 0; j < kMaxNumberOfScripts; ++j) {
- scripts_na[i][j] = 0;
- }
- orientations[i] = 0;
- }
- }
- void update_best_orientation();
- // Set the estimate of the orientation to the given id.
- void set_best_orientation(int orientation_id);
- // Update/Compute the best estimate of the script assuming the given
- // orientation id.
- void update_best_script(int orientation_id);
- // Return the index of the script with the highest score for this orientation.
- TESS_API int get_best_script(int orientation_id) const;
- // Accumulate scores with given OSResults instance and update the best script.
- void accumulate(const OSResults &osr);
-
- // Print statistics.
- void print_scores(void) const;
- void print_scores(int orientation_id) const;
-
- // Array holding scores for each orientation id [0,3].
- // Orientation ids [0..3] map to [0, 270, 180, 90] degree orientations of the
- // page respectively, where the values refer to the amount of clockwise
- // rotation to be applied to the page for the text to be upright and readable.
- float orientations[4];
- // Script confidence scores for each of 4 possible orientations.
- float scripts_na[4][kMaxNumberOfScripts];
-
- UNICHARSET *unicharset;
- OSBestResult best_result;
-};
-
-class OrientationDetector {
-public:
- OrientationDetector(const std::vector *allowed_scripts,
- OSResults *results);
- bool detect_blob(BLOB_CHOICE_LIST *scores);
- int get_orientation();
-
-private:
- OSResults *osr_;
- const std::vector *allowed_scripts_;
-};
-
-class ScriptDetector {
-public:
- ScriptDetector(const std::vector *allowed_scripts, OSResults *osr,
- tesseract::Tesseract *tess);
- void detect_blob(BLOB_CHOICE_LIST *scores);
- bool must_stop(int orientation) const;
-
-private:
- OSResults *osr_;
- static const char *korean_script_;
- static const char *japanese_script_;
- static const char *fraktur_script_;
- int korean_id_;
- int japanese_id_;
- int katakana_id_;
- int hiragana_id_;
- int han_id_;
- int hangul_id_;
- int latin_id_;
- int fraktur_id_;
- tesseract::Tesseract *tess_;
- const std::vector *allowed_scripts_;
-};
-
-int orientation_and_script_detection(const char *filename, OSResults *,
- tesseract::Tesseract *);
-
-int os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr,
- tesseract::Tesseract *tess);
-
-int os_detect_blobs(const std::vector *allowed_scripts,
- BLOBNBOX_CLIST *blob_list, OSResults *osr,
- tesseract::Tesseract *tess);
-
-bool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s,
- OSResults *, tesseract::Tesseract *tess);
-
-// Helper method to convert an orientation index to its value in degrees.
-// The value represents the amount of clockwise rotation in degrees that must be
-// applied for the text to be upright (readable).
-TESS_API int OrientationIdToValue(const int &id);
-
-} // namespace tesseract
-
-#endif // TESSERACT_CCMAIN_OSDETECT_H_
diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/pageiterator.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/pageiterator.h
deleted file mode 100644
index 68739715..00000000
--- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/pageiterator.h
+++ /dev/null
@@ -1,364 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: pageiterator.h
-// Description: Iterator for tesseract page structure that avoids using
-// tesseract internal data structures.
-// Author: Ray Smith
-//
-// (C) Copyright 2010, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_
-#define TESSERACT_CCMAIN_PAGEITERATOR_H_
-
-#include "export.h"
-#include "publictypes.h"
-
-struct Pix;
-struct Pta;
-
-namespace tesseract {
-
-struct BlamerBundle;
-class C_BLOB_IT;
-class PAGE_RES;
-class PAGE_RES_IT;
-class WERD;
-
-class Tesseract;
-
-/**
- * Class to iterate over tesseract page structure, providing access to all
- * levels of the page hierarchy, without including any tesseract headers or
- * having to handle any tesseract structures.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- * See tesseract/publictypes.h for the definition of PageIteratorLevel.
- * See also ResultIterator, derived from PageIterator, which adds in the
- * ability to access OCR output with text-specific methods.
- */
-
-class TESS_API PageIterator {
-public:
- /**
- * page_res and tesseract come directly from the BaseAPI.
- * The rectangle parameters are copied indirectly from the Thresholder,
- * via the BaseAPI. They represent the coordinates of some rectangle in an
- * original image (in top-left-origin coordinates) and therefore the top-left
- * needs to be added to any output boxes in order to specify coordinates
- * in the original image. See TessBaseAPI::SetRectangle.
- * The scale and scaled_yres are in case the Thresholder scaled the image
- * rectangle prior to thresholding. Any coordinates in tesseract's image
- * must be divided by scale before adding (rect_left, rect_top).
- * The scaled_yres indicates the effective resolution of the binary image
- * that tesseract has been given by the Thresholder.
- * After the constructor, Begin has already been called.
- */
- PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
- int scaled_yres, int rect_left, int rect_top, int rect_width,
- int rect_height);
- virtual ~PageIterator();
-
- /**
- * Page/ResultIterators may be copied! This makes it possible to iterate over
- * all the objects at a lower level, while maintaining an iterator to
- * objects at a higher level. These constructors DO NOT CALL Begin, so
- * iterations will continue from the location of src.
- */
- PageIterator(const PageIterator &src);
- const PageIterator &operator=(const PageIterator &src);
-
- /** Are we positioned at the same location as other? */
- bool PositionedAtSameWord(const PAGE_RES_IT *other) const;
-
- // ============= Moving around within the page ============.
-
- /**
- * Moves the iterator to point to the start of the page to begin an
- * iteration.
- */
- virtual void Begin();
-
- /**
- * Moves the iterator to the beginning of the paragraph.
- * This class implements this functionality by moving it to the zero indexed
- * blob of the first (leftmost) word on the first row of the paragraph.
- */
- virtual void RestartParagraph();
-
- /**
- * Return whether this iterator points anywhere in the first textline of a
- * paragraph.
- */
- bool IsWithinFirstTextlineOfParagraph() const;
-
- /**
- * Moves the iterator to the beginning of the text line.
- * This class implements this functionality by moving it to the zero indexed
- * blob of the first (leftmost) word of the row.
- */
- virtual void RestartRow();
-
- /**
- * Moves to the start of the next object at the given level in the
- * page hierarchy, and returns false if the end of the page was reached.
- * NOTE that RIL_SYMBOL will skip non-text blocks, but all other
- * PageIteratorLevel level values will visit each non-text block once.
- * Think of non text blocks as containing a single para, with a single line,
- * with a single imaginary word.
- * Calls to Next with different levels may be freely intermixed.
- * This function iterates words in right-to-left scripts correctly, if
- * the appropriate language has been loaded into Tesseract.
- */
- virtual bool Next(PageIteratorLevel level);
-
- /**
- * Returns true if the iterator is at the start of an object at the given
- * level.
- *
- * For instance, suppose an iterator it is pointed to the first symbol of the
- * first word of the third line of the second paragraph of the first block in
- * a page, then:
- * it.IsAtBeginningOf(RIL_BLOCK) = false
- * it.IsAtBeginningOf(RIL_PARA) = false
- * it.IsAtBeginningOf(RIL_TEXTLINE) = true
- * it.IsAtBeginningOf(RIL_WORD) = true
- * it.IsAtBeginningOf(RIL_SYMBOL) = true
- */
- virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
-
- /**
- * Returns whether the iterator is positioned at the last element in a
- * given level. (e.g. the last word in a line, the last line in a block)
- *
- * Here's some two-paragraph example
- * text. It starts off innocuously
- * enough but quickly turns bizarre.
- * The author inserts a cornucopia
- * of words to guard against confused
- * references.
- *
- * Now take an iterator it pointed to the start of "bizarre."
- * it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false
- * it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true
- * it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false
- */
- virtual bool IsAtFinalElement(PageIteratorLevel level,
- PageIteratorLevel element) const;
-
- /**
- * Returns whether this iterator is positioned
- * before other: -1
- * equal to other: 0
- * after other: 1
- */
- int Cmp(const PageIterator &other) const;
-
- // ============= Accessing data ==============.
- // Coordinate system:
- // Integer coordinates are at the cracks between the pixels.
- // The top-left corner of the top-left pixel in the image is at (0,0).
- // The bottom-right corner of the bottom-right pixel in the image is at
- // (width, height).
- // Every bounding box goes from the top-left of the top-left contained
- // pixel to the bottom-right of the bottom-right contained pixel, so
- // the bounding box of the single top-left pixel in the image is:
- // (0,0)->(1,1).
- // If an image rectangle has been set in the API, then returned coordinates
- // relate to the original (full) image, rather than the rectangle.
-
- /**
- * Controls what to include in a bounding box. Bounding boxes of all levels
- * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics.
- * Between layout analysis and recognition, it isn't known where all
- * diacritics belong, so this control is used to include or exclude some
- * diacritics that are above or below the main body of the word. In most cases
- * where the placement is obvious, and after recognition, it doesn't make as
- * much difference, as the diacritics will already be included in the word.
- */
- void SetBoundingBoxComponents(bool include_upper_dots,
- bool include_lower_dots) {
- include_upper_dots_ = include_upper_dots;
- include_lower_dots_ = include_lower_dots;
- }
-
- /**
- * Returns the bounding rectangle of the current object at the given level.
- * See comment on coordinate system above.
- * Returns false if there is no such object at the current position.
- * The returned bounding box is guaranteed to match the size and position
- * of the image returned by GetBinaryImage, but may clip foreground pixels
- * from a grey image. The padding argument to GetImage can be used to expand
- * the image to include more foreground pixels. See GetImage below.
- */
- bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right,
- int *bottom) const;
- bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top,
- int *right, int *bottom) const;
- /**
- * Returns the bounding rectangle of the object in a coordinate system of the
- * working image rectangle having its origin at (rect_left_, rect_top_) with
- * respect to the original image and is scaled by a factor scale_.
- */
- bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top,
- int *right, int *bottom) const;
-
- /** Returns whether there is no object of a given level. */
- bool Empty(PageIteratorLevel level) const;
-
- /**
- * Returns the type of the current block.
- * See tesseract/publictypes.h for PolyBlockType.
- */
- PolyBlockType BlockType() const;
-
- /**
- * Returns the polygon outline of the current block. The returned Pta must
- * be ptaDestroy-ed after use. Note that the returned Pta lists the vertices
- * of the polygon, and the last edge is the line segment between the last
- * point and the first point. nullptr will be returned if the iterator is
- * at the end of the document or layout analysis was not used.
- */
- Pta *BlockPolygon() const;
-
- /**
- * Returns a binary image of the current object at the given level.
- * The position and size match the return from BoundingBoxInternal, and so
- * this could be upscaled with respect to the original input image.
- * Use pixDestroy to delete the image after use.
- */
- Pix *GetBinaryImage(PageIteratorLevel level) const;
-
- /**
- * Returns an image of the current object at the given level in greyscale
- * if available in the input. To guarantee a binary image use BinaryImage.
- * NOTE that in order to give the best possible image, the bounds are
- * expanded slightly over the binary connected component, by the supplied
- * padding, so the top-left position of the returned image is returned
- * in (left,top). These will most likely not match the coordinates
- * returned by BoundingBox.
- * If you do not supply an original image, you will get a binary one.
- * Use pixDestroy to delete the image after use.
- */
- Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img,
- int *left, int *top) const;
-
- /**
- * Returns the baseline of the current object at the given level.
- * The baseline is the line that passes through (x1, y1) and (x2, y2).
- * WARNING: with vertical text, baselines may be vertical!
- * Returns false if there is no baseline at the current position.
- */
- bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2,
- int *y2) const;
-
- // Returns the attributes of the current row.
- void RowAttributes(float *row_height, float *descenders,
- float *ascenders) const;
-
- /**
- * Returns orientation for the block the iterator points to.
- * orientation, writing_direction, textline_order: see publictypes.h
- * deskew_angle: after rotating the block so the text orientation is
- * upright, how many radians does one have to rotate the
- * block anti-clockwise for it to be level?
- * -Pi/4 <= deskew_angle <= Pi/4
- */
- void Orientation(tesseract::Orientation *orientation,
- tesseract::WritingDirection *writing_direction,
- tesseract::TextlineOrder *textline_order,
- float *deskew_angle) const;
-
- /**
- * Returns information about the current paragraph, if available.
- *
- * justification -
- * LEFT if ragged right, or fully justified and script is left-to-right.
- * RIGHT if ragged left, or fully justified and script is right-to-left.
- * unknown if it looks like source code or we have very few lines.
- * is_list_item -
- * true if we believe this is a member of an ordered or unordered list.
- * is_crown -
- * true if the first line of the paragraph is aligned with the other
- * lines of the paragraph even though subsequent paragraphs have first
- * line indents. This typically indicates that this is the continuation
- * of a previous paragraph or that it is the very first paragraph in
- * the chapter.
- * first_line_indent -
- * For LEFT aligned paragraphs, the first text line of paragraphs of
- * this kind are indented this many pixels from the left edge of the
- * rest of the paragraph.
- * for RIGHT aligned paragraphs, the first text line of paragraphs of
- * this kind are indented this many pixels from the right edge of the
- * rest of the paragraph.
- * NOTE 1: This value may be negative.
- * NOTE 2: if *is_crown == true, the first line of this paragraph is
- * actually flush, and first_line_indent is set to the "common"
- * first_line_indent for subsequent paragraphs in this block
- * of text.
- */
- void ParagraphInfo(tesseract::ParagraphJustification *justification,
- bool *is_list_item, bool *is_crown,
- int *first_line_indent) const;
-
- // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle
- // of the current word to the given pointer (takes ownership of the pointer)
- // and returns true.
- // Can only be used when iterating on the word level.
- bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);
-
-protected:
- /**
- * Sets up the internal data for iterating the blobs of a new word, then
- * moves the iterator to the given offset.
- */
- void BeginWord(int offset);
-
- /** Pointer to the page_res owned by the API. */
- PAGE_RES *page_res_;
- /** Pointer to the Tesseract object owned by the API. */
- Tesseract *tesseract_;
- /**
- * The iterator to the page_res_. Owned by this ResultIterator.
- * A pointer just to avoid dragging in Tesseract includes.
- */
- PAGE_RES_IT *it_;
- /**
- * The current input WERD being iterated. If there is an output from OCR,
- * then word_ is nullptr. Owned by the API
- */
- WERD *word_;
- /** The length of the current word_. */
- int word_length_;
- /** The current blob index within the word. */
- int blob_index_;
- /**
- * Iterator to the blobs within the word. If nullptr, then we are iterating
- * OCR results in the box_word.
- * Owned by this ResultIterator.
- */
- C_BLOB_IT *cblob_it_;
- /** Control over what to include in bounding boxes. */
- bool include_upper_dots_;
- bool include_lower_dots_;
- /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/
- int scale_;
- int scaled_yres_;
- int rect_left_;
- int rect_top_;
- int rect_width_;
- int rect_height_;
-};
-
-} // namespace tesseract.
-
-#endif // TESSERACT_CCMAIN_PAGEITERATOR_H_
diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/publictypes.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/publictypes.h
deleted file mode 100644
index 0069cf28..00000000
--- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/publictypes.h
+++ /dev/null
@@ -1,281 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: publictypes.h
-// Description: Types used in both the API and internally
-// Author: Ray Smith
-//
-// (C) Copyright 2010, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCSTRUCT_PUBLICTYPES_H_
-#define TESSERACT_CCSTRUCT_PUBLICTYPES_H_
-
-namespace tesseract {
-
-// This file contains types that are used both by the API and internally
-// to Tesseract. In order to decouple the API from Tesseract and prevent cyclic
-// dependencies, THIS FILE SHOULD NOT DEPEND ON ANY OTHER PART OF TESSERACT.
-// Restated: It is OK for low-level Tesseract files to include publictypes.h,
-// but not for the low-level tesseract code to include top-level API code.
-// This file should not use other Tesseract types, as that would drag
-// their includes into the API-level.
-
-/** Number of printers' points in an inch. The unit of the pointsize return. */
-constexpr int kPointsPerInch = 72;
-/**
- * Minimum believable resolution. Used as a default if there is no other
- * information, as it is safer to under-estimate than over-estimate.
- */
-constexpr int kMinCredibleResolution = 70;
-/** Maximum believable resolution. */
-constexpr int kMaxCredibleResolution = 2400;
-/**
- * Ratio between median blob size and likely resolution. Used to estimate
- * resolution when none is provided. This is basically 1/usual text size in
- * inches. */
-constexpr int kResolutionEstimationFactor = 10;
-
-/**
- * Possible types for a POLY_BLOCK or ColPartition.
- * Must be kept in sync with kPBColors in polyblk.cpp and PTIs*Type functions
- * below, as well as kPolyBlockNames in layout_test.cc.
- * Used extensively by ColPartition, and POLY_BLOCK.
- */
-enum PolyBlockType {
- PT_UNKNOWN, // Type is not yet known. Keep as the first element.
- PT_FLOWING_TEXT, // Text that lives inside a column.
- PT_HEADING_TEXT, // Text that spans more than one column.
- PT_PULLOUT_TEXT, // Text that is in a cross-column pull-out region.
- PT_EQUATION, // Partition belonging to an equation region.
- PT_INLINE_EQUATION, // Partition has inline equation.
- PT_TABLE, // Partition belonging to a table region.
- PT_VERTICAL_TEXT, // Text-line runs vertically.
- PT_CAPTION_TEXT, // Text that belongs to an image.
- PT_FLOWING_IMAGE, // Image that lives inside a column.
- PT_HEADING_IMAGE, // Image that spans more than one column.
- PT_PULLOUT_IMAGE, // Image that is in a cross-column pull-out region.
- PT_HORZ_LINE, // Horizontal Line.
- PT_VERT_LINE, // Vertical Line.
- PT_NOISE, // Lies outside of any column.
- PT_COUNT
-};
-
-/** Returns true if PolyBlockType is of horizontal line type */
-inline bool PTIsLineType(PolyBlockType type) {
- return type == PT_HORZ_LINE || type == PT_VERT_LINE;
-}
-/** Returns true if PolyBlockType is of image type */
-inline bool PTIsImageType(PolyBlockType type) {
- return type == PT_FLOWING_IMAGE || type == PT_HEADING_IMAGE ||
- type == PT_PULLOUT_IMAGE;
-}
-/** Returns true if PolyBlockType is of text type */
-inline bool PTIsTextType(PolyBlockType type) {
- return type == PT_FLOWING_TEXT || type == PT_HEADING_TEXT ||
- type == PT_PULLOUT_TEXT || type == PT_TABLE ||
- type == PT_VERTICAL_TEXT || type == PT_CAPTION_TEXT ||
- type == PT_INLINE_EQUATION;
-}
-// Returns true if PolyBlockType is of pullout(inter-column) type
-inline bool PTIsPulloutType(PolyBlockType type) {
- return type == PT_PULLOUT_IMAGE || type == PT_PULLOUT_TEXT;
-}
-
-/**
- * +------------------+ Orientation Example:
- * | 1 Aaaa Aaaa Aaaa | ====================
- * | Aaa aa aaa aa | To left is a diagram of some (1) English and
- * | aaaaaa A aa aaa. | (2) Chinese text and a (3) photo credit.
- * | 2 |
- * | ####### c c C | Upright Latin characters are represented as A and a.
- * | ####### c c c | '<' represents a latin character rotated
- * | < ####### c c c | anti-clockwise 90 degrees.
- * | < ####### c c |
- * | < ####### . c | Upright Chinese characters are represented C and c.
- * | 3 ####### c |
- * +------------------+ NOTA BENE: enum values here should match goodoc.proto
-
- * If you orient your head so that "up" aligns with Orientation,
- * then the characters will appear "right side up" and readable.
- *
- * In the example above, both the English and Chinese paragraphs are oriented
- * so their "up" is the top of the page (page up). The photo credit is read
- * with one's head turned leftward ("up" is to page left).
- *
- * The values of this enum match the convention of Tesseract's osdetect.h
-*/
-enum Orientation {
- ORIENTATION_PAGE_UP = 0,
- ORIENTATION_PAGE_RIGHT = 1,
- ORIENTATION_PAGE_DOWN = 2,
- ORIENTATION_PAGE_LEFT = 3,
-};
-
-/**
- * The grapheme clusters within a line of text are laid out logically
- * in this direction, judged when looking at the text line rotated so that
- * its Orientation is "page up".
- *
- * For English text, the writing direction is left-to-right. For the
- * Chinese text in the above example, the writing direction is top-to-bottom.
- */
-enum WritingDirection {
- WRITING_DIRECTION_LEFT_TO_RIGHT = 0,
- WRITING_DIRECTION_RIGHT_TO_LEFT = 1,
- WRITING_DIRECTION_TOP_TO_BOTTOM = 2,
-};
-
-/**
- * The text lines are read in the given sequence.
- *
- * In English, the order is top-to-bottom.
- * In Chinese, vertical text lines are read right-to-left. Mongolian is
- * written in vertical columns top to bottom like Chinese, but the lines
- * order left-to right.
- *
- * Note that only some combinations make sense. For example,
- * WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM
- */
-enum TextlineOrder {
- TEXTLINE_ORDER_LEFT_TO_RIGHT = 0,
- TEXTLINE_ORDER_RIGHT_TO_LEFT = 1,
- TEXTLINE_ORDER_TOP_TO_BOTTOM = 2,
-};
-
-/**
- * Possible modes for page layout analysis. These *must* be kept in order
- * of decreasing amount of layout analysis to be done, except for OSD_ONLY,
- * so that the inequality test macros below work.
- */
-enum PageSegMode {
- PSM_OSD_ONLY = 0, ///< Orientation and script detection only.
- PSM_AUTO_OSD = 1, ///< Automatic page segmentation with orientation and
- ///< script detection. (OSD)
- PSM_AUTO_ONLY = 2, ///< Automatic page segmentation, but no OSD, or OCR.
- PSM_AUTO = 3, ///< Fully automatic page segmentation, but no OSD.
- PSM_SINGLE_COLUMN = 4, ///< Assume a single column of text of variable sizes.
- PSM_SINGLE_BLOCK_VERT_TEXT = 5, ///< Assume a single uniform block of
- ///< vertically aligned text.
- PSM_SINGLE_BLOCK = 6, ///< Assume a single uniform block of text. (Default.)
- PSM_SINGLE_LINE = 7, ///< Treat the image as a single text line.
- PSM_SINGLE_WORD = 8, ///< Treat the image as a single word.
- PSM_CIRCLE_WORD = 9, ///< Treat the image as a single word in a circle.
- PSM_SINGLE_CHAR = 10, ///< Treat the image as a single character.
- PSM_SPARSE_TEXT =
- 11, ///< Find as much text as possible in no particular order.
- PSM_SPARSE_TEXT_OSD = 12, ///< Sparse text with orientation and script det.
- PSM_RAW_LINE = 13, ///< Treat the image as a single text line, bypassing
- ///< hacks that are Tesseract-specific.
-
- PSM_COUNT ///< Number of enum entries.
-};
-
-/**
- * Inline functions that act on a PageSegMode to determine whether components of
- * layout analysis are enabled.
- * *Depend critically on the order of elements of PageSegMode.*
- * NOTE that arg is an int for compatibility with INT_PARAM.
- */
-inline bool PSM_OSD_ENABLED(int pageseg_mode) {
- return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD;
-}
-inline bool PSM_ORIENTATION_ENABLED(int pageseg_mode) {
- return pageseg_mode <= PSM_AUTO || pageseg_mode == PSM_SPARSE_TEXT_OSD;
-}
-inline bool PSM_COL_FIND_ENABLED(int pageseg_mode) {
- return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO;
-}
-inline bool PSM_SPARSE(int pageseg_mode) {
- return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
-}
-inline bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode) {
- return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN;
-}
-inline bool PSM_LINE_FIND_ENABLED(int pageseg_mode) {
- return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK;
-}
-inline bool PSM_WORD_FIND_ENABLED(int pageseg_mode) {
- return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) ||
- pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
-}
-
-/**
- * enum of the elements of the page hierarchy, used in ResultIterator
- * to provide functions that operate on each level without having to
- * have 5x as many functions.
- */
-enum PageIteratorLevel {
- RIL_BLOCK, // Block of text/image/separator line.
- RIL_PARA, // Paragraph within a block.
- RIL_TEXTLINE, // Line within a paragraph.
- RIL_WORD, // Word within a textline.
- RIL_SYMBOL // Symbol/character within a word.
-};
-
-/**
- * JUSTIFICATION_UNKNOWN
- * The alignment is not clearly one of the other options. This could happen
- * for example if there are only one or two lines of text or the text looks
- * like source code or poetry.
- *
- * NOTA BENE: Fully justified paragraphs (text aligned to both left and right
- * margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text
- * is written with a left-to-right script and with JUSTIFICATION_RIGHT if
- * their text is written in a right-to-left script.
- *
- * Interpretation for text read in vertical lines:
- * "Left" is wherever the starting reading position is.
- *
- * JUSTIFICATION_LEFT
- * Each line, except possibly the first, is flush to the same left tab stop.
- *
- * JUSTIFICATION_CENTER
- * The text lines of the paragraph are centered about a line going
- * down through their middle of the text lines.
- *
- * JUSTIFICATION_RIGHT
- * Each line, except possibly the first, is flush to the same right tab stop.
- */
-enum ParagraphJustification {
- JUSTIFICATION_UNKNOWN,
- JUSTIFICATION_LEFT,
- JUSTIFICATION_CENTER,
- JUSTIFICATION_RIGHT,
-};
-
-/**
- * When Tesseract/Cube is initialized we can choose to instantiate/load/run
- * only the Tesseract part, only the Cube part or both along with the combiner.
- * The preference of which engine to use is stored in tessedit_ocr_engine_mode.
- *
- * ATTENTION: When modifying this enum, please make sure to make the
- * appropriate changes to all the enums mirroring it (e.g. OCREngine in
- * cityblock/workflow/detection/detection_storage.proto). Such enums will
- * mention the connection to OcrEngineMode in the comments.
- */
-enum OcrEngineMode {
- OEM_TESSERACT_ONLY, // Run Tesseract only - fastest; deprecated
- OEM_LSTM_ONLY, // Run just the LSTM line recognizer.
- OEM_TESSERACT_LSTM_COMBINED, // Run the LSTM recognizer, but allow fallback
- // to Tesseract when things get difficult.
- // deprecated
- OEM_DEFAULT, // Specify this mode when calling init_*(),
- // to indicate that any of the above modes
- // should be automatically inferred from the
- // variables in the language-specific config,
- // command-line configs, or if not specified
- // in any of the above should be set to the
- // default OEM_TESSERACT_ONLY.
- OEM_COUNT // Number of OEMs
-};
-
-} // namespace tesseract.
-
-#endif // TESSERACT_CCSTRUCT_PUBLICTYPES_H_
diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/renderer.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/renderer.h
deleted file mode 100644
index 6f405233..00000000
--- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/renderer.h
+++ /dev/null
@@ -1,311 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: renderer.h
-// Description: Rendering interface to inject into TessBaseAPI
-//
-// (C) Copyright 2011, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_API_RENDERER_H_
-#define TESSERACT_API_RENDERER_H_
-
-#include "export.h"
-
-// To avoid collision with other typenames include the ABSOLUTE MINIMUM
-// complexity of includes here. Use forward declarations wherever possible
-// and hide includes of complex types in baseapi.cpp.
-#include
-#include // for std::string
-#include // for std::vector
-
-struct Pix;
-
-namespace tesseract {
-
-class TessBaseAPI;
-
-/**
- * Interface for rendering tesseract results into a document, such as text,
- * HOCR or pdf. This class is abstract. Specific classes handle individual
- * formats. This interface is then used to inject the renderer class into
- * tesseract when processing images.
- *
- * For simplicity implementing this with tesseract version 3.01,
- * the renderer contains document state that is cleared from document
- * to document just as the TessBaseAPI is. This way the base API can just
- * delegate its rendering functionality to injected renderers, and the
- * renderers can manage the associated state needed for the specific formats
- * in addition to the heuristics for producing it.
- */
-class TESS_API TessResultRenderer {
-public:
- virtual ~TessResultRenderer();
-
- // Takes ownership of pointer so must be new'd instance.
- // Renderers aren't ordered, but appends the sequences of next parameter
- // and existing next(). The renderers should be unique across both lists.
- void insert(TessResultRenderer *next);
-
- // Returns the next renderer or nullptr.
- TessResultRenderer *next() {
- return next_;
- }
-
- /**
- * Starts a new document with the given title.
- * This clears the contents of the output data.
- * Title should use UTF-8 encoding.
- */
- bool BeginDocument(const char *title);
-
- /**
- * Adds the recognized text from the source image to the current document.
- * Invalid if BeginDocument not yet called.
- *
- * Note that this API is a bit weird but is designed to fit into the
- * current TessBaseAPI implementation where the api has lots of state
- * information that we might want to add in.
- */
- bool AddImage(TessBaseAPI *api);
-
- /**
- * Finishes the document and finalizes the output data
- * Invalid if BeginDocument not yet called.
- */
- bool EndDocument();
-
- const char *file_extension() const {
- return file_extension_;
- }
- const char *title() const {
- return title_.c_str();
- }
-
- // Is everything fine? Otherwise something went wrong.
- bool happy() const {
- return happy_;
- }
-
- /**
- * Returns the index of the last image given to AddImage
- * (i.e. images are incremented whether the image succeeded or not)
- *
- * This is always defined. It means either the number of the
- * current image, the last image ended, or in the completed document
- * depending on when in the document lifecycle you are looking at it.
- * Will return -1 if a document was never started.
- */
- int imagenum() const {
- return imagenum_;
- }
-
-protected:
- /**
- * Called by concrete classes.
- *
- * outputbase is the name of the output file excluding
- * extension. For example, "/path/to/chocolate-chip-cookie-recipe"
- *
- * extension indicates the file extension to be used for output
- * files. For example "pdf" will produce a .pdf file, and "hocr"
- * will produce .hocr files.
- */
- TessResultRenderer(const char *outputbase, const char *extension);
-
- // Hook for specialized handling in BeginDocument()
- virtual bool BeginDocumentHandler();
-
- // This must be overridden to render the OCR'd results
- virtual bool AddImageHandler(TessBaseAPI *api) = 0;
-
- // Hook for specialized handling in EndDocument()
- virtual bool EndDocumentHandler();
-
- // Renderers can call this to append '\0' terminated strings into
- // the output string returned by GetOutput.
- // This method will grow the output buffer if needed.
- void AppendString(const char *s);
-
- // Renderers can call this to append binary byte sequences into
- // the output string returned by GetOutput. Note that s is not necessarily
- // '\0' terminated (and can contain '\0' within it).
- // This method will grow the output buffer if needed.
- void AppendData(const char *s, int len);
-
-private:
- TessResultRenderer *next_; // Can link multiple renderers together
- FILE *fout_; // output file pointer
- const char *file_extension_; // standard extension for generated output
- std::string title_; // title of document being rendered
- int imagenum_; // index of last image added
- bool happy_; // I get grumpy when the disk fills up, etc.
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string
- */
-class TESS_API TessTextRenderer : public TessResultRenderer {
-public:
- explicit TessTextRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-/**
- * Renders tesseract output into an hocr text string
- */
-class TESS_API TessHOcrRenderer : public TessResultRenderer {
-public:
- explicit TessHOcrRenderer(const char *outputbase, bool font_info);
- explicit TessHOcrRenderer(const char *outputbase);
-
-protected:
- bool BeginDocumentHandler() override;
- bool AddImageHandler(TessBaseAPI *api) override;
- bool EndDocumentHandler() override;
-
-private:
- bool font_info_; // whether to print font information
-};
-
-/**
- * Renders tesseract output into an alto text string
- */
-class TESS_API TessAltoRenderer : public TessResultRenderer {
-public:
- explicit TessAltoRenderer(const char *outputbase);
-
-protected:
- bool BeginDocumentHandler() override;
- bool AddImageHandler(TessBaseAPI *api) override;
- bool EndDocumentHandler() override;
-
-private:
- bool begin_document;
-};
-
-/**
- * Renders Tesseract output into a TSV string
- */
-class TESS_API TessTsvRenderer : public TessResultRenderer {
-public:
- explicit TessTsvRenderer(const char *outputbase, bool font_info);
- explicit TessTsvRenderer(const char *outputbase);
-
-protected:
- bool BeginDocumentHandler() override;
- bool AddImageHandler(TessBaseAPI *api) override;
- bool EndDocumentHandler() override;
-
-private:
- bool font_info_; // whether to print font information
-};
-
-/**
- * Renders tesseract output into searchable PDF
- */
-class TESS_API TessPDFRenderer : public TessResultRenderer {
-public:
- // datadir is the location of the TESSDATA. We need it because
- // we load a custom PDF font from this location.
- TessPDFRenderer(const char *outputbase, const char *datadir,
- bool textonly = false);
-
-protected:
- bool BeginDocumentHandler() override;
- bool AddImageHandler(TessBaseAPI *api) override;
- bool EndDocumentHandler() override;
-
-private:
- // We don't want to have every image in memory at once,
- // so we store some metadata as we go along producing
- // PDFs one page at a time. At the end, that metadata is
- // used to make everything that isn't easily handled in a
- // streaming fashion.
- long int obj_; // counter for PDF objects
- std::vector offsets_; // offset of every PDF object in bytes
- std::vector pages_; // object number for every /Page object
- std::string datadir_; // where to find the custom font
- bool textonly_; // skip images if set
- // Bookkeeping only. DIY = Do It Yourself.
- void AppendPDFObjectDIY(size_t objectsize);
- // Bookkeeping + emit data.
- void AppendPDFObject(const char *data);
- // Create the /Contents object for an entire page.
- char *GetPDFTextObjects(TessBaseAPI *api, double width, double height);
- // Turn an image into a PDF object. Only transcode if we have to.
- static bool imageToPDFObj(Pix *pix, const char *filename, long int objnum,
- char **pdf_object, long int *pdf_object_size,
- int jpg_quality);
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string
- */
-class TESS_API TessUnlvRenderer : public TessResultRenderer {
-public:
- explicit TessUnlvRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string for LSTMBox
- */
-class TESS_API TessLSTMBoxRenderer : public TessResultRenderer {
-public:
- explicit TessLSTMBoxRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string
- */
-class TESS_API TessBoxTextRenderer : public TessResultRenderer {
-public:
- explicit TessBoxTextRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-/**
- * Renders tesseract output into a plain UTF-8 text string in WordStr format
- */
-class TESS_API TessWordStrBoxRenderer : public TessResultRenderer {
-public:
- explicit TessWordStrBoxRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-#ifndef DISABLED_LEGACY_ENGINE
-
-/**
- * Renders tesseract output into an osd text string
- */
-class TESS_API TessOsdRenderer : public TessResultRenderer {
-public:
- explicit TessOsdRenderer(const char *outputbase);
-
-protected:
- bool AddImageHandler(TessBaseAPI *api) override;
-};
-
-#endif // ndef DISABLED_LEGACY_ENGINE
-
-} // namespace tesseract.
-
-#endif // TESSERACT_API_RENDERER_H_
diff --git a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/resultiterator.h b/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/resultiterator.h
deleted file mode 100644
index 3e4d5807..00000000
--- a/third_party/ocr/tesseract-ocr/uos/aarch64/include/tesseract/resultiterator.h
+++ /dev/null
@@ -1,250 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// File: resultiterator.h
-// Description: Iterator for tesseract results that is capable of
-// iterating in proper reading order over Bi Directional
-// (e.g. mixed Hebrew and English) text.
-// Author: David Eger
-//
-// (C) Copyright 2011, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_
-#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_
-
-#include "export.h" // for TESS_API, TESS_LOCAL
-#include "ltrresultiterator.h" // for LTRResultIterator
-#include "publictypes.h" // for PageIteratorLevel
-#include "unichar.h" // for StrongScriptDirection
-
-#include // for std::pair
-#include // for std::vector
-
-namespace tesseract {
-
-class TESS_API ResultIterator : public LTRResultIterator {
-public:
- static ResultIterator *StartOfParagraph(const LTRResultIterator &resit);
-
- /**
- * ResultIterator is copy constructible!
- * The default copy constructor works just fine for us.
- */
- ~ResultIterator() override = default;
-
- // ============= Moving around within the page ============.
- /**
- * Moves the iterator to point to the start of the page to begin
- * an iteration.
- */
- void Begin() override;
-
- /**
- * Moves to the start of the next object at the given level in the
- * page hierarchy in the appropriate reading order and returns false if
- * the end of the page was reached.
- * NOTE that RIL_SYMBOL will skip non-text blocks, but all other
- * PageIteratorLevel level values will visit each non-text block once.
- * Think of non text blocks as containing a single para, with a single line,
- * with a single imaginary word.
- * Calls to Next with different levels may be freely intermixed.
- * This function iterates words in right-to-left scripts correctly, if
- * the appropriate language has been loaded into Tesseract.
- */
- bool Next(PageIteratorLevel level) override;
-
- /**
- * IsAtBeginningOf() returns whether we're at the logical beginning of the
- * given level. (as opposed to ResultIterator's left-to-right top-to-bottom
- * order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf().
- * For a full description, see pageiterator.h
- */
- bool IsAtBeginningOf(PageIteratorLevel level) const override;
-
- /**
- * Implement PageIterator's IsAtFinalElement correctly in a BiDi context.
- * For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we
- * point at the last word in a paragraph. See PageIterator for full comment.
- */
- bool IsAtFinalElement(PageIteratorLevel level,
- PageIteratorLevel element) const override;
-
- // ============= Functions that refer to words only ============.
- // Returns the number of blanks before the current word.
- int BlanksBeforeWord() const;
-
- // ============= Accessing data ==============.
-
- /**
- * Returns the null terminated UTF-8 encoded text string for the current
- * object at the given level. Use delete [] to free after use.
- */
- virtual char *GetUTF8Text(PageIteratorLevel level) const;
-
- /**
- * Returns the LSTM choices for every LSTM timestep for the current word.
- */
- virtual std::vector>>>
- *GetRawLSTMTimesteps() const;
- virtual std::vector>>
- *GetBestLSTMSymbolChoices() const;
-
- /**
- * Return whether the current paragraph's dominant reading direction
- * is left-to-right (as opposed to right-to-left).
- */
- bool ParagraphIsLtr() const;
-
- // ============= Exposed only for testing =============.
-
- /**
- * Yields the reading order as a sequence of indices and (optional)
- * meta-marks for a set of words (given left-to-right).
- * The meta marks are passed as negative values:
- * kMinorRunStart Start of minor direction text.
- * kMinorRunEnd End of minor direction text.
- * kComplexWord The next indexed word contains both left-to-right and
- * right-to-left characters and was treated as neutral.
- *
- * For example, suppose we have five words in a text line,
- * indexed [0,1,2,3,4] from the leftmost side of the text line.
- * The following are all believable reading_orders:
- *
- * Left-to-Right (in ltr paragraph):
- * { 0, 1, 2, 3, 4 }
- * Left-to-Right (in rtl paragraph):
- * { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd }
- * Right-to-Left (in rtl paragraph):
- * { 4, 3, 2, 1, 0 }
- * Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph:
- * { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }
- */
- static void CalculateTextlineOrder(
- bool paragraph_is_ltr,
- const std::vector &word_dirs,
- std::vector