twain3.0/huagao/hg_ocr.cpp

512 lines
16 KiB
C++
Raw Permalink Normal View History

2021-11-20 06:24:33 +00:00
#include "hg_ocr.h"
#include "allheaders.h"
#include "baseapi.h"
#include "basedir.h"
#include "osdetect.h"
#include "renderer.h"
#include "strngs.h"
#include "tprintf.h"
#include "resultiterator.h"
#include <math.h>
static unsigned char string_CIDTOGIDMAP[] = {
120,156,236,194,1,9,0,0,0,2,160,250,127,186,33,137,
166,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,128,123,3,0,0,255,255,236,194,1,13,0,0,
0,194,32,223,191,180,69,24,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,235,0,0,0,255,
255,236,194,1,13,0,0,0,194,32,223,191,180,69,24,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,235,0,0,0,255,255,237,194,1,13,0,0,0,194,32,
223,191,180,69,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,235,0,255,0,16};
static unsigned char string_TTF[] = {
0,1,0,0,0,10,0,128,0,3,0,32,79,83,47,50,86,221,200,148,
0,0,1,40,0,0,0,96,99,109,97,112,0,18,0,78,0,0,1,144,0,0,
0,44,103,108,121,102,0,0,0,0,0,0,1,196,0,0,0,1,104,101,
97,100,2,80,182,226,0,0,0,172,0,0,0,54,104,104,101,97,0,
3,0,2,0,0,0,228,0,0,0,36,104,109,116,120,0,0,0,0,0,0,1,
136,0,0,0,8,108,111,99,97,0,0,0,0,0,0,1,188,0,0,0,6,109,
97,120,112,0,3,0,1,0,0,1,8,0,0,0,32,110,97,109,101,165,
232,245,73,0,0,1,200,0,0,0,80,112,111,115,116,0,1,0,1,0,
0,2,24,0,0,0,32,0,1,0,0,0,1,0,0,167,55,179,76,95,15,60,245,
4,7,1,0,0,0,0,0,207,154,252,110,0,0,0,0,207,154,252,110,0,
0,128,0,0,0,0,1,0,0,0,16,0,2,0,0,0,0,0,0,0,1,0,0,0,1,255,
255,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,3,0,0,1,144,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,5,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,71,79,79,71,0,64,255,255,0,0,0,1,255,255,0,0,0,1,
0,1,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,2,0,1,0,0,0,0,0,20,0,3,0,0,0,0,0,32,0,6,0,12,0,0,0,0,
0,1,0,0,0,6,0,12,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,3,0,42,0,0,0,3,0,0,0,5,0,22,0,11,0,1,0,0,0,0,0,5,0,11,0,
0,0,3,0,1,4,9,0,5,0,22,0,11,86,101,114,115,105,111,110,32,
49,46,48,0,86,0,101,0,114,0,115,0,105,0,111,0,110,0,32,0,49,
0,46,0,48,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0};
static const char* endstream = "endstream\nendobj\n";
Pix* createPix(const unsigned char * imgData, int width, int height, int bytes_per_pixel, int bytes_per_line);
void PreloadRenderers(tesseract::TessBaseAPI* api,
tesseract::PointerVector<tesseract::TessResultRenderer>* renderers, const char* outputbase);
char* GetPDFTextObjectss(tesseract::TessBaseAPI* api, double width, double height, int& len);
HG_OCR::HG_OCR()
: api(new tesseract::TessBaseAPI())
{
}
HG_OCR::HG_OCR(PSM_TYPE type)
: api(new tesseract::TessBaseAPI())
{
init_orientation("./tessdata/osd.traineddata");
}
HG_OCR::~HG_OCR()
{
if (api != nullptr)
delete reinterpret_cast<tesseract::TessBaseAPI*>(api);
}
void HG_OCR::init(HG_OCR::PSM_TYPE type)
{
init_orientation("./tessdata/osd.traineddata");
}
void HG_OCR::init(const char * filename, PSM_TYPE type)
{
switch (type)
{
case Orientation:
init_orientation(filename);
break;
}
}
int HG_OCR::getOrientation(unsigned char *imgData, int width, int height, int channels, int step)
{
int orientation, direction, lineOrder;
float deskewAngle;
getOrientation(imgData, width, height, channels, step,
orientation, direction, lineOrder, deskewAngle);
return orientation;
}
bool HG_OCR::getOrientation(unsigned char* imgData, int width, int height, int channels, int step,
int& orientation, int& direction, int& lineOrder, float& deskewAngle)
{
if (api == nullptr) return false;
tesseract::TessBaseAPI* ptr = reinterpret_cast<tesseract::TessBaseAPI*>(api);
ptr->SetImage(imgData, width, height, channels, step);
#if 0
tesseract::PageIterator* it = ptr->AnalyseLayout();
if (it != nullptr)
{
it->Orientation(reinterpret_cast<tesseract::Orientation*>(&orientation),
reinterpret_cast<tesseract::WritingDirection*>(&direction),
reinterpret_cast<tesseract::TextlineOrder*>(&lineOrder),
&deskewAngle);
delete it;
return true;
}
else
return false;
#endif
orientation = ptr->AnalyseLayout1();
}
void HG_OCR::init_orientation(const char *filename)
{
auto ret= reinterpret_cast<tesseract::TessBaseAPI*>(api)->Init(filename, "osd");
reinterpret_cast<tesseract::TessBaseAPI*>(api)->SetPageSegMode(tesseract::PSM_AUTO_OSD);
}
void PreloadRenderers(tesseract::TessBaseAPI* api,
tesseract::PointerVector<tesseract::TessResultRenderer>* renderers,
const char* outputbase)
{
bool b;
api->GetBoolVariable("tessedit_create_pdf", &b);
if (b)
{
bool textonly;
api->GetBoolVariable("textonly_pdf", &textonly);
printf("GetDatapath%s\n", api->GetDatapath());
renderers->push_back(new tesseract::TessPDFRenderer(outputbase, api->GetDatapath(), textonly));
}
}
Pix* createPix(const unsigned char * imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
{
int bpp = bytes_per_pixel * 8;
if (bpp == 0) bpp = 1;
Pix* pix = pixCreate(width, height, bpp == 24 ? 32 : bpp);
l_uint32* data = pixGetData(pix);
int wpl = pixGetWpl(pix);
switch (bpp) {
case 1:
for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line)
{
for (int x = 0; x < width; ++x) {
if (imagedata[x / 8] & (0x80 >> (x % 8)))
CLEAR_DATA_BIT(data, x);
else
SET_DATA_BIT(data, x);
}
}
break;
case 8:
// Greyscale just copies the bytes in the right order.
for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line)
for (int x = 0; x < width; ++x)
SET_DATA_BYTE(data, x, imagedata[x]);
break;
case 24:
// Put the colors in the correct places in the line buffer.
for (int y = 0; y < height; ++y, imagedata += bytes_per_line)
{
for (int x = 0; x < width; ++x, ++data) {
SET_DATA_BYTE(data, COLOR_RED, imagedata[3 * x]);
SET_DATA_BYTE(data, COLOR_GREEN, imagedata[3 * x + 1]);
SET_DATA_BYTE(data, COLOR_BLUE, imagedata[3 * x + 2]);
}
}
break;
case 32:
// Maintain byte order consistency across different endianness.
for (int y = 0; y < height; ++y, imagedata += bytes_per_line, data += wpl)
for (int x = 0; x < width; ++x)
data[x] = (imagedata[x * 4] << 24) | (imagedata[x * 4 + 1] << 16) |
(imagedata[x * 4 + 2] << 8) | imagedata[x * 4 + 3];
break;
default:
break;
}
pix->informat = bytes_per_pixel == 1 ? 1 : 2;
if (bytes_per_pixel == 1)
{
PIXCMAP* colormap = pixcmapCreate(8);
LEPT_FREE(colormap->array);
colormap->array = (l_uint8 *)LEPT_CALLOC(256, sizeof(RGBA_QUAD));
colormap->n = 256;
colormap->nalloc = 256;
colormap->depth = 8;
l_uint8* ptr = reinterpret_cast<l_uint8*>(colormap->array);
for (int i = 0; i < 256; i++)
ptr[i * 4 + 0] = ptr[i * 4 + 1] = ptr[i * 4 + 2] = ptr[i * 4 + 3] = i;
pixSetColormap(pix, colormap);
}
pixSetXRes(pix, 200);
pixSetYRes(pix, 200);
//FILE* file = fopenWriteStream("aaa.bmp", "w");
//pixWriteStreamBmp(file, pix);
//fclose(file);
return pix;
}
double prec(double x) {
double kPrecision = 1000.0;
double a = round(x * kPrecision) / kPrecision;
if (a == -0)
return 0;
return a;
}
void ClipBaseline(int ppi, int x1, int y1, int x2, int y2,
int *line_x1, int *line_y1,
int *line_x2, int *line_y2)
{
*line_x1 = x1;
*line_y1 = y1;
*line_x2 = x2;
*line_y2 = y2;
double rise = abs(y2 - y1) * 72 / ppi;
double run = abs(x2 - x1) * 72 / ppi;
if (rise < 2.0 && 2.0 < run)
*line_y1 = *line_y2 = (y1 + y2) / 2;
}
long dist2(int x1, int y1, int x2, int y2)
{
return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
}
void GetWordBaseline(int writing_direction, int ppi, int height,
int word_x1, int word_y1, int word_x2, int word_y2,
int line_x1, int line_y1, int line_x2, int line_y2,
double *x0, double *y0, double *length)
{
if (writing_direction == tesseract::WRITING_DIRECTION_RIGHT_TO_LEFT)
{
Swap(&word_x1, &word_x2);
Swap(&word_y1, &word_y2);
}
double word_length;
double x, y;
{
int px = word_x1;
int py = word_y1;
double l2 = dist2(line_x1, line_y1, line_x2, line_y2);
if (l2 == 0) {
x = line_x1;
y = line_y1;
}
else {
double t = ((px - line_x2) * (line_x2 - line_x1) +
(py - line_y2) * (line_y2 - line_y1)) / l2;
x = line_x2 + t * (line_x2 - line_x1);
y = line_y2 + t * (line_y2 - line_y1);
}
word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1,
word_x2, word_y2)));
word_length = word_length * 72.0 / ppi;
x = x * 72 / ppi;
y = height - (y * 72.0 / ppi);
}
*x0 = x;
*y0 = y;
*length = word_length;
}
void AffineMatrix(int writing_direction,
int line_x1, int line_y1, int line_x2, int line_y2,
double *a, double *b, double *c, double *d)
{
double theta = atan2(static_cast<double>(line_y1 - line_y2),
static_cast<double>(line_x2 - line_x1));
*a = cos(theta);
*b = sin(theta);
*c = -sin(theta);
*d = cos(theta);
switch (writing_direction) {
case tesseract::WRITING_DIRECTION_RIGHT_TO_LEFT:
*a = -*a;
*b = -*b;
break;
case tesseract::WRITING_DIRECTION_TOP_TO_BOTTOM:
// TODO(jbreiden) Consider using the vertical PDF writing mode.
break;
default:
break;
}
}
bool CodepointToUtf16be(int code, char utf16[20])
{
if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
tprintf("Dropping invalid codepoint %d\n", code);
return false;
}
if (code < 0x10000) {
snprintf(utf16, 20, "%04X", code);
}
else {
int a = code - 0x010000;
int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
int low_surrogate = (0x03FF & a) + 0xDC00;
snprintf(utf16, 20, "%04X%04X", high_surrogate, low_surrogate);
}
return true;
}
char* GetPDFTextObjectss(tesseract::TessBaseAPI* api, double width, double height, int& len)
{
STRING pdf_str("");
double ppi = api->GetSourceYResolution();
// These initial conditions are all arbitrary and will be overwritten
double old_x = 0.0, old_y = 0.0;
int old_fontsize = 0;
tesseract::WritingDirection old_writing_direction = tesseract::WRITING_DIRECTION_LEFT_TO_RIGHT;
bool new_block = true;
int fontsize = 0;
double a = 1;
double b = 0;
double c = 0;
double d = 1;
// TODO(jbreiden) This marries the text and image together.
// Slightly cleaner from an abstraction standpoint if this were to
// live inside a separate text object.
pdf_str += "q ";
pdf_str.add_str_double("", prec(width));
pdf_str += " 0 0 ";
pdf_str.add_str_double("", prec(height));
pdf_str += " 0 0 cm";
if (true) {
pdf_str += " /Im1 Do";
}
pdf_str += " Q\n";
int line_x1 = 0;
int line_y1 = 0;
int line_x2 = 0;
int line_y2 = 0;
tesseract::ResultIterator *res_it = api->GetIterator();
while (!res_it->Empty(tesseract::RIL_BLOCK))
{
if (res_it->IsAtBeginningOf(tesseract::RIL_BLOCK))
{
pdf_str += "BT\n3 Tr"; // Begin text object, use invisible ink
old_fontsize = 0; // Every block will declare its fontsize
new_block = true; // Every block will declare its affine matrix
}
if (res_it->IsAtBeginningOf(tesseract::RIL_TEXTLINE))
{
int x1, y1, x2, y2;
res_it->Baseline(tesseract::RIL_TEXTLINE, &x1, &y1, &x2, &y2);
ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
}
if (res_it->Empty(tesseract::RIL_WORD))
{
res_it->Next(tesseract::RIL_WORD);
continue;
}
// Writing direction changes at a per-word granularity
tesseract::WritingDirection writing_direction;
{
tesseract::Orientation orientation;
tesseract::TextlineOrder textline_order;
float deskew_angle;
res_it->Orientation(&orientation, &writing_direction,
&textline_order, &deskew_angle);
if (writing_direction != tesseract::WRITING_DIRECTION_TOP_TO_BOTTOM)
{
switch (res_it->WordDirection()) {
case DIR_LEFT_TO_RIGHT:
writing_direction = tesseract::WRITING_DIRECTION_LEFT_TO_RIGHT;
break;
case DIR_RIGHT_TO_LEFT:
writing_direction = tesseract::WRITING_DIRECTION_RIGHT_TO_LEFT;
break;
default:
writing_direction = old_writing_direction;
}
}
}
// Where is word origin and how long is it?
double x, y, word_length;
{
int word_x1, word_y1, word_x2, word_y2;
res_it->Baseline(tesseract::RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
GetWordBaseline(writing_direction, ppi, height,
word_x1, word_y1, word_x2, word_y2,
line_x1, line_y1, line_x2, line_y2,
&x, &y, &word_length);
}
if (writing_direction != old_writing_direction || new_block)
{
AffineMatrix(writing_direction,
line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
pdf_str.add_str_double(" ", prec(a)); // . This affine matrix
pdf_str.add_str_double(" ", prec(b)); // . sets the coordinate
pdf_str.add_str_double(" ", prec(c)); // . system for all
pdf_str.add_str_double(" ", prec(d)); // . text that follows.
pdf_str.add_str_double(" ", prec(x)); // .
pdf_str.add_str_double(" ", prec(y)); // .
pdf_str += (" Tm "); // Place cursor absolutely
new_block = false;
}
else
{
double dx = x - old_x;
double dy = y - old_y;
pdf_str.add_str_double(" ", prec(dx * a + dy * b));
pdf_str.add_str_double(" ", prec(dx * c + dy * d));
pdf_str += (" Td "); // Relative moveto
}
old_x = x;
old_y = y;
old_writing_direction = writing_direction;
// Adjust font size on a per word granularity. Pay attention to
// fontsize, old_fontsize, and pdf_str. We've found that for
// in Arabic, Tesseract will happily return a fontsize of zero,
// so we make up a default number to protect ourselves.
{
bool bold, italic, underlined, monospace, serif, smallcaps;
int font_id;
res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
&serif, &smallcaps, &fontsize, &font_id);
const int kDefaultFontsize = 8;
if (fontsize <= 0)
fontsize = kDefaultFontsize;
if (fontsize != old_fontsize)
{
char textfont[20];
snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", fontsize);
pdf_str += textfont;
old_fontsize = fontsize;
}
}
bool last_word_in_line = res_it->IsAtFinalElement(tesseract::RIL_TEXTLINE, tesseract::RIL_WORD);
bool last_word_in_block = res_it->IsAtFinalElement(tesseract::RIL_BLOCK, tesseract::RIL_WORD);
STRING pdf_word("");
int pdf_word_len = 0;
do {
const char *grapheme = res_it->GetUTF8Text(tesseract::RIL_SYMBOL);
if (grapheme && grapheme[0] != '\0') {
GenericVector<int> unicodes;
UNICHAR::UTF8ToUnicode(grapheme, &unicodes);
char utf16[20];
for (int i = 0; i < unicodes.length(); i++)
{
int code = unicodes[i];
if (CodepointToUtf16be(code, utf16))
{
pdf_word += utf16;
pdf_word_len++;
}
}
}
delete[]grapheme;
res_it->Next(tesseract::RIL_SYMBOL);
}
while (!res_it->Empty(tesseract::RIL_BLOCK) && !res_it->IsAtBeginningOf(tesseract::RIL_WORD));
if (word_length > 0 && pdf_word_len > 0 && fontsize > 0)
{
double h_stretch = 2 * prec(100.0 * word_length / (fontsize * pdf_word_len));
pdf_str.add_str_double("", h_stretch);
pdf_str += " Tz"; // horizontal stretch
pdf_str += " [ <";
pdf_str += pdf_word; // UTF-16BE representation
pdf_str += "> ] TJ"; // show the text
}
if (last_word_in_line) {
pdf_str += " \n";
}
if (last_word_in_block) {
pdf_str += "ET\n"; // end the text object
}
}
char *ret = new char[pdf_str.length() + 1];
strcpy(ret, pdf_str.string());
delete res_it;
len = pdf_str.length();
return ret;
}