#include "hg_ocr.h" #include "allheaders.h" #include "baseapi.h" #include "basedir.h" #include "osdetect.h" #include "renderer.h" #include "strngs.h" #include "tprintf.h" #include "resultiterator.h" #include static unsigned char string_CIDTOGIDMAP[] = { 120,156,236,194,1,9,0,0,0,2,160,250,127,186,33,137, 166,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,128,123,3,0,0,255,255,236,194,1,13,0,0, 0,194,32,223,191,180,69,24,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,235,0,0,0,255, 255,236,194,1,13,0,0,0,194,32,223,191,180,69,24,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,235,0,0,0,255,255,237,194,1,13,0,0,0,194,32, 223,191,180,69,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,235,0,255,0,16}; static unsigned char string_TTF[] = { 0,1,0,0,0,10,0,128,0,3,0,32,79,83,47,50,86,221,200,148, 0,0,1,40,0,0,0,96,99,109,97,112,0,18,0,78,0,0,1,144,0,0, 0,44,103,108,121,102,0,0,0,0,0,0,1,196,0,0,0,1,104,101, 97,100,2,80,182,226,0,0,0,172,0,0,0,54,104,104,101,97,0, 3,0,2,0,0,0,228,0,0,0,36,104,109,116,120,0,0,0,0,0,0,1, 136,0,0,0,8,108,111,99,97,0,0,0,0,0,0,1,188,0,0,0,6,109, 97,120,112,0,3,0,1,0,0,1,8,0,0,0,32,110,97,109,101,165, 232,245,73,0,0,1,200,0,0,0,80,112,111,115,116,0,1,0,1,0, 0,2,24,0,0,0,32,0,1,0,0,0,1,0,0,167,55,179,76,95,15,60,245, 4,7,1,0,0,0,0,0,207,154,252,110,0,0,0,0,207,154,252,110,0, 0,128,0,0,0,0,1,0,0,0,16,0,2,0,0,0,0,0,0,0,1,0,0,0,1,255, 255,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 2,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,3,0,0,1,144,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,5,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,71,79,79,71,0,64,255,255,0,0,0,1,255,255,0,0,0,1, 0,1,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, 0,0,0,2,0,1,0,0,0,0,0,20,0,3,0,0,0,0,0,32,0,6,0,12,0,0,0,0, 0,1,0,0,0,6,0,12,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,3,0,42,0,0,0,3,0,0,0,5,0,22,0,11,0,1,0,0,0,0,0,5,0,11,0, 0,0,3,0,1,4,9,0,5,0,22,0,11,86,101,114,115,105,111,110,32, 49,46,48,0,86,0,101,0,114,0,115,0,105,0,111,0,110,0,32,0,49, 0,46,0,48,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0}; static const char* endstream = "endstream\nendobj\n"; Pix* createPix(const unsigned char * imgData, int width, int height, int bytes_per_pixel, int bytes_per_line); void PreloadRenderers(tesseract::TessBaseAPI* api, tesseract::PointerVector* renderers, const char* outputbase); char* GetPDFTextObjectss(tesseract::TessBaseAPI* api, double width, double height, int& len); HG_OCR::HG_OCR() : api(new tesseract::TessBaseAPI()) { } HG_OCR::HG_OCR(PSM_TYPE type) : api(new tesseract::TessBaseAPI()) { init_orientation("./tessdata/osd.traineddata"); } HG_OCR::~HG_OCR() { if (api != nullptr) delete reinterpret_cast(api); } void HG_OCR::init(HG_OCR::PSM_TYPE type) { init_orientation("./tessdata/osd.traineddata"); } void HG_OCR::init(const char * filename, PSM_TYPE type) { switch (type) { case Orientation: init_orientation(filename); break; } } int HG_OCR::getOrientation(unsigned char *imgData, int width, int height, int channels, int step) { int orientation, direction, lineOrder; float deskewAngle; getOrientation(imgData, width, height, channels, step, orientation, direction, lineOrder, deskewAngle); return orientation; } bool HG_OCR::getOrientation(unsigned char* imgData, int width, int height, int channels, int step, int& orientation, int& direction, int& lineOrder, float& deskewAngle) { if (api == nullptr) return false; tesseract::TessBaseAPI* ptr = reinterpret_cast(api); ptr->SetImage(imgData, width, height, channels, step); #if 0 tesseract::PageIterator* it = ptr->AnalyseLayout(); if (it != nullptr) { it->Orientation(reinterpret_cast(&orientation), reinterpret_cast(&direction), reinterpret_cast(&lineOrder), &deskewAngle); delete it; return true; } else return false; #endif orientation = ptr->AnalyseLayout1(); } void HG_OCR::init_orientation(const char *filename) { auto ret= reinterpret_cast(api)->Init(filename, "osd"); reinterpret_cast(api)->SetPageSegMode(tesseract::PSM_AUTO_OSD); } void PreloadRenderers(tesseract::TessBaseAPI* api, tesseract::PointerVector* renderers, const char* outputbase) { bool b; api->GetBoolVariable("tessedit_create_pdf", &b); if (b) { bool textonly; api->GetBoolVariable("textonly_pdf", &textonly); printf("GetDatapath%s\n", api->GetDatapath()); renderers->push_back(new tesseract::TessPDFRenderer(outputbase, api->GetDatapath(), textonly)); } } Pix* createPix(const unsigned char * imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line) { int bpp = bytes_per_pixel * 8; if (bpp == 0) bpp = 1; Pix* pix = pixCreate(width, height, bpp == 24 ? 32 : bpp); l_uint32* data = pixGetData(pix); int wpl = pixGetWpl(pix); switch (bpp) { case 1: for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) { for (int x = 0; x < width; ++x) { if (imagedata[x / 8] & (0x80 >> (x % 8))) CLEAR_DATA_BIT(data, x); else SET_DATA_BIT(data, x); } } break; case 8: // Greyscale just copies the bytes in the right order. for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) for (int x = 0; x < width; ++x) SET_DATA_BYTE(data, x, imagedata[x]); break; case 24: // Put the colors in the correct places in the line buffer. for (int y = 0; y < height; ++y, imagedata += bytes_per_line) { for (int x = 0; x < width; ++x, ++data) { SET_DATA_BYTE(data, COLOR_RED, imagedata[3 * x]); SET_DATA_BYTE(data, COLOR_GREEN, imagedata[3 * x + 1]); SET_DATA_BYTE(data, COLOR_BLUE, imagedata[3 * x + 2]); } } break; case 32: // Maintain byte order consistency across different endianness. for (int y = 0; y < height; ++y, imagedata += bytes_per_line, data += wpl) for (int x = 0; x < width; ++x) data[x] = (imagedata[x * 4] << 24) | (imagedata[x * 4 + 1] << 16) | (imagedata[x * 4 + 2] << 8) | imagedata[x * 4 + 3]; break; default: break; } pix->informat = bytes_per_pixel == 1 ? 1 : 2; if (bytes_per_pixel == 1) { PIXCMAP* colormap = pixcmapCreate(8); LEPT_FREE(colormap->array); colormap->array = (l_uint8 *)LEPT_CALLOC(256, sizeof(RGBA_QUAD)); colormap->n = 256; colormap->nalloc = 256; colormap->depth = 8; l_uint8* ptr = reinterpret_cast(colormap->array); for (int i = 0; i < 256; i++) ptr[i * 4 + 0] = ptr[i * 4 + 1] = ptr[i * 4 + 2] = ptr[i * 4 + 3] = i; pixSetColormap(pix, colormap); } pixSetXRes(pix, 200); pixSetYRes(pix, 200); //FILE* file = fopenWriteStream("aaa.bmp", "w"); //pixWriteStreamBmp(file, pix); //fclose(file); return pix; } double prec(double x) { double kPrecision = 1000.0; double a = round(x * kPrecision) / kPrecision; if (a == -0) return 0; return a; } void ClipBaseline(int ppi, int x1, int y1, int x2, int y2, int *line_x1, int *line_y1, int *line_x2, int *line_y2) { *line_x1 = x1; *line_y1 = y1; *line_x2 = x2; *line_y2 = y2; double rise = abs(y2 - y1) * 72 / ppi; double run = abs(x2 - x1) * 72 / ppi; if (rise < 2.0 && 2.0 < run) *line_y1 = *line_y2 = (y1 + y2) / 2; } long dist2(int x1, int y1, int x2, int y2) { return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1); } void GetWordBaseline(int writing_direction, int ppi, int height, int word_x1, int word_y1, int word_x2, int word_y2, int line_x1, int line_y1, int line_x2, int line_y2, double *x0, double *y0, double *length) { if (writing_direction == tesseract::WRITING_DIRECTION_RIGHT_TO_LEFT) { Swap(&word_x1, &word_x2); Swap(&word_y1, &word_y2); } double word_length; double x, y; { int px = word_x1; int py = word_y1; double l2 = dist2(line_x1, line_y1, line_x2, line_y2); if (l2 == 0) { x = line_x1; y = line_y1; } else { double t = ((px - line_x2) * (line_x2 - line_x1) + (py - line_y2) * (line_y2 - line_y1)) / l2; x = line_x2 + t * (line_x2 - line_x1); y = line_y2 + t * (line_y2 - line_y1); } word_length = sqrt(static_cast(dist2(word_x1, word_y1, word_x2, word_y2))); word_length = word_length * 72.0 / ppi; x = x * 72 / ppi; y = height - (y * 72.0 / ppi); } *x0 = x; *y0 = y; *length = word_length; } void AffineMatrix(int writing_direction, int line_x1, int line_y1, int line_x2, int line_y2, double *a, double *b, double *c, double *d) { double theta = atan2(static_cast(line_y1 - line_y2), static_cast(line_x2 - line_x1)); *a = cos(theta); *b = sin(theta); *c = -sin(theta); *d = cos(theta); switch (writing_direction) { case tesseract::WRITING_DIRECTION_RIGHT_TO_LEFT: *a = -*a; *b = -*b; break; case tesseract::WRITING_DIRECTION_TOP_TO_BOTTOM: // TODO(jbreiden) Consider using the vertical PDF writing mode. break; default: break; } } bool CodepointToUtf16be(int code, char utf16[20]) { if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) { tprintf("Dropping invalid codepoint %d\n", code); return false; } if (code < 0x10000) { snprintf(utf16, 20, "%04X", code); } else { int a = code - 0x010000; int high_surrogate = (0x03FF & (a >> 10)) + 0xD800; int low_surrogate = (0x03FF & a) + 0xDC00; snprintf(utf16, 20, "%04X%04X", high_surrogate, low_surrogate); } return true; } char* GetPDFTextObjectss(tesseract::TessBaseAPI* api, double width, double height, int& len) { STRING pdf_str(""); double ppi = api->GetSourceYResolution(); // These initial conditions are all arbitrary and will be overwritten double old_x = 0.0, old_y = 0.0; int old_fontsize = 0; tesseract::WritingDirection old_writing_direction = tesseract::WRITING_DIRECTION_LEFT_TO_RIGHT; bool new_block = true; int fontsize = 0; double a = 1; double b = 0; double c = 0; double d = 1; // TODO(jbreiden) This marries the text and image together. // Slightly cleaner from an abstraction standpoint if this were to // live inside a separate text object. pdf_str += "q "; pdf_str.add_str_double("", prec(width)); pdf_str += " 0 0 "; pdf_str.add_str_double("", prec(height)); pdf_str += " 0 0 cm"; if (true) { pdf_str += " /Im1 Do"; } pdf_str += " Q\n"; int line_x1 = 0; int line_y1 = 0; int line_x2 = 0; int line_y2 = 0; tesseract::ResultIterator *res_it = api->GetIterator(); while (!res_it->Empty(tesseract::RIL_BLOCK)) { if (res_it->IsAtBeginningOf(tesseract::RIL_BLOCK)) { pdf_str += "BT\n3 Tr"; // Begin text object, use invisible ink old_fontsize = 0; // Every block will declare its fontsize new_block = true; // Every block will declare its affine matrix } if (res_it->IsAtBeginningOf(tesseract::RIL_TEXTLINE)) { int x1, y1, x2, y2; res_it->Baseline(tesseract::RIL_TEXTLINE, &x1, &y1, &x2, &y2); ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2); } if (res_it->Empty(tesseract::RIL_WORD)) { res_it->Next(tesseract::RIL_WORD); continue; } // Writing direction changes at a per-word granularity tesseract::WritingDirection writing_direction; { tesseract::Orientation orientation; tesseract::TextlineOrder textline_order; float deskew_angle; res_it->Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle); if (writing_direction != tesseract::WRITING_DIRECTION_TOP_TO_BOTTOM) { switch (res_it->WordDirection()) { case DIR_LEFT_TO_RIGHT: writing_direction = tesseract::WRITING_DIRECTION_LEFT_TO_RIGHT; break; case DIR_RIGHT_TO_LEFT: writing_direction = tesseract::WRITING_DIRECTION_RIGHT_TO_LEFT; break; default: writing_direction = old_writing_direction; } } } // Where is word origin and how long is it? double x, y, word_length; { int word_x1, word_y1, word_x2, word_y2; res_it->Baseline(tesseract::RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2); GetWordBaseline(writing_direction, ppi, height, word_x1, word_y1, word_x2, word_y2, line_x1, line_y1, line_x2, line_y2, &x, &y, &word_length); } if (writing_direction != old_writing_direction || new_block) { AffineMatrix(writing_direction, line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d); pdf_str.add_str_double(" ", prec(a)); // . This affine matrix pdf_str.add_str_double(" ", prec(b)); // . sets the coordinate pdf_str.add_str_double(" ", prec(c)); // . system for all pdf_str.add_str_double(" ", prec(d)); // . text that follows. pdf_str.add_str_double(" ", prec(x)); // . pdf_str.add_str_double(" ", prec(y)); // . pdf_str += (" Tm "); // Place cursor absolutely new_block = false; } else { double dx = x - old_x; double dy = y - old_y; pdf_str.add_str_double(" ", prec(dx * a + dy * b)); pdf_str.add_str_double(" ", prec(dx * c + dy * d)); pdf_str += (" Td "); // Relative moveto } old_x = x; old_y = y; old_writing_direction = writing_direction; // Adjust font size on a per word granularity. Pay attention to // fontsize, old_fontsize, and pdf_str. We've found that for // in Arabic, Tesseract will happily return a fontsize of zero, // so we make up a default number to protect ourselves. { bool bold, italic, underlined, monospace, serif, smallcaps; int font_id; res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps, &fontsize, &font_id); const int kDefaultFontsize = 8; if (fontsize <= 0) fontsize = kDefaultFontsize; if (fontsize != old_fontsize) { char textfont[20]; snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", fontsize); pdf_str += textfont; old_fontsize = fontsize; } } bool last_word_in_line = res_it->IsAtFinalElement(tesseract::RIL_TEXTLINE, tesseract::RIL_WORD); bool last_word_in_block = res_it->IsAtFinalElement(tesseract::RIL_BLOCK, tesseract::RIL_WORD); STRING pdf_word(""); int pdf_word_len = 0; do { const char *grapheme = res_it->GetUTF8Text(tesseract::RIL_SYMBOL); if (grapheme && grapheme[0] != '\0') { GenericVector unicodes; UNICHAR::UTF8ToUnicode(grapheme, &unicodes); char utf16[20]; for (int i = 0; i < unicodes.length(); i++) { int code = unicodes[i]; if (CodepointToUtf16be(code, utf16)) { pdf_word += utf16; pdf_word_len++; } } } delete[]grapheme; res_it->Next(tesseract::RIL_SYMBOL); } while (!res_it->Empty(tesseract::RIL_BLOCK) && !res_it->IsAtBeginningOf(tesseract::RIL_WORD)); if (word_length > 0 && pdf_word_len > 0 && fontsize > 0) { double h_stretch = 2 * prec(100.0 * word_length / (fontsize * pdf_word_len)); pdf_str.add_str_double("", h_stretch); pdf_str += " Tz"; // horizontal stretch pdf_str += " [ <"; pdf_str += pdf_word; // UTF-16BE representation pdf_str += "> ] TJ"; // show the text } if (last_word_in_line) { pdf_str += " \n"; } if (last_word_in_block) { pdf_str += "ET\n"; // end the text object } } char *ret = new char[pdf_str.length() + 1]; strcpy(ret, pdf_str.string()); delete res_it; len = pdf_str.length(); return ret; }