513 lines
16 KiB
C++
513 lines
16 KiB
C++
|
#ifdef WIN32
|
|||
|
#include "hg_ocr.h"
|
|||
|
#include "allheaders.h"
|
|||
|
#include "baseapi.h"
|
|||
|
#include "basedir.h"
|
|||
|
#include "osdetect.h"
|
|||
|
#include "renderer.h"
|
|||
|
#include "strngs.h"
|
|||
|
#include "tprintf.h"
|
|||
|
#include "resultiterator.h"
|
|||
|
#include <math.h>
|
|||
|
|
|||
|
static unsigned char string_CIDTOGIDMAP[] = {
|
|||
|
120,156,236,194,1,9,0,0,0,2,160,250,127,186,33,137,
|
|||
|
166,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|||
|
0,0,0,0,0,0,0,128,123,3,0,0,255,255,236,194,1,13,0,0,
|
|||
|
0,194,32,223,191,180,69,24,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,235,0,0,0,255,
|
|||
|
255,236,194,1,13,0,0,0,194,32,223,191,180,69,24,0,0,
|
|||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|||
|
0,0,0,0,235,0,0,0,255,255,237,194,1,13,0,0,0,194,32,
|
|||
|
223,191,180,69,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,235,0,255,0,16};
|
|||
|
|
|||
|
static unsigned char string_TTF[] = {
|
|||
|
0,1,0,0,0,10,0,128,0,3,0,32,79,83,47,50,86,221,200,148,
|
|||
|
0,0,1,40,0,0,0,96,99,109,97,112,0,18,0,78,0,0,1,144,0,0,
|
|||
|
0,44,103,108,121,102,0,0,0,0,0,0,1,196,0,0,0,1,104,101,
|
|||
|
97,100,2,80,182,226,0,0,0,172,0,0,0,54,104,104,101,97,0,
|
|||
|
3,0,2,0,0,0,228,0,0,0,36,104,109,116,120,0,0,0,0,0,0,1,
|
|||
|
136,0,0,0,8,108,111,99,97,0,0,0,0,0,0,1,188,0,0,0,6,109,
|
|||
|
97,120,112,0,3,0,1,0,0,1,8,0,0,0,32,110,97,109,101,165,
|
|||
|
232,245,73,0,0,1,200,0,0,0,80,112,111,115,116,0,1,0,1,0,
|
|||
|
0,2,24,0,0,0,32,0,1,0,0,0,1,0,0,167,55,179,76,95,15,60,245,
|
|||
|
4,7,1,0,0,0,0,0,207,154,252,110,0,0,0,0,207,154,252,110,0,
|
|||
|
0,128,0,0,0,0,1,0,0,0,16,0,2,0,0,0,0,0,0,0,1,0,0,0,1,255,
|
|||
|
255,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|||
|
2,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
|
|||
|
0,0,0,0,0,3,0,0,1,144,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|||
|
0,0,0,0,0,0,0,0,5,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|||
|
0,0,0,0,0,71,79,79,71,0,64,255,255,0,0,0,1,255,255,0,0,0,1,
|
|||
|
0,1,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
|
|||
|
0,0,0,2,0,1,0,0,0,0,0,20,0,3,0,0,0,0,0,32,0,6,0,12,0,0,0,0,
|
|||
|
0,1,0,0,0,6,0,12,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|||
|
0,0,3,0,42,0,0,0,3,0,0,0,5,0,22,0,11,0,1,0,0,0,0,0,5,0,11,0,
|
|||
|
0,0,3,0,1,4,9,0,5,0,22,0,11,86,101,114,115,105,111,110,32,
|
|||
|
49,46,48,0,86,0,101,0,114,0,115,0,105,0,111,0,110,0,32,0,49,
|
|||
|
0,46,0,48,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
|
|||
|
0,0,0,0,0,0,0,0,0,0,0,0};
|
|||
|
|
|||
|
static const char* endstream = "endstream\nendobj\n";
|
|||
|
|
|||
|
Pix* createPix(const unsigned char * imgData, int width, int height, int bytes_per_pixel, int bytes_per_line);
|
|||
|
|
|||
|
void PreloadRenderers(tesseract::TessBaseAPI* api,
|
|||
|
tesseract::PointerVector<tesseract::TessResultRenderer>* renderers, const char* outputbase);
|
|||
|
|
|||
|
char* GetPDFTextObjectss(tesseract::TessBaseAPI* api, double width, double height, int& len);
|
|||
|
|
|||
|
HG_OCR::HG_OCR()
|
|||
|
: api(new tesseract::TessBaseAPI())
|
|||
|
{
|
|||
|
|
|||
|
}
|
|||
|
|
|||
|
HG_OCR::HG_OCR(PSM_TYPE type)
|
|||
|
: api(new tesseract::TessBaseAPI())
|
|||
|
{
|
|||
|
init_orientation("./tessdata/osd.traineddata");
|
|||
|
}
|
|||
|
|
|||
|
HG_OCR::~HG_OCR()
|
|||
|
{
|
|||
|
if (api != nullptr)
|
|||
|
delete reinterpret_cast<tesseract::TessBaseAPI*>(api);
|
|||
|
}
|
|||
|
|
|||
|
void HG_OCR::init(HG_OCR::PSM_TYPE type)
|
|||
|
{
|
|||
|
init_orientation("./tessdata/osd.traineddata");
|
|||
|
}
|
|||
|
|
|||
|
void HG_OCR::init(const char * filename, PSM_TYPE type)
|
|||
|
{
|
|||
|
switch (type)
|
|||
|
{
|
|||
|
case Orientation:
|
|||
|
init_orientation(filename);
|
|||
|
break;
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
int HG_OCR::getOrientation(unsigned char *imgData, int width, int height, int channels, int step)
|
|||
|
{
|
|||
|
int orientation, direction, lineOrder;
|
|||
|
float deskewAngle;
|
|||
|
getOrientation(imgData, width, height, channels, step,
|
|||
|
orientation, direction, lineOrder, deskewAngle);
|
|||
|
|
|||
|
return orientation;
|
|||
|
}
|
|||
|
|
|||
|
bool HG_OCR::getOrientation(unsigned char* imgData, int width, int height, int channels, int step,
|
|||
|
int& orientation, int& direction, int& lineOrder, float& deskewAngle)
|
|||
|
{
|
|||
|
if (api == nullptr) return false;
|
|||
|
|
|||
|
tesseract::TessBaseAPI* ptr = reinterpret_cast<tesseract::TessBaseAPI*>(api);
|
|||
|
ptr->SetImage(imgData, width, height, channels, step);
|
|||
|
|
|||
|
#if 0
|
|||
|
tesseract::PageIterator* it = ptr->AnalyseLayout();
|
|||
|
|
|||
|
if (it != nullptr)
|
|||
|
{
|
|||
|
it->Orientation(reinterpret_cast<tesseract::Orientation*>(&orientation),
|
|||
|
reinterpret_cast<tesseract::WritingDirection*>(&direction),
|
|||
|
reinterpret_cast<tesseract::TextlineOrder*>(&lineOrder),
|
|||
|
&deskewAngle);
|
|||
|
delete it;
|
|||
|
return true;
|
|||
|
}
|
|||
|
else
|
|||
|
return false;
|
|||
|
#endif
|
|||
|
orientation = ptr->AnalyseLayout1();
|
|||
|
}
|
|||
|
|
|||
|
void HG_OCR::init_orientation(const char *filename)
|
|||
|
{
|
|||
|
auto ret= reinterpret_cast<tesseract::TessBaseAPI*>(api)->Init(filename, "osd");
|
|||
|
reinterpret_cast<tesseract::TessBaseAPI*>(api)->SetPageSegMode(tesseract::PSM_AUTO_OSD);
|
|||
|
}
|
|||
|
|
|||
|
void PreloadRenderers(tesseract::TessBaseAPI* api,
|
|||
|
tesseract::PointerVector<tesseract::TessResultRenderer>* renderers,
|
|||
|
const char* outputbase)
|
|||
|
{
|
|||
|
bool b;
|
|||
|
api->GetBoolVariable("tessedit_create_pdf", &b);
|
|||
|
|
|||
|
if (b)
|
|||
|
{
|
|||
|
bool textonly;
|
|||
|
api->GetBoolVariable("textonly_pdf", &textonly);
|
|||
|
printf("GetDatapath%s\n", api->GetDatapath());
|
|||
|
renderers->push_back(new tesseract::TessPDFRenderer(outputbase, api->GetDatapath(), textonly));
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
Pix* createPix(const unsigned char * imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
|
|||
|
{
|
|||
|
int bpp = bytes_per_pixel * 8;
|
|||
|
if (bpp == 0) bpp = 1;
|
|||
|
Pix* pix = pixCreate(width, height, bpp == 24 ? 32 : bpp);
|
|||
|
l_uint32* data = pixGetData(pix);
|
|||
|
int wpl = pixGetWpl(pix);
|
|||
|
switch (bpp) {
|
|||
|
case 1:
|
|||
|
for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line)
|
|||
|
{
|
|||
|
for (int x = 0; x < width; ++x) {
|
|||
|
if (imagedata[x / 8] & (0x80 >> (x % 8)))
|
|||
|
CLEAR_DATA_BIT(data, x);
|
|||
|
else
|
|||
|
SET_DATA_BIT(data, x);
|
|||
|
}
|
|||
|
}
|
|||
|
break;
|
|||
|
|
|||
|
case 8:
|
|||
|
// Greyscale just copies the bytes in the right order.
|
|||
|
for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line)
|
|||
|
for (int x = 0; x < width; ++x)
|
|||
|
SET_DATA_BYTE(data, x, imagedata[x]);
|
|||
|
break;
|
|||
|
|
|||
|
case 24:
|
|||
|
// Put the colors in the correct places in the line buffer.
|
|||
|
for (int y = 0; y < height; ++y, imagedata += bytes_per_line)
|
|||
|
{
|
|||
|
for (int x = 0; x < width; ++x, ++data) {
|
|||
|
SET_DATA_BYTE(data, COLOR_RED, imagedata[3 * x]);
|
|||
|
SET_DATA_BYTE(data, COLOR_GREEN, imagedata[3 * x + 1]);
|
|||
|
SET_DATA_BYTE(data, COLOR_BLUE, imagedata[3 * x + 2]);
|
|||
|
}
|
|||
|
}
|
|||
|
break;
|
|||
|
|
|||
|
case 32:
|
|||
|
// Maintain byte order consistency across different endianness.
|
|||
|
for (int y = 0; y < height; ++y, imagedata += bytes_per_line, data += wpl)
|
|||
|
for (int x = 0; x < width; ++x)
|
|||
|
data[x] = (imagedata[x * 4] << 24) | (imagedata[x * 4 + 1] << 16) |
|
|||
|
(imagedata[x * 4 + 2] << 8) | imagedata[x * 4 + 3];
|
|||
|
break;
|
|||
|
|
|||
|
default:
|
|||
|
break;
|
|||
|
}
|
|||
|
|
|||
|
pix->informat = bytes_per_pixel == 1 ? 1 : 2;
|
|||
|
if (bytes_per_pixel == 1)
|
|||
|
{
|
|||
|
PIXCMAP* colormap = pixcmapCreate(8);
|
|||
|
LEPT_FREE(colormap->array);
|
|||
|
colormap->array = (l_uint8 *)LEPT_CALLOC(256, sizeof(RGBA_QUAD));
|
|||
|
colormap->n = 256;
|
|||
|
colormap->nalloc = 256;
|
|||
|
colormap->depth = 8;
|
|||
|
l_uint8* ptr = reinterpret_cast<l_uint8*>(colormap->array);
|
|||
|
for (int i = 0; i < 256; i++)
|
|||
|
ptr[i * 4 + 0] = ptr[i * 4 + 1] = ptr[i * 4 + 2] = ptr[i * 4 + 3] = i;
|
|||
|
pixSetColormap(pix, colormap);
|
|||
|
}
|
|||
|
pixSetXRes(pix, 200);
|
|||
|
pixSetYRes(pix, 200);
|
|||
|
|
|||
|
//FILE* file = fopenWriteStream("aaa.bmp", "w");
|
|||
|
//pixWriteStreamBmp(file, pix);
|
|||
|
//fclose(file);
|
|||
|
return pix;
|
|||
|
}
|
|||
|
|
|||
|
double prec(double x) {
|
|||
|
double kPrecision = 1000.0;
|
|||
|
double a = round(x * kPrecision) / kPrecision;
|
|||
|
if (a == -0)
|
|||
|
return 0;
|
|||
|
return a;
|
|||
|
}
|
|||
|
|
|||
|
void ClipBaseline(int ppi, int x1, int y1, int x2, int y2,
|
|||
|
int *line_x1, int *line_y1,
|
|||
|
int *line_x2, int *line_y2)
|
|||
|
{
|
|||
|
*line_x1 = x1;
|
|||
|
*line_y1 = y1;
|
|||
|
*line_x2 = x2;
|
|||
|
*line_y2 = y2;
|
|||
|
double rise = abs(y2 - y1) * 72 / ppi;
|
|||
|
double run = abs(x2 - x1) * 72 / ppi;
|
|||
|
if (rise < 2.0 && 2.0 < run)
|
|||
|
*line_y1 = *line_y2 = (y1 + y2) / 2;
|
|||
|
}
|
|||
|
|
|||
|
long dist2(int x1, int y1, int x2, int y2)
|
|||
|
{
|
|||
|
return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
|
|||
|
}
|
|||
|
|
|||
|
void GetWordBaseline(int writing_direction, int ppi, int height,
|
|||
|
int word_x1, int word_y1, int word_x2, int word_y2,
|
|||
|
int line_x1, int line_y1, int line_x2, int line_y2,
|
|||
|
double *x0, double *y0, double *length)
|
|||
|
{
|
|||
|
if (writing_direction == tesseract::WRITING_DIRECTION_RIGHT_TO_LEFT)
|
|||
|
{
|
|||
|
Swap(&word_x1, &word_x2);
|
|||
|
Swap(&word_y1, &word_y2);
|
|||
|
}
|
|||
|
double word_length;
|
|||
|
double x, y;
|
|||
|
{
|
|||
|
int px = word_x1;
|
|||
|
int py = word_y1;
|
|||
|
double l2 = dist2(line_x1, line_y1, line_x2, line_y2);
|
|||
|
if (l2 == 0) {
|
|||
|
x = line_x1;
|
|||
|
y = line_y1;
|
|||
|
}
|
|||
|
else {
|
|||
|
double t = ((px - line_x2) * (line_x2 - line_x1) +
|
|||
|
(py - line_y2) * (line_y2 - line_y1)) / l2;
|
|||
|
x = line_x2 + t * (line_x2 - line_x1);
|
|||
|
y = line_y2 + t * (line_y2 - line_y1);
|
|||
|
}
|
|||
|
word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1,
|
|||
|
word_x2, word_y2)));
|
|||
|
word_length = word_length * 72.0 / ppi;
|
|||
|
x = x * 72 / ppi;
|
|||
|
y = height - (y * 72.0 / ppi);
|
|||
|
}
|
|||
|
*x0 = x;
|
|||
|
*y0 = y;
|
|||
|
*length = word_length;
|
|||
|
}
|
|||
|
|
|||
|
void AffineMatrix(int writing_direction,
|
|||
|
int line_x1, int line_y1, int line_x2, int line_y2,
|
|||
|
double *a, double *b, double *c, double *d)
|
|||
|
{
|
|||
|
double theta = atan2(static_cast<double>(line_y1 - line_y2),
|
|||
|
static_cast<double>(line_x2 - line_x1));
|
|||
|
*a = cos(theta);
|
|||
|
*b = sin(theta);
|
|||
|
*c = -sin(theta);
|
|||
|
*d = cos(theta);
|
|||
|
switch (writing_direction) {
|
|||
|
case tesseract::WRITING_DIRECTION_RIGHT_TO_LEFT:
|
|||
|
*a = -*a;
|
|||
|
*b = -*b;
|
|||
|
break;
|
|||
|
case tesseract::WRITING_DIRECTION_TOP_TO_BOTTOM:
|
|||
|
// TODO(jbreiden) Consider using the vertical PDF writing mode.
|
|||
|
break;
|
|||
|
default:
|
|||
|
break;
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
bool CodepointToUtf16be(int code, char utf16[20])
|
|||
|
{
|
|||
|
if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
|
|||
|
tprintf("Dropping invalid codepoint %d\n", code);
|
|||
|
return false;
|
|||
|
}
|
|||
|
if (code < 0x10000) {
|
|||
|
snprintf(utf16, 20, "%04X", code);
|
|||
|
}
|
|||
|
else {
|
|||
|
int a = code - 0x010000;
|
|||
|
int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
|
|||
|
int low_surrogate = (0x03FF & a) + 0xDC00;
|
|||
|
snprintf(utf16, 20, "%04X%04X", high_surrogate, low_surrogate);
|
|||
|
}
|
|||
|
return true;
|
|||
|
}
|
|||
|
|
|||
|
char* GetPDFTextObjectss(tesseract::TessBaseAPI* api, double width, double height, int& len)
|
|||
|
{
|
|||
|
STRING pdf_str("");
|
|||
|
double ppi = api->GetSourceYResolution();
|
|||
|
|
|||
|
// These initial conditions are all arbitrary and will be overwritten
|
|||
|
double old_x = 0.0, old_y = 0.0;
|
|||
|
int old_fontsize = 0;
|
|||
|
tesseract::WritingDirection old_writing_direction = tesseract::WRITING_DIRECTION_LEFT_TO_RIGHT;
|
|||
|
bool new_block = true;
|
|||
|
int fontsize = 0;
|
|||
|
double a = 1;
|
|||
|
double b = 0;
|
|||
|
double c = 0;
|
|||
|
double d = 1;
|
|||
|
|
|||
|
// TODO(jbreiden) This marries the text and image together.
|
|||
|
// Slightly cleaner from an abstraction standpoint if this were to
|
|||
|
// live inside a separate text object.
|
|||
|
pdf_str += "q ";
|
|||
|
pdf_str.add_str_double("", prec(width));
|
|||
|
pdf_str += " 0 0 ";
|
|||
|
pdf_str.add_str_double("", prec(height));
|
|||
|
pdf_str += " 0 0 cm";
|
|||
|
if (true) {
|
|||
|
pdf_str += " /Im1 Do";
|
|||
|
}
|
|||
|
pdf_str += " Q\n";
|
|||
|
|
|||
|
int line_x1 = 0;
|
|||
|
int line_y1 = 0;
|
|||
|
int line_x2 = 0;
|
|||
|
int line_y2 = 0;
|
|||
|
|
|||
|
tesseract::ResultIterator *res_it = api->GetIterator();
|
|||
|
while (!res_it->Empty(tesseract::RIL_BLOCK))
|
|||
|
{
|
|||
|
if (res_it->IsAtBeginningOf(tesseract::RIL_BLOCK))
|
|||
|
{
|
|||
|
pdf_str += "BT\n3 Tr"; // Begin text object, use invisible ink
|
|||
|
old_fontsize = 0; // Every block will declare its fontsize
|
|||
|
new_block = true; // Every block will declare its affine matrix
|
|||
|
}
|
|||
|
|
|||
|
if (res_it->IsAtBeginningOf(tesseract::RIL_TEXTLINE))
|
|||
|
{
|
|||
|
int x1, y1, x2, y2;
|
|||
|
res_it->Baseline(tesseract::RIL_TEXTLINE, &x1, &y1, &x2, &y2);
|
|||
|
ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
|
|||
|
}
|
|||
|
|
|||
|
if (res_it->Empty(tesseract::RIL_WORD))
|
|||
|
{
|
|||
|
res_it->Next(tesseract::RIL_WORD);
|
|||
|
continue;
|
|||
|
}
|
|||
|
|
|||
|
// Writing direction changes at a per-word granularity
|
|||
|
tesseract::WritingDirection writing_direction;
|
|||
|
{
|
|||
|
tesseract::Orientation orientation;
|
|||
|
tesseract::TextlineOrder textline_order;
|
|||
|
float deskew_angle;
|
|||
|
res_it->Orientation(&orientation, &writing_direction,
|
|||
|
&textline_order, &deskew_angle);
|
|||
|
if (writing_direction != tesseract::WRITING_DIRECTION_TOP_TO_BOTTOM)
|
|||
|
{
|
|||
|
switch (res_it->WordDirection()) {
|
|||
|
case DIR_LEFT_TO_RIGHT:
|
|||
|
writing_direction = tesseract::WRITING_DIRECTION_LEFT_TO_RIGHT;
|
|||
|
break;
|
|||
|
case DIR_RIGHT_TO_LEFT:
|
|||
|
writing_direction = tesseract::WRITING_DIRECTION_RIGHT_TO_LEFT;
|
|||
|
break;
|
|||
|
default:
|
|||
|
writing_direction = old_writing_direction;
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
// Where is word origin and how long is it?
|
|||
|
double x, y, word_length;
|
|||
|
{
|
|||
|
int word_x1, word_y1, word_x2, word_y2;
|
|||
|
res_it->Baseline(tesseract::RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
|
|||
|
GetWordBaseline(writing_direction, ppi, height,
|
|||
|
word_x1, word_y1, word_x2, word_y2,
|
|||
|
line_x1, line_y1, line_x2, line_y2,
|
|||
|
&x, &y, &word_length);
|
|||
|
}
|
|||
|
|
|||
|
if (writing_direction != old_writing_direction || new_block)
|
|||
|
{
|
|||
|
AffineMatrix(writing_direction,
|
|||
|
line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
|
|||
|
pdf_str.add_str_double(" ", prec(a)); // . This affine matrix
|
|||
|
pdf_str.add_str_double(" ", prec(b)); // . sets the coordinate
|
|||
|
pdf_str.add_str_double(" ", prec(c)); // . system for all
|
|||
|
pdf_str.add_str_double(" ", prec(d)); // . text that follows.
|
|||
|
pdf_str.add_str_double(" ", prec(x)); // .
|
|||
|
pdf_str.add_str_double(" ", prec(y)); // .
|
|||
|
pdf_str += (" Tm "); // Place cursor absolutely
|
|||
|
new_block = false;
|
|||
|
}
|
|||
|
else
|
|||
|
{
|
|||
|
double dx = x - old_x;
|
|||
|
double dy = y - old_y;
|
|||
|
pdf_str.add_str_double(" ", prec(dx * a + dy * b));
|
|||
|
pdf_str.add_str_double(" ", prec(dx * c + dy * d));
|
|||
|
pdf_str += (" Td "); // Relative moveto
|
|||
|
}
|
|||
|
old_x = x;
|
|||
|
old_y = y;
|
|||
|
old_writing_direction = writing_direction;
|
|||
|
|
|||
|
// Adjust font size on a per word granularity. Pay attention to
|
|||
|
// fontsize, old_fontsize, and pdf_str. We've found that for
|
|||
|
// in Arabic, Tesseract will happily return a fontsize of zero,
|
|||
|
// so we make up a default number to protect ourselves.
|
|||
|
{
|
|||
|
bool bold, italic, underlined, monospace, serif, smallcaps;
|
|||
|
int font_id;
|
|||
|
res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
|
|||
|
&serif, &smallcaps, &fontsize, &font_id);
|
|||
|
const int kDefaultFontsize = 8;
|
|||
|
if (fontsize <= 0)
|
|||
|
fontsize = kDefaultFontsize;
|
|||
|
if (fontsize != old_fontsize)
|
|||
|
{
|
|||
|
char textfont[20];
|
|||
|
snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", fontsize);
|
|||
|
pdf_str += textfont;
|
|||
|
old_fontsize = fontsize;
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
bool last_word_in_line = res_it->IsAtFinalElement(tesseract::RIL_TEXTLINE, tesseract::RIL_WORD);
|
|||
|
bool last_word_in_block = res_it->IsAtFinalElement(tesseract::RIL_BLOCK, tesseract::RIL_WORD);
|
|||
|
STRING pdf_word("");
|
|||
|
int pdf_word_len = 0;
|
|||
|
do {
|
|||
|
const char *grapheme = res_it->GetUTF8Text(tesseract::RIL_SYMBOL);
|
|||
|
if (grapheme && grapheme[0] != '\0') {
|
|||
|
GenericVector<int> unicodes;
|
|||
|
UNICHAR::UTF8ToUnicode(grapheme, &unicodes);
|
|||
|
char utf16[20];
|
|||
|
for (int i = 0; i < unicodes.length(); i++)
|
|||
|
{
|
|||
|
int code = unicodes[i];
|
|||
|
if (CodepointToUtf16be(code, utf16))
|
|||
|
{
|
|||
|
pdf_word += utf16;
|
|||
|
pdf_word_len++;
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
delete[]grapheme;
|
|||
|
res_it->Next(tesseract::RIL_SYMBOL);
|
|||
|
}
|
|||
|
while (!res_it->Empty(tesseract::RIL_BLOCK) && !res_it->IsAtBeginningOf(tesseract::RIL_WORD));
|
|||
|
if (word_length > 0 && pdf_word_len > 0 && fontsize > 0)
|
|||
|
{
|
|||
|
double h_stretch = 2 * prec(100.0 * word_length / (fontsize * pdf_word_len));
|
|||
|
pdf_str.add_str_double("", h_stretch);
|
|||
|
pdf_str += " Tz"; // horizontal stretch
|
|||
|
pdf_str += " [ <";
|
|||
|
pdf_str += pdf_word; // UTF-16BE representation
|
|||
|
pdf_str += "> ] TJ"; // show the text
|
|||
|
}
|
|||
|
if (last_word_in_line) {
|
|||
|
pdf_str += " \n";
|
|||
|
}
|
|||
|
if (last_word_in_block) {
|
|||
|
pdf_str += "ET\n"; // end the text object
|
|||
|
}
|
|||
|
}
|
|||
|
char *ret = new char[pdf_str.length() + 1];
|
|||
|
strcpy(ret, pdf_str.string());
|
|||
|
delete res_it;
|
|||
|
len = pdf_str.length();
|
|||
|
return ret;
|
|||
|
}
|
|||
|
#endif
|