twain3.0/3rdparty/hgOCR/include/ccmain/output.cpp

/******************************************************************
 * File:        output.cpp  (Formerly output.c)
 * Description: Output pass
 * Author:          Phil Cheatle
 * Created:         Thu Aug  4 10:56:08 BST 1994
 *
 * (C) Copyright 1994, Hewlett-Packard Ltd.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/

#ifdef _MSC_VER
#pragma warning(disable:4244)  // Conversion warnings
#endif

#include <string.h>
#include <ctype.h>
#ifdef __UNIX__
#include          <assert.h>
#include          <unistd.h>
#include          <errno.h>
#endif
#include "helpers.h"
#include "tessvars.h"
#include "control.h"
#include "reject.h"
#include "docqual.h"
#include "output.h"
#include "globals.h"
#include "tesseractclass.h"

#define EPAPER_EXT      ".ep"
#define PAGE_YSIZE      3508
#define CTRL_INSET      '\024'   //dc4=text inset
#define CTRL_FONT       '\016'   //so=font change
#define CTRL_DEFAULT      '\017' //si=default font
#define CTRL_SHIFT      '\022'   //dc2=x shift
#define CTRL_TAB        '\011'   //tab
#define CTRL_NEWLINE      '\012' //newline
#define CTRL_HARDLINE   '\015'   //cr

 /**********************************************************************
  * pixels_to_pts
  *
  * Convert an integer number of pixels to the nearest integer
  * number of points.
  **********************************************************************/

inT32 pixels_to_pts(               //convert coords
	inT32 pixels,
	inT32 pix_res  //resolution
) {
	float pts;                     //converted value

	pts = pixels * 72.0 / pix_res;
	return (inT32)(pts + 0.5);    //round it
}

namespace tesseract {
	void Tesseract::output_pass(  //Tess output pass //send to api
		PAGE_RES_IT &page_res_it,
		const TBOX *target_word_box) {
		BLOCK_RES *block_of_last_word;
		BOOL8 force_eol;               //During output
		BLOCK *nextblock;              //block of next word
		WERD *nextword;                //next word

		page_res_it.restart_page();
		block_of_last_word = NULL;
		while (page_res_it.word() != NULL) {
			check_debug_pt(page_res_it.word(), 120);

			if (target_word_box) {
				TBOX current_word_box = page_res_it.word()->word->bounding_box();
				FCOORD center_pt(
					(current_word_box.right() + current_word_box.left()) / 2,
					(current_word_box.bottom() + current_word_box.top()) / 2);
				if (!target_word_box->contains(center_pt)) {
					page_res_it.forward();
					continue;
				}
			}
			if (tessedit_write_block_separators &&
				block_of_last_word != page_res_it.block()) {
				block_of_last_word = page_res_it.block();
			}

			force_eol = (tessedit_write_block_separators &&
				(page_res_it.block() != page_res_it.next_block())) ||
				(page_res_it.next_word() == NULL);

			if (page_res_it.next_word() != NULL)
				nextword = page_res_it.next_word()->word;
			else
				nextword = NULL;
			if (page_res_it.next_block() != NULL)
				nextblock = page_res_it.next_block()->block;
			else
				nextblock = NULL;
			//regardless of tilde crunching
			write_results(page_res_it,
				determine_newline_type(page_res_it.word()->word,
					page_res_it.block()->block,
					nextword, nextblock), force_eol);
			page_res_it.forward();
		}
	}


	/*************************************************************************
	 * write_results()
	 *
	 * All recognition and rejection has now been done. Generate the following:
	 *   .txt file     - giving the final best choices with NO highlighting
	 *   .raw file     - giving the tesseract top choice output for each word
	 *   .map file     - showing how the .txt file has been rejected in the .ep file
	 *   epchoice list - a list of one element per word, containing the text for the
	 *                   epaper. Reject strings are inserted.
	 *   inset list    - a list of bounding boxes of reject insets - indexed by the
	 *                   reject strings in the epchoice text.
	 *************************************************************************/
	void Tesseract::write_results(PAGE_RES_IT &page_res_it,
		char newline_type,  // type of newline
		BOOL8 force_eol) {  // override tilde crunch?
		WERD_RES *word = page_res_it.word();
		const UNICHARSET &uchset = *word->uch_set;
		int i;
		BOOL8 need_reject = FALSE;
		UNICHAR_ID space = uchset.unichar_to_id(" ");

		if ((word->unlv_crunch_mode != CR_NONE ||
			word->best_choice->length() == 0) &&
			!tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
			if ((word->unlv_crunch_mode != CR_DELETE) &&
				(!stats_.tilde_crunch_written ||
				((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
					(word->word->space() > 0) &&
					!word->word->flag(W_FUZZY_NON) &&
					!word->word->flag(W_FUZZY_SP)))) {
				if (!word->word->flag(W_BOL) &&
					(word->word->space() > 0) &&
					!word->word->flag(W_FUZZY_NON) &&
					!word->word->flag(W_FUZZY_SP)) {
					stats_.last_char_was_tilde = false;
				}
				need_reject = TRUE;
			}
			if ((need_reject && !stats_.last_char_was_tilde) ||
				(force_eol && stats_.write_results_empty_block)) {
				/* Write a reject char - mark as rejected unless zero_rejection mode */
				stats_.last_char_was_tilde = TRUE;
				stats_.tilde_crunch_written = true;
				stats_.last_char_was_newline = false;
				stats_.write_results_empty_block = false;
			}

			if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) {
				stats_.tilde_crunch_written = false;
				stats_.last_char_was_newline = true;
				stats_.last_char_was_tilde = false;
			}

			if (force_eol)
				stats_.write_results_empty_block = true;
			return;
		}

		/* NORMAL PROCESSING of non tilde crunched words */

		stats_.tilde_crunch_written = false;
		if (newline_type)
			stats_.last_char_was_newline = true;
		else
			stats_.last_char_was_newline = false;
		stats_.write_results_empty_block = force_eol;  // about to write a real word

		if (unlv_tilde_crunching &&
			stats_.last_char_was_tilde &&
			(word->word->space() == 0) &&
			!(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
			(word->best_choice->unichar_id(0) == space)) {
			/* Prevent adjacent tilde across words - we know that adjacent tildes within
			   words have been removed */
			word->MergeAdjacentBlobs(0);
		}
		if (newline_type ||
			(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes))
			stats_.last_char_was_tilde = false;
		else {
			if (word->reject_map.length() > 0) {
				if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
					stats_.last_char_was_tilde = true;
				else
					stats_.last_char_was_tilde = false;
			}
			else if (word->word->space() > 0)
				stats_.last_char_was_tilde = false;
			/* else it is unchanged as there are no output chars */
		}

		ASSERT_HOST(word->best_choice->length() == word->reject_map.length());

		set_unlv_suspects(word);
		check_debug_pt(word, 120);
		if (tessedit_rejection_debug) {
			tprintf("Dict word: \"%s\": %d\n",
				word->best_choice->debug_string().string(),
				dict_word(*(word->best_choice)));
		}
		if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
			if (tessedit_zero_rejection) {
				/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
				for (i = 0; i < word->best_choice->length(); ++i) {
					if (word->reject_map[i].rejected())
						word->reject_map[i].setrej_minimal_rej_accept();
				}
			}
			if (tessedit_minimal_rejection) {
				/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
				for (i = 0; i < word->best_choice->length(); ++i) {
					if ((word->best_choice->unichar_id(i) != space) &&
						word->reject_map[i].rejected())
						word->reject_map[i].setrej_minimal_rej_accept();
				}
			}
		}
	}
}  // namespace tesseract

/**********************************************************************
 * determine_newline_type
 *
 * Find whether we have a wrapping or hard newline.
 * Return FALSE if not at end of line.
 **********************************************************************/

char determine_newline_type(                   //test line ends
	WERD *word,        //word to do
	BLOCK *block,      //current block
	WERD *next_word,   //next word
	BLOCK *next_block  //block of next word
) {
	inT16 end_gap;                 //to right edge
	inT16 width;                   //of next word
	TBOX word_box;                  //bounding
	TBOX next_box;                  //next word
	TBOX block_box;                 //block bounding

	if (!word->flag(W_EOL))
		return FALSE;                //not end of line
	if (next_word == NULL || next_block == NULL || block != next_block)
		return CTRL_NEWLINE;
	if (next_word->space() > 0)
		return CTRL_HARDLINE;        //it is tabbed
	word_box = word->bounding_box();
	next_box = next_word->bounding_box();
	block_box = block->bounding_box();
	//gap to eol
	end_gap = block_box.right() - word_box.right();
	end_gap -= (inT32)block->space();
	width = next_box.right() - next_box.left();
	//      tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
	//              block_box.right(),word_box.right(),end_gap,
	//              next_box.right(),next_box.left(),width,
	//              end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
	return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
}

/*************************************************************************
 * get_rep_char()
 * Return the first accepted character from the repetition string. This is the
 * character which is repeated - as determined earlier by fix_rep_char()
 *************************************************************************/
namespace tesseract {
	UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) {  // what char is repeated?
		int i;
		for (i = 0; ((i < word->reject_map.length()) &&
			(word->reject_map[i].rejected())); ++i);

		if (i < word->reject_map.length()) {
			return word->best_choice->unichar_id(i);
		}
		else {
			return word->uch_set->unichar_to_id(unrecognised_char.string());
		}
	}

	/*************************************************************************
	 * SUSPECT LEVELS
	 *
	 * 0 - don't reject ANYTHING
	 * 1,2 - partial rejection
	 * 3 - BEST
	 *
	 * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
	 * tessedit_minimal_rejection.
	 *************************************************************************/
	void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
		int len = word_res->reject_map.length();
		const WERD_CHOICE &word = *(word_res->best_choice);
		const UNICHARSET &uchset = *word.unicharset();
		int i;
		float rating_per_ch;

		if (suspect_level == 0) {
			for (i = 0; i < len; i++) {
				if (word_res->reject_map[i].rejected())
					word_res->reject_map[i].setrej_minimal_rej_accept();
			}
			return;
		}

		if (suspect_level >= 3)
			return;                      //Use defaults

		  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/

		if (safe_dict_word(word_res) &&
			(count_alphas(word) > suspect_short_words)) {
			/* Unreject alphas in dictionary words */
			for (i = 0; i < len; ++i) {
				if (word_res->reject_map[i].rejected() &&
					uchset.get_isalpha(word.unichar_id(i)))
					word_res->reject_map[i].setrej_minimal_rej_accept();
			}
		}

		rating_per_ch = word.rating() / word_res->reject_map.length();

		if (rating_per_ch >= suspect_rating_per_ch)
			return;  // Don't touch bad ratings

		if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
			/* Unreject any Tess Acceptable word - but NOT tess reject chs*/
			for (i = 0; i < len; ++i) {
				if (word_res->reject_map[i].rejected() &&
					(!uchset.eq(word.unichar_id(i), " ")))
					word_res->reject_map[i].setrej_minimal_rej_accept();
			}
		}

		for (i = 0; i < len; i++) {
			if (word_res->reject_map[i].rejected()) {
				if (word_res->reject_map[i].flag(R_DOC_REJ))
					word_res->reject_map[i].setrej_minimal_rej_accept();
				if (word_res->reject_map[i].flag(R_BLOCK_REJ))
					word_res->reject_map[i].setrej_minimal_rej_accept();
				if (word_res->reject_map[i].flag(R_ROW_REJ))
					word_res->reject_map[i].setrej_minimal_rej_accept();
			}
		}

		if (suspect_level == 2)
			return;

		if (!suspect_constrain_1Il ||
			(word_res->reject_map.length() <= suspect_short_words)) {
			for (i = 0; i < len; i++) {
				if (word_res->reject_map[i].rejected()) {
					if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
						word_res->reject_map[i].flag(R_POSTNN_1IL)))
						word_res->reject_map[i].setrej_minimal_rej_accept();

					if (!suspect_constrain_1Il &&
						word_res->reject_map[i].flag(R_MM_REJECT))
						word_res->reject_map[i].setrej_minimal_rej_accept();
				}
			}
		}

		if (acceptable_word_string(*word_res->uch_set,
			word.unichar_string().string(),
			word.unichar_lengths().string()) !=
			AC_UNACCEPTABLE ||
			acceptable_number_string(word.unichar_string().string(),
				word.unichar_lengths().string())) {
			if (word_res->reject_map.length() > suspect_short_words) {
				for (i = 0; i < len; i++) {
					if (word_res->reject_map[i].rejected() &&
						(!word_res->reject_map[i].perm_rejected() ||
							word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
							word_res->reject_map[i].flag(R_POSTNN_1IL) ||
							word_res->reject_map[i].flag(R_MM_REJECT))) {
						word_res->reject_map[i].setrej_minimal_rej_accept();
					}
				}
			}
		}
	}

	inT16 Tesseract::count_alphas(const WERD_CHOICE &word) {
		int count = 0;
		for (int i = 0; i < word.length(); ++i) {
			if (word.unicharset()->get_isalpha(word.unichar_id(i)))
				count++;
		}
		return count;
	}


	inT16 Tesseract::count_alphanums(const WERD_CHOICE &word) {
		int count = 0;
		for (int i = 0; i < word.length(); ++i) {
			if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
				word.unicharset()->get_isdigit(word.unichar_id(i)))
				count++;
		}
		return count;
	}


	BOOL8 Tesseract::acceptable_number_string(const char *s,
		const char *lengths) {
		BOOL8 prev_digit = FALSE;

		if (*lengths == 1 && *s == '(')
			s++;

		if (*lengths == 1 &&
			((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
			s++;

		for (; *s != '\0'; s += *(lengths++)) {
			if (unicharset.get_isdigit(s, *lengths))
				prev_digit = TRUE;
			else if (prev_digit &&
				(*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
				prev_digit = FALSE;
			else if (prev_digit && *lengths == 1 &&
				(*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
				return TRUE;
			else if (prev_digit &&
				*lengths == 1 && (*s == '%') &&
				(*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
				(*(s + *lengths + *(lengths + 1)) == '\0'))
				return TRUE;
			else
				return FALSE;
		}
		return TRUE;
	}
}  // namespace tesseract