diff --git a/third_party/ocr/tesseract-ocr/src/pagesegmain.cpp b/third_party/ocr/tesseract-ocr/src/pagesegmain.cpp index d8497897..52dbf71b 100644 --- a/third_party/ocr/tesseract-ocr/src/pagesegmain.cpp +++ b/third_party/ocr/tesseract-ocr/src/pagesegmain.cpp @@ -97,62 +97,64 @@ static Pix* RemoveEnclosingCircle(Pix* pixs) { * On return the blocks list owns all the constructed page layout. */ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, - Tesseract* osd_tess, OSResults* osr) { - ASSERT_HOST(pix_binary_ != nullptr); - int width = pixGetWidth(pix_binary_); - int height = pixGetHeight(pix_binary_); - // Get page segmentation mode. - auto pageseg_mode = static_cast( - static_cast(tessedit_pageseg_mode)); - // If a UNLV zone file can be found, use that instead of segmentation. - if (!PSM_COL_FIND_ENABLED(pageseg_mode) && - input_file != nullptr && input_file->length() > 0) { - STRING name = *input_file; - const char* lastdot = strrchr(name.string(), '.'); - if (lastdot != nullptr) - name[lastdot - name.string()] = '\0'; - read_unlv_file(name, width, height, blocks); - } - if (blocks->empty()) { - // No UNLV file present. Work according to the PageSegMode. - // First make a single block covering the whole image. - BLOCK_IT block_it(blocks); - auto* block = new BLOCK("", true, 0, 0, 0, 0, width, height); - block->set_right_to_left(right_to_left()); - block_it.add_to_end(block); - } else { - // UNLV file present. Use PSM_SINGLE_BLOCK. - pageseg_mode = PSM_SINGLE_BLOCK; - } - // The diacritic_blobs holds noise blobs that may be diacritics. They - // are separated out on areas of the image that seem noisy and short-circuit - // the layout process, going straight from the initial partition creation - // right through to after word segmentation, where they are added to the - // rej_cblobs list of the most appropriate word. From there classification - // will determine whether they are used. - BLOBNBOX_LIST diacritic_blobs; - int auto_page_seg_ret_val = 0; - TO_BLOCK_LIST to_blocks; - if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) || - PSM_SPARSE(pageseg_mode)) { - auto_page_seg_ret_val = AutoPageSeg( - pageseg_mode, blocks, &to_blocks, - enable_noise_removal ? &diacritic_blobs : nullptr, osd_tess, osr); - if (pageseg_mode == PSM_OSD_ONLY) - return auto_page_seg_ret_val; - // To create blobs from the image region bounds uncomment this line: - // to_blocks.clear(); // Uncomment to go back to the old mode. - } else { - deskew_ = FCOORD(1.0f, 0.0f); - reskew_ = FCOORD(1.0f, 0.0f); - if (pageseg_mode == PSM_CIRCLE_WORD) { - Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_); - if (pixcleaned != nullptr) { - pixDestroy(&pix_binary_); - pix_binary_ = pixcleaned; + Tesseract* osd_tess, OSResults* osr) +{ + ASSERT_HOST(pix_binary_ != nullptr); + int width = pixGetWidth(pix_binary_); + int height = pixGetHeight(pix_binary_); + // Get page segmentation mode. + auto pageseg_mode = static_cast(static_cast(tessedit_pageseg_mode)); + // If a UNLV zone file can be found, use that instead of segmentation. + if (!PSM_COL_FIND_ENABLED(pageseg_mode) && input_file != nullptr && input_file->length() > 0) + { + STRING name = *input_file; + const char* lastdot = strrchr(name.string(), '.'); + if (lastdot != nullptr) + name[lastdot - name.string()] = '\0'; + read_unlv_file(name, width, height, blocks); + } + + if (blocks->empty()) + { + // No UNLV file present. Work according to the PageSegMode. + // First make a single block covering the whole image. + BLOCK_IT block_it(blocks); + auto* block = new BLOCK("", true, 0, 0, 0, 0, width, height); + block->set_right_to_left(right_to_left()); + block_it.add_to_end(block); + } + else + { + // UNLV file present. Use PSM_SINGLE_BLOCK. + pageseg_mode = PSM_SINGLE_BLOCK; + } + // The diacritic_blobs holds noise blobs that may be diacritics. They + // are separated out on areas of the image that seem noisy and short-circuit + // the layout process, going straight from the initial partition creation + // right through to after word segmentation, where they are added to the + // rej_cblobs list of the most appropriate word. From there classification + // will determine whether they are used. + BLOBNBOX_LIST diacritic_blobs; + int auto_page_seg_ret_val = 0; + TO_BLOCK_LIST to_blocks; + if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) || PSM_SPARSE(pageseg_mode)) + { + auto_page_seg_ret_val = AutoPageSeg(pageseg_mode, blocks, &to_blocks, enable_noise_removal ? &diacritic_blobs : nullptr, osd_tess, osr); + if (pageseg_mode == PSM_OSD_ONLY) + return auto_page_seg_ret_val; + // To create blobs from the image region bounds uncomment this line: + to_blocks.clear(); // Uncomment to go back to the old mode. + } else { + deskew_ = FCOORD(1.0f, 0.0f); + reskew_ = FCOORD(1.0f, 0.0f); + if (pageseg_mode == PSM_CIRCLE_WORD) { + Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_); + if (pixcleaned != nullptr) { + pixDestroy(&pix_binary_); + pix_binary_ = pixcleaned; + } } } - } if (auto_page_seg_ret_val < 0) { return -1; @@ -213,6 +215,8 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks, #if 1 pixDestroy(&photomask_pix); pixDestroy(&musicmask_pix); + delete finder; + blocks->clear(); return 0; #else int result = 0; diff --git a/third_party/ocr/tesseract-ocr/windows/lib/x64/tesseract41.lib b/third_party/ocr/tesseract-ocr/windows/lib/x64/tesseract41.lib index 9043064c..71278fba 100644 Binary files a/third_party/ocr/tesseract-ocr/windows/lib/x64/tesseract41.lib and b/third_party/ocr/tesseract-ocr/windows/lib/x64/tesseract41.lib differ diff --git a/third_party/ocr/tesseract-ocr/windows/lib/x64/tesseract41d.lib b/third_party/ocr/tesseract-ocr/windows/lib/x64/tesseract41d.lib index 074cafb9..3fcc2ebb 100644 Binary files a/third_party/ocr/tesseract-ocr/windows/lib/x64/tesseract41d.lib and b/third_party/ocr/tesseract-ocr/windows/lib/x64/tesseract41d.lib differ diff --git a/third_party/ocr/tesseract-ocr/windows/lib/x86/tesseract41.lib b/third_party/ocr/tesseract-ocr/windows/lib/x86/tesseract41.lib index c164ad69..be7cccce 100644 Binary files a/third_party/ocr/tesseract-ocr/windows/lib/x86/tesseract41.lib and b/third_party/ocr/tesseract-ocr/windows/lib/x86/tesseract41.lib differ diff --git a/third_party/ocr/tesseract-ocr/windows/lib/x86/tesseract41d.lib b/third_party/ocr/tesseract-ocr/windows/lib/x86/tesseract41d.lib index 0e10eaa2..7a1ff194 100644 Binary files a/third_party/ocr/tesseract-ocr/windows/lib/x86/tesseract41d.lib and b/third_party/ocr/tesseract-ocr/windows/lib/x86/tesseract41d.lib differ