解决tessert-ocr自动文本方向识别时内存泄漏的问题

2023-07-12 17:30:46 +08:00 · 2023-07-12 17:30:46 +08:00 · 97e5ff8c17
parent a4e92b8d59
commit 97e5ff8c17
5 changed files with 58 additions and 54 deletions
--- a/third_party/ocr/tesseract-ocr/src/pagesegmain.cpp
+++ b/third_party/ocr/tesseract-ocr/src/pagesegmain.cpp
@ -97,62 +97,64 @@ static Pix* RemoveEnclosingCircle(Pix* pixs) {
 * On return the blocks list owns all the constructed page layout.
 */
 int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
-                           Tesseract* osd_tess, OSResults* osr) {
+                           Tesseract* osd_tess, OSResults* osr) 
-  ASSERT_HOST(pix_binary_ != nullptr);
+{
-  int width = pixGetWidth(pix_binary_);
+    ASSERT_HOST(pix_binary_ != nullptr);
-  int height = pixGetHeight(pix_binary_);
+    int width = pixGetWidth(pix_binary_);
-  // Get page segmentation mode.
+    int height = pixGetHeight(pix_binary_);
-  auto pageseg_mode = static_cast<PageSegMode>(
+    // Get page segmentation mode.
-      static_cast<int>(tessedit_pageseg_mode));
+    auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode));
-  // If a UNLV zone file can be found, use that instead of segmentation.
+    // If a UNLV zone file can be found, use that instead of segmentation.
-  if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
+    if (!PSM_COL_FIND_ENABLED(pageseg_mode) && input_file != nullptr && input_file->length() > 0) 
-      input_file != nullptr && input_file->length() > 0) {
+    {
-    STRING name = *input_file;
+        STRING name = *input_file;
-    const char* lastdot = strrchr(name.string(), '.');
+        const char* lastdot = strrchr(name.string(), '.');
-    if (lastdot != nullptr)
+        if (lastdot != nullptr)
-      name[lastdot - name.string()] = '\0';
+            name[lastdot - name.string()] = '\0';
-    read_unlv_file(name, width, height, blocks);
+        read_unlv_file(name, width, height, blocks);
-  }
+    }
-  if (blocks->empty()) {
+
-    // No UNLV file present. Work according to the PageSegMode.
+    if (blocks->empty()) 
-    // First make a single block covering the whole image.
+    {
-    BLOCK_IT block_it(blocks);
+        // No UNLV file present. Work according to the PageSegMode.
-    auto* block = new BLOCK("", true, 0, 0, 0, 0, width, height);
+        // First make a single block covering the whole image.
-    block->set_right_to_left(right_to_left());
+        BLOCK_IT block_it(blocks);
-    block_it.add_to_end(block);
+        auto* block = new BLOCK("", true, 0, 0, 0, 0, width, height);
-  } else {
+        block->set_right_to_left(right_to_left());
-    // UNLV file present. Use PSM_SINGLE_BLOCK.
+        block_it.add_to_end(block);
-    pageseg_mode = PSM_SINGLE_BLOCK;
+    } 
-  }
+    else 
-  // The diacritic_blobs holds noise blobs that may be diacritics. They
+    {
-  // are separated out on areas of the image that seem noisy and short-circuit
+        // UNLV file present. Use PSM_SINGLE_BLOCK.
-  // the layout process, going straight from the initial partition creation
+        pageseg_mode = PSM_SINGLE_BLOCK;
-  // right through to after word segmentation, where they are added to the
+    }
-  // rej_cblobs list of the most appropriate word. From there classification
+    // The diacritic_blobs holds noise blobs that may be diacritics. They
-  // will determine whether they are used.
+    // are separated out on areas of the image that seem noisy and short-circuit
-  BLOBNBOX_LIST diacritic_blobs;
+    // the layout process, going straight from the initial partition creation
-  int auto_page_seg_ret_val = 0;
+    // right through to after word segmentation, where they are added to the
-  TO_BLOCK_LIST to_blocks;
+    // rej_cblobs list of the most appropriate word. From there classification
-  if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
+    // will determine whether they are used.
-      PSM_SPARSE(pageseg_mode)) {
+    BLOBNBOX_LIST diacritic_blobs;
-    auto_page_seg_ret_val = AutoPageSeg(
+    int auto_page_seg_ret_val = 0;
-        pageseg_mode, blocks, &to_blocks,
+    TO_BLOCK_LIST to_blocks;
-        enable_noise_removal ? &diacritic_blobs : nullptr, osd_tess, osr);
+    if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) || PSM_SPARSE(pageseg_mode)) 
-    if (pageseg_mode == PSM_OSD_ONLY)
+    {
-      return auto_page_seg_ret_val;
+        auto_page_seg_ret_val = AutoPageSeg(pageseg_mode, blocks, &to_blocks, enable_noise_removal ? &diacritic_blobs : nullptr, osd_tess, osr);
-    // To create blobs from the image region bounds uncomment this line:
+      if (pageseg_mode == PSM_OSD_ONLY)
-    //  to_blocks.clear();  // Uncomment to go back to the old mode.
+        return auto_page_seg_ret_val;
-  } else {
+      // To create blobs from the image region bounds uncomment this line:
-    deskew_ = FCOORD(1.0f, 0.0f);
+        to_blocks.clear();  // Uncomment to go back to the old mode.
-    reskew_ = FCOORD(1.0f, 0.0f);
+    } else {
-    if (pageseg_mode == PSM_CIRCLE_WORD) {
+      deskew_ = FCOORD(1.0f, 0.0f);
-      Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
+      reskew_ = FCOORD(1.0f, 0.0f);
-      if (pixcleaned != nullptr) {
+      if (pageseg_mode == PSM_CIRCLE_WORD) {
-        pixDestroy(&pix_binary_);
+        Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
-        pix_binary_ = pixcleaned;
+        if (pixcleaned != nullptr) {
          pixDestroy(&pix_binary_);
          pix_binary_ = pixcleaned;
        }
      }
    }
  }
  if (auto_page_seg_ret_val < 0) {
    return -1;
@ -213,6 +215,8 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
 #if 1
  pixDestroy(&photomask_pix);
  pixDestroy(&musicmask_pix);
  delete finder;
  blocks->clear();
  return 0;
  #else
  int result = 0;
--- a/third_party/ocr/tesseract-ocr/windows/lib/x64/tesseract41.lib
+++ b/third_party/ocr/tesseract-ocr/windows/lib/x64/tesseract41.lib
--- a/third_party/ocr/tesseract-ocr/windows/lib/x64/tesseract41d.lib
+++ b/third_party/ocr/tesseract-ocr/windows/lib/x64/tesseract41d.lib
--- a/third_party/ocr/tesseract-ocr/windows/lib/x86/tesseract41.lib
+++ b/third_party/ocr/tesseract-ocr/windows/lib/x86/tesseract41.lib
--- a/third_party/ocr/tesseract-ocr/windows/lib/x86/tesseract41d.lib
+++ b/third_party/ocr/tesseract-ocr/windows/lib/x86/tesseract41d.lib