Skip to content

Commit

Permalink
Modernize code for renderers and remove filename conversion for Windo…
Browse files Browse the repository at this point in the history
…ws (#4330)

Commit db52047 added the filename conversion for the hOCR renderer,
but it was removed later for TSV in commit 6700edd.

Tesseract does not use a filename conversion anywhere else, so remove it
for the other renderers, too.

Signed-off-by: Stefan Weil <[email protected]>
  • Loading branch information
stweil authored Oct 23, 2024
1 parent 3020c14 commit 638868e
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 77 deletions.
32 changes: 7 additions & 25 deletions src/api/altorenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@

#include "errcode.h" // for ASSERT_HOST
#include "helpers.h" // for copy_string
#ifdef _WIN32
# include "host.h" // windows.h for MultiByteToWideChar, ...
#endif
#include "tprintf.h" // for tprintf

#include <tesseract/baseapi.h>
Expand Down Expand Up @@ -145,20 +142,6 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
SetInputName(nullptr);
}

#ifdef _WIN32
// convert input name from ANSI encoding to utf-8
int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
wchar_t *uni16_str = new WCHAR[str16_len];
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, str16_len);
int utf8_len =
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr);
char *utf8_str = new char[utf8_len];
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr);
input_file_ = utf8_str;
delete[] uni16_str;
delete[] utf8_str;
#endif

std::stringstream alto_str;
// Use "C" locale (needed for int values larger than 999).
alto_str.imbue(std::locale::classic());
Expand All @@ -169,7 +152,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
<< " WIDTH=\"" << rect_width_ << "\""
<< " HEIGHT=\"" << rect_height_ << "\">\n";

ResultIterator *res_it = GetIterator();
std::unique_ptr<ResultIterator> res_it(GetIterator());
while (!res_it->Empty(RIL_BLOCK)) {
if (res_it->Empty(RIL_WORD)) {
res_it->Next(RIL_WORD);
Expand All @@ -186,7 +169,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
// Handle all kinds of images.
// TODO: optionally add TYPE, for example TYPE="photo".
alto_str << "\t\t\t\t<Illustration ID=\"cblock_" << bcnt++ << "\"";
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
alto_str << "</Illustration>\n";
res_it->Next(RIL_BLOCK);
continue;
Expand All @@ -195,7 +178,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
case PT_VERT_LINE:
// Handle horizontal and vertical lines.
alto_str << "\t\t\t\t<GraphicalElement ID=\"cblock_" << bcnt++ << "\"";
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
alto_str << "</GraphicalElement >\n";
res_it->Next(RIL_BLOCK);
continue;
Expand All @@ -208,24 +191,24 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {

if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\"";
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
alto_str << "\n";
}

if (res_it->IsAtBeginningOf(RIL_PARA)) {
alto_str << "\t\t\t\t\t<TextBlock ID=\"block_" << tcnt << "\"";
AddBoxToAlto(res_it, RIL_PARA, alto_str);
AddBoxToAlto(res_it.get(), RIL_PARA, alto_str);
alto_str << "\n";
}

if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
AddBoxToAlto(res_it, RIL_TEXTLINE, alto_str);
AddBoxToAlto(res_it.get(), RIL_TEXTLINE, alto_str);
alto_str << "\n";
}

alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
AddBoxToAlto(res_it, RIL_WORD, alto_str);
AddBoxToAlto(res_it.get(), RIL_WORD, alto_str);
alto_str << " CONTENT=\"";

bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
Expand Down Expand Up @@ -272,7 +255,6 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
alto_str << "\t\t\t</PrintSpace>\n"
<< "\t\t</Page>\n";

delete res_it;
return copy_string(alto_str.str());
}

Expand Down
20 changes: 0 additions & 20 deletions src/api/hocrrenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,6 @@
#include <locale> // for std::locale::classic
#include <memory> // for std::unique_ptr
#include <sstream> // for std::stringstream
#ifdef _WIN32
# include "host.h" // windows.h for MultiByteToWideChar, ...
#endif
#include <tesseract/renderer.h>
#include "helpers.h" // for copy_string
#include "tesseractclass.h" // for Tesseract
Expand Down Expand Up @@ -151,23 +148,6 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
SetInputName(nullptr);
}

#ifdef _WIN32
// convert input name from ANSI encoding to utf-8
int str16_len =
MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
wchar_t *uni16_str = new WCHAR[str16_len];
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str,
str16_len);
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
0, nullptr, nullptr);
char *utf8_str = new char[utf8_len];
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
nullptr, nullptr);
input_file_ = utf8_str;
delete[] uni16_str;
delete[] utf8_str;
#endif

std::stringstream hocr_str;
// Use "C" locale (needed for double values x_size and x_descenders).
hocr_str.imbue(std::locale::classic());
Expand Down
43 changes: 11 additions & 32 deletions src/api/pagerenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// Description: PAGE XML rendering interface
// Author: Jan Kamlah

// (C) Copyright 2021
// (C) Copyright 2024
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
Expand All @@ -15,9 +15,6 @@

#include "errcode.h" // for ASSERT_HOST
#include "helpers.h" // for copy_string
#ifdef _WIN32
# include "host.h" // windows.h for MultiByteToWideChar, ...
#endif
#include "tprintf.h" // for tprintf

#include <tesseract/baseapi.h>
Expand Down Expand Up @@ -717,23 +714,6 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
SetInputName(nullptr);
}

#ifdef _WIN32
// convert input name from ANSI encoding to utf-8
int str16_len =
MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
wchar_t *uni16_str = new WCHAR[str16_len];
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str,
str16_len);
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
0, nullptr, nullptr);
char *utf8_str = new char[utf8_len];
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
nullptr, nullptr);
input_file_ = utf8_str;
delete[] uni16_str;
delete[] utf8_str;
#endif

// Used variables

std::stringstream reading_order_str;
Expand Down Expand Up @@ -788,7 +768,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
<< "\t\t\t<OrderedGroup id=\"ro" << ro_id
<< "\" caption=\"Regions reading order\">\n";

ResultIterator *res_it = GetIterator();
std::unique_ptr<ResultIterator> res_it(GetIterator());

float block_conf = 0;
float line_conf = 0;
Expand All @@ -808,7 +788,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
// Handle all kinds of images.
page_str << "\t\t<GraphicRegion id=\"r" << rcnt++ << "\">\n";
page_str << "\t\t\t";
AddBoxToPAGE(res_it, RIL_BLOCK, page_str);
AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str);
page_str << "\t\t</GraphicRegion>\n";
res_it->Next(RIL_BLOCK);
continue;
Expand All @@ -818,7 +798,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
// Handle horizontal and vertical lines.
page_str << "\t\t<SeparatorRegion id=\"r" << rcnt++ << "\">\n";
page_str << "\t\t\t";
AddBoxToPAGE(res_it, RIL_BLOCK, page_str);
AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str);
page_str << "\t\t</SeparatorRegion>\n";
res_it->Next(RIL_BLOCK);
continue;
Expand Down Expand Up @@ -849,7 +829,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
if ((!POLYGONFLAG || (orientation_block != ORIENTATION_PAGE_UP &&
orientation_block != ORIENTATION_PAGE_DOWN)) &&
LEVELFLAG == 0) {
AddBoxToPAGE(res_it, RIL_BLOCK, page_str);
AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str);
}
}

Expand Down Expand Up @@ -892,9 +872,9 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
line_str << "custom=\"" << "readingOrder {index:" << lcnt << ";}\">\n";
// If level is linebased, get the line polygon and baseline
if (LEVELFLAG == 0 && (!POLYGONFLAG || skewed_flag)) {
AddPointToWordPolygon(res_it, RIL_TEXTLINE, line_top_ltr_pts,
AddPointToWordPolygon(res_it.get(), RIL_TEXTLINE, line_top_ltr_pts,
line_bottom_ltr_pts, writing_direction);
AddBaselineToPTA(res_it, RIL_TEXTLINE, line_baseline_pts);
AddBaselineToPTA(res_it.get(), RIL_TEXTLINE, line_baseline_pts);
if (ttb_flag) {
line_baseline_pts = TransposePolygonline(line_baseline_pts);
}
Expand All @@ -914,18 +894,18 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
<< WritingDirectionToStr(writing_direction) << "\" "
<< "custom=\"" << "readingOrder {index:" << wcnt << ";}\">\n";
if ((!POLYGONFLAG || skewed_flag) || ttb_flag) {
AddPointToWordPolygon(res_it, RIL_WORD, word_top_pts, word_bottom_pts,
AddPointToWordPolygon(res_it.get(), RIL_WORD, word_top_pts, word_bottom_pts,
writing_direction);
}
}

if (POLYGONFLAG && !skewed_flag && ttb_flag && LEVELFLAG == 0) {
AddPointToWordPolygon(res_it, RIL_WORD, word_top_pts, word_bottom_pts,
AddPointToWordPolygon(res_it.get(), RIL_WORD, word_top_pts, word_bottom_pts,
writing_direction);
}

// Get the word baseline information
AddBaselineToPTA(res_it, RIL_WORD, word_baseline_pts);
AddBaselineToPTA(res_it.get(), RIL_WORD, word_baseline_pts);

// Get the word text content and polygon
do {
Expand All @@ -934,7 +914,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
if (grapheme && grapheme[0] != 0) {
word_content << HOcrEscape(grapheme.get()).c_str();
if (POLYGONFLAG && !skewed_flag && !ttb_flag) {
AddPointToWordPolygon(res_it, RIL_SYMBOL, word_top_pts,
AddPointToWordPolygon(res_it.get(), RIL_SYMBOL, word_top_pts,
word_bottom_pts, writing_direction);
}
}
Expand Down Expand Up @@ -1146,7 +1126,6 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
const std::string &text = reading_order_str.str();
reading_order_str.str("");

delete res_it;
return copy_string(text);
}

Expand Down

0 comments on commit 638868e

Please sign in to comment.