From 93bf8b841d3e64d8ca108833203012ee3992b7e5 Mon Sep 17 00:00:00 2001 From: Zhou Zhiqiang Date: Fri, 1 Sep 2023 01:26:30 +0800 Subject: [PATCH 1/2] feat: structured ocr parse Signed-off-by: Zhou Zhiqiang --- Cargo.toml | 1 + Makefile | 4 + src/{ocr.rs => ocr/mod.rs} | 12 +- src/ocr/model.rs | 280 +++++++++++++++ static/tesseract.out.hocr | 717 +++++++++++++++++++++++++++++++++++++ 5 files changed, 1013 insertions(+), 1 deletion(-) rename src/{ocr.rs => ocr/mod.rs} (80%) create mode 100644 src/ocr/model.rs create mode 100644 static/tesseract.out.hocr diff --git a/Cargo.toml b/Cargo.toml index 6da2681..91e64c5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,3 +33,4 @@ rust-embed = { version = "6.8.1", features = ["axum-ex"] } mime_guess = "2" imageproc = "0.23" colorsys = "0.6" +xml = "0.8" diff --git a/Makefile b/Makefile index 1d715ef..a6cbc34 100644 --- a/Makefile +++ b/Makefile @@ -11,3 +11,7 @@ frontend-embbed: webui-export .PHONY: webui-export webui-export: cd webui && pnpm install && cp next.config.js.export next.config.js && npx next build && cp next.config.js.dev next.config.js + +.PHONY: clean +clean: + cargo clean diff --git a/src/ocr.rs b/src/ocr/mod.rs similarity index 80% rename from src/ocr.rs rename to src/ocr/mod.rs index 7044c88..baf92f8 100644 --- a/src/ocr.rs +++ b/src/ocr/mod.rs @@ -1,6 +1,10 @@ use anyhow::Ok; use async_trait::async_trait; +/// Structured recognized data, basically refers from hOCR spec. +/// https://kba.github.io/hocr-spec/1.2 +mod model; + #[derive(Debug, Clone)] pub struct RecognizeItem { pub text: String, @@ -61,8 +65,14 @@ impl TesseractOCR { #[async_trait] impl CharacterRecognizer for TesseractOCR { async fn recognize(&self, image: &image::DynamicImage) -> anyhow::Result> { - let default_args = rusty_tesseract::Args::default(); + let mut default_args = rusty_tesseract::Args::default(); + default_args + .config_variables + .insert("tessedit_create_hocr".into(), "1".into()); let ri = rusty_tesseract::Image::from_dynamic_image(image)?; + let output_string = rusty_tesseract::image_to_string(&ri, &default_args)?; + // print output_string + println!("{}", output_string); let output = rusty_tesseract::image_to_data(&ri, &default_args)?; let result: Vec = output .data diff --git a/src/ocr/model.rs b/src/ocr/model.rs new file mode 100644 index 0000000..22bbc1c --- /dev/null +++ b/src/ocr/model.rs @@ -0,0 +1,280 @@ +use std::thread::current; + +use super::MarkupBox; + +#[derive(Debug)] +pub struct BoundingBox { + pub left: u32, + pub top: u32, + pub right: u32, + pub bottom: u32, +} + +impl BoundingBox { + pub fn new(left: u32, top: u32, right: u32, bottom: u32) -> Self { + Self { + left, + top, + right, + bottom, + } + } + + pub fn new_i32(left: i32, top: i32, right: i32, bottom: i32) -> Self { + Self { + left: left as u32, + top: top as u32, + right: right as u32, + bottom: bottom as u32, + } + } +} + +impl Into for BoundingBox { + fn into(self) -> MarkupBox { + MarkupBox::new( + self.left, + self.top, + self.right - self.left, + self.bottom - self.top, + ) + } +} + +#[derive(Debug)] +pub struct Paragraph { + pub id: String, + pub bounding_box: BoundingBox, + pub language: String, + pub lines: Vec, +} + +impl Paragraph { + pub fn new(id: String, bounding_box: BoundingBox, language: String, lines: Vec) -> Self { + Self { + id, + bounding_box, + language, + lines, + } + } + + pub fn text(&self) -> String { + self.lines + .iter() + .map(|l| l.text()) + .collect::>() + .join("\n") + } +} + +#[derive(Debug)] +pub struct Line { + pub id: String, + pub bounding_box: BoundingBox, + pub words: Vec, +} +impl Line { + pub fn new(id: String, bounding_box: BoundingBox, words: Vec) -> Self { + Self { + id, + bounding_box, + words, + } + } + + pub fn text(&self) -> String { + self.words + .iter() + .map(|w| w.text()) + .collect::>() + .join(" ") + } +} + +#[derive(Debug)] +pub struct Word { + pub id: String, + pub bounding_box: BoundingBox, + pub content: String, +} + +impl Word { + pub fn new(id: String, bounding_box: BoundingBox, content: String) -> Self { + Self { + id, + bounding_box, + content, + } + } + + pub fn text(&self) -> String { + self.content.clone() + } +} + +use xml::reader::{EventReader, XmlEvent}; +pub fn parse_hocr_xml(hocr: &str) -> Vec { + let mut result = Vec::new(); + let reader = EventReader::new(hocr.as_bytes()); + + let mut current_par: Option = None; + let mut current_line: Option = None; + let mut current_word: Option = None; + + for event in reader { + match event { + Ok(XmlEvent::StartElement { + name, + attributes, + namespace, + }) => { + let par = attributes + .iter() + .find(|item| item.name.local_name == "class" && item.value.contains("ocr_par")); + if par.is_some() { + let id = attributes + .iter() + .find(|item| item.name.local_name == "id") + .unwrap() + .value + .clone(); + let title = attributes + .iter() + .find(|item| item.name.local_name == "title") + .unwrap() + .value + .clone(); + let bbox = title + .split(";") + .take(1) + .collect::() + .split_whitespace() + .skip(1) + .map(|item| item.parse::().unwrap()) + .collect::>(); + let bounding_box = BoundingBox::new_i32(bbox[0], bbox[1], bbox[2], bbox[3]); + let lang = attributes + .iter() + .find(|item| item.name.local_name == "lang") + .unwrap() + .value + .clone(); + current_par = Some(Paragraph::new(id, bounding_box, lang, Vec::new())); + } + + let line = attributes.iter().find(|item| { + item.name.local_name == "class" && item.value.contains("ocr_line") + }); + if line.is_some() { + let id = attributes + .iter() + .find(|item| item.name.local_name == "id") + .unwrap() + .value + .clone(); + let title = attributes + .iter() + .find(|item| item.name.local_name == "title") + .unwrap() + .value + .clone(); + let bbox = title + .split(";") + .take(1) + .collect::() + .split_whitespace() + .skip(1) + .map(|item| item.parse::().unwrap()) + .collect::>(); + let bounding_box = BoundingBox::new_i32(bbox[0], bbox[1], bbox[2], bbox[3]); + let words = Vec::new(); + current_line = Some(Line::new(id, bounding_box, words)) + } + let word = attributes.iter().find(|item| { + item.name.local_name == "class" && item.value.contains("ocrx_word") + }); + if word.is_some() { + let id = attributes + .iter() + .find(|item| item.name.local_name == "id") + .unwrap() + .value + .clone(); + let title = attributes + .iter() + .find(|item| item.name.local_name == "title") + .unwrap() + .value + .clone(); + let bbox = title + .split(";") + .take(1) + .collect::() + .split_whitespace() + .skip(1) + .map(|item| item.parse::().unwrap()) + .collect::>(); + let bounding_box = BoundingBox::new_i32(bbox[0], bbox[1], bbox[2], bbox[3]); + let content = String::new(); + current_word = Some(Word::new(id, bounding_box, content)) + } + } + Ok(XmlEvent::Characters(content)) => { + if let Some(word) = current_word.as_mut() { + word.content = content; + } + } + Ok(XmlEvent::EndElement { name }) => { + if name.local_name == "p" { + if let Some(par) = current_par { + result.push(par); + current_par = None; + } + } + if name.local_name == "span" { + if let Some(word) = current_word { + // closing word + if let Some(line) = current_line.as_mut() { + line.words.push(word); + } + current_word = None; + } else if let Some(line) = current_line { + // closing line + if let Some(par) = current_par.as_mut() { + par.lines.push(line); + } + current_line = None; + } + } + } + Ok(_) => {} + Err(e) => { + panic!("Error: {}", e) + } + } + } + return result; +} + +#[cfg(test)] +mod tests { + use super::parse_hocr_xml; + use std::fs::File; + use std::io::{BufReader, Read}; + + #[test] + fn test_parse_xml() -> anyhow::Result<()> { + let file = File::open("static/tesseract.out.hocr")?; + let mut file = BufReader::new(file); + let mut str = String::new(); + file.read_to_string(&mut str); + let result = parse_hocr_xml(&str); + + for p in result { + let text = p.text(); + println!("Paragraph: {:}", text); + } + Ok(()) + } +} diff --git a/static/tesseract.out.hocr b/static/tesseract.out.hocr new file mode 100644 index 0000000..d9327d5 --- /dev/null +++ b/static/tesseract.out.hocr @@ -0,0 +1,717 @@ + + + + + + + + + + +
+
+

+ + ) + +

+
+
+

+ + File + Edit + Selection + View + Go + Run + Terminal + Help + +

+
+
+

+ + © + +

+
+
+

+ + ® + 0 + ® + »p + @ + B + &V + +

+
+
+

+ + (@ + @ + @ + i, + +

+
+
+

+ + al + +

+
+
+

+ + EXPLORER + +

+
+
+

+ + / + OPEN + EDITORS + + + X + ® + screenshots.rs + examples + 2,M + + + v + SCREENSHOTS-RS + + + > + .devcontainer + + + v + examples + ° + + + ‘@ + screenshots.rs + 2,M + + + v + src + + + > + linux + + + ® + darwin.rs + + + ® + image.rs + + + ® + lib.rs + + + ® + win32.rs + + + Vv + target + + + > + debug + + + {} + .rustc_info.json + + + & + 66-2.png + + + 66.png + + + 67-2.png + + + 67.png + + + CACHEDIR.TAG + + + & + capture_display_with_point.png + +

+
+
+

+ + [T + A + 2 + 2 + +

+
+
+

+ + = + _editorconfig + + + .gitignore + +

+ +

+ + ® + build.rs + +

+ +

+ + £ + Cargo.lock + +

+ +

+ + £ + Cargo.toml + +

+ +

+ + f + LICENSE + +

+ +

+ + ® + README.md + +

+ +

+ + % + rustfmt.toml + +

+
+
+

+ + > + OUTLINE + + + > + TIMELINE + +

+
+
+

+ + {S} + > + REMIX + + + > + RUST + DEPENDENCIES + +

+
+
+

+ + screenshots.rs + - + screenshots-rs + - + Visual + Studio + Code + +

+
+
+

+ + ® + screenshots.rs + 2,M + X + + + examples + > + ® + screenshots.rs + > + @ + main + + + You, + 26 + minutes + ago + | + 2 + authors + (nashaofu + and + others) + + + 1 + use + screenshots::Screen; + + + 2 + use + std::{fs, + time::Instant}; + +

+
+
+
+
+

+ + » + Run + | + Debug + + + 4 + fn-main() + { + + + 5 + let + start: + Instant + = + Instant::now(); + + + 6 + let + screens: + Vec<Screen> + = + Screen::all().unwrap(); + + + 7 + + + 8 + for + screen: + Screen + in + screens + { + + + 9 + [println!("capturer-{screen:?}"); + ] + + + 10 + let + mut + image: + Image + = + screen.capture().unwrap(); + + + 1 + let + mut + buffer: + Vec<u8> + = + image.to_png().unwrap(); + + + 12 + fs::write(path: + format!("target/{}.png", + screen.display_info.id), + contents: + buffer)|.unwrap(); + + + ‘3§ + //-image + -=-screen.capture_area(300, + 306, + 380, + - + 380) + .unwrap(); + + + 14 + //-buffer-=-image.to_png() + .unwrap(); + + + ’5% + /] + fs::write(format!("target/{}-2.png", + screen.display_info.id), + -buffer).unwrap(); + + + —6 + : + } + + + 17 + + + 187 + | + // + let-screen-=-Screen::from_point(188,-168).unwrap(); + + + ‘9% + //-println!("capturer-{screen:?}"); + + + 20 + + + 21§ + //-1let + image + = + screen.capture_area(360, + 360, + 3068, + -368) + .unwrap(); + + + 22 + | + //-let-buffer + =-image.to_png().unwrap(); + + + 23% + /] + fs::write("target/capture_display_with_point.png", + -buffer).unwrap(); + + + 24 + + + 25 + println! + ("iz1THERY: + {:7}", + start.elapsed()); + + + 26} + +

+
+
+

+ + PROBLEMS + (2 + OUTPUT + GITLENS + DEBUG + CONSOLE + COMMENTS + TERMINAL + +

+
+
+
+

+ + warning: + variable + does + not + need + to + be + mutable + + + --> + examples/screenshots.rs:10:9 + +

+
+
+

+ + 10 + | + let + mut + image + = + screen.capture().unwrap(); + +

+
+
+

+ + | + Annnn + +

+
+
+

+ + | + + + | + help: + remove + this + ‘mut® + + + | + +

+
+
+

+ + note: + ‘#[warn(unused_mut)]* + on + by + default + +

+
+
+

+ + warning: + ‘screenshots® + (example + "screenshots") + generated + 3 + warnings + (run + ‘cargo + fix + --example + "screenshots" + to + apply + 3 + suggestions) + +

+
+
+

+ + Finished + dev + [unoptimized + + + debuginfo] + target(s) + in + 8.85s + + + Running + “target/debug/examples/screenshots® + +

+
+
+

+ + capturer + Screen + { + display_info: + DisplayInfo + { + id: + 66, + x: + @8, + y: + 8, + width: + 2568, + height: + 1448, + rotation: + 0.8, + scale_factor: + 1.5, + is_primary: + true + } + } + +

+
+
+

+ + EITHERY: + 494.776138nms + +

+
+
+

+ + capturer + Screen + { + display_info: + DisplayInfo + { + id: + 67, + x: + 2568, + y: + 8, + width: + 2568, + height: + 1448, + rotation: + 0.8, + scale_factor: + 1.5, + is_primary: + false + } + } + +

+
+
+

+ + JE{THERY: + 981.720953ms + + + JEITHERY: + 982.192745ms + + + F + Terminal + will + be + reused + by + tasks, + press + any + key + to + close + it. + +

+
+
+

+ + B + +

+
+
+
+

+ + 90O + ®d-- + +

+
+
+
+

+ + runscreenshots-Task + A + +v + [@ + @ + -+ + ~ + X + +

+
+
+ + From f06ab88e42791495d7c9ec3c9ca8a12ea2b21490 Mon Sep 17 00:00:00 2001 From: Zhou Zhiqiang Date: Fri, 1 Sep 2023 11:57:57 +0800 Subject: [PATCH 2/2] feat: hocr structured ocr result, migrate db schema Signed-off-by: Zhou Zhiqiang --- Cargo.toml | 2 +- src/analysis.rs | 12 +-- src/http/service.rs | 2 +- src/ocr/hocr_parse.rs | 168 ++++++++++++++++++++++++++++++++++++ src/ocr/mod.rs | 22 +++-- src/ocr/model.rs | 168 ------------------------------------ src/repository/in_memory.rs | 2 +- src/repository/mod.rs | 18 ++-- src/repository/sqlite.rs | 66 +++++++------- 9 files changed, 237 insertions(+), 223 deletions(-) create mode 100644 src/ocr/hocr_parse.rs diff --git a/Cargo.toml b/Cargo.toml index 91e64c5..7e835a1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ in-memory = [] [dependencies] chrono = "0.4" screenshots = "0.6.0" -rusty-tesseract = { version = "*", git = "https://github.com/STRRL/rusty-tesseract", rev = "84418ef" } +rusty-tesseract = { version = "1.1.7" } image = "0.24.6" anyhow = { version = "1.0", features = ["backtrace"] } async-trait = "0.1.71" diff --git a/src/analysis.rs b/src/analysis.rs index 224625e..6da7af9 100644 --- a/src/analysis.rs +++ b/src/analysis.rs @@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize}; use crate::{ image_archive::{ImageArchiver}, ocr::{CharacterRecognizer, RecognizeItem}, - repository::{EntityImage, EntityText, Repository}, + repository::{EntityImage, EntityWord, Repository}, screenshot::Screenshot, }; @@ -36,16 +36,16 @@ impl Analysis { let entity_image = self.repo.save_image(&entity_image).await?; let ocr_result: Vec = self.ocr.recognize(&screenshot.image).await?; - let entity_texts: Vec = ocr_result + let entity_texts: Vec = ocr_result .iter() .filter(|it| it.level == 5) - .filter_map(|it: &RecognizeItem| -> Option { it.try_into().ok() }) + .filter_map(|it: &RecognizeItem| -> Option { it.try_into().ok() }) .map(|mut it| { it.image_id = entity_image.id; it }) .collect(); - self.repo.save_texts(&entity_texts).await?; + self.repo.save_words(&entity_texts).await?; Ok(()) } @@ -64,11 +64,11 @@ impl Analysis { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SearchResult { pub image_id: u32, - pub texts: Vec, + pub texts: Vec, } impl SearchResult { - pub fn new(image_id: u32, texts: Vec) -> Self { + pub fn new(image_id: u32, texts: Vec) -> Self { Self { image_id, texts } } } diff --git a/src/http/service.rs b/src/http/service.rs index 92afa87..4c0decf 100644 --- a/src/http/service.rs +++ b/src/http/service.rs @@ -50,7 +50,7 @@ impl Service { let mut markups = Vec::new(); for text_id in text_ids { - let entity_text = self.repo.get_text_by_id(*text_id).await?; + let entity_text = self.repo.get_word_by_id(*text_id).await?; let markup_box = MarkupBox::new( entity_text.left, entity_text.top, diff --git a/src/ocr/hocr_parse.rs b/src/ocr/hocr_parse.rs new file mode 100644 index 0000000..2392d62 --- /dev/null +++ b/src/ocr/hocr_parse.rs @@ -0,0 +1,168 @@ + +use xml::reader::{EventReader, XmlEvent}; +use super::model::{Paragraph, Line, Word, BoundingBox}; + +pub fn parse_hocr_xml(hocr: &str) -> Vec { + let mut result = Vec::new(); + let reader = EventReader::new(hocr.as_bytes()); + + let mut current_par: Option = None; + let mut current_line: Option = None; + let mut current_word: Option = None; + + for event in reader { + match event { + Ok(XmlEvent::StartElement { + name, + attributes, + namespace, + }) => { + let par = attributes + .iter() + .find(|item| item.name.local_name == "class" && item.value.contains("ocr_par")); + if par.is_some() { + let id = attributes + .iter() + .find(|item| item.name.local_name == "id") + .unwrap() + .value + .clone(); + let title = attributes + .iter() + .find(|item| item.name.local_name == "title") + .unwrap() + .value + .clone(); + let bbox = title + .split(";") + .take(1) + .collect::() + .split_whitespace() + .skip(1) + .map(|item| item.parse::().unwrap()) + .collect::>(); + let bounding_box = BoundingBox::new_i32(bbox[0], bbox[1], bbox[2], bbox[3]); + let lang = attributes + .iter() + .find(|item| item.name.local_name == "lang") + .unwrap() + .value + .clone(); + current_par = Some(Paragraph::new(id, bounding_box, lang, Vec::new())); + } + + let line = attributes.iter().find(|item| { + item.name.local_name == "class" && item.value.contains("ocr_line") + }); + if line.is_some() { + let id = attributes + .iter() + .find(|item| item.name.local_name == "id") + .unwrap() + .value + .clone(); + let title = attributes + .iter() + .find(|item| item.name.local_name == "title") + .unwrap() + .value + .clone(); + let bbox = title + .split(";") + .take(1) + .collect::() + .split_whitespace() + .skip(1) + .map(|item| item.parse::().unwrap()) + .collect::>(); + let bounding_box = BoundingBox::new_i32(bbox[0], bbox[1], bbox[2], bbox[3]); + let words = Vec::new(); + current_line = Some(Line::new(id, bounding_box, words)) + } + let word = attributes.iter().find(|item| { + item.name.local_name == "class" && item.value.contains("ocrx_word") + }); + if word.is_some() { + let id = attributes + .iter() + .find(|item| item.name.local_name == "id") + .unwrap() + .value + .clone(); + let title = attributes + .iter() + .find(|item| item.name.local_name == "title") + .unwrap() + .value + .clone(); + let bbox = title + .split(";") + .take(1) + .collect::() + .split_whitespace() + .skip(1) + .map(|item| item.parse::().unwrap()) + .collect::>(); + let bounding_box = BoundingBox::new_i32(bbox[0], bbox[1], bbox[2], bbox[3]); + let content = String::new(); + current_word = Some(Word::new(id, bounding_box, content)) + } + } + Ok(XmlEvent::Characters(content)) => { + if let Some(word) = current_word.as_mut() { + word.content = content; + } + } + Ok(XmlEvent::EndElement { name }) => { + if name.local_name == "p" { + if let Some(par) = current_par { + result.push(par); + current_par = None; + } + } + if name.local_name == "span" { + if let Some(word) = current_word { + // closing word + if let Some(line) = current_line.as_mut() { + line.words.push(word); + } + current_word = None; + } else if let Some(line) = current_line { + // closing line + if let Some(par) = current_par.as_mut() { + par.lines.push(line); + } + current_line = None; + } + } + } + Ok(_) => {} + Err(e) => { + panic!("Error: {}", e) + } + } + } + return result; +} + +#[cfg(test)] +mod tests { + use super::parse_hocr_xml; + use std::fs::File; + use std::io::{BufReader, Read}; + + #[test] + fn test_parse_xml() -> anyhow::Result<()> { + let file = File::open("static/tesseract.out.hocr")?; + let mut file = BufReader::new(file); + let mut str = String::new(); + file.read_to_string(&mut str); + let result = parse_hocr_xml(&str); + + for p in result { + let text = p.text(); + println!("Paragraph: {:}", text); + } + Ok(()) + } +} diff --git a/src/ocr/mod.rs b/src/ocr/mod.rs index baf92f8..5e7c3cd 100644 --- a/src/ocr/mod.rs +++ b/src/ocr/mod.rs @@ -1,6 +1,9 @@ use anyhow::Ok; use async_trait::async_trait; +use self::{hocr_parse::parse_hocr_xml, model::Paragraph}; + +mod hocr_parse; /// Structured recognized data, basically refers from hOCR spec. /// https://kba.github.io/hocr-spec/1.2 mod model; @@ -51,7 +54,9 @@ impl MarkupBox { #[async_trait] pub trait CharacterRecognizer { + #[deprecated] async fn recognize(&self, image: &image::DynamicImage) -> anyhow::Result>; + async fn recognize_hocr(&self, image: &image::DynamicImage) -> anyhow::Result>; } pub struct TesseractOCR {} @@ -66,13 +71,7 @@ impl TesseractOCR { impl CharacterRecognizer for TesseractOCR { async fn recognize(&self, image: &image::DynamicImage) -> anyhow::Result> { let mut default_args = rusty_tesseract::Args::default(); - default_args - .config_variables - .insert("tessedit_create_hocr".into(), "1".into()); let ri = rusty_tesseract::Image::from_dynamic_image(image)?; - let output_string = rusty_tesseract::image_to_string(&ri, &default_args)?; - // print output_string - println!("{}", output_string); let output = rusty_tesseract::image_to_data(&ri, &default_args)?; let result: Vec = output .data @@ -85,4 +84,15 @@ impl CharacterRecognizer for TesseractOCR { .collect(); Ok(result) } + + async fn recognize_hocr(&self, image: &image::DynamicImage) -> anyhow::Result> { + let mut default_args = rusty_tesseract::Args::default(); + default_args + .config_variables + .insert("tessedit_create_hocr".into(), "1".into()); + let ri = rusty_tesseract::Image::from_dynamic_image(image)?; + let output_hocr = rusty_tesseract::image_to_string(&ri, &default_args)?; + let result = parse_hocr_xml(&output_hocr); + Ok(result) + } } diff --git a/src/ocr/model.rs b/src/ocr/model.rs index 22bbc1c..97ea5b8 100644 --- a/src/ocr/model.rs +++ b/src/ocr/model.rs @@ -1,5 +1,3 @@ -use std::thread::current; - use super::MarkupBox; #[derive(Debug)] @@ -112,169 +110,3 @@ impl Word { self.content.clone() } } - -use xml::reader::{EventReader, XmlEvent}; -pub fn parse_hocr_xml(hocr: &str) -> Vec { - let mut result = Vec::new(); - let reader = EventReader::new(hocr.as_bytes()); - - let mut current_par: Option = None; - let mut current_line: Option = None; - let mut current_word: Option = None; - - for event in reader { - match event { - Ok(XmlEvent::StartElement { - name, - attributes, - namespace, - }) => { - let par = attributes - .iter() - .find(|item| item.name.local_name == "class" && item.value.contains("ocr_par")); - if par.is_some() { - let id = attributes - .iter() - .find(|item| item.name.local_name == "id") - .unwrap() - .value - .clone(); - let title = attributes - .iter() - .find(|item| item.name.local_name == "title") - .unwrap() - .value - .clone(); - let bbox = title - .split(";") - .take(1) - .collect::() - .split_whitespace() - .skip(1) - .map(|item| item.parse::().unwrap()) - .collect::>(); - let bounding_box = BoundingBox::new_i32(bbox[0], bbox[1], bbox[2], bbox[3]); - let lang = attributes - .iter() - .find(|item| item.name.local_name == "lang") - .unwrap() - .value - .clone(); - current_par = Some(Paragraph::new(id, bounding_box, lang, Vec::new())); - } - - let line = attributes.iter().find(|item| { - item.name.local_name == "class" && item.value.contains("ocr_line") - }); - if line.is_some() { - let id = attributes - .iter() - .find(|item| item.name.local_name == "id") - .unwrap() - .value - .clone(); - let title = attributes - .iter() - .find(|item| item.name.local_name == "title") - .unwrap() - .value - .clone(); - let bbox = title - .split(";") - .take(1) - .collect::() - .split_whitespace() - .skip(1) - .map(|item| item.parse::().unwrap()) - .collect::>(); - let bounding_box = BoundingBox::new_i32(bbox[0], bbox[1], bbox[2], bbox[3]); - let words = Vec::new(); - current_line = Some(Line::new(id, bounding_box, words)) - } - let word = attributes.iter().find(|item| { - item.name.local_name == "class" && item.value.contains("ocrx_word") - }); - if word.is_some() { - let id = attributes - .iter() - .find(|item| item.name.local_name == "id") - .unwrap() - .value - .clone(); - let title = attributes - .iter() - .find(|item| item.name.local_name == "title") - .unwrap() - .value - .clone(); - let bbox = title - .split(";") - .take(1) - .collect::() - .split_whitespace() - .skip(1) - .map(|item| item.parse::().unwrap()) - .collect::>(); - let bounding_box = BoundingBox::new_i32(bbox[0], bbox[1], bbox[2], bbox[3]); - let content = String::new(); - current_word = Some(Word::new(id, bounding_box, content)) - } - } - Ok(XmlEvent::Characters(content)) => { - if let Some(word) = current_word.as_mut() { - word.content = content; - } - } - Ok(XmlEvent::EndElement { name }) => { - if name.local_name == "p" { - if let Some(par) = current_par { - result.push(par); - current_par = None; - } - } - if name.local_name == "span" { - if let Some(word) = current_word { - // closing word - if let Some(line) = current_line.as_mut() { - line.words.push(word); - } - current_word = None; - } else if let Some(line) = current_line { - // closing line - if let Some(par) = current_par.as_mut() { - par.lines.push(line); - } - current_line = None; - } - } - } - Ok(_) => {} - Err(e) => { - panic!("Error: {}", e) - } - } - } - return result; -} - -#[cfg(test)] -mod tests { - use super::parse_hocr_xml; - use std::fs::File; - use std::io::{BufReader, Read}; - - #[test] - fn test_parse_xml() -> anyhow::Result<()> { - let file = File::open("static/tesseract.out.hocr")?; - let mut file = BufReader::new(file); - let mut str = String::new(); - file.read_to_string(&mut str); - let result = parse_hocr_xml(&str); - - for p in result { - let text = p.text(); - println!("Paragraph: {:}", text); - } - Ok(()) - } -} diff --git a/src/repository/in_memory.rs b/src/repository/in_memory.rs index 9470da7..867e950 100644 --- a/src/repository/in_memory.rs +++ b/src/repository/in_memory.rs @@ -1,5 +1,5 @@ #[cfg(feature = "in-memory")] -use {async_trait::async_trait, tokio::sync::Mutex, super::{EntityImage, EntityText, Repository}}; +use {async_trait::async_trait, tokio::sync::Mutex, super::{EntityImage, EntityWord, Repository}}; #[cfg(feature = "in-memory")] pub struct InMemoryRepository { diff --git a/src/repository/mod.rs b/src/repository/mod.rs index 94518f2..71d3d6d 100644 --- a/src/repository/mod.rs +++ b/src/repository/mod.rs @@ -28,17 +28,17 @@ impl EntityImage { } #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct EntityText { +pub struct EntityWord { pub id: u32, pub image_id: u32, - pub text: String, + pub content: String, pub left: u32, pub top: u32, pub width: u32, pub height: u32, } -impl EntityText { +impl EntityWord { pub fn new( id: u32, image_id: u32, @@ -51,7 +51,7 @@ impl EntityText { Self { id, image_id, - text, + content: text, left, top, width, @@ -60,7 +60,7 @@ impl EntityText { } } -impl TryFrom<&crate::ocr::RecognizeItem> for EntityText { +impl TryFrom<&crate::ocr::RecognizeItem> for EntityWord { type Error = anyhow::Error; fn try_from(value: &crate::ocr::RecognizeItem) -> anyhow::Result { @@ -81,8 +81,8 @@ impl TryFrom<&crate::ocr::RecognizeItem> for EntityText { pub trait Repository { async fn save_image(&self, entity: &EntityImage) -> anyhow::Result; async fn get_image_by_id(&self, id: u32) -> anyhow::Result; - async fn save_text(&self, entity: &EntityText) -> anyhow::Result; - async fn save_texts(&self, entities: &[EntityText]) -> anyhow::Result>; - async fn get_text_by_id(&self, id: u32) -> anyhow::Result; - async fn full_text_search(&self, text: &str) -> anyhow::Result>; + async fn save_word(&self, entity: &EntityWord) -> anyhow::Result; + async fn save_words(&self, entities: &[EntityWord]) -> anyhow::Result>; + async fn get_word_by_id(&self, id: u32) -> anyhow::Result; + async fn full_text_search(&self, text: &str) -> anyhow::Result>; } diff --git a/src/repository/sqlite.rs b/src/repository/sqlite.rs index fcdad15..063b471 100644 --- a/src/repository/sqlite.rs +++ b/src/repository/sqlite.rs @@ -1,4 +1,4 @@ -use super::{EntityImage, EntityText, Repository}; +use super::{EntityImage, EntityWord, Repository}; use anyhow::Result; use async_trait::async_trait; use futures::TryStreamExt; @@ -26,10 +26,10 @@ impl SqliteRepository { .await?; sqlx::query( - "CREATE TABLE IF NOT EXISTS texts ( + "CREATE TABLE IF NOT EXISTS word ( id INTEGER PRIMARY KEY, image_id INTEGER NOT NULL, - text TEXT NOT NULL, + content TEXT NOT NULL, left INTEGER NOT NULL, top INTEGER NOT NULL, width INTEGER NOT NULL, @@ -69,8 +69,10 @@ impl Repository for SqliteRepository { } async fn get_image_by_id(&self, id: u32) -> Result { - let query = - sqlx::query("SELECT archive_type, archive_info, captured_at_epoch FROM images WHERE id = ?").bind(id); + let query = sqlx::query( + "SELECT archive_type, archive_info, captured_at_epoch FROM images WHERE id = ?", + ) + .bind(id); let row = query.fetch_one(&self.pool).await?; let archive_type: String = row.get(0); let archive_info: String = row.get(1); @@ -83,13 +85,13 @@ impl Repository for SqliteRepository { }) } - async fn save_text(&self, entity: &EntityText) -> Result { + async fn save_word(&self, entity: &EntityWord) -> Result { let query = sqlx::query( - "INSERT INTO texts (image_id, text, left, top, width, height) VALUES (?, ?, ?, ?, ?, ?)", + "INSERT INTO word (image_id, content, left, top, width, height) VALUES (?, ?, ?, ?, ?, ?)", ); let query_result = query .bind(entity.image_id) - .bind(&entity.text) + .bind(&entity.content) .bind(entity.left) .bind(entity.top) .bind(entity.width) @@ -100,14 +102,14 @@ impl Repository for SqliteRepository { // insert into table text_fts let query = sqlx::query("INSERT INTO text_fts (text, text_id) VALUES (?, ?)"); query - .bind(&entity.text) + .bind(&entity.content) .bind(id) .execute(&self.pool) .await?; - Ok(EntityText { + Ok(EntityWord { id, image_id: entity.image_id, - text: entity.text.clone(), + content: entity.content.clone(), left: entity.left, top: entity.top, width: entity.width, @@ -115,14 +117,15 @@ impl Repository for SqliteRepository { }) } - async fn save_texts(&self, entities: &[EntityText]) -> Result> { - let mut builder = - sqlx::QueryBuilder::new("INSERT INTO texts (image_id, text, left, top, width, height)"); - builder.push_values(entities, |mut b, it| { + async fn save_words(&self, words: &[EntityWord]) -> Result> { + let mut builder = sqlx::QueryBuilder::new( + "INSERT INTO word (image_id, content, left, top, width, height)", + ); + builder.push_values(words, |mut b, it| { b.push(it.image_id) // TODO: sqlx just concat the SQL string without quoting, so we have to do it manually. // TODO: and it's not safe at all. - .push(format!("'{}'", it.text.clone().replace('\'', "''"))) + .push(format!("'{}'", it.content.clone().replace('\'', "''"))) .push(it.left) .push(it.top) .push(it.width) @@ -135,15 +138,15 @@ impl Repository for SqliteRepository { let id_start = 1 + last_insert_rowid as u32 - rows_affected as u32; - let result = entities + let result = words .iter() .enumerate() - .map(|(i, it)| EntityText { + .map(|(i, it)| EntityWord { id: id_start + i as u32, image_id: it.image_id, // TODO: sqlx just concat the SQL string without quoting, so we have to do it manually. // TODO: and it's not safe at all. - text: (format!("'{}'", it.text.clone().replace('\'', "''"))), + content: (format!("'{}'", it.content.clone().replace('\'', "''"))), left: it.left, top: it.top, width: it.width, @@ -153,8 +156,8 @@ impl Repository for SqliteRepository { let mut builder = sqlx::QueryBuilder::new("INSERT INTO text_fts (text, text_id)"); - builder.push_values(&result, |mut b, it: &EntityText| { - b.push(it.text.clone()).push(it.id); + builder.push_values(&result, |mut b, it: &EntityWord| { + b.push(it.content.clone()).push(it.id); }); let query = builder.build(); query.execute(&self.pool).await?; @@ -162,21 +165,22 @@ impl Repository for SqliteRepository { Ok(result) } - async fn get_text_by_id(&self, id: u32) -> Result { - let query = - sqlx::query("SELECT image_id, text, left, top, width, height FROM texts WHERE id = ?") - .bind(id); + async fn get_word_by_id(&self, id: u32) -> Result { + let query = sqlx::query( + "SELECT image_id, content, left, top, width, height FROM word WHERE id = ?", + ) + .bind(id); let row = query.fetch_one(&self.pool).await?; let image_id: u32 = row.get(0); - let text: String = row.get(1); + let content: String = row.get(1); let left: u32 = row.get(2); let top: u32 = row.get(3); let width: u32 = row.get(4); let height: u32 = row.get(5); - Ok(EntityText { + Ok(EntityWord { id, image_id, - text, + content: content, left, top, width, @@ -184,13 +188,13 @@ impl Repository for SqliteRepository { }) } - async fn full_text_search(&self, text: &str) -> Result> { - let query = sqlx::query("SELECT text_id FROM text_fts WHERE text_fts MATCH ?1").bind(text); + async fn full_text_search(&self, text: &str) -> Result> { + let query = sqlx::query("SELECT text_id FROM text_fts WHERE text_fts MATCH ?").bind(text); let mut rows = query.fetch(&self.pool); let mut result = vec![]; while let Some(row) = rows.try_next().await? { let text_id: u32 = row.get(0); - let entity = self.get_text_by_id(text_id).await?; + let entity = self.get_word_by_id(text_id).await?; result.push(entity); } Ok(result)