From 142b4d1de1deb35ef323c706bbcda402039ed6d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B4=AE=E7=94=9F?= Date: Sun, 16 Jun 2024 22:00:19 +0800 Subject: [PATCH 1/4] =?UTF-8?q?=E4=BC=98=E5=8C=96setImageOCRText=E6=8E=A5?= =?UTF-8?q?=E5=8F=A3=E8=B0=83=E7=94=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/src/menus/protyle.ts | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/app/src/menus/protyle.ts b/app/src/menus/protyle.ts index 1e59bb1192d..cc174de01b9 100644 --- a/app/src/menus/protyle.ts +++ b/app/src/menus/protyle.ts @@ -1019,7 +1019,9 @@ export const imgMenu = (protyle: IProtyle, range: Range, assetElement: HTMLEleme fetchPost("/api/asset/getImageOCRText", { path: imgElement.getAttribute("src") }, (response) => { - element.querySelector("textarea").value = response.data.text; + const textarea =element.querySelector("textarea") + textarea.value = response.data.text; + textarea.dataset.ocrText = response.data.text; }); } }, { @@ -1031,11 +1033,6 @@ export const imgMenu = (protyle: IProtyle, range: Range, assetElement: HTMLEleme fetchPost("/api/asset/ocr", { path: imgElement.getAttribute("src"), force: true - }, (response) => { - fetchPost("/api/asset/setImageOCRText", { - path: imgElement.getAttribute("src"), - text: response.data.text - }); }); } }], @@ -1119,6 +1116,13 @@ export const imgMenu = (protyle: IProtyle, range: Range, assetElement: HTMLEleme const textElements = window.siyuan.menus.menu.element.querySelectorAll("textarea"); textElements[0].focus(); window.siyuan.menus.menu.removeCB = () => { + const ocrElement = window.siyuan.menus.menu.element.querySelector('[data-type="ocr"]') as HTMLTextAreaElement; + if (ocrElement && ocrElement.dataset.ocrText !== ocrElement.value) { + fetchPost("/api/asset/setImageOCRText", { + path: imgElement.getAttribute("src"), + text: ocrElement.value + }); + } imgElement.setAttribute("alt", textElements[2].value.replace(/\n|\r\n|\r|\u2028|\u2029/g, "")); nodeElement.setAttribute("updated", dayjs().format("YYYYMMDDHHmmss")); updateTransaction(protyle, id, nodeElement.outerHTML, html); From 6a5490e27279b72742144136feafeb13b4f98338 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B4=AE=E7=94=9F?= Date: Sun, 16 Jun 2024 22:34:56 +0800 Subject: [PATCH 2/4] =?UTF-8?q?=E6=89=A9=E5=B1=95=20ocr=20=E6=8E=A5?= =?UTF-8?q?=E5=8F=A3=EF=BC=8C=E6=B7=BB=E5=8A=A0=20ocrJSON=20=E8=BF=94?= =?UTF-8?q?=E5=9B=9E=E5=AD=97=E6=AE=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- kernel/api/asset.go | 8 ++--- kernel/model/ocr.go | 2 +- kernel/sql/block.go | 2 +- kernel/util/ocr.go | 73 +++++++++++++++++++++++++++++++-------------- 4 files changed, 55 insertions(+), 30 deletions(-) diff --git a/kernel/api/asset.go b/kernel/api/asset.go index 2b3ad722328..08b11f5e143 100644 --- a/kernel/api/asset.go +++ b/kernel/api/asset.go @@ -137,13 +137,11 @@ func ocr(c *gin.Context) { } path := arg["path"].(string) - force := false - if forceArg := arg["force"]; nil != forceArg { - force = forceArg.(bool) - } + ocrJSON := util.OcrAsset(path) ret.Data = map[string]interface{}{ - "text": util.OcrAsset(path, force), + "text": util.GetOcrJsonText(ocrJSON), + "ocrJSON": ocrJSON, } } diff --git a/kernel/model/ocr.go b/kernel/model/ocr.go index 61e9c92edaa..638c5ca6b2c 100644 --- a/kernel/model/ocr.go +++ b/kernel/model/ocr.go @@ -33,7 +33,7 @@ func autoOCRAssets() { assets := getUnOCRAssetsAbsPaths() if 0 < len(assets) { for i, assetAbsPath := range assets { - text := util.Tesseract(assetAbsPath) + text := util.GetOcrJsonText(util.Tesseract(assetAbsPath)) p := strings.TrimPrefix(assetAbsPath, assetsPath) p = "assets" + filepath.ToSlash(p) util.SetAssetText(p, text) diff --git a/kernel/sql/block.go b/kernel/sql/block.go index f6efa01529a..9266d82f073 100644 --- a/kernel/sql/block.go +++ b/kernel/sql/block.go @@ -198,7 +198,7 @@ func nodeStaticContent(node *ast.Node, excludeTypes []string, includeTextMarkATi var linkDestStr, ocrText string if nil != linkDest { linkDestStr = linkDest.TokensStr() - ocrText = util.OcrAsset(linkDestStr, false) + ocrText = util.GetAssetText(linkDestStr) } linkText := n.ChildByType(ast.NodeLinkText) diff --git a/kernel/util/ocr.go b/kernel/util/ocr.go index 3f2e8c3b6bc..b049da647d8 100644 --- a/kernel/util/ocr.go +++ b/kernel/util/ocr.go @@ -149,22 +149,16 @@ func ExistsAssetText(asset string) (ret bool) { return } -func OcrAsset(asset string, force bool) (ret string) { - if !force { - assetsTextsLock.Lock() - ret = assetsTexts[asset] - assetsTextsLock.Unlock() - return - } - +func OcrAsset(asset string) (ret []map[string]interface{}) { assetsPath := GetDataAssetsAbsPath() assetAbsPath := strings.TrimPrefix(asset, "assets") assetAbsPath = filepath.Join(assetsPath, assetAbsPath) ret = Tesseract(assetAbsPath) assetsTextsLock.Lock() - assetsTexts[asset] = ret + ocrText := GetOcrJsonText(ret) + assetsTexts[asset] = ocrText assetsTextsLock.Unlock() - if "" != ret { + if "" != ocrText { assetsTextsChanged.Store(true) } return @@ -184,9 +178,9 @@ func IsTesseractExtractable(p string) bool { // tesseractOCRLock 用于 Tesseract OCR 加锁串行执行提升稳定性 https://github.com/siyuan-note/siyuan/issues/7265 var tesseractOCRLock = sync.Mutex{} -func Tesseract(imgAbsPath string) string { +func Tesseract(imgAbsPath string) (ret []map[string]interface{}) { if ContainerStd != Container || !TesseractEnabled { - return "" + return } defer logging.Recover() @@ -194,16 +188,16 @@ func Tesseract(imgAbsPath string) string { defer tesseractOCRLock.Unlock() if !IsTesseractExtractable(imgAbsPath) { - return "" + return } info, err := os.Stat(imgAbsPath) if nil != err { - return "" + return } if TesseractMaxSize < uint64(info.Size()) { - return "" + return } defer logging.Recover() @@ -211,25 +205,58 @@ func Tesseract(imgAbsPath string) string { ctx, cancel := context.WithTimeout(context.Background(), 7*time.Second) defer cancel() - cmd := exec.CommandContext(ctx, TesseractBin, "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(TesseractLangs, "+")) + cmd := exec.CommandContext(ctx, TesseractBin, "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(TesseractLangs, "+"), "tsv") gulu.CmdAttr(cmd) output, err := cmd.CombinedOutput() if ctx.Err() == context.DeadlineExceeded { logging.LogWarnf("tesseract [path=%s, size=%d] timeout", imgAbsPath, info.Size()) - return "" + return } if nil != err { logging.LogWarnf("tesseract [path=%s, size=%d] failed: %s", imgAbsPath, info.Size(), err) - return "" + return + } + + tsv := string(output) + + // 按行分割 TSV 数据 + lines := strings.Split(tsv, "\r\n") + + // 解析 TSV 数据 跳过标题行,从第二行开始处理 + for _, line := range lines[1:] { + if line == "" { + continue // 跳过空行 + } + // 分割每列数据 + fields := strings.Split(line, "\t") + // 将字段名和字段值映射到一个 map 中 + dataMap := make(map[string]interface{}) + for i, header := range strings.Split(lines[0], "\t") { + dataMap[header] = fields[i] + } + ret = append(ret, dataMap) } - ret := string(output) - ret = gulu.Str.RemoveInvisible(ret) - ret = RemoveRedundantSpace(ret) - msg := fmt.Sprintf("OCR [%s] [%s]", html.EscapeString(info.Name()), html.EscapeString(ret)) + tsv = gulu.Str.RemoveInvisible(tsv) + tsv = RemoveRedundantSpace(tsv) + msg := fmt.Sprintf("OCR [%s] [%s]", html.EscapeString(info.Name()), html.EscapeString(GetOcrJsonText(ret))) PushStatusBar(msg) - return ret + return +} + +// 提取并连接所有 text 字段的函数 +func GetOcrJsonText(jsonData []map[string]interface{}) (textString string) { + for _, dataMap := range jsonData { + // 检查 text 字段是否存在 + if text, ok := dataMap["text"]; ok { + // 确保 text 是字符串类型 + if textStr, ok := text.(string); ok { + textString += textStr + } + } + } + return textString } var tesseractInited = atomic.Bool{} From 3d8057d5e50f70b0833eba98ce9455de45fd421c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B4=AE=E7=94=9F?= Date: Sun, 16 Jun 2024 22:38:32 +0800 Subject: [PATCH 3/4] =?UTF-8?q?=E8=BF=87=E6=BB=A4=E4=B8=8D=E5=8F=AF?= =?UTF-8?q?=E8=A7=81=E5=AD=97=E7=AC=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- kernel/util/ocr.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/util/ocr.go b/kernel/util/ocr.go index b049da647d8..6ca06f12527 100644 --- a/kernel/util/ocr.go +++ b/kernel/util/ocr.go @@ -246,17 +246,19 @@ func Tesseract(imgAbsPath string) (ret []map[string]interface{}) { } // 提取并连接所有 text 字段的函数 -func GetOcrJsonText(jsonData []map[string]interface{}) (textString string) { +func GetOcrJsonText(jsonData []map[string]interface{}) (ret string) { for _, dataMap := range jsonData { // 检查 text 字段是否存在 if text, ok := dataMap["text"]; ok { // 确保 text 是字符串类型 if textStr, ok := text.(string); ok { - textString += textStr + ret += textStr } } } - return textString + ret = gulu.Str.RemoveInvisible(ret) + ret = RemoveRedundantSpace(ret) + return ret } var tesseractInited = atomic.Bool{} From bfcd622521a074255276a452db8d08af29708efd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B4=AE=E7=94=9F?= Date: Sun, 16 Jun 2024 22:43:03 +0800 Subject: [PATCH 4/4] =?UTF-8?q?=E8=BF=94=E5=9B=9E=E7=9A=84ocr=E6=96=87?= =?UTF-8?q?=E6=9C=AC=E6=B7=BB=E5=8A=A0=E7=A9=BA=E6=A0=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- kernel/util/ocr.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/util/ocr.go b/kernel/util/ocr.go index 6ca06f12527..2edb9bb3f4f 100644 --- a/kernel/util/ocr.go +++ b/kernel/util/ocr.go @@ -252,7 +252,7 @@ func GetOcrJsonText(jsonData []map[string]interface{}) (ret string) { if text, ok := dataMap["text"]; ok { // 确保 text 是字符串类型 if textStr, ok := text.(string); ok { - ret += textStr + ret += " " + textStr } } }