Skip to content

Commit

Permalink
Kernel API OCR returns text coordinate information (#11738)
Browse files Browse the repository at this point in the history
* 优化setImageOCRText接口调用

* 扩展 ocr 接口,添加 ocrJSON 返回字段

* 过滤不可见字符

* 返回的ocr文本添加空格
  • Loading branch information
2234839 authored Jun 16, 2024
1 parent 974f1c1 commit c0bd645
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 34 deletions.
16 changes: 10 additions & 6 deletions app/src/menus/protyle.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1019,7 +1019,9 @@ export const imgMenu = (protyle: IProtyle, range: Range, assetElement: HTMLEleme
fetchPost("/api/asset/getImageOCRText", {
path: imgElement.getAttribute("src")
}, (response) => {
element.querySelector("textarea").value = response.data.text;
const textarea =element.querySelector("textarea")
textarea.value = response.data.text;
textarea.dataset.ocrText = response.data.text;
});
}
}, {
Expand All @@ -1031,11 +1033,6 @@ export const imgMenu = (protyle: IProtyle, range: Range, assetElement: HTMLEleme
fetchPost("/api/asset/ocr", {
path: imgElement.getAttribute("src"),
force: true
}, (response) => {
fetchPost("/api/asset/setImageOCRText", {
path: imgElement.getAttribute("src"),
text: response.data.text
});
});
}
}],
Expand Down Expand Up @@ -1119,6 +1116,13 @@ export const imgMenu = (protyle: IProtyle, range: Range, assetElement: HTMLEleme
const textElements = window.siyuan.menus.menu.element.querySelectorAll("textarea");
textElements[0].focus();
window.siyuan.menus.menu.removeCB = () => {
const ocrElement = window.siyuan.menus.menu.element.querySelector('[data-type="ocr"]') as HTMLTextAreaElement;
if (ocrElement && ocrElement.dataset.ocrText !== ocrElement.value) {
fetchPost("/api/asset/setImageOCRText", {
path: imgElement.getAttribute("src"),
text: ocrElement.value
});
}
imgElement.setAttribute("alt", textElements[2].value.replace(/\n|\r\n|\r|\u2028|\u2029/g, ""));
nodeElement.setAttribute("updated", dayjs().format("YYYYMMDDHHmmss"));
updateTransaction(protyle, id, nodeElement.outerHTML, html);
Expand Down
8 changes: 3 additions & 5 deletions kernel/api/asset.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,13 +137,11 @@ func ocr(c *gin.Context) {
}

path := arg["path"].(string)
force := false
if forceArg := arg["force"]; nil != forceArg {
force = forceArg.(bool)
}

ocrJSON := util.OcrAsset(path)
ret.Data = map[string]interface{}{
"text": util.OcrAsset(path, force),
"text": util.GetOcrJsonText(ocrJSON),
"ocrJSON": ocrJSON,
}
}

Expand Down
2 changes: 1 addition & 1 deletion kernel/model/ocr.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ func autoOCRAssets() {
assets := getUnOCRAssetsAbsPaths()
if 0 < len(assets) {
for i, assetAbsPath := range assets {
text := util.Tesseract(assetAbsPath)
text := util.GetOcrJsonText(util.Tesseract(assetAbsPath))
p := strings.TrimPrefix(assetAbsPath, assetsPath)
p = "assets" + filepath.ToSlash(p)
util.SetAssetText(p, text)
Expand Down
2 changes: 1 addition & 1 deletion kernel/sql/block.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ func nodeStaticContent(node *ast.Node, excludeTypes []string, includeTextMarkATi
var linkDestStr, ocrText string
if nil != linkDest {
linkDestStr = linkDest.TokensStr()
ocrText = util.OcrAsset(linkDestStr, false)
ocrText = util.GetAssetText(linkDestStr)
}

linkText := n.ChildByType(ast.NodeLinkText)
Expand Down
71 changes: 50 additions & 21 deletions kernel/util/ocr.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,22 +149,16 @@ func ExistsAssetText(asset string) (ret bool) {
return
}

func OcrAsset(asset string, force bool) (ret string) {
if !force {
assetsTextsLock.Lock()
ret = assetsTexts[asset]
assetsTextsLock.Unlock()
return
}

func OcrAsset(asset string) (ret []map[string]interface{}) {
assetsPath := GetDataAssetsAbsPath()
assetAbsPath := strings.TrimPrefix(asset, "assets")
assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
ret = Tesseract(assetAbsPath)
assetsTextsLock.Lock()
assetsTexts[asset] = ret
ocrText := GetOcrJsonText(ret)
assetsTexts[asset] = ocrText
assetsTextsLock.Unlock()
if "" != ret {
if "" != ocrText {
assetsTextsChanged.Store(true)
}
return
Expand All @@ -184,51 +178,86 @@ func IsTesseractExtractable(p string) bool {
// tesseractOCRLock 用于 Tesseract OCR 加锁串行执行提升稳定性 https://github.com/siyuan-note/siyuan/issues/7265
var tesseractOCRLock = sync.Mutex{}

func Tesseract(imgAbsPath string) string {
func Tesseract(imgAbsPath string) (ret []map[string]interface{}) {
if ContainerStd != Container || !TesseractEnabled {
return ""
return
}

defer logging.Recover()
tesseractOCRLock.Lock()
defer tesseractOCRLock.Unlock()

if !IsTesseractExtractable(imgAbsPath) {
return ""
return
}

info, err := os.Stat(imgAbsPath)
if nil != err {
return ""
return
}

if TesseractMaxSize < uint64(info.Size()) {
return ""
return
}

defer logging.Recover()

ctx, cancel := context.WithTimeout(context.Background(), 7*time.Second)
defer cancel()

cmd := exec.CommandContext(ctx, TesseractBin, "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(TesseractLangs, "+"))
cmd := exec.CommandContext(ctx, TesseractBin, "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(TesseractLangs, "+"), "tsv")
gulu.CmdAttr(cmd)
output, err := cmd.CombinedOutput()
if ctx.Err() == context.DeadlineExceeded {
logging.LogWarnf("tesseract [path=%s, size=%d] timeout", imgAbsPath, info.Size())
return ""
return
}

if nil != err {
logging.LogWarnf("tesseract [path=%s, size=%d] failed: %s", imgAbsPath, info.Size(), err)
return ""
return
}

ret := string(output)
tsv := string(output)

// 按行分割 TSV 数据
lines := strings.Split(tsv, "\r\n")

// 解析 TSV 数据 跳过标题行,从第二行开始处理
for _, line := range lines[1:] {
if line == "" {
continue // 跳过空行
}
// 分割每列数据
fields := strings.Split(line, "\t")
// 将字段名和字段值映射到一个 map 中
dataMap := make(map[string]interface{})
for i, header := range strings.Split(lines[0], "\t") {
dataMap[header] = fields[i]
}
ret = append(ret, dataMap)
}

tsv = gulu.Str.RemoveInvisible(tsv)
tsv = RemoveRedundantSpace(tsv)
msg := fmt.Sprintf("OCR [%s] [%s]", html.EscapeString(info.Name()), html.EscapeString(GetOcrJsonText(ret)))
PushStatusBar(msg)
return
}

// 提取并连接所有 text 字段的函数
func GetOcrJsonText(jsonData []map[string]interface{}) (ret string) {
for _, dataMap := range jsonData {
// 检查 text 字段是否存在
if text, ok := dataMap["text"]; ok {
// 确保 text 是字符串类型
if textStr, ok := text.(string); ok {
ret += " " + textStr
}
}
}
ret = gulu.Str.RemoveInvisible(ret)
ret = RemoveRedundantSpace(ret)
msg := fmt.Sprintf("OCR [%s] [%s]", html.EscapeString(info.Name()), html.EscapeString(ret))
PushStatusBar(msg)
return ret
}

Expand Down

0 comments on commit c0bd645

Please sign in to comment.