Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kernel API OCR returns text coordinate information #11738

Merged
merged 4 commits into from
Jun 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions app/src/menus/protyle.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1019,7 +1019,9 @@ export const imgMenu = (protyle: IProtyle, range: Range, assetElement: HTMLEleme
fetchPost("/api/asset/getImageOCRText", {
path: imgElement.getAttribute("src")
}, (response) => {
element.querySelector("textarea").value = response.data.text;
const textarea =element.querySelector("textarea")
textarea.value = response.data.text;
textarea.dataset.ocrText = response.data.text;
});
}
}, {
Expand All @@ -1031,11 +1033,6 @@ export const imgMenu = (protyle: IProtyle, range: Range, assetElement: HTMLEleme
fetchPost("/api/asset/ocr", {
path: imgElement.getAttribute("src"),
force: true
}, (response) => {
fetchPost("/api/asset/setImageOCRText", {
path: imgElement.getAttribute("src"),
text: response.data.text
});
});
}
}],
Expand Down Expand Up @@ -1119,6 +1116,13 @@ export const imgMenu = (protyle: IProtyle, range: Range, assetElement: HTMLEleme
const textElements = window.siyuan.menus.menu.element.querySelectorAll("textarea");
textElements[0].focus();
window.siyuan.menus.menu.removeCB = () => {
const ocrElement = window.siyuan.menus.menu.element.querySelector('[data-type="ocr"]') as HTMLTextAreaElement;
if (ocrElement && ocrElement.dataset.ocrText !== ocrElement.value) {
fetchPost("/api/asset/setImageOCRText", {
path: imgElement.getAttribute("src"),
text: ocrElement.value
});
}
imgElement.setAttribute("alt", textElements[2].value.replace(/\n|\r\n|\r|\u2028|\u2029/g, ""));
nodeElement.setAttribute("updated", dayjs().format("YYYYMMDDHHmmss"));
updateTransaction(protyle, id, nodeElement.outerHTML, html);
Expand Down
8 changes: 3 additions & 5 deletions kernel/api/asset.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,13 +137,11 @@ func ocr(c *gin.Context) {
}

path := arg["path"].(string)
force := false
if forceArg := arg["force"]; nil != forceArg {
force = forceArg.(bool)
}

ocrJSON := util.OcrAsset(path)
ret.Data = map[string]interface{}{
"text": util.OcrAsset(path, force),
"text": util.GetOcrJsonText(ocrJSON),
"ocrJSON": ocrJSON,
}
}

Expand Down
2 changes: 1 addition & 1 deletion kernel/model/ocr.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ func autoOCRAssets() {
assets := getUnOCRAssetsAbsPaths()
if 0 < len(assets) {
for i, assetAbsPath := range assets {
text := util.Tesseract(assetAbsPath)
text := util.GetOcrJsonText(util.Tesseract(assetAbsPath))
p := strings.TrimPrefix(assetAbsPath, assetsPath)
p = "assets" + filepath.ToSlash(p)
util.SetAssetText(p, text)
Expand Down
2 changes: 1 addition & 1 deletion kernel/sql/block.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ func nodeStaticContent(node *ast.Node, excludeTypes []string, includeTextMarkATi
var linkDestStr, ocrText string
if nil != linkDest {
linkDestStr = linkDest.TokensStr()
ocrText = util.OcrAsset(linkDestStr, false)
ocrText = util.GetAssetText(linkDestStr)
}

linkText := n.ChildByType(ast.NodeLinkText)
Expand Down
71 changes: 50 additions & 21 deletions kernel/util/ocr.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,22 +149,16 @@ func ExistsAssetText(asset string) (ret bool) {
return
}

func OcrAsset(asset string, force bool) (ret string) {
if !force {
assetsTextsLock.Lock()
ret = assetsTexts[asset]
assetsTextsLock.Unlock()
return
}

func OcrAsset(asset string) (ret []map[string]interface{}) {
assetsPath := GetDataAssetsAbsPath()
assetAbsPath := strings.TrimPrefix(asset, "assets")
assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
ret = Tesseract(assetAbsPath)
assetsTextsLock.Lock()
assetsTexts[asset] = ret
ocrText := GetOcrJsonText(ret)
assetsTexts[asset] = ocrText
assetsTextsLock.Unlock()
if "" != ret {
if "" != ocrText {
assetsTextsChanged.Store(true)
}
return
Expand All @@ -184,51 +178,86 @@ func IsTesseractExtractable(p string) bool {
// tesseractOCRLock 用于 Tesseract OCR 加锁串行执行提升稳定性 https://github.com/siyuan-note/siyuan/issues/7265
var tesseractOCRLock = sync.Mutex{}

func Tesseract(imgAbsPath string) string {
func Tesseract(imgAbsPath string) (ret []map[string]interface{}) {
if ContainerStd != Container || !TesseractEnabled {
return ""
return
}

defer logging.Recover()
tesseractOCRLock.Lock()
defer tesseractOCRLock.Unlock()

if !IsTesseractExtractable(imgAbsPath) {
return ""
return
}

info, err := os.Stat(imgAbsPath)
if nil != err {
return ""
return
}

if TesseractMaxSize < uint64(info.Size()) {
return ""
return
}

defer logging.Recover()

ctx, cancel := context.WithTimeout(context.Background(), 7*time.Second)
defer cancel()

cmd := exec.CommandContext(ctx, TesseractBin, "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(TesseractLangs, "+"))
cmd := exec.CommandContext(ctx, TesseractBin, "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(TesseractLangs, "+"), "tsv")
gulu.CmdAttr(cmd)
output, err := cmd.CombinedOutput()
if ctx.Err() == context.DeadlineExceeded {
logging.LogWarnf("tesseract [path=%s, size=%d] timeout", imgAbsPath, info.Size())
return ""
return
}

if nil != err {
logging.LogWarnf("tesseract [path=%s, size=%d] failed: %s", imgAbsPath, info.Size(), err)
return ""
return
}

ret := string(output)
tsv := string(output)

// 按行分割 TSV 数据
lines := strings.Split(tsv, "\r\n")

// 解析 TSV 数据 跳过标题行,从第二行开始处理
for _, line := range lines[1:] {
if line == "" {
continue // 跳过空行
}
// 分割每列数据
fields := strings.Split(line, "\t")
// 将字段名和字段值映射到一个 map 中
dataMap := make(map[string]interface{})
for i, header := range strings.Split(lines[0], "\t") {
dataMap[header] = fields[i]
}
ret = append(ret, dataMap)
}

tsv = gulu.Str.RemoveInvisible(tsv)
tsv = RemoveRedundantSpace(tsv)
msg := fmt.Sprintf("OCR [%s] [%s]", html.EscapeString(info.Name()), html.EscapeString(GetOcrJsonText(ret)))
PushStatusBar(msg)
return
}

// 提取并连接所有 text 字段的函数
func GetOcrJsonText(jsonData []map[string]interface{}) (ret string) {
for _, dataMap := range jsonData {
// 检查 text 字段是否存在
if text, ok := dataMap["text"]; ok {
// 确保 text 是字符串类型
if textStr, ok := text.(string); ok {
ret += " " + textStr
}
}
}
ret = gulu.Str.RemoveInvisible(ret)
ret = RemoveRedundantSpace(ret)
msg := fmt.Sprintf("OCR [%s] [%s]", html.EscapeString(info.Name()), html.EscapeString(ret))
PushStatusBar(msg)
return ret
}

Expand Down
Loading