Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/keyword_suite/video_text_test.robot
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ Test Keyword Find Text
${length}= Get Length ${matched_text}
Should Be True ${length} > 1
Should Be Equal As Strings ${matched_text[0]['text']} AB123cd
Should Be Equal As Numbers ${matched_text[0]['confidence']} 100.0
Should Be Equal As Numbers ${matched_text[0]['similarity']} 100.0

${matched_text}= Find Text regex:${REGEX} region=${REGEX_REGION}
${count}= Set Variable 0
Expand Down
64 changes: 46 additions & 18 deletions yarf/rf_libraries/libraries/ocr/rapidocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import rapidfuzz
from PIL import Image
from rapidocr import RapidOCR
from robot.api import logger

from yarf.rf_libraries.libraries.geometry.quad import Quad
from yarf.vendor.RPA.core.geometry import Region
Expand All @@ -24,7 +25,7 @@ class OCRResult:
Attributes:
position: Quadrilateral region of the match.
text: Text found in the match.
confidence: Confidence of the match
confidence: Estimated probability that the recognized text is correct.
"""

position: Quad
Expand All @@ -44,12 +45,18 @@ class RapidOCRReader:
This class is a singleton to avoid loading the model multiple times.

Attributes:
DEFAULT_CONFIDENCE: Default confidence for text detection.
DEFAULT_COINCIDENCE: Default coincidence for text similarities.
DEFAULT_SIMILARITY_THRESHOLD: Minimum similarity percentage (0-100) for
text matching. If the similarity between the found text and the target
text is below this threshold, the match is discarded.
DEFAULT_CONFIDENCE_THRESHOLD: Minumum confidence percentage (0-100) for
text matching. If the confidence of the found text is below this
threshold, the match is discarded.
SIMILARITY_LOG_THRESHOLD: Minimum similarity to log rejected matches.
"""

DEFAULT_CONFIDENCE: float = 0.7
DEFAULT_COINCIDENCE: float = 80.0
DEFAULT_SIMILARITY_THRESHOLD: float = 80.0
DEFAULT_CONFIDENCE_THRESHOLD: float = 70.0
SIMILARITY_LOG_THRESHOLD: float = 80.0

def __new__(cls) -> "RapidOCRReader":
if not hasattr(cls, "instance"):
Expand Down Expand Up @@ -82,8 +89,8 @@ def find(
self,
image: Image.Image | Path,
text: str,
confidence: float = DEFAULT_CONFIDENCE,
coincidence: float = DEFAULT_COINCIDENCE,
similarity_threshold: float = DEFAULT_SIMILARITY_THRESHOLD,
confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
region: Region | None = None,
partial: bool = True,
) -> list[dict]:
Expand All @@ -94,8 +101,12 @@ def find(
Args:
image: Path to image or Image object.
text: Text to find in image.
confidence: Minimum confidence for text detection.
coincidence: Minimum coincidence for text similarities.
similarity_threshold: Minimum similarity percentage (0-100) for
text matching. If the similarity between the found text and the
target text is below this threshold, the match is discarded.
confidence_threshold: Minimum confidence percentage (0-100) for
text matching. If the confidence of the found text is below this
threshold, the match is discarded.
region: Limit the region of the screen where to look.
partial: Use partial matching.

Expand Down Expand Up @@ -123,9 +134,12 @@ def find(
ocr_output.boxes, ocr_output.txts, ocr_output.scores
)
]
# Multiply the item confidence with 100 to convert it to percentage
for item in result:
item.confidence *= 100

matches = self.get_matches(
result, text, confidence, coincidence, partial
result, text, similarity_threshold, confidence_threshold, partial
)

if region is not None:
Expand All @@ -138,8 +152,8 @@ def get_matches(
self,
result: list[OCRResult],
match_text: str,
confidence: float,
coincidence: float,
similarity_threshold: float,
confidence_threshold: float,
partial: bool,
) -> list[dict]:
"""
Expand All @@ -148,8 +162,12 @@ def get_matches(
Args:
result: List with the OCR results.
match_text: Text to match.
confidence: Minimum confidence for text detection.
coincidence: Minimum coincidence for text similarities.
similarity_threshold: Minimum similarity percentage (0-100) for
text matching. If the similarity between the found text and the
target text is below this threshold, the match is discarded.
confidence_threshold: Minimum confidence percentage (0-100) for
text matching. If the confidence of the found text is below this
threshold, the match is discarded.
partial: Use partial matching.

Returns:
Expand Down Expand Up @@ -187,17 +205,27 @@ def directional_ratio(q: str, text: str) -> float:

matches = []
for item in result:
ratio = (
similarity = (
directional_ratio(match_text, item.text)
if partial
else rapidfuzz.fuzz.ratio(item.text, match_text)
)
if ratio >= coincidence and item.confidence >= confidence:
if (
similarity >= similarity_threshold
and item.confidence >= confidence_threshold
):
matches.append(
{
"text": item.text,
"region": item.position.to_region(),
"confidence": ratio, # Using the ratio like tesseract
"similarity": similarity,
"confidence": item.confidence,
}
)
return sorted(matches, key=lambda x: x["confidence"], reverse=True)
elif similarity >= self.SIMILARITY_LOG_THRESHOLD:
logger.debug(
f"Rejected match for text '{match_text}' "
f"with similarity {similarity} "
f"and confidence {item.confidence}: '{item.text}'"
)
Comment on lines +225 to +230
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you are having a similarity threshold, why not having also a confidence threshold?
Also, why is the similarity log threshold the same as the similarity threshold?
I think it would be more useful to log the cases in which both confidence and similarity are close to the threshold values:

Suggested change
elif similarity >= self.SIMILARITY_LOG_THRESHOLD:
logger.debug(
f"Rejected match for text '{match_text}' "
f"with similarity {similarity} "
f"and confidence {item.confidence}: '{item.text}'"
)
elif (
similarity >= self.SIMILARITY_LOG_THRESHOLD
and item.confidence >= self.CONFIDENCE_LOG_THRESHOLD
):
logger.debug(
f"Rejected match for text '{match_text}' "
f"with similarity {similarity} "
f"and confidence {item.confidence}: '{item.text}'"
)

I'd suggest:

  • SIMILARITY_LOG_THRESHOLD = 70.0
  • CONFIDENCE_LOG_THRESHOLD = 60.0

Another option would be having just a "LOG_THRESHOLD" and compare:

            elif (
                similarity >= similarity_threshold - self.LOG_THRESHOLD
                and item.confidence >= confidence_threshold - self.LOG_THRESHOLD
            ):

Copy link
Contributor Author

@adombeck adombeck Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you are having a similarity threshold, why not having also a confidence threshold?

I don't think that would be useful. The confidence score measures how confident the OCR engine is that the string it returned is the actual string that's on the image. Let's say we want to find the string "foo" in an image, but the image only contains "bar". The OCR engine returns {"text": "bar", "confidence": "99"}, i.e. it's confident that the image really contains "bar". We calculate the similarity to "foo" which results in a similarity score of 10. Logging the message Rejected match for text 'foo' with similarity 10 and confidence 99: 'bar' is not very helpful. Does that make sense?

Also, why is the similarity log threshold the same as the similarity threshold?

Because I only found it useful when setting a higher threshold (which is made possible by #100). The log message is only useful to spot false negatives (i.e. a match was rejected even though it should have been accepted). We found the default threshold to be so low that it frequently causes false positive matches, so I doubt that there will be a lot of false negatives with an even lower similarity score.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that would be useful

But in that case you won't log anything. Both have to be close to the threshold to log anything.
Imagine you text is blurry, but the OCR still manages to get your word right. You try to look for foo, and you get: {"text": "foo", "confidence": "65"}. The OCR was close to get it, but the confidence was just below the threshold. I think that information could be useful.

If the OCR returns {"text": "bar", "confidence": "99"}, similarity will be 0 and it won't log anything

Because I only found it useful

I don't see the point in having it exactly the same. If you are looking for "fooo" and OCR gets "foo0", you may not want to match it, because the similarity is 75, but it's close enough to your threshold that you may want it to be logged.
If you have the exact same value, you are basically filtering for similarity to see when the confidence is too low but the similarity is high.

I think having a log threshold right below your real threshold will be useful to log only the cases that are close, so you can adjust your thresholds accordingly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But in that case you won't log anything. Both have to be close to the threshold to log anything. Imagine you text is blurry, but the OCR still manages to get your word right. You try to look for foo, and you get: {"text": "foo", "confidence": "65"}. The OCR was close to get it, but the confidence was just below the threshold. I think that information could be useful.

You already get that message with my proposal though, because the similarity between "foo" and "foo" is 100, so it's above the similarity log threshold. In the diff you suggested, you just add an additional condition for logging the rejected matches, i.e. that the confidence is also high. That means when the text is blurry but the OCR engine still recognizes the text we're looking for, e.g. {"text": "foo", "confidence": "50"}, it will reject the match and won't even log about it.

I don't see the point in having it exactly the same. If you are looking for "fooo" and OCR gets "foo0", you may not want to match it, because the similarity is 75, but it's close enough to your threshold that you may want it to be logged. If you have the exact same value, you are basically filtering for similarity to see when the confidence is too low but the similarity is high.

That is a case we want to log IMO. In the case from above, when the engine is unsure that it recognized the exact text and returns {"text": "foo", "confidence": "50"}, we do want to log the rejected match even if the confidence score is very low.

I think having a log threshold right below your real threshold will be useful to log only the cases that are close, so you can adjust your thresholds accordingly.

Agreed, that's why in our tests we are using a similarity threshold of 92 and a log threshold of 80. My point is just that the default similarity threshold is already too low and causes too many false positives. I don't think we can safely change that, because it would break existing tests. So the proposed log threshold of 80 would mainly be useful when you set a higher similarity threshold (except for the case discussed above, where the match was rejected because of low confidence, in that case the log threshold of 80 would also be useful without setting a higher similarity threshold). Anyway, if we also make the log threshold configurable, which we should probably do anyway, I'm also fine with using a lower default there.

Copy link
Contributor

@fernando79513 fernando79513 Jan 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You already get that message with my proposal though.

You are 100% right there. Sorry for my confusion. The condition I suggested was even more restrictive.

If you think we don't need a lower bound to log the confidence results, I think that's okay. The lower bound for outputting a result (text_score) is already set at 0.5 in the rapidocr config.yaml, so I don't think it's going to introduce too much noise.

I didn't get at first why we would want to have a logging mechanism that just skips logging when using default values (for the similarity threshold). If you are only planning on increasing it, it makes sense because you will log the results you would get with the default values.
Nevertheless, if you were going to reduce the similarity threshold, you wouldn't log anything; that's why I liked the "LOG_THRESHOLD" approach.

Maybe the threshold is already too permissive, and we don't have to bother about it...

I leave it up to you if you want to have an absolute threshold for logs or a relative one.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are 100% right there. Sorry for my confusion.

No worries, I also found it challenging to understand the actual effect of the different approaches in practice and actually tried out the approaches to see the actual results.

If you are only planning on increasing it, it makes sense because you will log the results you would get with the default values.

Exactly, that was the idea.

Maybe the threshold is already too permissive, and we don't have to bother about it...

That's the case in my experience

return sorted(matches, key=lambda x: x["similarity"], reverse=True)
77 changes: 47 additions & 30 deletions yarf/rf_libraries/libraries/ocr/tests/test_rapidocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def mock_to_image():
@pytest.fixture(autouse=True)
def mock_reader():
with patch("yarf.rf_libraries.libraries.ocr.rapidocr.RapidOCR") as p:
p.SIMILARITY_LOG_THRESHOLD = RapidOCRReader.SIMILARITY_LOG_THRESHOLD
yield p


Expand All @@ -41,7 +42,7 @@ def test_read(self, mock_reader):
mock_reader.reader.return_value = MockRapidOCROutput(
boxes=np.array([[[0, 0], [0, 0], [0, 0], [0, 0]]]),
txts=("Hello", "World"),
scores=(0.9, 0.8),
scores=(90, 80),
)
result = RapidOCRReader.read(mock_reader, None)

Expand All @@ -55,18 +56,24 @@ def test_find(self, mock_reader):
mock_reader.reader.return_value = MockRapidOCROutput(
boxes=np.array([[[0, 0], [0, 0], [0, 0], [0, 0]]]),
txts=("Hello",),
scores=(0.9,),
scores=(90,),
)
mock_reader.get_matches.return_value = [
{"text": "Hello", "region": Region(0, 0, 1, 1), "confidence": 100}
{
"text": "Hello",
"region": Region(0, 0, 1, 1),
"confidence": 90,
"similarity": 100,
}
]
result = RapidOCRReader.find(mock_reader, None, "Hello")

assert result == [
{
"text": "Hello",
"region": Region(0, 0, 1, 1),
"confidence": 100,
"confidence": 90,
"similarity": 100,
}
]

Expand All @@ -87,10 +94,15 @@ def test_find_in_region(self, mock_to_image, mock_reader):
mock_reader.reader.return_value = MockRapidOCROutput(
boxes=np.array([[[0, 0], [1, 0], [1, 1], [0, 1]]]),
txts=("Hello World",),
scores=(0.9,),
scores=(90,),
)
mock_reader.get_matches.return_value = [
{"text": "Hello", "region": Region(0, 0, 1, 1), "confidence": 100}
{
"text": "Hello",
"region": Region(0, 0, 1, 1),
"confidence": 90,
"similarity": 100,
}
]
result = RapidOCRReader.find(
mock_reader, None, "Hello", region=Region(0, 0, 1, 1)
Expand All @@ -100,50 +112,53 @@ def test_find_in_region(self, mock_to_image, mock_reader):
{
"text": "Hello",
"region": Region(0, 0, 1, 1),
"confidence": 100,
"confidence": 90,
"similarity": 100,
}
]

def test_get_matches(self, mock_reader):
items = [
OCRResult(
Quad([[0, 0], [1, 0], [1, 1], [0, 1]]), "Hello World", 0.9
Quad([[0, 0], [1, 0], [1, 1], [0, 1]]), "Hello World", 90
),
]
result = RapidOCRReader.get_matches(
mock_reader, items, "Hello World", 0.8, 80, False
mock_reader, items, "Hello World", 80, 80, False
)

assert result == [
{
"text": "Hello World",
"region": Region(0, 0, 1, 1),
"confidence": 100,
"confidence": 90,
"similarity": 100,
}
]

def test_get_matches_partial(self, mock_reader):
items = [
OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Hello World", 0.9),
OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Hello World", 90),
]
result = RapidOCRReader.get_matches(
mock_reader, items, "Hello", 0.8, 80, True
mock_reader, items, "Hello", 80, 80, True
)

assert result == [
{
"text": "Hello World",
"region": Region(0, 0, 1, 1),
"confidence": 100,
"confidence": 90,
"similarity": 100,
}
]

def test_get_matches_no_matches(self, mock_reader):
items = [
OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Hello World", 0.9),
OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Hello World", 90),
]
result = RapidOCRReader.get_matches(
mock_reader, items, "Hello", 0.8, 90, False
mock_reader, items, "Hello", 80, 90, False
)

assert result == []
Expand All @@ -158,18 +173,19 @@ def test_get_matches_no_matches(self, mock_reader):
def test_substring_match(self, mock_reader, input_text, result_text):
"Substrings match 100% to longer results"
items = [
OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Trash", 0.9),
OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Move to Trash", 0.9),
OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Move to ...", 0.9),
OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Trash", 90),
OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Move to Trash", 90),
OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Move to ...", 90),
]
result = RapidOCRReader.get_matches(
mock_reader, items, input_text, 0.8, 80, True
mock_reader, items, input_text, 80, 80, True
)
for text in result_text:
assert {
"text": text,
"region": Region(0, 0, 1, 1),
"confidence": 100,
"confidence": 90,
"similarity": 100,
} in result

@pytest.mark.parametrize(
Expand All @@ -184,37 +200,38 @@ def test_asimetric_match(self, mock_reader, input_text, result_text):
- "Move to Trash" does not match "Trash" .
"""
items = [
OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Trash", 0.9),
OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Move to Trash", 0.9),
OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Move to ...", 0.9),
OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Trash", 90),
OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Move to Trash", 90),
OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Move to ...", 90),
]
result = RapidOCRReader.get_matches(
mock_reader, items, input_text, 0.8, 90, True
mock_reader, items, input_text, 90, 80, True
)
assert len(result) == 1
assert result == [
{
"text": result_text,
"region": Region(0, 0, 1, 1),
"confidence": 100,
"similarity": 100,
"confidence": 90,
}
]

def test_asimetric_long_match(self, mock_reader):
items = [
OCRResult(
[[0, 0], [1, 0], [1, 1], [0, 1]], "Trash a set of files", 0.9
[[0, 0], [1, 0], [1, 1], [0, 1]], "Trash a set of files", 90
),
OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Move to Trash", 0.9),
OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "!", 0.9),
OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Move to Trash", 90),
OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "!", 90),
OCRResult(
[[0, 0], [1, 0], [1, 1], [0, 1]],
"Move to Downloads",
0.9,
90,
),
]
result = RapidOCRReader.get_matches(
mock_reader, items, "Move to Trash!", 0.8, 80, True
mock_reader, items, "Move to Trash!", 80, 80, True
)

assert result[0]["text"] == "Move to Trash"
Expand Down
Loading