diff --git a/tests/keyword_suite/video_text_test.robot b/tests/keyword_suite/video_text_test.robot index a17b96f2..0f8141ab 100644 --- a/tests/keyword_suite/video_text_test.robot +++ b/tests/keyword_suite/video_text_test.robot @@ -78,7 +78,7 @@ Test Keyword Find Text ${length}= Get Length ${matched_text} Should Be True ${length} > 1 Should Be Equal As Strings ${matched_text[0]['text']} AB123cd - Should Be Equal As Numbers ${matched_text[0]['confidence']} 100.0 + Should Be Equal As Numbers ${matched_text[0]['similarity']} 100.0 ${matched_text}= Find Text regex:${REGEX} region=${REGEX_REGION} ${count}= Set Variable 0 diff --git a/yarf/rf_libraries/libraries/ocr/rapidocr.py b/yarf/rf_libraries/libraries/ocr/rapidocr.py index d12b2437..d1499d75 100644 --- a/yarf/rf_libraries/libraries/ocr/rapidocr.py +++ b/yarf/rf_libraries/libraries/ocr/rapidocr.py @@ -10,6 +10,7 @@ import rapidfuzz from PIL import Image from rapidocr import RapidOCR +from robot.api import logger from yarf.rf_libraries.libraries.geometry.quad import Quad from yarf.vendor.RPA.core.geometry import Region @@ -24,7 +25,7 @@ class OCRResult: Attributes: position: Quadrilateral region of the match. text: Text found in the match. - confidence: Confidence of the match + confidence: Estimated probability that the recognized text is correct. """ position: Quad @@ -44,12 +45,18 @@ class RapidOCRReader: This class is a singleton to avoid loading the model multiple times. Attributes: - DEFAULT_CONFIDENCE: Default confidence for text detection. - DEFAULT_COINCIDENCE: Default coincidence for text similarities. + DEFAULT_SIMILARITY_THRESHOLD: Minimum similarity percentage (0-100) for + text matching. If the similarity between the found text and the target + text is below this threshold, the match is discarded. + DEFAULT_CONFIDENCE_THRESHOLD: Minumum confidence percentage (0-100) for + text matching. If the confidence of the found text is below this + threshold, the match is discarded. + SIMILARITY_LOG_THRESHOLD: Minimum similarity to log rejected matches. """ - DEFAULT_CONFIDENCE: float = 0.7 - DEFAULT_COINCIDENCE: float = 80.0 + DEFAULT_SIMILARITY_THRESHOLD: float = 80.0 + DEFAULT_CONFIDENCE_THRESHOLD: float = 70.0 + SIMILARITY_LOG_THRESHOLD: float = 80.0 def __new__(cls) -> "RapidOCRReader": if not hasattr(cls, "instance"): @@ -82,8 +89,8 @@ def find( self, image: Image.Image | Path, text: str, - confidence: float = DEFAULT_CONFIDENCE, - coincidence: float = DEFAULT_COINCIDENCE, + similarity_threshold: float = DEFAULT_SIMILARITY_THRESHOLD, + confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD, region: Region | None = None, partial: bool = True, ) -> list[dict]: @@ -94,8 +101,12 @@ def find( Args: image: Path to image or Image object. text: Text to find in image. - confidence: Minimum confidence for text detection. - coincidence: Minimum coincidence for text similarities. + similarity_threshold: Minimum similarity percentage (0-100) for + text matching. If the similarity between the found text and the + target text is below this threshold, the match is discarded. + confidence_threshold: Minimum confidence percentage (0-100) for + text matching. If the confidence of the found text is below this + threshold, the match is discarded. region: Limit the region of the screen where to look. partial: Use partial matching. @@ -123,9 +134,12 @@ def find( ocr_output.boxes, ocr_output.txts, ocr_output.scores ) ] + # Multiply the item confidence with 100 to convert it to percentage + for item in result: + item.confidence *= 100 matches = self.get_matches( - result, text, confidence, coincidence, partial + result, text, similarity_threshold, confidence_threshold, partial ) if region is not None: @@ -138,8 +152,8 @@ def get_matches( self, result: list[OCRResult], match_text: str, - confidence: float, - coincidence: float, + similarity_threshold: float, + confidence_threshold: float, partial: bool, ) -> list[dict]: """ @@ -148,8 +162,12 @@ def get_matches( Args: result: List with the OCR results. match_text: Text to match. - confidence: Minimum confidence for text detection. - coincidence: Minimum coincidence for text similarities. + similarity_threshold: Minimum similarity percentage (0-100) for + text matching. If the similarity between the found text and the + target text is below this threshold, the match is discarded. + confidence_threshold: Minimum confidence percentage (0-100) for + text matching. If the confidence of the found text is below this + threshold, the match is discarded. partial: Use partial matching. Returns: @@ -187,17 +205,27 @@ def directional_ratio(q: str, text: str) -> float: matches = [] for item in result: - ratio = ( + similarity = ( directional_ratio(match_text, item.text) if partial else rapidfuzz.fuzz.ratio(item.text, match_text) ) - if ratio >= coincidence and item.confidence >= confidence: + if ( + similarity >= similarity_threshold + and item.confidence >= confidence_threshold + ): matches.append( { "text": item.text, "region": item.position.to_region(), - "confidence": ratio, # Using the ratio like tesseract + "similarity": similarity, + "confidence": item.confidence, } ) - return sorted(matches, key=lambda x: x["confidence"], reverse=True) + elif similarity >= self.SIMILARITY_LOG_THRESHOLD: + logger.debug( + f"Rejected match for text '{match_text}' " + f"with similarity {similarity} " + f"and confidence {item.confidence}: '{item.text}'" + ) + return sorted(matches, key=lambda x: x["similarity"], reverse=True) diff --git a/yarf/rf_libraries/libraries/ocr/tests/test_rapidocr.py b/yarf/rf_libraries/libraries/ocr/tests/test_rapidocr.py index 2eafc1ba..754116f4 100644 --- a/yarf/rf_libraries/libraries/ocr/tests/test_rapidocr.py +++ b/yarf/rf_libraries/libraries/ocr/tests/test_rapidocr.py @@ -29,6 +29,7 @@ def mock_to_image(): @pytest.fixture(autouse=True) def mock_reader(): with patch("yarf.rf_libraries.libraries.ocr.rapidocr.RapidOCR") as p: + p.SIMILARITY_LOG_THRESHOLD = RapidOCRReader.SIMILARITY_LOG_THRESHOLD yield p @@ -41,7 +42,7 @@ def test_read(self, mock_reader): mock_reader.reader.return_value = MockRapidOCROutput( boxes=np.array([[[0, 0], [0, 0], [0, 0], [0, 0]]]), txts=("Hello", "World"), - scores=(0.9, 0.8), + scores=(90, 80), ) result = RapidOCRReader.read(mock_reader, None) @@ -55,10 +56,15 @@ def test_find(self, mock_reader): mock_reader.reader.return_value = MockRapidOCROutput( boxes=np.array([[[0, 0], [0, 0], [0, 0], [0, 0]]]), txts=("Hello",), - scores=(0.9,), + scores=(90,), ) mock_reader.get_matches.return_value = [ - {"text": "Hello", "region": Region(0, 0, 1, 1), "confidence": 100} + { + "text": "Hello", + "region": Region(0, 0, 1, 1), + "confidence": 90, + "similarity": 100, + } ] result = RapidOCRReader.find(mock_reader, None, "Hello") @@ -66,7 +72,8 @@ def test_find(self, mock_reader): { "text": "Hello", "region": Region(0, 0, 1, 1), - "confidence": 100, + "confidence": 90, + "similarity": 100, } ] @@ -87,10 +94,15 @@ def test_find_in_region(self, mock_to_image, mock_reader): mock_reader.reader.return_value = MockRapidOCROutput( boxes=np.array([[[0, 0], [1, 0], [1, 1], [0, 1]]]), txts=("Hello World",), - scores=(0.9,), + scores=(90,), ) mock_reader.get_matches.return_value = [ - {"text": "Hello", "region": Region(0, 0, 1, 1), "confidence": 100} + { + "text": "Hello", + "region": Region(0, 0, 1, 1), + "confidence": 90, + "similarity": 100, + } ] result = RapidOCRReader.find( mock_reader, None, "Hello", region=Region(0, 0, 1, 1) @@ -100,50 +112,53 @@ def test_find_in_region(self, mock_to_image, mock_reader): { "text": "Hello", "region": Region(0, 0, 1, 1), - "confidence": 100, + "confidence": 90, + "similarity": 100, } ] def test_get_matches(self, mock_reader): items = [ OCRResult( - Quad([[0, 0], [1, 0], [1, 1], [0, 1]]), "Hello World", 0.9 + Quad([[0, 0], [1, 0], [1, 1], [0, 1]]), "Hello World", 90 ), ] result = RapidOCRReader.get_matches( - mock_reader, items, "Hello World", 0.8, 80, False + mock_reader, items, "Hello World", 80, 80, False ) assert result == [ { "text": "Hello World", "region": Region(0, 0, 1, 1), - "confidence": 100, + "confidence": 90, + "similarity": 100, } ] def test_get_matches_partial(self, mock_reader): items = [ - OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Hello World", 0.9), + OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Hello World", 90), ] result = RapidOCRReader.get_matches( - mock_reader, items, "Hello", 0.8, 80, True + mock_reader, items, "Hello", 80, 80, True ) assert result == [ { "text": "Hello World", "region": Region(0, 0, 1, 1), - "confidence": 100, + "confidence": 90, + "similarity": 100, } ] def test_get_matches_no_matches(self, mock_reader): items = [ - OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Hello World", 0.9), + OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Hello World", 90), ] result = RapidOCRReader.get_matches( - mock_reader, items, "Hello", 0.8, 90, False + mock_reader, items, "Hello", 80, 90, False ) assert result == [] @@ -158,18 +173,19 @@ def test_get_matches_no_matches(self, mock_reader): def test_substring_match(self, mock_reader, input_text, result_text): "Substrings match 100% to longer results" items = [ - OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Trash", 0.9), - OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Move to Trash", 0.9), - OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Move to ...", 0.9), + OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Trash", 90), + OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Move to Trash", 90), + OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Move to ...", 90), ] result = RapidOCRReader.get_matches( - mock_reader, items, input_text, 0.8, 80, True + mock_reader, items, input_text, 80, 80, True ) for text in result_text: assert { "text": text, "region": Region(0, 0, 1, 1), - "confidence": 100, + "confidence": 90, + "similarity": 100, } in result @pytest.mark.parametrize( @@ -184,37 +200,38 @@ def test_asimetric_match(self, mock_reader, input_text, result_text): - "Move to Trash" does not match "Trash" . """ items = [ - OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Trash", 0.9), - OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Move to Trash", 0.9), - OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Move to ...", 0.9), + OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Trash", 90), + OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Move to Trash", 90), + OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Move to ...", 90), ] result = RapidOCRReader.get_matches( - mock_reader, items, input_text, 0.8, 90, True + mock_reader, items, input_text, 90, 80, True ) assert len(result) == 1 assert result == [ { "text": result_text, "region": Region(0, 0, 1, 1), - "confidence": 100, + "similarity": 100, + "confidence": 90, } ] def test_asimetric_long_match(self, mock_reader): items = [ OCRResult( - [[0, 0], [1, 0], [1, 1], [0, 1]], "Trash a set of files", 0.9 + [[0, 0], [1, 0], [1, 1], [0, 1]], "Trash a set of files", 90 ), - OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Move to Trash", 0.9), - OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "!", 0.9), + OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "Move to Trash", 90), + OCRResult([[0, 0], [1, 0], [1, 1], [0, 1]], "!", 90), OCRResult( [[0, 0], [1, 0], [1, 1], [0, 1]], "Move to Downloads", - 0.9, + 90, ), ] result = RapidOCRReader.get_matches( - mock_reader, items, "Move to Trash!", 0.8, 80, True + mock_reader, items, "Move to Trash!", 80, 80, True ) assert result[0]["text"] == "Move to Trash" diff --git a/yarf/rf_libraries/libraries/tests/test_video_input_base.py b/yarf/rf_libraries/libraries/tests/test_video_input_base.py index e54e45c5..24510edc 100644 --- a/yarf/rf_libraries/libraries/tests/test_video_input_base.py +++ b/yarf/rf_libraries/libraries/tests/test_video_input_base.py @@ -434,14 +434,41 @@ async def test_read_text_image(self, stub_videoinput): await stub_videoinput.read_text(image) stub_videoinput.ocr.read.assert_called_once_with(image) + @pytest.mark.parametrize( + "log_level", + [ + "INFO", + "DEBUG", + ], + ) @pytest.mark.asyncio - async def test_find_text(self, stub_videoinput): + async def test_find_text(self, stub_videoinput, log_level: str): """ Test if the function grabs a new screenshot and finds the text position. """ - stub_videoinput.ocr.find = Mock() - await stub_videoinput.find_text("text") + stub_videoinput.ocr.find = Mock( + return_value=[ + { + "text": "Hello", + "region": Region(0, 0, 1, 1), + "confidence": 90, + "similarity": 100, + } + ], + ) + + with ( + patch.dict(os.environ, {"YARF_LOG_LEVEL": log_level}), + patch( + "yarf.rf_libraries.libraries.video_input_base.log_image" + ) as mock_log_image, + ): + await stub_videoinput.find_text("text") + if log_level == "DEBUG": + mock_log_image.assert_called_once() + else: + mock_log_image.assert_not_called() stub_videoinput.ocr.find.assert_called_once_with( stub_videoinput.grab_screenshot.return_value, "text", region=None @@ -453,7 +480,16 @@ async def test_find_text_in_region(self, stub_videoinput): Test if the function grabs a new screenshot and finds the text position. """ - stub_videoinput.ocr.find = Mock() + stub_videoinput.ocr.find = Mock( + return_value=[ + { + "text": "Hello", + "region": Region(0, 0, 1, 1), + "confidence": 90, + "similarity": 100, + } + ], + ) region = { "left": 0, "top": 0, @@ -475,7 +511,16 @@ async def test_find_text_in_image(self, stub_videoinput): Test if the function finds the text position in an image. """ image = Mock() - stub_videoinput.ocr.find = Mock() + stub_videoinput.ocr.find = Mock( + return_value=[ + { + "text": "Hello", + "region": Region(0, 0, 1, 1), + "confidence": 90, + "similarity": 100, + } + ], + ) await stub_videoinput.find_text("text", image=image) stub_videoinput.ocr.find.assert_called_once_with( @@ -488,10 +533,14 @@ async def test_find_text_with_regex(self, stub_videoinput): Test if the function finds the text position with a regex. """ stub_videoinput.ocr.find = Mock( - side_effect=[ - [sentinel.region1, sentinel.region2, sentinel.region3], - [sentinel.region4], - ] + return_value=[ + { + "text": "Hello", + "region": Region(0, 0, 1, 1), + "confidence": 90, + "similarity": 100, + } + ], ) stub_videoinput.ocr.read = Mock( return_value=""" @@ -522,7 +571,16 @@ async def test_match_text_in_region(self, stub_videoinput): """ Test if the function finds the text in a region. """ - stub_videoinput.ocr.find = Mock() + stub_videoinput.ocr.find = Mock( + return_value=[ + { + "text": "Hello", + "region": Region(0, 0, 1, 1), + "confidence": 90, + "similarity": 100, + } + ], + ) await stub_videoinput.find_text("text", region=Region(0, 0, 1, 1)) stub_videoinput.ocr.find.assert_called_once_with( @@ -658,15 +716,8 @@ async def test_match_text_with_regex(self, stub_videoinput): sentinel.image, ) - @pytest.mark.parametrize( - "log_level", - [ - "INFO", - "DEBUG", - ], - ) @pytest.mark.asyncio - async def test_get_text_position(self, stub_videoinput, log_level: str): + async def test_get_text_position(self, stub_videoinput): """ Test the function returns the center of the best match. """ @@ -680,18 +731,8 @@ async def test_get_text_position(self, stub_videoinput, log_level: str): image, ) - with ( - patch.dict(os.environ, {"YARF_LOG_LEVEL": log_level}), - patch( - "yarf.rf_libraries.libraries.video_input_base.log_image" - ) as mock_log_image, - ): - result = await stub_videoinput.get_text_position("text") - assert result == (2, 2) - if log_level == "DEBUG": - mock_log_image.assert_called_once() - else: - mock_log_image.assert_not_called() + result = await stub_videoinput.get_text_position("text") + assert result == (2, 2) @pytest.mark.asyncio async def test_get_text_position_in_region(self, stub_videoinput): diff --git a/yarf/rf_libraries/libraries/video_input_base.py b/yarf/rf_libraries/libraries/video_input_base.py index fe544aaf..997418d9 100644 --- a/yarf/rf_libraries/libraries/video_input_base.py +++ b/yarf/rf_libraries/libraries/video_input_base.py @@ -294,6 +294,19 @@ async def find_text( else: matched_text_regions = self.ocr.find(image, text, region=region) # type: ignore[arg-type] + # Log the image which we found the text on for debugging false positives + if os.getenv("YARF_LOG_LEVEL") == "DEBUG": + for match in matched_text_regions: + similarity = f"{match['similarity']:.2f}" + confidence = f"{match['confidence']:.2f}" + matched_image = self._draw_region_on_image( + image, match["region"] + ) + log_image( + matched_image, + f"Found text matching '{text}' with similarity {similarity}, confidence {confidence}: '{match['text']}'", + ) + return matched_text_regions @keyword @@ -353,7 +366,6 @@ async def match_text( if text_matches: return text_matches, cropped_image - log_image(cropped_image, "The image used for ocr was:") read_text = await self.read_text(cropped_image) raise ValueError( f"Timed out looking for '{text}' after '{timeout}' seconds. " @@ -386,11 +398,6 @@ async def get_text_position( # Get the best match match = text_matches[0] - # Draw the region on the image for debugging - if os.getenv("YARF_LOG_LEVEL") == "DEBUG": - matched_image = self._draw_region_on_image(image, match["region"]) - log_image(matched_image, "Matched text region:") - # Get the center of the region center = match["region"].center logger.info(f"\nThe center of the best match is: {center}") diff --git a/yarf/vendor/RPA/recognition/ocr.py b/yarf/vendor/RPA/recognition/ocr.py index b235ca2c..c548ca93 100644 --- a/yarf/vendor/RPA/recognition/ocr.py +++ b/yarf/vendor/RPA/recognition/ocr.py @@ -38,7 +38,7 @@ "see library documentation for installation instructions" ) -DEFAULT_CONFIDENCE = 80.0 +DEFAULT_SIMILARITY_THRESHOLD = 80.0 def read( @@ -67,7 +67,7 @@ def read( def find( image: Union[Image.Image, Path], text: str, - confidence: float = DEFAULT_CONFIDENCE, + similarity_threshold: float = DEFAULT_SIMILARITY_THRESHOLD, region: Optional[Region] = None, language: Optional[str] = None, configuration: Optional[str] = None @@ -77,14 +77,16 @@ def find( :param image: Path to image or Image object :param text: Text to find in image - :param confidence: Minimum confidence for text similaritys + :param similarity_threshold: Minimum similarity percentage (0-100) for text + matching. If the similarity between the found text and the target text is + below this threshold, the match is discarded. The default value is 80.0. :param region: Limit the region of the screen where to look for the text :param language: 3-character ISO 639-2 language code of the text. This is passed directly to the pytesseract lib in the lang parameter. See https://tesseract-ocr.github.io/tessdoc/Command-Line-Usage.html#using-one-language """ # noqa: E501 image = to_image(image) - confidence = clamp(1, float(confidence), 100) + similarity_threshold = clamp(1, float(similarity_threshold), 100) text = str(text).strip() if not text: @@ -107,7 +109,7 @@ def find( raise EnvironmentError(INSTALL_PROMPT) from err lines = _dict_lines(data) - matches = _match_lines(lines, text, confidence) + matches = _match_lines(lines, text, similarity_threshold) if region is not None: for match in matches: @@ -132,8 +134,7 @@ def _dict_lines(data: Dict) -> List: word["left"], word["top"], word["width"], word["height"] ) - # NOTE: Currently ignoring confidence in tesseract results - lines[key].append({"text": word["text"], "region": region}) + lines[key].append({"text": word["text"], "region": region, "confidence": word["conf"]}) assert len(lines[key]) == word["word_num"] return list(lines.values()) @@ -144,9 +145,9 @@ def _iter_rows(data: Dict) -> Generator: return (dict(zip(data.keys(), values)) for values in zip(*data.values())) -def _match_lines(lines: List[Dict], text: str, confidence: float) -> List[Dict]: +def _match_lines(lines: List[Dict], text: str, similarity_threshold: float) -> List[Dict]: """Find best matches between lines of text and target text, - and return resulting bounding boxes and confidences. + and return resulting bounding boxes and similarities. A line of N words will be matched to the given text in all 1 to N length sections, in every sequential position. @@ -161,21 +162,26 @@ def _match_lines(lines: List[Dict], text: str, confidence: float) -> List[Dict]: regions = [word["region"] for word in words] sentence = " ".join(word["text"] for word in words) - ratio = SequenceMatcher(None, sentence, text).ratio() * 100.0 + similarity = SequenceMatcher(None, sentence, text).ratio() * 100.0 - if ratio < confidence: + if similarity < similarity_threshold: continue - if match and match["confidence"] >= ratio: + if match and match["similarity"] >= similarity: + # We already have a better match continue + # Use the lowest confidence among the words in the match + confidence = min(word["confidence"] for word in words if word["confidence"] != -1) + match = { "text": sentence, "region": Region.merge(regions), - "confidence": ratio, + "similarity": similarity, + "confidence": confidence, } if match: matches.append(match) - return sorted(matches, key=lambda match: match["confidence"], reverse=True) + return sorted(matches, key=lambda match: match["similarity"], reverse=True)