Merge pull request #715 from roboflow/features/tests_for_google_visio…

…n_ocr Add tests for Google Vision OCR
roboflow · Oct 3, 2024 · 74b52a9 · 74b52a9
2 parents 9b65794 + ad8ae8c
commit 74b52a9
Show file tree

Hide file tree

Showing 11 changed files with 300 additions and 59 deletions.
diff --git a/.github/workflows/hosted_inference_e2e_test_production.yml b/.github/workflows/hosted_inference_e2e_test_production.yml
@@ -28,4 +28,4 @@ jobs:
           python -m pip install -r requirements/requirements.test.unit.txt -r requirements/requirements.test.integration.txt -r requirements/requirements.sdk.http.txt
       - name: 📝 E2E test of HOSTED INFERENCE at 🚨 PRODUCTION 🚨 🔥🔥🔥🔥
         run:
-          SKIP_WARMUP=${{ github.event.inputs.skip_warmup }} HOSTED_PLATFORM_TESTS_API_KEY=${{ secrets.LOAD_TEST_PRODUCTION_API_KEY }} HOSTED_PLATFORM_TESTS_PROJECT=roboflow-platform OPENAI_KEY=${{ secrets.OPEN_AI_API_KEY }} GOOGLE_API_KEY=${{ secrets.GEMINI_API_KEY }} ANTHROPIC_API_KEY=${{ secrets.ANTHROPIC_API_KEY }} pytest tests/inference/hosted_platform_tests/
+          SKIP_WARMUP=${{ github.event.inputs.skip_warmup }} HOSTED_PLATFORM_TESTS_API_KEY=${{ secrets.LOAD_TEST_PRODUCTION_API_KEY }} HOSTED_PLATFORM_TESTS_PROJECT=roboflow-platform OPENAI_KEY=${{ secrets.OPEN_AI_API_KEY }} GOOGLE_API_KEY=${{ secrets.GEMINI_API_KEY }} ANTHROPIC_API_KEY=${{ secrets.ANTHROPIC_API_KEY }} GOOGLE_VISION_API_KEY=${{ secrets.GOOGLE_VISION_API_KEY }} pytest tests/inference/hosted_platform_tests/
diff --git a/.github/workflows/hosted_inference_e2e_test_staging.yml b/.github/workflows/hosted_inference_e2e_test_staging.yml
@@ -28,4 +28,4 @@ jobs:
           python -m pip install -r requirements/requirements.test.unit.txt -r requirements/requirements.test.integration.txt -r requirements/requirements.sdk.http.txt
       - name: 📝 E2E test of HOSTED INFERENCE at 😎 STAGING 😎 🔥🔥🔥🔥
         run:
-          SKIP_WARMUP=${{ github.event.inputs.skip_warmup }} HOSTED_PLATFORM_TESTS_API_KEY=${{ secrets.LOAD_TEST_STAGING_API_KEY }} HOSTED_PLATFORM_TESTS_PROJECT=roboflow-staging OPENAI_KEY=${{ secrets.OPEN_AI_API_KEY }} GOOGLE_API_KEY=${{ secrets.GEMINI_API_KEY }} ANTHROPIC_API_KEY=${{ secrets.ANTHROPIC_API_KEY }} pytest tests/inference/hosted_platform_tests/
+          SKIP_WARMUP=${{ github.event.inputs.skip_warmup }} HOSTED_PLATFORM_TESTS_API_KEY=${{ secrets.LOAD_TEST_STAGING_API_KEY }} HOSTED_PLATFORM_TESTS_PROJECT=roboflow-staging OPENAI_KEY=${{ secrets.OPEN_AI_API_KEY }} GOOGLE_API_KEY=${{ secrets.GEMINI_API_KEY }} ANTHROPIC_API_KEY=${{ secrets.ANTHROPIC_API_KEY }} GOOGLE_VISION_API_KEY=${{ secrets.GOOGLE_VISION_API_KEY }} pytest tests/inference/hosted_platform_tests/
diff --git a/inference/core/workflows/core_steps/models/foundation/google_vision_ocr/v1.py b/inference/core/workflows/core_steps/models/foundation/google_vision_ocr/v1.py
@@ -66,11 +66,11 @@ class BlockManifest(WorkflowBlockManifest):
         json_schema_extra={
             "values_metadata": {
                 "text_detection": {
-                    "name": "Text Detection",
+                    "name": "Any Scene Text Detection",
                     "description": "Detects and extracts text from any image, including photographs that contain blocks of text.",
                 },
                 "ocr_text_detection": {
-                    "name": "OCR Text Detection",
+                    "name": "Document Text Detection",
                     "description": "Optimized for dense text documents, such as scanned pages or photographs of printed text.",
                 },
             },

diff --git a/tests/inference/hosted_platform_tests/conftest.py b/tests/inference/hosted_platform_tests/conftest.py
@@ -89,6 +89,7 @@ class PlatformEnvironment(Enum):
 OPENAI_KEY = os.getenv("OPENAI_KEY")
 GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
+GOOGLE_VISION_API_KEY = os.getenv("GOOGLE_VISION_API_KEY")
 
 
 @pytest.fixture(scope="session")

diff --git a/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_google_ocr.py b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_google_ocr.py
@@ -0,0 +1,89 @@
+import numpy as np
+import pytest
+
+from inference_sdk import InferenceHTTPClient
+from tests.inference.hosted_platform_tests.conftest import (
+    GOOGLE_VISION_API_KEY,
+    ROBOFLOW_API_KEY,
+)
+
+GOOGLE_VISION_OCR_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/google_vision_ocr@v1",
+            "name": "google_vision_ocr",
+            "image": "$inputs.image",
+            "ocr_type": "text_detection",
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/bounding_box_visualization@v1",
+            "name": "bounding_box_visualization",
+            "predictions": "$steps.google_vision_ocr.predictions",
+            "image": "$inputs.image",
+        },
+        {
+            "type": "roboflow_core/label_visualization@v1",
+            "name": "label_visualization",
+            "predictions": "$steps.google_vision_ocr.predictions",
+            "image": "$steps.bounding_box_visualization.image",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "extracted_text",
+            "selector": "$steps.google_vision_ocr.text",
+        },
+        {
+            "type": "JsonField",
+            "name": "text_detections",
+            "selector": "$steps.google_vision_ocr.predictions",
+        },
+        {
+            "type": "JsonField",
+            "name": "text_visualised",
+            "selector": "$steps.label_visualization.image",
+        },
+    ],
+}
+
+
+@pytest.mark.skipif(GOOGLE_VISION_API_KEY is None, reason="No OpenAI API key provided")
+@pytest.mark.flaky(retries=4, delay=1)
+def test_workflow_with_google_api_ocr(
+    object_detection_service_url: str,
+    license_plate_image: str,
+) -> None:
+    client = InferenceHTTPClient(
+        api_url=object_detection_service_url,
+        api_key=ROBOFLOW_API_KEY,
+    )
+
+    # when
+    result = client.run_workflow(
+        specification=GOOGLE_VISION_OCR_WORKFLOW,
+        images={
+            "image": license_plate_image,
+        },
+        parameters={
+            "api_key": GOOGLE_VISION_API_KEY,
+        },
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {
+        "extracted_text",
+        "text_visualised",
+        "text_detections",
+    }, "Expected all outputs to be delivered"
+    assert len(result[0]["extracted_text"]) > 0, "Expected text to be extracted"
+    assert (
+        len(result[0]["text_detections"]) == 4
+    ), "Expected 4 text regions to be detected"
diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_detection_plus_ocr.py b/tests/workflows/integration_tests/execution/test_workflow_with_detection_plus_ocr.py
@@ -81,8 +81,8 @@
 
 
 @add_to_workflows_gallery(
-    category="Workflows with multiple models",
-    use_case_title="Workflow detection models and OCR",
+    category="Workflows for OCR",
+    use_case_title="Workflow with DocTR model",
     use_case_description="""
 This example showcases quite sophisticated workflows usage scenario that assume the following:
 

diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_google_vision_ocr.py b/tests/workflows/integration_tests/execution/test_workflow_with_google_vision_ocr.py
@@ -0,0 +1,155 @@
+import os
+
+import numpy as np
+import pytest
+
+from inference.core.env import WORKFLOWS_MAX_CONCURRENT_STEPS
+from inference.core.managers.base import ModelManager
+from inference.core.workflows.core_steps.common.entities import StepExecutionMode
+from inference.core.workflows.execution_engine.core import ExecutionEngine
+from tests.workflows.integration_tests.execution.workflows_gallery_collector.decorators import (
+    add_to_workflows_gallery,
+)
+
+GOOGLE_VISION_API_KEY = os.getenv("WORKFLOWS_TEST_GOOGLE_VISION_API_KEY")
+
+GOOGLE_VISION_OCR_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/google_vision_ocr@v1",
+            "name": "google_vision_ocr",
+            "image": "$inputs.image",
+            "ocr_type": "text_detection",
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/bounding_box_visualization@v1",
+            "name": "bounding_box_visualization",
+            "predictions": "$steps.google_vision_ocr.predictions",
+            "image": "$inputs.image",
+        },
+        {
+            "type": "roboflow_core/label_visualization@v1",
+            "name": "label_visualization",
+            "predictions": "$steps.google_vision_ocr.predictions",
+            "image": "$steps.bounding_box_visualization.image",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "extracted_text",
+            "selector": "$steps.google_vision_ocr.text",
+        },
+        {
+            "type": "JsonField",
+            "name": "text_detections",
+            "selector": "$steps.google_vision_ocr.predictions",
+        },
+        {
+            "type": "JsonField",
+            "name": "text_visualised",
+            "selector": "$steps.label_visualization.image",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows for OCR",
+    use_case_title="Google Vision OCR",
+    use_case_description="""
+In this example, Google Vision OCR is used to extract text from input image.
+Additionally, example presents how to combine structured output of Google API
+with visualisation blocks. 
+    """,
+    workflow_definition=GOOGLE_VISION_OCR_WORKFLOW,
+    workflow_name_in_app="google-vision-ocr",
+)
+@pytest.mark.skipif(
+    condition=GOOGLE_VISION_API_KEY is None, reason="Google API key not provided"
+)
+def test_workflow_with_google_ocr_when_text_should_be_detected(
+    model_manager: ModelManager,
+    license_plate_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=GOOGLE_VISION_OCR_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [license_plate_image],
+            "api_key": GOOGLE_VISION_API_KEY,
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {
+        "extracted_text",
+        "text_visualised",
+        "text_detections",
+    }, "Expected all outputs to be delivered"
+    assert (
+        result[0]["extracted_text"] == "2398027\n2398023\nKn\n239+8072"
+    ), "Extracted text should match reference"
+    assert not np.allclose(
+        license_plate_image, result[0]["text_visualised"].numpy_image
+    ), "Expected that visualisation will change the output image"
+    assert (
+        len(result[0]["text_detections"]) == 4
+    ), "Expected 4 text regions to be detected"
+
+
+@pytest.mark.skipif(
+    condition=GOOGLE_VISION_API_KEY is None, reason="Google API key not provided"
+)
+def test_workflow_with_google_ocr_when_no_text_should_be_detected(
+    model_manager: ModelManager,
+    dogs_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=GOOGLE_VISION_OCR_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [dogs_image],
+            "api_key": GOOGLE_VISION_API_KEY,
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {
+        "extracted_text",
+        "text_visualised",
+        "text_detections",
+    }, "Expected all outputs to be delivered"
+    assert result[0]["extracted_text"] == ""
+    assert np.allclose(
+        dogs_image, result[0]["text_visualised"].numpy_image
+    ), "Expected that visualisation will not change the output image"
+    assert len(result[0]["text_detections"]) == 0, "Expected 0 text regions detected"
diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_stitch_image.py b/tests/workflows/integration_tests/execution/test_workflow_with_stitch_image.py
@@ -3,7 +3,9 @@
 from inference.core.env import WORKFLOWS_MAX_CONCURRENT_STEPS
 from inference.core.managers.base import ModelManager
 from inference.core.workflows.core_steps.common.entities import StepExecutionMode
-from inference.core.workflows.core_steps.transformations.stitch_images.v1 import OUTPUT_KEY
+from inference.core.workflows.core_steps.transformations.stitch_images.v1 import (
+    OUTPUT_KEY,
+)
 from inference.core.workflows.execution_engine.core import ExecutionEngine
 from tests.workflows.integration_tests.execution.workflows_gallery_collector.decorators import (
     add_to_workflows_gallery,
@@ -14,7 +16,10 @@
     "inputs": [
         {"type": "InferenceImage", "name": "image1"},
         {"type": "InferenceImage", "name": "image2"},
-        {"type": "InferenceParameter", "name": "count_of_best_matches_per_query_descriptor"},
+        {
+            "type": "InferenceParameter",
+            "name": "count_of_best_matches_per_query_descriptor",
+        },
         {"type": "InferenceParameter", "name": "max_allowed_reprojection_error"},
     ],
     "steps": [
@@ -77,6 +82,8 @@ def test_workflow_with_classical_pattern_matching(
     assert set(result[0].keys()) == {
         "stitched_image",
     }, "Expected all declared outputs to be delivered"
-    assert (
-        result[0]["stitched_image"].numpy_image.shape == (2918, 2034, 3)
+    assert result[0]["stitched_image"].numpy_image.shape == (
+        2918,
+        2034,
+        3,
     ), "Expected result image shape must match (2918, 2034, 3)"