Merge pull request #718 from roboflow/fix/issue_with_crop_plus_openai…

…_block Fix the problem with VLMs on batch inference
roboflow · Oct 4, 2024 · bce522e · bce522e
2 parents 74b52a9 + 8714e1d
commit bce522e
Show file tree

Hide file tree

Showing 6 changed files with 27 additions and 12 deletions.
diff --git a/inference/core/workflows/core_steps/models/foundation/anthropic_claude/v1.py b/inference/core/workflows/core_steps/models/foundation/anthropic_claude/v1.py
@@ -306,13 +306,13 @@ def run_claude_prompting(
         base64_image = base64.b64encode(
             encode_image_to_jpeg_bytes(loaded_image)
         ).decode("ascii")
-        prompt = PROMPT_BUILDERS[task_type](
+        generated_prompt = PROMPT_BUILDERS[task_type](
             base64_image=base64_image,
             prompt=prompt,
             output_structure=output_structure,
             classes=classes,
         )
-        prompts.append(prompt)
+        prompts.append(generated_prompt)
     return execute_claude_requests(
         api_key=api_key,
         prompts=prompts,

diff --git a/inference/core/workflows/core_steps/models/foundation/google_gemini/v1.py b/inference/core/workflows/core_steps/models/foundation/google_gemini/v1.py
@@ -303,15 +303,15 @@ def run_gemini_prompting(
         base64_image = base64.b64encode(
             encode_image_to_jpeg_bytes(loaded_image)
         ).decode("ascii")
-        prompt = PROMPT_BUILDERS[task_type](
+        generated_prompt = PROMPT_BUILDERS[task_type](
             base64_image=base64_image,
             prompt=prompt,
             output_structure=output_structure,
             classes=classes,
             temperature=temperature,
             max_tokens=max_tokens,
         )
-        gemini_prompts.append(prompt)
+        gemini_prompts.append(generated_prompt)
     return execute_gemini_requests(
         google_api_key=google_api_key,
         gemini_prompts=gemini_prompts,

diff --git a/inference/core/workflows/core_steps/models/foundation/openai/v2.py b/inference/core/workflows/core_steps/models/foundation/openai/v2.py
@@ -301,14 +301,14 @@ def run_gpt_4v_llm_prompting(
         base64_image = base64.b64encode(
             encode_image_to_jpeg_bytes(loaded_image)
         ).decode("ascii")
-        prompt = PROMPT_BUILDERS[task_type](
+        generated_prompt = PROMPT_BUILDERS[task_type](
             base64_image=base64_image,
             prompt=prompt,
             output_structure=output_structure,
             classes=classes,
             gpt_image_detail=gpt_image_detail,
         )
-        gpt4_prompts.append(prompt)
+        gpt4_prompts.append(generated_prompt)
     return execute_gpt_4v_requests(
         openai_api_key=openai_api_key,
         gpt4_prompts=gpt4_prompts,

diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_claude_models.py b/tests/workflows/integration_tests/execution/test_workflow_with_claude_models.py
@@ -59,6 +59,7 @@
 def test_workflow_with_unconstrained_prompt(
     model_manager: ModelManager,
     dogs_image: np.ndarray,
+    license_plate_image: np.ndarray,
 ) -> None:
     # given
     workflow_init_parameters = {
@@ -74,18 +75,22 @@ def test_workflow_with_unconstrained_prompt(
     # when
     result = execution_engine.run(
         runtime_parameters={
-            "image": [dogs_image],
+            "image": [dogs_image, license_plate_image],
             "api_key": ANTHROPIC_API_KEY,
             "prompt": "What is the topic of the image?",
         }
     )
 
     # then
-    assert len(result) == 1, "Single image given, expected single output"
+    assert len(result) == 2, "Single image given, expected single output"
     assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert set(result[1].keys()) == {"result"}, "Expected all outputs to be delivered"
     assert (
         isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
     ), "Expected non-empty string generated"
+    assert (
+        isinstance(result[1]["result"], str) and len(result[1]["result"]) > 0
+    ), "Expected non-empty string generated"
 
 
 OCR_WORKFLOW = {

diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_gemini_models.py b/tests/workflows/integration_tests/execution/test_workflow_with_gemini_models.py
@@ -59,6 +59,7 @@
 def test_workflow_with_unconstrained_prompt(
     model_manager: ModelManager,
     dogs_image: np.ndarray,
+    license_plate_image: np.ndarray,
 ) -> None:
     # given
     workflow_init_parameters = {
@@ -74,18 +75,22 @@ def test_workflow_with_unconstrained_prompt(
     # when
     result = execution_engine.run(
         runtime_parameters={
-            "image": [dogs_image],
+            "image": [dogs_image, license_plate_image],
             "api_key": GOOGLE_API_KEY,
             "prompt": "What is the topic of the image?",
         }
     )
 
     # then
-    assert len(result) == 1, "Single image given, expected single output"
+    assert len(result) == 2, "Single image given, expected single output"
     assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert set(result[1].keys()) == {"result"}, "Expected all outputs to be delivered"
     assert (
         isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
     ), "Expected non-empty string generated"
+    assert (
+        isinstance(result[1]["result"], str) and len(result[1]["result"]) > 0
+    ), "Expected non-empty string generated"
 
 
 OCR_WORKFLOW = {

diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_open_ai_models.py b/tests/workflows/integration_tests/execution/test_workflow_with_open_ai_models.py
@@ -60,6 +60,7 @@
 def test_workflow_with_unconstrained_prompt(
     model_manager: ModelManager,
     dogs_image: np.ndarray,
+    license_plate_image: np.ndarray,
 ) -> None:
     # given
     workflow_init_parameters = {
@@ -75,18 +76,22 @@ def test_workflow_with_unconstrained_prompt(
     # when
     result = execution_engine.run(
         runtime_parameters={
-            "image": [dogs_image],
+            "image": [dogs_image, license_plate_image],
             "api_key": OPEN_AI_API_KEY,
             "prompt": "What is the topic of the image?",
         }
     )
 
     # then
-    assert len(result) == 1, "Single image given, expected single output"
+    assert len(result) == 2, "Single image given, expected single output"
     assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert set(result[1].keys()) == {"result"}, "Expected all outputs to be delivered"
     assert (
         isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
     ), "Expected non-empty string generated"
+    assert (
+        isinstance(result[1]["result"], str) and len(result[1]["result"]) > 0
+    ), "Expected non-empty string generated"
 
 
 OCR_WORKFLOW = {