diff --git a/development/docs/build_block_docs.py b/development/docs/build_block_docs.py
index a0fb7c2e52..14e948380e 100644
--- a/development/docs/build_block_docs.py
+++ b/development/docs/build_block_docs.py
@@ -361,7 +361,7 @@ def format_block_connections(
connections = [
(
f"[`{block_type2manifest_type_identifier[connection]}`]"
- f"(/workflows/blocks/{camel_to_snake(block_type2manifest_type_identifier[connection])})"
+ f"(/workflows/blocks/{slugify_block_name(block_type2manifest_type_identifier[connection])})"
)
for connection in connections
]
diff --git a/docs/workflows/blocks.md b/docs/workflows/blocks.md
index 1c6290f45c..ed2f6ecccf 100644
--- a/docs/workflows/blocks.md
+++ b/docs/workflows/blocks.md
@@ -72,6 +72,11 @@ hide:
+
+
+
+
+
diff --git a/docs/workflows/create_workflow_block.md b/docs/workflows/create_workflow_block.md
index c6216f5dde..d14a757703 100644
--- a/docs/workflows/create_workflow_block.md
+++ b/docs/workflows/create_workflow_block.md
@@ -1050,7 +1050,7 @@ def run(self, predictions: List[dict]) -> BlockResult:
)
from inference.core.workflows.execution_engine.entities.types import (
StepOutputSelector,
- BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND,
+ OBJECT_DETECTION_PREDICTION_KIND,
)
from inference.core.workflows.prototypes.block import (
BlockResult,
@@ -1063,7 +1063,7 @@ def run(self, predictions: List[dict]) -> BlockResult:
class BlockManifest(WorkflowBlockManifest):
type: Literal["my_plugin/fusion_of_predictions@v1"]
name: str
- predictions: List[StepOutputSelector(kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND])] = Field(
+ predictions: List[StepOutputSelector(kind=[OBJECT_DETECTION_PREDICTION_KIND])] = Field(
description="Selectors to step outputs",
examples=[["$steps.model_1.predictions", "$steps.model_2.predictions"]],
)
@@ -1073,7 +1073,7 @@ def run(self, predictions: List[dict]) -> BlockResult:
return [
OutputDefinition(
name="predictions",
- kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND],
+ kind=[OBJECT_DETECTION_PREDICTION_KIND],
)
]
@@ -1251,8 +1251,8 @@ the method signatures.
ImageParentMetadata,
)
from inference.core.workflows.execution_engine.entities.types import (
- BATCH_OF_IMAGES_KIND,
- BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND,
+ IMAGE_KIND,
+ OBJECT_DETECTION_PREDICTION_KIND,
StepOutputImageSelector,
StepOutputSelector,
WorkflowImageSelector,
@@ -1267,7 +1267,7 @@ the method signatures.
type: Literal["my_block/dynamic_crop@v1"]
image: Union[WorkflowImageSelector, StepOutputImageSelector]
predictions: StepOutputSelector(
- kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND],
+ kind=[OBJECT_DETECTION_PREDICTION_KIND],
)
@classmethod
@@ -1277,7 +1277,7 @@ the method signatures.
@classmethod
def describe_outputs(cls) -> List[OutputDefinition]:
return [
- OutputDefinition(name="crops", kind=[BATCH_OF_IMAGES_KIND]),
+ OutputDefinition(name="crops", kind=[IMAGE_KIND]),
]
@classmethod
@@ -1340,8 +1340,8 @@ the method signatures.
WorkflowImageData,
)
from inference.core.workflows.execution_engine.entities.types import (
- BATCH_OF_IMAGES_KIND,
- BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND,
+ IMAGE_KIND,
+ OBJECT_DETECTION_PREDICTION_KIND,
StepOutputImageSelector,
StepOutputSelector,
WorkflowImageSelector,
@@ -1357,7 +1357,7 @@ the method signatures.
type: Literal["my_plugin/tile_detections@v1"]
crops: Union[WorkflowImageSelector, StepOutputImageSelector]
crops_predictions: StepOutputSelector(
- kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND]
+ kind=[OBJECT_DETECTION_PREDICTION_KIND]
)
@classmethod
@@ -1367,7 +1367,7 @@ the method signatures.
@classmethod
def describe_outputs(cls) -> List[OutputDefinition]:
return [
- OutputDefinition(name="visualisations", kind=[BATCH_OF_IMAGES_KIND]),
+ OutputDefinition(name="visualisations", kind=[IMAGE_KIND]),
]
@@ -1427,7 +1427,7 @@ the method signatures.
WorkflowImageData,
)
from inference.core.workflows.execution_engine.entities.types import (
- BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND,
+ OBJECT_DETECTION_PREDICTION_KIND,
StepOutputImageSelector,
StepOutputSelector,
WorkflowImageSelector,
@@ -1443,7 +1443,7 @@ the method signatures.
type: Literal["my_plugin/stitch@v1"]
image: Union[WorkflowImageSelector, StepOutputImageSelector]
image_predictions: StepOutputSelector(
- kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND],
+ kind=[OBJECT_DETECTION_PREDICTION_KIND],
)
@classmethod
@@ -1463,7 +1463,7 @@ the method signatures.
OutputDefinition(
name="predictions",
kind=[
- BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND,
+ OBJECT_DETECTION_PREDICTION_KIND,
],
),
]
@@ -1526,8 +1526,8 @@ the method signatures.
Batch,
)
from inference.core.workflows.execution_engine.entities.types import (
- BATCH_OF_IMAGES_KIND,
- BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND,
+ IMAGE_KIND,
+ OBJECT_DETECTION_PREDICTION_KIND,
StepOutputImageSelector,
StepOutputSelector,
WorkflowImageSelector,
@@ -1542,7 +1542,7 @@ the method signatures.
type: Literal["my_block/dynamic_crop@v1"]
image: Union[WorkflowImageSelector, StepOutputImageSelector]
predictions: StepOutputSelector(
- kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND],
+ kind=[OBJECT_DETECTION_PREDICTION_KIND],
)
@classmethod
@@ -1556,7 +1556,7 @@ the method signatures.
@classmethod
def describe_outputs(cls) -> List[OutputDefinition]:
return [
- OutputDefinition(name="crops", kind=[BATCH_OF_IMAGES_KIND]),
+ OutputDefinition(name="crops", kind=[IMAGE_KIND]),
]
@classmethod
@@ -1629,8 +1629,8 @@ the method signatures.
WorkflowImageData,
)
from inference.core.workflows.execution_engine.entities.types import (
- BATCH_OF_IMAGES_KIND,
- BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND,
+ IMAGE_KIND,
+ OBJECT_DETECTION_PREDICTION_KIND,
StepOutputImageSelector,
StepOutputSelector,
WorkflowImageSelector,
@@ -1646,7 +1646,7 @@ the method signatures.
type: Literal["my_plugin/tile_detections@v1"]
images_crops: Union[WorkflowImageSelector, StepOutputImageSelector]
crops_predictions: StepOutputSelector(
- kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND]
+ kind=[OBJECT_DETECTION_PREDICTION_KIND]
)
@classmethod
@@ -1660,7 +1660,7 @@ the method signatures.
@classmethod
def describe_outputs(cls) -> List[OutputDefinition]:
return [
- OutputDefinition(name="visualisations", kind=[BATCH_OF_IMAGES_KIND]),
+ OutputDefinition(name="visualisations", kind=[IMAGE_KIND]),
]
@@ -1726,7 +1726,7 @@ the method signatures.
WorkflowImageData,
)
from inference.core.workflows.execution_engine.entities.types import (
- BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND,
+ OBJECT_DETECTION_PREDICTION_KIND,
StepOutputImageSelector,
StepOutputSelector,
WorkflowImageSelector,
@@ -1742,7 +1742,7 @@ the method signatures.
type: Literal["my_plugin/stitch@v1"]
images: Union[WorkflowImageSelector, StepOutputImageSelector]
images_predictions: StepOutputSelector(
- kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND],
+ kind=[OBJECT_DETECTION_PREDICTION_KIND],
)
@classmethod
@@ -1766,7 +1766,7 @@ the method signatures.
OutputDefinition(
name="predictions",
kind=[
- BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND,
+ OBJECT_DETECTION_PREDICTION_KIND,
],
),
]
diff --git a/docs/workflows/gallery_index.md b/docs/workflows/gallery_index.md
index d734e77111..f9dd8a1a60 100644
--- a/docs/workflows/gallery_index.md
+++ b/docs/workflows/gallery_index.md
@@ -7,6 +7,7 @@ Browse through the various categories to find inspiration and ideas for building
Workflows with multiple models
Workflows enhanced by Roboflow Platform
Workflows with classical Computer Vision methods
+ Workflows with Visual Language Models
Basic Workflows
Workflows with dynamic Python Blocks
Workflows with data transformations
diff --git a/docs/workflows/kinds.md b/docs/workflows/kinds.md
index 4a92216bf4..22482dfd85 100644
--- a/docs/workflows/kinds.md
+++ b/docs/workflows/kinds.md
@@ -23,49 +23,50 @@ for the presence of a mask in the input.
!!! Warning
- The list presented below contains elements with `Batch[X]` markers - those will
- get soon deprecated and we will use only `X` markers. For now, developers are asked
- to create their blocks using the `Batch[X]` markers, but raise the
- [issue here](https://github.com/roboflow/inference/issues/608). This GH issue will be used
- as a point of communication regarding deprecation process.
+ In `inference` release `0.18.0` we decided to make drastic move to heal the ecosystem
+ from the problem with ambiguous kinds names (`Batch[X]` vs `X` - see more
+ [here](https://github.com/roboflow/inference/issues/608)).
+
+ The change is breaking only if there is remote Workflow plugin depending on imports
+ from `inference.core.workflows.execution_engine.entities.types` module, which is
+ not the case to the best of our knowledge. We removed problematic kinds as if they
+ never existed in the ecosystem and fixed all blocks from `roboflow_core` plugin.
+ If there is anyone impacted by the change - here is the
+ [migration guide](https://github.com/roboflow/inference/releases/tag/v0.18.0).
## Kinds declared in Roboflow plugins
-* [`zone`](/workflows/kinds/zone): Definition of polygon zone
-* [`Batch[dictionary]`](/workflows/kinds/batch_dictionary): Batch of dictionaries
-* [`dictionary`](/workflows/kinds/dictionary): Dictionary
-* [`point`](/workflows/kinds/point): Single point in 2D
-* [`Batch[parent_id]`](/workflows/kinds/batch_parent_id): Identifier of parent for step output
-* [`roboflow_model_id`](/workflows/kinds/roboflow_model_id): Roboflow model id
-* [`Batch[classification_prediction]`](/workflows/kinds/batch_classification_prediction): `'predictions'` key from Classification Model outputs
-* [`Batch[top_class]`](/workflows/kinds/batch_top_class): Batch of string values representing top class predicted by classification model
-* [`rgb_color`](/workflows/kinds/rgb_color): RGB color
-* [`Batch[keypoint_detection_prediction]`](/workflows/kinds/batch_keypoint_detection_prediction): `'predictions'` key from Keypoint Detection Model output
-* [`Batch[serialised_payloads]`](/workflows/kinds/batch_serialised_payloads): List of serialised elements that can be registered in the sink
+* [`bar_code_detection`](/workflows/kinds/bar_code_detection): Prediction with barcode detection
+* [`language_model_output`](/workflows/kinds/language_model_output): LLM / VLM output
+* [`top_class`](/workflows/kinds/top_class): String value representing top class predicted by classification model
+* [`prediction_type`](/workflows/kinds/prediction_type): String value with type of prediction
+* [`object_detection_prediction`](/workflows/kinds/object_detection_prediction): Prediction with detected bounding boxes in form of sv.Detections(...) object
+* [`qr_code_detection`](/workflows/kinds/qr_code_detection): Prediction with QR code detection
+* [`image_metadata`](/workflows/kinds/image_metadata): Dictionary with image metadata required by supervision
* [`float_zero_to_one`](/workflows/kinds/float_zero_to_one): `float` value in range `[0.0, 1.0]`
-* [`Batch[boolean]`](/workflows/kinds/batch_boolean): Boolean values batch
-* [`list_of_values`](/workflows/kinds/list_of_values): List of values of any types
-* [`Batch[instance_segmentation_prediction]`](/workflows/kinds/batch_instance_segmentation_prediction): `'predictions'` key from Instance Segmentation Model outputs
-* [`Batch[qr_code_detection]`](/workflows/kinds/batch_qr_code_detection): Prediction with QR code detection
+* [`parent_id`](/workflows/kinds/parent_id): Identifier of parent for step output
+* [`keypoint_detection_prediction`](/workflows/kinds/keypoint_detection_prediction): Prediction with detected bounding boxes and detected keypoints in form of sv.Detections(...) object
+* [`float`](/workflows/kinds/float): Float value
+* [`*`](/workflows/kinds/*): Equivalent of any element
* [`contours`](/workflows/kinds/contours): List of numpy arrays where each array represents contour points
-* [`Batch[image]`](/workflows/kinds/batch_image): Image in workflows
+* [`boolean`](/workflows/kinds/boolean): Boolean flag
* [`detection`](/workflows/kinds/detection): Single element of detections-based prediction (like `object_detection_prediction`)
-* [`Batch[prediction_type]`](/workflows/kinds/batch_prediction_type): String value with type of prediction
+* [`roboflow_project`](/workflows/kinds/roboflow_project): Roboflow project name
+* [`dictionary`](/workflows/kinds/dictionary): Dictionary
+* [`numpy_array`](/workflows/kinds/numpy_array): Numpy array
* [`roboflow_api_key`](/workflows/kinds/roboflow_api_key): Roboflow API key
* [`string`](/workflows/kinds/string): String value
-* [`*`](/workflows/kinds/*): Equivalent of any element
-* [`float`](/workflows/kinds/float): Float value
-* [`keypoint_detection_prediction`](/workflows/kinds/keypoint_detection_prediction): Prediction with detected bounding boxes and detected keypoints in form of sv.Detections(...) object
-* [`Batch[object_detection_prediction]`](/workflows/kinds/batch_object_detection_prediction): `'predictions'` key from Object Detection Model output
-* [`integer`](/workflows/kinds/integer): Integer value
-* [`roboflow_project`](/workflows/kinds/roboflow_project): Roboflow project name
-* [`Batch[string]`](/workflows/kinds/batch_string): Batch of string values
-* [`image`](/workflows/kinds/image): Image in workflows
-* [`Batch[bar_code_detection]`](/workflows/kinds/batch_bar_code_detection): Prediction with barcode detection
-* [`object_detection_prediction`](/workflows/kinds/object_detection_prediction): Prediction with detected bounding boxes in form of sv.Detections(...) object
-* [`boolean`](/workflows/kinds/boolean): Boolean flag
+* [`roboflow_model_id`](/workflows/kinds/roboflow_model_id): Roboflow model id
+* [`list_of_values`](/workflows/kinds/list_of_values): List of values of any types
* [`instance_segmentation_prediction`](/workflows/kinds/instance_segmentation_prediction): Prediction with detected bounding boxes and segmentation masks in form of sv.Detections(...) object
+* [`image`](/workflows/kinds/image): Image in workflows
+* [`video_metadata`](/workflows/kinds/video_metadata): Video image metadata
+* [`serialised_payloads`](/workflows/kinds/serialised_payloads): Serialised element that is usually accepted by sink
+* [`integer`](/workflows/kinds/integer): Integer value
+* [`rgb_color`](/workflows/kinds/rgb_color): RGB color
+* [`classification_prediction`](/workflows/kinds/classification_prediction): Predictions from classifier
* [`image_keypoints`](/workflows/kinds/image_keypoints): Image keypoints detected by classical Computer Vision method
-* [`Batch[image_metadata]`](/workflows/kinds/batch_image_metadata): Dictionary with image metadata required by supervision
+* [`point`](/workflows/kinds/point): Single point in 2D
+* [`zone`](/workflows/kinds/zone): Definition of polygon zone
diff --git a/inference/core/version.py b/inference/core/version.py
index 7dbc1800f2..b3b607f742 100644
--- a/inference/core/version.py
+++ b/inference/core/version.py
@@ -1,4 +1,4 @@
-__version__ = "0.17.1"
+__version__ = "0.18.0"
if __name__ == "__main__":
diff --git a/inference/core/workflows/core_steps/formatters/json_parser/__init__.py b/inference/core/workflows/core_steps/formatters/json_parser/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/inference/core/workflows/core_steps/formatters/json_parser/v1.py b/inference/core/workflows/core_steps/formatters/json_parser/v1.py
new file mode 100644
index 0000000000..0e1a0f1b3e
--- /dev/null
+++ b/inference/core/workflows/core_steps/formatters/json_parser/v1.py
@@ -0,0 +1,142 @@
+import json
+import logging
+import re
+from typing import List, Literal, Optional, Tuple, Type
+
+from pydantic import AfterValidator, ConfigDict, Field
+from typing_extensions import Annotated
+
+from inference.core.workflows.execution_engine.entities.base import OutputDefinition
+from inference.core.workflows.execution_engine.entities.types import (
+ BOOLEAN_KIND,
+ LANGUAGE_MODEL_OUTPUT_KIND,
+ StepOutputSelector,
+)
+from inference.core.workflows.prototypes.block import (
+ BlockResult,
+ WorkflowBlock,
+ WorkflowBlockManifest,
+)
+
+JSON_MARKDOWN_BLOCK_PATTERN = re.compile(r"```json([\s\S]*?)```", flags=re.IGNORECASE)
+
+LONG_DESCRIPTION = """
+The block expects string input that would be produced by blocks exposing Large Language Models (LLMs) and
+Visual Language Models (VLMs). Input is parsed to JSON, and its keys are exposed as block outputs.
+
+Accepted formats:
+- valid JSON strings
+- JSON documents wrapped with Markdown tags (very common for GPT responses)
+```
+{"my": "json"}
+```
+
+**Details regarding block behavior:**
+
+- `error_status` is set `True` whenever at least one of `expected_fields` cannot be retrieved from input
+
+- in case of multiple markdown blocks with raw JSON content - only first will be parsed and returned, while
+`error_status` will remain `False`
+"""
+
+SHORT_DESCRIPTION = "Parses raw string into JSON."
+
+
+def validate_reserved_fields(expected_fields: List[str]) -> List[str]:
+ if "error_status" in expected_fields:
+ raise ValueError(
+ "`error_status` is reserved field name and cannot be "
+ "used in `expected_fields` of `roboflow_core/json_parser@v1` block."
+ )
+ return expected_fields
+
+
+class BlockManifest(WorkflowBlockManifest):
+ model_config = ConfigDict(
+ json_schema_extra={
+ "name": "JSON Parser",
+ "version": "v1",
+ "short_description": SHORT_DESCRIPTION,
+ "long_description": LONG_DESCRIPTION,
+ "license": "Apache-2.0",
+ "block_type": "formatter",
+ }
+ )
+ type: Literal["roboflow_core/json_parser@v1"]
+ raw_json: StepOutputSelector(kind=[LANGUAGE_MODEL_OUTPUT_KIND]) = Field(
+ description="The string with raw JSON to parse.",
+ examples=[["$steps.lmm.output"]],
+ )
+ expected_fields: Annotated[List[str], AfterValidator(validate_reserved_fields)] = (
+ Field(
+ description="List of expected JSON fields. `error_status` field name is reserved and cannot be used.",
+ examples=[["field_a", "field_b"]],
+ )
+ )
+
+ @classmethod
+ def describe_outputs(cls) -> List[OutputDefinition]:
+ return [
+ OutputDefinition(name="error_status", kind=[BOOLEAN_KIND]),
+ OutputDefinition(name="*"),
+ ]
+
+ def get_actual_outputs(self) -> List[OutputDefinition]:
+ result = [
+ OutputDefinition(name="error_status", kind=[BOOLEAN_KIND]),
+ ]
+ for field_name in self.expected_fields:
+ result.append(OutputDefinition(name=field_name))
+ return result
+
+ @classmethod
+ def get_execution_engine_compatibility(cls) -> Optional[str]:
+ return ">=1.0.0,<2.0.0"
+
+
+class JSONParserBlockV1(WorkflowBlock):
+
+ @classmethod
+ def get_manifest(cls) -> Type[WorkflowBlockManifest]:
+ return BlockManifest
+
+ def run(
+ self,
+ raw_json: str,
+ expected_fields: List[str],
+ ) -> BlockResult:
+ error_status, parsed_data = string2json(
+ raw_json=raw_json,
+ expected_fields=expected_fields,
+ )
+ parsed_data["error_status"] = error_status
+ return parsed_data
+
+
+def string2json(
+ raw_json: str,
+ expected_fields: List[str],
+) -> Tuple[bool, dict]:
+ json_blocks_found = JSON_MARKDOWN_BLOCK_PATTERN.findall(raw_json)
+ if len(json_blocks_found) == 0:
+ return try_parse_json(raw_json, expected_fields=expected_fields)
+ first_block = json_blocks_found[0]
+ return try_parse_json(first_block, expected_fields=expected_fields)
+
+
+def try_parse_json(content: str, expected_fields: List[str]) -> Tuple[bool, dict]:
+ try:
+ parsed_data = json.loads(content)
+ result = {}
+ all_fields_find = True
+ for field in expected_fields:
+ if field not in parsed_data:
+ all_fields_find = False
+ result[field] = parsed_data.get(field)
+ return not all_fields_find, result
+ except Exception as error:
+ logging.warning(
+ f"Could not parse JSON in `roboflow_core/json_parser@v1` block. "
+ f"Error type: {error.__class__.__name__}. Details: {error}"
+ )
+ return True, {field: None for field in expected_fields}
diff --git a/inference/core/workflows/core_steps/formatters/vlm_as_classifier/__init__.py b/inference/core/workflows/core_steps/formatters/vlm_as_classifier/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/inference/core/workflows/core_steps/formatters/vlm_as_classifier/v1.py b/inference/core/workflows/core_steps/formatters/vlm_as_classifier/v1.py
new file mode 100644
index 0000000000..7edce35af6
--- /dev/null
+++ b/inference/core/workflows/core_steps/formatters/vlm_as_classifier/v1.py
@@ -0,0 +1,269 @@
+import json
+import logging
+import re
+from typing import Dict, List, Literal, Optional, Tuple, Type, Union
+from uuid import uuid4
+
+from pydantic import ConfigDict, Field
+
+from inference.core.workflows.execution_engine.entities.base import (
+ OutputDefinition,
+ WorkflowImageData,
+)
+from inference.core.workflows.execution_engine.entities.types import (
+ BOOLEAN_KIND,
+ CLASSIFICATION_PREDICTION_KIND,
+ LANGUAGE_MODEL_OUTPUT_KIND,
+ LIST_OF_VALUES_KIND,
+ STRING_KIND,
+ StepOutputImageSelector,
+ StepOutputSelector,
+ WorkflowImageSelector,
+ WorkflowParameterSelector,
+)
+from inference.core.workflows.prototypes.block import (
+ BlockResult,
+ WorkflowBlock,
+ WorkflowBlockManifest,
+)
+
+JSON_MARKDOWN_BLOCK_PATTERN = re.compile(r"```json([\s\S]*?)```", flags=re.IGNORECASE)
+
+LONG_DESCRIPTION = """
+The block expects string input that would be produced by blocks exposing Large Language Models (LLMs) and
+Visual Language Models (VLMs). Input is parsed to classification prediction and returned as block output.
+
+Accepted formats:
+
+- valid JSON strings
+
+- JSON documents wrapped with Markdown tags (very common for GPT responses)
+
+Example:
+```
+{"my": "json"}
+```
+
+**Details regarding block behavior:**
+
+- `error_status` is set `True` whenever parsing cannot be completed
+
+- in case of multiple markdown blocks with raw JSON content - only first will be parsed
+"""
+
+SHORT_DESCRIPTION = "Parses raw string into classification prediction."
+
+
+class BlockManifest(WorkflowBlockManifest):
+ model_config = ConfigDict(
+ json_schema_extra={
+ "name": "VLM as Classifier",
+ "version": "v1",
+ "short_description": SHORT_DESCRIPTION,
+ "long_description": LONG_DESCRIPTION,
+ "license": "Apache-2.0",
+ "block_type": "formatter",
+ }
+ )
+ type: Literal["roboflow_core/vlm_as_classifier@v1"]
+ image: Union[WorkflowImageSelector, StepOutputImageSelector] = Field(
+ description="The image which was the base to generate VLM prediction",
+ examples=["$inputs.image", "$steps.cropping.crops"],
+ )
+ vlm_output: StepOutputSelector(kind=[LANGUAGE_MODEL_OUTPUT_KIND]) = Field(
+ title="VLM Output",
+ description="The string with raw classification prediction to parse.",
+ examples=[["$steps.lmm.output"]],
+ )
+ classes: Union[
+ WorkflowParameterSelector(kind=[LIST_OF_VALUES_KIND]),
+ StepOutputSelector(kind=[LIST_OF_VALUES_KIND]),
+ List[str],
+ ] = Field(
+ description="List of all classes used by the model, required to "
+ "generate mapping between class name and class id.",
+ examples=[["$steps.lmm.classes", "$inputs.classes", ["class_a", "class_b"]]],
+ )
+
+ @classmethod
+ def describe_outputs(cls) -> List[OutputDefinition]:
+ return [
+ OutputDefinition(name="error_status", kind=[BOOLEAN_KIND]),
+ OutputDefinition(name="predictions", kind=[CLASSIFICATION_PREDICTION_KIND]),
+ OutputDefinition(name="inference_id", kind=[STRING_KIND]),
+ ]
+
+ @classmethod
+ def get_execution_engine_compatibility(cls) -> Optional[str]:
+ return ">=1.0.0,<2.0.0"
+
+
+class VLMAsClassifierBlockV1(WorkflowBlock):
+
+ @classmethod
+ def get_manifest(cls) -> Type[WorkflowBlockManifest]:
+ return BlockManifest
+
+ def run(
+ self,
+ image: WorkflowImageData,
+ vlm_output: str,
+ classes: List[str],
+ ) -> BlockResult:
+ inference_id = f"{uuid4()}"
+ error_status, parsed_data = string2json(
+ raw_json=vlm_output,
+ )
+ if error_status:
+ return {
+ "error_status": True,
+ "predictions": None,
+ "inference_id": inference_id,
+ }
+ if "class_name" in parsed_data and "confidence" in parsed_data:
+ return parse_multi_class_classification_results(
+ image=image,
+ results=parsed_data,
+ classes=classes,
+ inference_id=inference_id,
+ )
+ if "predicted_classes" in parsed_data:
+ return parse_multi_label_classification_results(
+ image=image,
+ results=parsed_data,
+ classes=classes,
+ inference_id=inference_id,
+ )
+ return {
+ "error_status": True,
+ "predictions": None,
+ "inference_id": inference_id,
+ }
+
+
+def string2json(
+ raw_json: str,
+) -> Tuple[bool, dict]:
+ json_blocks_found = JSON_MARKDOWN_BLOCK_PATTERN.findall(raw_json)
+ if len(json_blocks_found) == 0:
+ return try_parse_json(raw_json)
+ first_block = json_blocks_found[0]
+ return try_parse_json(first_block)
+
+
+def try_parse_json(content: str) -> Tuple[bool, dict]:
+ try:
+ return False, json.loads(content)
+ except Exception as error:
+ logging.warning(
+ f"Could not parse JSON to dict in `roboflow_core/vlm_as_classifier@v1` block. "
+ f"Error type: {error.__class__.__name__}. Details: {error}"
+ )
+ return True, {}
+
+
+def parse_multi_class_classification_results(
+ image: WorkflowImageData,
+ results: dict,
+ classes: List[str],
+ inference_id: str,
+) -> dict:
+ try:
+ class2id_mapping = create_classes_index(classes=classes)
+ height, width = image.numpy_image.shape[:2]
+ top_class = results["class_name"]
+ confidences = {top_class: scale_confidence(results["confidence"])}
+ predictions = []
+ if top_class not in class2id_mapping:
+ predictions.append(
+ {
+ "class_name": top_class,
+ "class_id": -1,
+ "confidence": confidences.get(top_class, 0.0),
+ }
+ )
+ for class_name, class_id in class2id_mapping.items():
+ predictions.append(
+ {
+ "class_name": class_name,
+ "class_id": class_id,
+ "confidence": confidences.get(class_name, 0.0),
+ }
+ )
+ parsed_prediction = {
+ "image": {"width": width, "height": height},
+ "predictions": predictions,
+ "top": top_class,
+ "confidence": confidences[top_class],
+ "inference_id": inference_id,
+ "parent_id": image.parent_metadata.parent_id,
+ }
+ return {
+ "error_status": False,
+ "predictions": parsed_prediction,
+ "inference_id": inference_id,
+ }
+ except Exception as error:
+ logging.warning(
+ f"Could not parse multi-class classification results in `roboflow_core/vlm_as_classifier@v1` block. "
+ f"Error type: {error.__class__.__name__}. Details: {error}"
+ )
+ return {"error_status": True, "predictions": None, "inference_id": inference_id}
+
+
+def parse_multi_label_classification_results(
+ image: WorkflowImageData,
+ results: dict,
+ classes: List[str],
+ inference_id: str,
+) -> dict:
+ try:
+ class2id_mapping = create_classes_index(classes=classes)
+ height, width = image.numpy_image.shape[:2]
+ predicted_classes_confidences = {}
+ for prediction in results["predicted_classes"]:
+ if prediction["class"] not in class2id_mapping:
+ class2id_mapping[prediction["class"]] = -1
+ if prediction["class"] in predicted_classes_confidences:
+ old_confidence = predicted_classes_confidences[prediction["class"]]
+ new_confidence = scale_confidence(value=prediction["confidence"])
+ predicted_classes_confidences[prediction["class"]] = max(
+ old_confidence, new_confidence
+ )
+ else:
+ predicted_classes_confidences[prediction["class"]] = scale_confidence(
+ value=prediction["confidence"]
+ )
+ predictions = {
+ class_name: {
+ "confidence": predicted_classes_confidences.get(class_name, 0.0),
+ "class_id": class_id,
+ }
+ for class_name, class_id in class2id_mapping.items()
+ }
+ parsed_prediction = {
+ "image": {"width": width, "height": height},
+ "predictions": predictions,
+ "predicted_classes": list(predicted_classes_confidences.keys()),
+ "inference_id": inference_id,
+ "parent_id": image.parent_metadata.parent_id,
+ }
+ return {
+ "error_status": False,
+ "predictions": parsed_prediction,
+ "inference_id": inference_id,
+ }
+ except Exception as error:
+ logging.warning(
+ f"Could not parse multi-label classification results in `roboflow_core/vlm_as_classifier@v1` block. "
+ f"Error type: {error.__class__.__name__}. Details: {error}"
+ )
+ return {"error_status": True, "predictions": None, "inference_id": inference_id}
+
+
+def create_classes_index(classes: List[str]) -> Dict[str, int]:
+ return {class_name: idx for idx, class_name in enumerate(classes)}
+
+
+def scale_confidence(value: float) -> float:
+ return min(max(float(value), 0.0), 1.0)
diff --git a/inference/core/workflows/core_steps/formatters/vlm_as_detector/__init__.py b/inference/core/workflows/core_steps/formatters/vlm_as_detector/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/inference/core/workflows/core_steps/formatters/vlm_as_detector/v1.py b/inference/core/workflows/core_steps/formatters/vlm_as_detector/v1.py
new file mode 100644
index 0000000000..3dbb7cf3dc
--- /dev/null
+++ b/inference/core/workflows/core_steps/formatters/vlm_as_detector/v1.py
@@ -0,0 +1,261 @@
+import json
+import logging
+import re
+from typing import Dict, List, Literal, Optional, Tuple, Type, Union
+from uuid import uuid4
+
+import numpy as np
+import supervision as sv
+from pydantic import ConfigDict, Field, model_validator
+from supervision.config import CLASS_NAME_DATA_FIELD
+
+from inference.core.workflows.core_steps.common.utils import (
+ attach_parents_coordinates_to_sv_detections,
+)
+from inference.core.workflows.execution_engine.constants import (
+ DETECTION_ID_KEY,
+ IMAGE_DIMENSIONS_KEY,
+ INFERENCE_ID_KEY,
+ PREDICTION_TYPE_KEY,
+)
+from inference.core.workflows.execution_engine.entities.base import (
+ OutputDefinition,
+ WorkflowImageData,
+)
+from inference.core.workflows.execution_engine.entities.types import (
+ BOOLEAN_KIND,
+ LANGUAGE_MODEL_OUTPUT_KIND,
+ LIST_OF_VALUES_KIND,
+ OBJECT_DETECTION_PREDICTION_KIND,
+ STRING_KIND,
+ StepOutputImageSelector,
+ StepOutputSelector,
+ WorkflowImageSelector,
+ WorkflowParameterSelector,
+)
+from inference.core.workflows.prototypes.block import (
+ BlockResult,
+ WorkflowBlock,
+ WorkflowBlockManifest,
+)
+
+JSON_MARKDOWN_BLOCK_PATTERN = re.compile(r"```json([\s\S]*?)```", flags=re.IGNORECASE)
+
+LONG_DESCRIPTION = """
+The block expects string input that would be produced by blocks exposing Large Language Models (LLMs) and
+Visual Language Models (VLMs). Input is parsed to object-detection prediction and returned as block output.
+
+Accepted formats:
+
+- valid JSON strings
+
+- JSON documents wrapped with Markdown tags
+
+Example
+```
+{"my": "json"}
+```
+
+**Details regarding block behavior:**
+
+- `error_status` is set `True` whenever parsing cannot be completed
+
+- in case of multiple markdown blocks with raw JSON content - only first will be parsed
+"""
+
+SHORT_DESCRIPTION = "Parses raw string into object-detection prediction."
+
+
+class BlockManifest(WorkflowBlockManifest):
+ model_config = ConfigDict(
+ json_schema_extra={
+ "name": "VLM as Detector",
+ "version": "v1",
+ "short_description": SHORT_DESCRIPTION,
+ "long_description": LONG_DESCRIPTION,
+ "license": "Apache-2.0",
+ "block_type": "formatter",
+ }
+ )
+ type: Literal["roboflow_core/vlm_as_detector@v1"]
+ image: Union[WorkflowImageSelector, StepOutputImageSelector] = Field(
+ description="The image which was the base to generate VLM prediction",
+ examples=["$inputs.image", "$steps.cropping.crops"],
+ )
+ vlm_output: StepOutputSelector(kind=[LANGUAGE_MODEL_OUTPUT_KIND]) = Field(
+ title="VLM Output",
+ description="The string with raw classification prediction to parse.",
+ examples=[["$steps.lmm.output"]],
+ )
+ classes: Union[
+ WorkflowParameterSelector(kind=[LIST_OF_VALUES_KIND]),
+ StepOutputSelector(kind=[LIST_OF_VALUES_KIND]),
+ List[str],
+ ] = Field(
+ description="List of all classes used by the model, required to "
+ "generate mapping between class name and class id.",
+ examples=[["$steps.lmm.classes", "$inputs.classes", ["class_a", "class_b"]]],
+ )
+ model_type: Literal["google-gemini", "anthropic-claude"] = Field(
+ description="Type of the model that generated prediction",
+ examples=[["google-gemini", "anthropic-claude"]],
+ )
+ task_type: Literal["object-detection"]
+
+ @model_validator(mode="after")
+ def validate(self) -> "BlockManifest":
+ if (self.model_type, self.task_type) not in REGISTERED_PARSERS:
+ raise ValueError(
+ f"Could not parse result of task {self.task_type} for model {self.model_type}"
+ )
+ return self
+
+ @classmethod
+ def describe_outputs(cls) -> List[OutputDefinition]:
+ return [
+ OutputDefinition(name="error_status", kind=[BOOLEAN_KIND]),
+ OutputDefinition(
+ name="predictions", kind=[OBJECT_DETECTION_PREDICTION_KIND]
+ ),
+ OutputDefinition(name="inference_id", kind=[STRING_KIND]),
+ ]
+
+ @classmethod
+ def get_execution_engine_compatibility(cls) -> Optional[str]:
+ return ">=1.0.0,<2.0.0"
+
+
+class VLMAsDetectorBlockV1(WorkflowBlock):
+
+ @classmethod
+ def get_manifest(cls) -> Type[WorkflowBlockManifest]:
+ return BlockManifest
+
+ def run(
+ self,
+ image: WorkflowImageData,
+ vlm_output: str,
+ classes: List[str],
+ model_type: str,
+ task_type: str,
+ ) -> BlockResult:
+ inference_id = f"{uuid4()}"
+ error_status, parsed_data = string2json(
+ raw_json=vlm_output,
+ )
+ if error_status:
+ return {
+ "error_status": True,
+ "predictions": None,
+ "inference_id": inference_id,
+ }
+ try:
+ predictions = REGISTERED_PARSERS[(model_type, task_type)](
+ image=image,
+ parsed_data=parsed_data,
+ classes=classes,
+ inference_id=inference_id,
+ )
+ return {
+ "error_status": False,
+ "predictions": predictions,
+ "inference_id": inference_id,
+ }
+ except Exception as error:
+ logging.warning(
+ f"Could not parse VLM prediction for model {model_type} and task {task_type} "
+ f"in `roboflow_core/vlm_as_detector@v1` block. "
+ f"Error type: {error.__class__.__name__}. Details: {error}"
+ )
+ return {
+ "error_status": True,
+ "predictions": None,
+ "inference_id": inference_id,
+ }
+
+
+def string2json(
+ raw_json: str,
+) -> Tuple[bool, dict]:
+ json_blocks_found = JSON_MARKDOWN_BLOCK_PATTERN.findall(raw_json)
+ if len(json_blocks_found) == 0:
+ return try_parse_json(raw_json)
+ first_block = json_blocks_found[0]
+ return try_parse_json(first_block)
+
+
+def try_parse_json(content: str) -> Tuple[bool, dict]:
+ try:
+ return False, json.loads(content)
+ except Exception as error:
+ logging.warning(
+ f"Could not parse JSON to dict in `roboflow_core/vlm_as_detector@v1` block. "
+ f"Error type: {error.__class__.__name__}. Details: {error}"
+ )
+ return True, {}
+
+
+def parse_gemini_object_detection_response(
+ image: WorkflowImageData,
+ parsed_data: dict,
+ classes: List[str],
+ inference_id: str,
+) -> sv.Detections:
+ class_name2id = create_classes_index(classes=classes)
+ image_height, image_width = image.numpy_image.shape[:2]
+ if len(parsed_data["detections"]) == 0:
+ return sv.Detections.empty()
+ xyxy, class_id, class_name, confidence = [], [], [], []
+ for detection in parsed_data["detections"]:
+ xyxy.append(
+ [
+ detection["x_min"] * image_width,
+ detection["y_min"] * image_height,
+ detection["x_max"] * image_width,
+ detection["y_max"] * image_height,
+ ]
+ )
+ class_id.append(class_name2id.get(detection["class_name"], -1))
+ class_name.append(detection["class_name"])
+ confidence.append(scale_confidence(detection.get("confidence", 1.0)))
+ xyxy = np.array(xyxy).round(0) if len(xyxy) > 0 else np.empty((0, 4))
+ confidence = np.array(confidence) if len(confidence) > 0 else np.empty(0)
+ class_id = np.array(class_id).astype(int) if len(class_id) > 0 else np.empty(0)
+ class_name = np.array(class_name) if len(class_name) > 0 else np.empty(0)
+ detection_ids = np.array([str(uuid4()) for _ in range(len(xyxy))])
+ dimensions = np.array([[image_height, image_width]] * len(xyxy))
+ inference_ids = np.array([inference_id] * len(xyxy))
+ prediction_type = np.array(["object-detection"] * len(xyxy))
+ data = {
+ CLASS_NAME_DATA_FIELD: class_name,
+ IMAGE_DIMENSIONS_KEY: dimensions,
+ INFERENCE_ID_KEY: inference_ids,
+ DETECTION_ID_KEY: detection_ids,
+ PREDICTION_TYPE_KEY: prediction_type,
+ }
+ detections = sv.Detections(
+ xyxy=xyxy,
+ confidence=confidence,
+ class_id=class_id,
+ mask=None,
+ tracker_id=None,
+ data=data,
+ )
+ return attach_parents_coordinates_to_sv_detections(
+ detections=detections,
+ image=image,
+ )
+
+
+def create_classes_index(classes: List[str]) -> Dict[str, int]:
+ return {class_name: idx for idx, class_name in enumerate(classes)}
+
+
+def scale_confidence(value: float) -> float:
+ return min(max(float(value), 0.0), 1.0)
+
+
+REGISTERED_PARSERS = {
+ ("google-gemini", "object-detection"): parse_gemini_object_detection_response,
+ ("anthropic-claude", "object-detection"): parse_gemini_object_detection_response,
+}
diff --git a/inference/core/workflows/core_steps/loader.py b/inference/core/workflows/core_steps/loader.py
index 6cfca977d1..e09ef79df4 100644
--- a/inference/core/workflows/core_steps/loader.py
+++ b/inference/core/workflows/core_steps/loader.py
@@ -40,9 +40,18 @@
from inference.core.workflows.core_steps.formatters.first_non_empty_or_default.v1 import (
FirstNonEmptyOrDefaultBlockV1,
)
+from inference.core.workflows.core_steps.formatters.json_parser.v1 import (
+ JSONParserBlockV1,
+)
from inference.core.workflows.core_steps.formatters.property_definition.v1 import (
PropertyDefinitionBlockV1,
)
+from inference.core.workflows.core_steps.formatters.vlm_as_classifier.v1 import (
+ VLMAsClassifierBlockV1,
+)
+from inference.core.workflows.core_steps.formatters.vlm_as_detector.v1 import (
+ VLMAsDetectorBlockV1,
+)
from inference.core.workflows.core_steps.fusion.detections_classes_replacement.v1 import (
DetectionsClassesReplacementBlockV1,
)
@@ -55,6 +64,9 @@
from inference.core.workflows.core_steps.fusion.dimension_collapse.v1 import (
DimensionCollapseBlockV1,
)
+from inference.core.workflows.core_steps.models.foundation.anthropic_claude.v1 import (
+ AntropicClaudeBlockV1,
+)
from inference.core.workflows.core_steps.models.foundation.clip_comparison.v1 import (
ClipComparisonBlockV1,
)
@@ -64,6 +76,9 @@
from inference.core.workflows.core_steps.models.foundation.cog_vlm.v1 import (
CogVLMBlockV1,
)
+from inference.core.workflows.core_steps.models.foundation.google_gemini.v1 import (
+ GoogleGeminiBlockV1,
+)
from inference.core.workflows.core_steps.models.foundation.lmm.v1 import LMMBlockV1
from inference.core.workflows.core_steps.models.foundation.lmm_classifier.v1 import (
LMMForClassificationBlockV1,
@@ -72,6 +87,9 @@
from inference.core.workflows.core_steps.models.foundation.openai.v1 import (
OpenAIBlockV1,
)
+from inference.core.workflows.core_steps.models.foundation.openai.v2 import (
+ OpenAIBlockV2,
+)
from inference.core.workflows.core_steps.models.foundation.segment_anything2.v1 import (
SegmentAnything2BlockV1,
)
@@ -197,6 +215,7 @@
INSTANCE_SEGMENTATION_PREDICTION_KIND,
INTEGER_KIND,
KEYPOINT_DETECTION_PREDICTION_KIND,
+ LANGUAGE_MODEL_OUTPUT_KIND,
LIST_OF_VALUES_KIND,
NUMPY_ARRAY_KIND,
OBJECT_DETECTION_PREDICTION_KIND,
@@ -290,6 +309,12 @@ def load_blocks() -> List[Type[WorkflowBlock]]:
ClipComparisonBlockV2,
CameraFocusBlockV1,
RoboflowDatasetUploadBlockV2,
+ OpenAIBlockV2,
+ JSONParserBlockV1,
+ VLMAsClassifierBlockV1,
+ GoogleGeminiBlockV1,
+ VLMAsDetectorBlockV1,
+ AntropicClaudeBlockV1,
]
@@ -320,6 +345,7 @@ def load_kinds() -> List[Kind]:
RGB_COLOR_KIND,
IMAGE_KEYPOINTS_KIND,
CONTOURS_KIND,
+ LANGUAGE_MODEL_OUTPUT_KIND,
NUMPY_ARRAY_KIND,
QR_CODE_DETECTION_KIND,
BAR_CODE_DETECTION_KIND,
diff --git a/inference/core/workflows/core_steps/models/foundation/anthropic_claude/__init__.py b/inference/core/workflows/core_steps/models/foundation/anthropic_claude/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/inference/core/workflows/core_steps/models/foundation/anthropic_claude/v1.py b/inference/core/workflows/core_steps/models/foundation/anthropic_claude/v1.py
new file mode 100644
index 0000000000..370ea4cc72
--- /dev/null
+++ b/inference/core/workflows/core_steps/models/foundation/anthropic_claude/v1.py
@@ -0,0 +1,657 @@
+import base64
+import json
+import re
+from functools import partial
+from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union
+
+import anthropic
+from anthropic import NOT_GIVEN
+from pydantic import ConfigDict, Field, model_validator
+
+from inference.core.env import WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS
+from inference.core.managers.base import ModelManager
+from inference.core.utils.image_utils import encode_image_to_jpeg_bytes, load_image
+from inference.core.utils.preprocess import downscale_image_keeping_aspect_ratio
+from inference.core.workflows.core_steps.common.utils import run_in_parallel
+from inference.core.workflows.execution_engine.entities.base import (
+ Batch,
+ OutputDefinition,
+ WorkflowImageData,
+)
+from inference.core.workflows.execution_engine.entities.types import (
+ FLOAT_KIND,
+ INTEGER_KIND,
+ LANGUAGE_MODEL_OUTPUT_KIND,
+ LIST_OF_VALUES_KIND,
+ STRING_KIND,
+ ImageInputField,
+ StepOutputImageSelector,
+ WorkflowImageSelector,
+ WorkflowParameterSelector,
+)
+from inference.core.workflows.prototypes.block import (
+ BlockResult,
+ WorkflowBlock,
+ WorkflowBlockManifest,
+)
+
+LONG_DESCRIPTION = """
+Ask a question to Anthropic Claude model with vision capabilities.
+
+You can specify arbitrary text prompts or predefined ones, the block supports the following types of prompt:
+
+- `unconstrained` - any arbitrary prompt you like
+
+- `ocr`- predefined prompt to recognise text from image
+
+- `visual-question-answering` - your prompt is supposed to provide question and will be
+wrapped into structure that is suited for VQA task
+
+- `caption` - predefined prompt to generate short caption of the image
+
+- `detailed-caption` - predefined prompt to generate elaborated caption of the image
+
+- `classification` - predefined prompt to generate multi-class classification output (that can be parsed
+with `VLM as Classifier` block)
+
+- `multi-label-classification` - predefined prompt to generate multi-label classification output (that
+can be parsed with `VLM as Classifier` block)
+
+- `object-detection` - predefined prompt to generate object detection output (that can be parsed
+with `VLM as Detector` block)
+
+- `structured-answering` - your input defines expected JSON output fields that can be parsed with `JSON Parser`
+block.
+
+You need to provide your Anthropic API key to use the Claude model.
+"""
+
+TaskType = Literal[
+ "unconstrained",
+ "ocr",
+ "visual-question-answering",
+ "caption",
+ "detailed-caption",
+ "classification",
+ "multi-label-classification",
+ "structured-answering",
+ "object-detection",
+]
+
+TASKS_REQUIRING_PROMPT = {
+ "unconstrained",
+ "visual-question-answering",
+}
+
+TASKS_REQUIRING_CLASSES = {
+ "classification",
+ "multi-label-classification",
+ "object-detection",
+}
+
+TASKS_REQUIRING_OUTPUT_STRUCTURE = {
+ "structured-answering",
+}
+
+
+class BlockManifest(WorkflowBlockManifest):
+ model_config = ConfigDict(
+ json_schema_extra={
+ "name": "Anthropic Claude",
+ "version": "v1",
+ "short_description": "Run Anthropic Claude model with vision capabilities",
+ "long_description": LONG_DESCRIPTION,
+ "license": "Apache-2.0",
+ "block_type": "model",
+ "search_keywords": ["LMM", "VLM", "Claude", "Anthropic"],
+ }
+ )
+ type: Literal["roboflow_core/anthropic_claude@v1"]
+ images: Union[WorkflowImageSelector, StepOutputImageSelector] = ImageInputField
+ task_type: TaskType = Field(
+ description="Task type to be performed by model. Value of parameter determine set of fields "
+ "that are required. For `unconstrained`, `visual-question-answering`, "
+ " - `prompt` parameter must be provided."
+ "For `structured-answering` - `output-structure` must be provided. For "
+ "`classification`, `multi-label-classification` and `object-detection` - "
+ "`classes` must be filled. `ocr`, `caption`, `detailed-caption` do not"
+ "require any additional parameter.",
+ )
+ prompt: Optional[Union[WorkflowParameterSelector(kind=[STRING_KIND]), str]] = Field(
+ default=None,
+ description="Text prompt to the Claude model",
+ examples=["my prompt", "$inputs.prompt"],
+ json_schema_extra={
+ "relevant_for": {
+ "task_type": {"values": TASKS_REQUIRING_PROMPT, "required": True},
+ },
+ },
+ )
+ output_structure: Optional[Dict[str, str]] = Field(
+ default=None,
+ description="Dictionary with structure of expected JSON response",
+ examples=[{"my_key": "description"}, "$inputs.output_structure"],
+ json_schema_extra={
+ "relevant_for": {
+ "task_type": {"values": TASKS_REQUIRING_CLASSES, "required": True},
+ },
+ },
+ )
+ classes: Optional[
+ Union[WorkflowParameterSelector(kind=[LIST_OF_VALUES_KIND]), List[str]]
+ ] = Field(
+ default=None,
+ description="List of classes to be used",
+ examples=[["class-a", "class-b"], "$inputs.classes"],
+ json_schema_extra={
+ "relevant_for": {
+ "task_type": {
+ "values": TASKS_REQUIRING_OUTPUT_STRUCTURE,
+ "required": True,
+ },
+ },
+ },
+ )
+ api_key: Union[WorkflowParameterSelector(kind=[STRING_KIND]), str] = Field(
+ description="Your Antropic API key",
+ examples=["xxx-xxx", "$inputs.antropics_api_key"],
+ private=True,
+ )
+ model_version: Union[
+ WorkflowParameterSelector(kind=[STRING_KIND]),
+ Literal[
+ "claude-3-5-sonnet", "claude-3-opus", "claude-3-sonnet", "claude-3-haiku"
+ ],
+ ] = Field(
+ default="claude-3-5-sonnet",
+ description="Model to be used",
+ examples=["claude-3-5-sonnet", "$inputs.claude"],
+ )
+ max_tokens: int = Field(
+ default=450,
+ description="Maximum number of tokens the model can generate in it's response.",
+ )
+ temperature: Optional[
+ Union[float, WorkflowParameterSelector(kind=[FLOAT_KIND])]
+ ] = Field(
+ default=None,
+ description="Temperature to sample from the model - value in range 0.0-2.0, the higher - the more "
+ 'random / "creative" the generations are.',
+ ge=0.0,
+ le=2.0,
+ )
+ max_image_size: Union[int, WorkflowParameterSelector(kind=[INTEGER_KIND])] = Field(
+ description="Maximum size of the image - if input has larger side, it will be downscaled, keeping aspect ratio",
+ default=1024,
+ )
+ max_concurrent_requests: Optional[int] = Field(
+ default=None,
+ description="Number of concurrent requests that can be executed by block when batch of input images provided. "
+ "If not given - block defaults to value configured globally in Workflows Execution Engine. "
+ "Please restrict if you hit ANtropic API limits.",
+ )
+
+ @model_validator(mode="after")
+ def validate(self) -> "BlockManifest":
+ if self.task_type in TASKS_REQUIRING_PROMPT and self.prompt is None:
+ raise ValueError(
+ f"`prompt` parameter required to be set for task `{self.task_type}`"
+ )
+ if self.task_type in TASKS_REQUIRING_CLASSES and self.classes is None:
+ raise ValueError(
+ f"`classes` parameter required to be set for task `{self.task_type}`"
+ )
+ if (
+ self.task_type in TASKS_REQUIRING_OUTPUT_STRUCTURE
+ and self.output_structure is None
+ ):
+ raise ValueError(
+ f"`output_structure` parameter required to be set for task `{self.task_type}`"
+ )
+ return self
+
+ @classmethod
+ def accepts_batch_input(cls) -> bool:
+ return True
+
+ @classmethod
+ def describe_outputs(cls) -> List[OutputDefinition]:
+ return [
+ OutputDefinition(
+ name="output", kind=[STRING_KIND, LANGUAGE_MODEL_OUTPUT_KIND]
+ ),
+ OutputDefinition(name="classes", kind=[LIST_OF_VALUES_KIND]),
+ ]
+
+ @classmethod
+ def get_execution_engine_compatibility(cls) -> Optional[str]:
+ return ">=1.0.0,<2.0.0"
+
+
+class AntropicClaudeBlockV1(WorkflowBlock):
+
+ def __init__(
+ self,
+ model_manager: ModelManager,
+ api_key: Optional[str],
+ ):
+ self._model_manager = model_manager
+ self._api_key = api_key
+
+ @classmethod
+ def get_init_parameters(cls) -> List[str]:
+ return ["model_manager", "api_key"]
+
+ @classmethod
+ def get_manifest(cls) -> Type[WorkflowBlockManifest]:
+ return BlockManifest
+
+ @classmethod
+ def get_execution_engine_compatibility(cls) -> Optional[str]:
+ return ">=1.0.0,<2.0.0"
+
+ def run(
+ self,
+ images: Batch[WorkflowImageData],
+ task_type: TaskType,
+ prompt: Optional[str],
+ output_structure: Optional[Dict[str, str]],
+ classes: Optional[List[str]],
+ api_key: str,
+ model_version: str,
+ max_tokens: int,
+ temperature: Optional[float],
+ max_image_size: int,
+ max_concurrent_requests: Optional[int],
+ ) -> BlockResult:
+ inference_images = [i.to_inference_format() for i in images]
+ raw_outputs = run_claude_prompting(
+ images=inference_images,
+ task_type=task_type,
+ prompt=prompt,
+ output_structure=output_structure,
+ classes=classes,
+ api_key=api_key,
+ model_version=model_version,
+ max_tokens=max_tokens,
+ temperature=temperature,
+ max_image_size=max_image_size,
+ max_concurrent_requests=max_concurrent_requests,
+ )
+ return [
+ {"output": raw_output, "classes": classes} for raw_output in raw_outputs
+ ]
+
+
+def run_claude_prompting(
+ images: List[Dict[str, Any]],
+ task_type: TaskType,
+ prompt: Optional[str],
+ output_structure: Optional[Dict[str, str]],
+ classes: Optional[List[str]],
+ api_key: str,
+ model_version: str,
+ max_tokens: int,
+ temperature: Optional[float],
+ max_image_size: int,
+ max_concurrent_requests: Optional[int],
+) -> List[str]:
+ if task_type not in PROMPT_BUILDERS:
+ raise ValueError(f"Task type: {task_type} not supported.")
+ prompts = []
+ for image in images:
+ loaded_image, _ = load_image(image)
+ loaded_image = downscale_image_keeping_aspect_ratio(
+ image=loaded_image, desired_size=(max_image_size, max_image_size)
+ )
+ base64_image = base64.b64encode(
+ encode_image_to_jpeg_bytes(loaded_image)
+ ).decode("ascii")
+ prompt = PROMPT_BUILDERS[task_type](
+ base64_image=base64_image,
+ prompt=prompt,
+ output_structure=output_structure,
+ classes=classes,
+ )
+ prompts.append(prompt)
+ return execute_claude_requests(
+ api_key=api_key,
+ prompts=prompts,
+ model_version=model_version,
+ max_tokens=max_tokens,
+ temperature=temperature,
+ max_concurrent_requests=max_concurrent_requests,
+ )
+
+
+def execute_claude_requests(
+ api_key: str,
+ prompts: List[Tuple[Optional[str], List[dict]]],
+ model_version: str,
+ max_tokens: int,
+ temperature: Optional[float],
+ max_concurrent_requests: Optional[int],
+) -> List[str]:
+ tasks = [
+ partial(
+ execute_claude_request,
+ system_prompt=prompt[0],
+ messages=prompt[1],
+ model_version=model_version,
+ max_tokens=max_tokens,
+ temperature=temperature,
+ api_key=api_key,
+ )
+ for prompt in prompts
+ ]
+ max_workers = (
+ max_concurrent_requests
+ or WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS
+ )
+ return run_in_parallel(
+ tasks=tasks,
+ max_workers=max_workers,
+ )
+
+
+EXACT_MODELS_VERSIONS_MAPPING = {
+ "claude-3-5-sonnet": "claude-3-5-sonnet-20240620",
+ "claude-3-opus": "claude-3-opus-20240229",
+ "claude-3-sonnet": "claude-3-sonnet-20240229",
+ "claude-3-haiku": "claude-3-haiku-20240307",
+}
+
+
+def execute_claude_request(
+ system_prompt: Optional[str],
+ messages: List[dict],
+ model_version: str,
+ max_tokens: int,
+ temperature: Optional[float],
+ api_key: str,
+) -> str:
+ client = anthropic.Anthropic(api_key=api_key)
+ if system_prompt is None:
+ system_prompt = NOT_GIVEN
+ if temperature is None:
+ temperature = NOT_GIVEN
+ result = client.messages.create(
+ system=system_prompt,
+ messages=messages,
+ max_tokens=max_tokens,
+ model=EXACT_MODELS_VERSIONS_MAPPING[model_version],
+ temperature=temperature,
+ )
+ return result.content[0].text
+
+
+def prepare_unconstrained_prompt(
+ base64_image: str,
+ prompt: str,
+ **kwargs,
+) -> Tuple[Optional[str], List[dict]]:
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "source": {
+ "type": "base64",
+ "media_type": "image/jpeg",
+ "data": base64_image,
+ },
+ },
+ {
+ "type": "text",
+ "text": prompt,
+ },
+ ],
+ }
+ ]
+ return None, messages
+
+
+def prepare_classification_prompt(
+ base64_image: str,
+ classes: List[str],
+ **kwargs,
+) -> Tuple[Optional[str], List[dict]]:
+ serialised_classes = ", ".join(classes)
+ system_prompt = (
+ "You act as single-class classification model. You must provide reasonable predictions. "
+ "You are only allowed to produce JSON document. "
+ 'Expected structure of json: {"class_name": "class-name", "confidence": 0.4}. '
+ "`class-name` must be one of the class names defined by user. You are only allowed to return "
+ "single JSON document, even if there are potentially multiple classes. You are not allowed to "
+ "return list."
+ )
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "source": {
+ "type": "base64",
+ "media_type": "image/jpeg",
+ "data": base64_image,
+ },
+ },
+ {
+ "type": "text",
+ "text": f"List of all classes to be recognised by model: {serialised_classes}",
+ },
+ ],
+ }
+ ]
+ return system_prompt, messages
+
+
+def prepare_multi_label_classification_prompt(
+ base64_image: str,
+ classes: List[str],
+ **kwargs,
+) -> Tuple[Optional[str], List[dict]]:
+ serialised_classes = ", ".join(classes)
+ system_prompt = (
+ "You act as multi-label classification model. You must provide reasonable predictions. "
+ "You are only allowed to produce JSON document. "
+ 'Expected structure of json: {"predicted_classes": [{"class": "class-name-1", "confidence": 0.9}, '
+ '{"class": "class-name-2", "confidence": 0.7}]}.'
+ "`class-name-X` must be one of the class names defined by user and `confidence` is a float value "
+ "in range 0.0-1.0 that represents how sure you are that the class is present in the image. "
+ "Only return class names that are visible."
+ )
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "source": {
+ "type": "base64",
+ "media_type": "image/jpeg",
+ "data": base64_image,
+ },
+ },
+ {
+ "type": "text",
+ "text": f"List of all classes to be recognised by model: {serialised_classes}",
+ },
+ ],
+ }
+ ]
+ return system_prompt, messages
+
+
+def prepare_vqa_prompt(
+ base64_image: str,
+ prompt: str,
+ **kwargs,
+) -> Tuple[Optional[str], List[dict]]:
+ system_prompt = (
+ "You act as Visual Question Answering model. Your task is to provide answer to question"
+ "submitted by user. If this is open-question - answer with few sentences, for ABCD question, "
+ "return only the indicator of the answer."
+ )
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "source": {
+ "type": "base64",
+ "media_type": "image/jpeg",
+ "data": base64_image,
+ },
+ },
+ {
+ "type": "text",
+ "text": f"Question: {prompt}",
+ },
+ ],
+ }
+ ]
+ return system_prompt, messages
+
+
+def prepare_ocr_prompt(
+ base64_image: str,
+ **kwargs,
+) -> Tuple[Optional[str], List[dict]]:
+ system_prompt = (
+ "You act as OCR model. Your task is to read text from the image and return it in "
+ "paragraphs representing the structure of texts in the image. You should only return "
+ "recognised text, nothing else."
+ )
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "source": {
+ "type": "base64",
+ "media_type": "image/jpeg",
+ "data": base64_image,
+ },
+ },
+ ],
+ }
+ ]
+ return system_prompt, messages
+
+
+def prepare_caption_prompt(
+ base64_image: str,
+ short_description: bool,
+ **kwargs,
+) -> Tuple[Optional[str], List[dict]]:
+ caption_detail_level = "Caption should be short."
+ if not short_description:
+ caption_detail_level = "Caption should be extensive."
+ system_prompt = (
+ f"You act as image caption model. Your task is to provide description of the image. "
+ f"{caption_detail_level}"
+ )
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "source": {
+ "type": "base64",
+ "media_type": "image/jpeg",
+ "data": base64_image,
+ },
+ },
+ ],
+ }
+ ]
+ return system_prompt, messages
+
+
+def prepare_structured_answering_prompt(
+ base64_image: str,
+ output_structure: Dict[str, str],
+ **kwargs,
+) -> Tuple[Optional[str], List[dict]]:
+ output_structure_serialised = json.dumps(output_structure, indent=4)
+ system_prompt = (
+ "You are supposed to produce responses in JSON. User is to provide you dictionary with "
+ "keys and values. Each key must be present in your response. Values in user dictionary "
+ "represent descriptions for JSON fields to be generated. Provide only JSON in response."
+ )
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "source": {
+ "type": "base64",
+ "media_type": "image/jpeg",
+ "data": base64_image,
+ },
+ },
+ {
+ "type": "text",
+ "text": f"Specification of requirements regarding output fields: \n"
+ f"{output_structure_serialised}",
+ },
+ ],
+ }
+ ]
+ return system_prompt, messages
+
+
+def prepare_object_detection_prompt(
+ base64_image: str,
+ classes: List[str],
+ **kwargs,
+) -> Tuple[Optional[str], List[dict]]:
+ serialised_classes = ", ".join(classes)
+ system_prompt = (
+ "You act as object-detection model. You must provide reasonable predictions. "
+ "You are only allowed to produce JSON document. "
+ 'Expected structure of json: {"detections": [{"x_min": 0.1, "y_min": 0.2, "x_max": 0.3, "y_max": 0.4, "class_name": "my-class-X", "confidence": 0.7}]} '
+ "- remember to close top-level dictionary at the end. "
+ "`my-class-X` must be one of the class names defined by user. All coordinates must be in range 0.0-1.0, representing percentage of image dimensions. "
+ "`confidence` is a value in range 0.0-1.0 representing your confidence in prediction. You should detect all instances of classes provided by user."
+ )
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "source": {
+ "type": "base64",
+ "media_type": "image/jpeg",
+ "data": base64_image,
+ },
+ },
+ {
+ "type": "text",
+ "text": f"List of all classes to be recognised by model: {serialised_classes}",
+ },
+ ],
+ }
+ ]
+ return system_prompt, messages
+
+
+PROMPT_BUILDERS = {
+ "unconstrained": prepare_unconstrained_prompt,
+ "ocr": prepare_ocr_prompt,
+ "visual-question-answering": prepare_vqa_prompt,
+ "caption": partial(prepare_caption_prompt, short_description=True),
+ "detailed-caption": partial(prepare_caption_prompt, short_description=False),
+ "classification": prepare_classification_prompt,
+ "multi-label-classification": prepare_multi_label_classification_prompt,
+ "structured-answering": prepare_structured_answering_prompt,
+ "object-detection": prepare_object_detection_prompt,
+}
diff --git a/inference/core/workflows/core_steps/models/foundation/google_gemini/__init__.py b/inference/core/workflows/core_steps/models/foundation/google_gemini/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/inference/core/workflows/core_steps/models/foundation/google_gemini/v1.py b/inference/core/workflows/core_steps/models/foundation/google_gemini/v1.py
new file mode 100644
index 0000000000..9fb2d6638a
--- /dev/null
+++ b/inference/core/workflows/core_steps/models/foundation/google_gemini/v1.py
@@ -0,0 +1,725 @@
+import base64
+import json
+import re
+from functools import partial
+from typing import Any, Dict, List, Literal, Optional, Type, Union
+
+import requests
+from pydantic import ConfigDict, Field, model_validator
+from requests import Response
+
+from inference.core.env import WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS
+from inference.core.managers.base import ModelManager
+from inference.core.utils.image_utils import encode_image_to_jpeg_bytes, load_image
+from inference.core.workflows.core_steps.common.utils import run_in_parallel
+from inference.core.workflows.execution_engine.entities.base import (
+ Batch,
+ OutputDefinition,
+ WorkflowImageData,
+)
+from inference.core.workflows.execution_engine.entities.types import (
+ FLOAT_KIND,
+ LANGUAGE_MODEL_OUTPUT_KIND,
+ LIST_OF_VALUES_KIND,
+ STRING_KIND,
+ ImageInputField,
+ StepOutputImageSelector,
+ WorkflowImageSelector,
+ WorkflowParameterSelector,
+)
+from inference.core.workflows.prototypes.block import (
+ BlockResult,
+ WorkflowBlock,
+ WorkflowBlockManifest,
+)
+
+GOOGLE_API_KEY_PATTERN = re.compile(r"key=(.[^&]*)")
+GOOGLE_API_KEY_VALUE_GROUP = 1
+MIN_KEY_LENGTH_TO_REVEAL_PREFIX = 8
+
+LONG_DESCRIPTION = """
+Ask a question to Google's Gemini model with vision capabilities.
+
+You can specify arbitrary text prompts or predefined ones, the block supports the following types of prompt:
+
+- `unconstrained` - any arbitrary prompt you like
+
+- `ocr`- predefined prompt to recognise text from image
+
+- `visual-question-answering` - your prompt is supposed to provide question and will be
+wrapped into structure that is suited for VQA task
+
+- `caption` - predefined prompt to generate short caption of the image
+
+- `detailed-caption` - predefined prompt to generate elaborated caption of the image
+
+- `classification` - predefined prompt to generate multi-class classification output (that can be parsed
+with `VLM as Classifier` block)
+
+- `multi-label-classification` - predefined prompt to generate multi-label classification output (that
+can be parsed with `VLM as Classifier` block)
+
+- `object-detection` - predefined prompt to generate object detection output (that can be parsed
+with `VLM as Detector` block)
+
+- `structured-answering` - your input defines expected JSON output fields that can be parsed with `JSON Parser`
+block.
+
+You need to provide your Google AI API key to use the Gemini model.
+
+**WARNING!**
+
+This block makes use of `/v1beta` API of Google Gemini model - the implementation may change
+in the future, without guarantee of backward compatibility.
+"""
+
+TaskType = Literal[
+ "unconstrained",
+ "ocr",
+ "visual-question-answering",
+ "caption",
+ "detailed-caption",
+ "classification",
+ "multi-label-classification",
+ "structured-answering",
+ "object-detection",
+]
+
+TASKS_REQUIRING_PROMPT = {
+ "unconstrained",
+ "visual-question-answering",
+}
+
+TASKS_REQUIRING_CLASSES = {
+ "classification",
+ "multi-label-classification",
+ "object-detection",
+}
+
+TASKS_REQUIRING_OUTPUT_STRUCTURE = {
+ "structured-answering",
+}
+
+
+class BlockManifest(WorkflowBlockManifest):
+ model_config = ConfigDict(
+ json_schema_extra={
+ "name": "Google Gemini",
+ "version": "v1",
+ "short_description": "Run Google's Gemini model with vision capabilities",
+ "long_description": LONG_DESCRIPTION,
+ "license": "Apache-2.0",
+ "block_type": "model",
+ "search_keywords": ["LMM", "VLM", "Gemini", "Google"],
+ "beta": True,
+ }
+ )
+ type: Literal["roboflow_core/google_gemini@v1"]
+ images: Union[WorkflowImageSelector, StepOutputImageSelector] = ImageInputField
+ task_type: TaskType = Field(
+ description="Task type to be performed by model. Value of parameter determine set of fields "
+ "that are required. For `unconstrained`, `visual-question-answering`, "
+ " - `prompt` parameter must be provided."
+ "For `structured-answering` - `output-structure` must be provided. For "
+ "`classification`, `multi-label-classification` and `object-detection` - "
+ "`classes` must be filled. `ocr`, `caption`, `detailed-caption` do not"
+ "require any additional parameter.",
+ )
+ prompt: Optional[Union[WorkflowParameterSelector(kind=[STRING_KIND]), str]] = Field(
+ default=None,
+ description="Text prompt to the Gemini model",
+ examples=["my prompt", "$inputs.prompt"],
+ json_schema_extra={
+ "relevant_for": {
+ "task_type": {"values": TASKS_REQUIRING_PROMPT, "required": True},
+ },
+ },
+ )
+ output_structure: Optional[Dict[str, str]] = Field(
+ default=None,
+ description="Dictionary with structure of expected JSON response",
+ examples=[{"my_key": "description"}, "$inputs.output_structure"],
+ json_schema_extra={
+ "relevant_for": {
+ "task_type": {"values": TASKS_REQUIRING_CLASSES, "required": True},
+ },
+ },
+ )
+ classes: Optional[
+ Union[WorkflowParameterSelector(kind=[LIST_OF_VALUES_KIND]), List[str]]
+ ] = Field(
+ default=None,
+ description="List of classes to be used",
+ examples=[["class-a", "class-b"], "$inputs.classes"],
+ json_schema_extra={
+ "relevant_for": {
+ "task_type": {
+ "values": TASKS_REQUIRING_OUTPUT_STRUCTURE,
+ "required": True,
+ },
+ },
+ },
+ )
+ api_key: Union[WorkflowParameterSelector(kind=[STRING_KIND]), str] = Field(
+ description="Your Google AI API key",
+ examples=["xxx-xxx", "$inputs.google_api_key"],
+ private=True,
+ )
+ model_version: Union[
+ WorkflowParameterSelector(kind=[STRING_KIND]),
+ Literal["gemini-1.5-flash", "gemini-1.5-pro"],
+ ] = Field(
+ default="gemini-1.5-flash",
+ description="Model to be used",
+ examples=["gemini-1.5-flash", "$inputs.gemini_model"],
+ )
+ max_tokens: int = Field(
+ default=450,
+ description="Maximum number of tokens the model can generate in it's response.",
+ )
+ temperature: Optional[
+ Union[float, WorkflowParameterSelector(kind=[FLOAT_KIND])]
+ ] = Field(
+ default=None,
+ description="Temperature to sample from the model - value in range 0.0-2.0, the higher - the more "
+ 'random / "creative" the generations are.',
+ ge=0.0,
+ le=2.0,
+ )
+ max_concurrent_requests: Optional[int] = Field(
+ default=None,
+ description="Number of concurrent requests that can be executed by block when batch of input images provided. "
+ "If not given - block defaults to value configured globally in Workflows Execution Engine. "
+ "Please restrict if you hit Google Gemini API limits.",
+ )
+
+ @model_validator(mode="after")
+ def validate(self) -> "BlockManifest":
+ if self.task_type in TASKS_REQUIRING_PROMPT and self.prompt is None:
+ raise ValueError(
+ f"`prompt` parameter required to be set for task `{self.task_type}`"
+ )
+ if self.task_type in TASKS_REQUIRING_CLASSES and self.classes is None:
+ raise ValueError(
+ f"`classes` parameter required to be set for task `{self.task_type}`"
+ )
+ if (
+ self.task_type in TASKS_REQUIRING_OUTPUT_STRUCTURE
+ and self.output_structure is None
+ ):
+ raise ValueError(
+ f"`output_structure` parameter required to be set for task `{self.task_type}`"
+ )
+ return self
+
+ @classmethod
+ def accepts_batch_input(cls) -> bool:
+ return True
+
+ @classmethod
+ def describe_outputs(cls) -> List[OutputDefinition]:
+ return [
+ OutputDefinition(
+ name="output", kind=[STRING_KIND, LANGUAGE_MODEL_OUTPUT_KIND]
+ ),
+ OutputDefinition(name="classes", kind=[LIST_OF_VALUES_KIND]),
+ ]
+
+ @classmethod
+ def get_execution_engine_compatibility(cls) -> Optional[str]:
+ return ">=1.0.0,<2.0.0"
+
+
+class GoogleGeminiBlockV1(WorkflowBlock):
+
+ def __init__(
+ self,
+ model_manager: ModelManager,
+ api_key: Optional[str],
+ ):
+ self._model_manager = model_manager
+ self._api_key = api_key
+
+ @classmethod
+ def get_init_parameters(cls) -> List[str]:
+ return ["model_manager", "api_key"]
+
+ @classmethod
+ def get_manifest(cls) -> Type[WorkflowBlockManifest]:
+ return BlockManifest
+
+ @classmethod
+ def get_execution_engine_compatibility(cls) -> Optional[str]:
+ return ">=1.0.0,<2.0.0"
+
+ def run(
+ self,
+ images: Batch[WorkflowImageData],
+ task_type: TaskType,
+ prompt: Optional[str],
+ output_structure: Optional[Dict[str, str]],
+ classes: Optional[List[str]],
+ api_key: str,
+ model_version: str,
+ max_tokens: int,
+ temperature: Optional[float],
+ max_concurrent_requests: Optional[int],
+ ) -> BlockResult:
+ inference_images = [i.to_inference_format() for i in images]
+ raw_outputs = run_gemini_prompting(
+ images=inference_images,
+ task_type=task_type,
+ prompt=prompt,
+ output_structure=output_structure,
+ classes=classes,
+ google_api_key=api_key,
+ model_version=model_version,
+ max_tokens=max_tokens,
+ temperature=temperature,
+ max_concurrent_requests=max_concurrent_requests,
+ )
+ return [
+ {"output": raw_output, "classes": classes} for raw_output in raw_outputs
+ ]
+
+
+def run_gemini_prompting(
+ images: List[Dict[str, Any]],
+ task_type: TaskType,
+ prompt: Optional[str],
+ output_structure: Optional[Dict[str, str]],
+ classes: Optional[List[str]],
+ google_api_key: Optional[str],
+ model_version: str,
+ max_tokens: int,
+ temperature: Optional[float],
+ max_concurrent_requests: Optional[int],
+) -> List[str]:
+ if task_type not in PROMPT_BUILDERS:
+ raise ValueError(f"Task type: {task_type} not supported.")
+ gemini_prompts = []
+ for image in images:
+ loaded_image, _ = load_image(image)
+ base64_image = base64.b64encode(
+ encode_image_to_jpeg_bytes(loaded_image)
+ ).decode("ascii")
+ prompt = PROMPT_BUILDERS[task_type](
+ base64_image=base64_image,
+ prompt=prompt,
+ output_structure=output_structure,
+ classes=classes,
+ temperature=temperature,
+ max_tokens=max_tokens,
+ )
+ gemini_prompts.append(prompt)
+ return execute_gemini_requests(
+ google_api_key=google_api_key,
+ gemini_prompts=gemini_prompts,
+ model_version=model_version,
+ max_concurrent_requests=max_concurrent_requests,
+ )
+
+
+def execute_gemini_requests(
+ google_api_key: str,
+ gemini_prompts: List[dict],
+ model_version: str,
+ max_concurrent_requests: Optional[int],
+) -> List[str]:
+ tasks = [
+ partial(
+ execute_gemini_request,
+ prompt=prompt,
+ model_version=model_version,
+ google_api_key=google_api_key,
+ )
+ for prompt in gemini_prompts
+ ]
+ max_workers = (
+ max_concurrent_requests
+ or WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS
+ )
+ return run_in_parallel(
+ tasks=tasks,
+ max_workers=max_workers,
+ )
+
+
+def execute_gemini_request(
+ prompt: dict,
+ model_version: str,
+ google_api_key: str,
+) -> str:
+ response = requests.post(
+ f"https://generativelanguage.googleapis.com/v1beta/models/{model_version}:generateContent",
+ headers={
+ "Content-Type": "application/json",
+ },
+ params={
+ "key": google_api_key,
+ },
+ json=prompt,
+ )
+ response_data = response.json()
+ google_api_key_safe_raise_for_status(response=response)
+ return response_data["candidates"][0]["content"]["parts"][0]["text"]
+
+
+def prepare_unconstrained_prompt(
+ base64_image: str,
+ prompt: str,
+ temperature: Optional[float],
+ max_tokens: int,
+ **kwargs,
+) -> dict:
+ return {
+ "contents": {
+ "parts": [
+ {
+ "inline_data": {
+ "mime_type": "image/jpeg",
+ "data": base64_image,
+ }
+ },
+ {
+ "text": prompt,
+ },
+ ],
+ "role": "user",
+ },
+ "generationConfig": prepare_generation_config(
+ max_tokens=max_tokens,
+ temperature=temperature,
+ ),
+ }
+
+
+def prepare_classification_prompt(
+ base64_image: str,
+ classes: List[str],
+ temperature: Optional[float],
+ max_tokens: int,
+ **kwargs,
+) -> dict:
+ serialised_classes = ", ".join(classes)
+ return {
+ "systemInstruction": {
+ "role": "system",
+ "parts": [
+ {
+ "text": "You act as single-class classification model. You must provide reasonable predictions. "
+ "You are only allowed to produce JSON document. "
+ 'Expected structure of json: {"class_name": "class-name", "confidence": 0.4}. '
+ "`class-name` must be one of the class names defined by user. You are only allowed to return "
+ "single JSON document, even if there are potentially multiple classes. You are not allowed to "
+ "return list.",
+ }
+ ],
+ },
+ "contents": {
+ "parts": [
+ {
+ "inline_data": {
+ "mime_type": "image/jpeg",
+ "data": base64_image,
+ }
+ },
+ {
+ "text": f"List of all classes to be recognised by model: {serialised_classes}",
+ },
+ ],
+ "role": "user",
+ },
+ "generationConfig": prepare_generation_config(
+ max_tokens=max_tokens,
+ temperature=temperature,
+ response_mime_type="application/json",
+ ),
+ }
+
+
+def prepare_multi_label_classification_prompt(
+ base64_image: str,
+ classes: List[str],
+ temperature: Optional[float],
+ max_tokens: int,
+ **kwargs,
+) -> dict:
+ serialised_classes = ", ".join(classes)
+ return {
+ "systemInstruction": {
+ "role": "system",
+ "parts": [
+ {
+ "text": "You act as multi-label classification model. You must provide reasonable predictions. "
+ "You are only allowed to produce JSON document. "
+ 'Expected structure of json: {"predicted_classes": [{"class": "class-name-1", "confidence": 0.9}, '
+ '{"class": "class-name-2", "confidence": 0.7}]}. '
+ "`class-name-X` must be one of the class names defined by user and `confidence` is a float value "
+ "in range 0.0-1.0 that represents how sure you are that the class is present in the image. "
+ "Only return class names that are visible.",
+ }
+ ],
+ },
+ "contents": {
+ "parts": [
+ {
+ "inline_data": {
+ "mime_type": "image/jpeg",
+ "data": base64_image,
+ }
+ },
+ {
+ "text": f"List of all classes to be recognised by model: {serialised_classes}",
+ },
+ ],
+ "role": "user",
+ },
+ "generationConfig": prepare_generation_config(
+ max_tokens=max_tokens,
+ temperature=temperature,
+ response_mime_type="application/json",
+ ),
+ }
+
+
+def prepare_vqa_prompt(
+ base64_image: str,
+ prompt: str,
+ temperature: Optional[float],
+ max_tokens: int,
+ **kwargs,
+) -> dict:
+ return {
+ "systemInstruction": {
+ "role": "system",
+ "parts": [
+ {
+ "text": "You act as Visual Question Answering model. Your task is to provide answer to question"
+ "submitted by user. If this is open-question - answer with few sentences, for ABCD question, "
+ "return only the indicator of the answer.",
+ }
+ ],
+ },
+ "contents": {
+ "parts": [
+ {
+ "inline_data": {
+ "mime_type": "image/jpeg",
+ "data": base64_image,
+ }
+ },
+ {
+ "text": f"Question: {prompt}",
+ },
+ ],
+ "role": "user",
+ },
+ "generationConfig": prepare_generation_config(
+ max_tokens=max_tokens,
+ temperature=temperature,
+ ),
+ }
+
+
+def prepare_ocr_prompt(
+ base64_image: str,
+ temperature: Optional[float],
+ max_tokens: int,
+ **kwargs,
+) -> dict:
+ return {
+ "systemInstruction": {
+ "role": "system",
+ "parts": [
+ {
+ "text": "You act as OCR model. Your task is to read text from the image and return it in "
+ "paragraphs representing the structure of texts in the image. You should only return "
+ "recognised text, nothing else.",
+ }
+ ],
+ },
+ "contents": {
+ "parts": [
+ {
+ "inline_data": {
+ "mime_type": "image/jpeg",
+ "data": base64_image,
+ }
+ }
+ ],
+ "role": "user",
+ },
+ "generationConfig": prepare_generation_config(
+ max_tokens=max_tokens,
+ temperature=temperature,
+ ),
+ }
+
+
+def prepare_caption_prompt(
+ base64_image: str,
+ short_description: bool,
+ temperature: Optional[float],
+ max_tokens: int,
+ **kwargs,
+) -> dict:
+ caption_detail_level = "Caption should be short."
+ if not short_description:
+ caption_detail_level = "Caption should be extensive."
+ return {
+ "systemInstruction": {
+ "role": "system",
+ "parts": [
+ {
+ "text": f"You act as image caption model. Your task is to provide description of the image. "
+ f"{caption_detail_level}",
+ }
+ ],
+ },
+ "contents": {
+ "parts": [
+ {
+ "inline_data": {
+ "mime_type": "image/jpeg",
+ "data": base64_image,
+ }
+ }
+ ],
+ "role": "user",
+ },
+ "generationConfig": prepare_generation_config(
+ max_tokens=max_tokens,
+ temperature=temperature,
+ ),
+ }
+
+
+def prepare_structured_answering_prompt(
+ base64_image: str,
+ output_structure: Dict[str, str],
+ temperature: Optional[float],
+ max_tokens: int,
+ **kwargs,
+) -> dict:
+ output_structure_serialised = json.dumps(output_structure, indent=4)
+ return {
+ "systemInstruction": {
+ "role": "system",
+ "parts": [
+ {
+ "text": "You are supposed to produce responses in JSON. User is to provide you dictionary with "
+ "keys and values. Each key must be present in your response. Values in user dictionary "
+ "represent descriptions for JSON fields to be generated. Provide only JSON in response.",
+ }
+ ],
+ },
+ "contents": {
+ "parts": [
+ {
+ "inline_data": {
+ "mime_type": "image/jpeg",
+ "data": base64_image,
+ }
+ },
+ {
+ "text": f"Specification of requirements regarding output fields: \n"
+ f"{output_structure_serialised}",
+ },
+ ],
+ "role": "user",
+ },
+ "generationConfig": prepare_generation_config(
+ max_tokens=max_tokens,
+ temperature=temperature,
+ response_mime_type="application/json",
+ ),
+ }
+
+
+def prepare_object_detection_prompt(
+ base64_image: str,
+ classes: List[str],
+ temperature: Optional[float],
+ max_tokens: int,
+ **kwargs,
+) -> dict:
+ serialised_classes = ", ".join(classes)
+ return {
+ "systemInstruction": {
+ "role": "system",
+ "parts": [
+ {
+ "text": "You act as object-detection model. You must provide reasonable predictions. "
+ "You are only allowed to produce JSON document. "
+ 'Expected structure of json: {"detections": [{"x_min": 0.1, "y_min": 0.2, "x_max": 0.3, "y_max": 0.4, "class_name": "my-class-X", "confidence": 0.7}]}. '
+ "`my-class-X` must be one of the class names defined by user. All coordinates must be in range 0.0-1.0, representing percentage of image dimensions. "
+ "`confidence` is a value in range 0.0-1.0 representing your confidence in prediction. You should detect all instances of classes provided by user.",
+ }
+ ],
+ },
+ "contents": {
+ "parts": [
+ {
+ "inline_data": {
+ "mime_type": "image/jpeg",
+ "data": base64_image,
+ }
+ },
+ {
+ "text": f"List of all classes to be recognised by model: {serialised_classes}",
+ },
+ ],
+ "role": "user",
+ },
+ "generationConfig": prepare_generation_config(
+ max_tokens=max_tokens,
+ temperature=temperature,
+ response_mime_type="application/json",
+ ),
+ }
+
+
+def prepare_generation_config(
+ max_tokens: int,
+ temperature: Optional[float],
+ response_mime_type: str = "text/plain",
+) -> dict:
+ result = {
+ "max_output_tokens": max_tokens,
+ "response_mime_type": response_mime_type,
+ "candidate_count": 1,
+ }
+ if temperature is not None:
+ result["temperature"] = temperature
+ return result
+
+
+def google_api_key_safe_raise_for_status(response: Response) -> None:
+ request_is_successful = response.status_code < 400
+ if request_is_successful:
+ return None
+ response.url = GOOGLE_API_KEY_PATTERN.sub(deduct_api_key, response.url)
+ response.raise_for_status()
+
+
+def deduct_api_key(match: re.Match) -> str:
+ key_value = match.group(GOOGLE_API_KEY_VALUE_GROUP)
+ if len(key_value) < MIN_KEY_LENGTH_TO_REVEAL_PREFIX:
+ return f"key=***"
+ key_prefix = key_value[:2]
+ key_postfix = key_value[-2:]
+ return f"key={key_prefix}***{key_postfix}"
+
+
+PROMPT_BUILDERS = {
+ "unconstrained": prepare_unconstrained_prompt,
+ "ocr": prepare_ocr_prompt,
+ "visual-question-answering": prepare_vqa_prompt,
+ "caption": partial(prepare_caption_prompt, short_description=True),
+ "detailed-caption": partial(prepare_caption_prompt, short_description=False),
+ "classification": prepare_classification_prompt,
+ "multi-label-classification": prepare_multi_label_classification_prompt,
+ "structured-answering": prepare_structured_answering_prompt,
+ "object-detection": prepare_object_detection_prompt,
+}
diff --git a/inference/core/workflows/core_steps/models/foundation/lmm_classifier/v1.py b/inference/core/workflows/core_steps/models/foundation/lmm_classifier/v1.py
index 2c029dd506..3f468a3321 100644
--- a/inference/core/workflows/core_steps/models/foundation/lmm_classifier/v1.py
+++ b/inference/core/workflows/core_steps/models/foundation/lmm_classifier/v1.py
@@ -64,6 +64,7 @@ class BlockManifest(WorkflowBlockManifest):
"long_description": LONG_DESCRIPTION,
"license": "Apache-2.0",
"block_type": "model",
+ "deprecated": True,
}
)
type: Literal["roboflow_core/lmm_for_classification@v1", "LMMForClassification"]
diff --git a/inference/core/workflows/core_steps/models/foundation/openai/v2.py b/inference/core/workflows/core_steps/models/foundation/openai/v2.py
new file mode 100644
index 0000000000..406c462121
--- /dev/null
+++ b/inference/core/workflows/core_steps/models/foundation/openai/v2.py
@@ -0,0 +1,573 @@
+import base64
+import json
+from functools import partial
+from typing import Any, Dict, List, Literal, Optional, Type, Union
+
+from openai import OpenAI
+from openai._types import NOT_GIVEN
+from pydantic import ConfigDict, Field, model_validator
+
+from inference.core.env import WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS
+from inference.core.managers.base import ModelManager
+from inference.core.utils.image_utils import encode_image_to_jpeg_bytes, load_image
+from inference.core.workflows.core_steps.common.utils import run_in_parallel
+from inference.core.workflows.execution_engine.entities.base import (
+ Batch,
+ OutputDefinition,
+ WorkflowImageData,
+)
+from inference.core.workflows.execution_engine.entities.types import (
+ FLOAT_KIND,
+ LANGUAGE_MODEL_OUTPUT_KIND,
+ LIST_OF_VALUES_KIND,
+ STRING_KIND,
+ ImageInputField,
+ StepOutputImageSelector,
+ WorkflowImageSelector,
+ WorkflowParameterSelector,
+)
+from inference.core.workflows.prototypes.block import (
+ BlockResult,
+ WorkflowBlock,
+ WorkflowBlockManifest,
+)
+
+LONG_DESCRIPTION = """
+Ask a question to OpenAI's GPT-4 with Vision model.
+
+You can specify arbitrary text prompts or predefined ones, the block supports the following types of prompt:
+
+- `unconstrained` - any arbitrary prompt you like
+
+- `ocr`- predefined prompt to recognise text from image
+
+- `visual-question-answering` - your prompt is supposed to provide question and will be
+wrapped into structure that is suited for VQA task
+
+- `caption` - predefined prompt to generate short caption of the image
+
+- `detailed-caption` - predefined prompt to generate elaborated caption of the image
+
+- `classification` - predefined prompt to generate multi-class classification output (that can be parsed
+with `VLM as Classifier` block)
+
+- `multi-label-classification` - predefined prompt to generate multi-label classification output (that
+can be parsed with `VLM as Classifier` block)
+
+- `structured-answering` - your input defines expected JSON output fields that can be parsed with `JSON Parser`
+block.
+
+You need to provide your OpenAI API key to use the GPT-4 with Vision model.
+"""
+
+TaskType = Literal[
+ "unconstrained",
+ "ocr",
+ "visual-question-answering",
+ "caption",
+ "detailed-caption",
+ "classification",
+ "multi-label-classification",
+ "structured-answering",
+]
+
+TASKS_REQUIRING_PROMPT = {
+ "unconstrained",
+ "visual-question-answering",
+}
+
+TASKS_REQUIRING_CLASSES = {
+ "classification",
+ "multi-label-classification",
+}
+
+TASKS_REQUIRING_OUTPUT_STRUCTURE = {
+ "structured-answering",
+}
+
+
+class BlockManifest(WorkflowBlockManifest):
+ model_config = ConfigDict(
+ json_schema_extra={
+ "name": "OpenAI",
+ "version": "v2",
+ "short_description": "Run OpenAI's GPT-4 with Vision",
+ "long_description": LONG_DESCRIPTION,
+ "license": "Apache-2.0",
+ "block_type": "model",
+ "search_keywords": ["LMM", "VLM", "ChatGPT", "GPT", "OpenAI"],
+ }
+ )
+ type: Literal["roboflow_core/open_ai@v2"]
+ images: Union[WorkflowImageSelector, StepOutputImageSelector] = ImageInputField
+ task_type: TaskType = Field(
+ description="Task type to be performed by model. Value of parameter determine set of fields "
+ "that are required. For `unconstrained`, `visual-question-answering`, "
+ " - `prompt` parameter must be provided."
+ "For `structured-answering` - `output-structure` must be provided. For "
+ "`classification`, `multi-label-classification` - "
+ "`classes` must be filled. `ocr`, `caption`, `detailed-caption` do not"
+ "require any additional parameter.",
+ )
+ prompt: Optional[Union[WorkflowParameterSelector(kind=[STRING_KIND]), str]] = Field(
+ default=None,
+ description="Text prompt to the OpenAI model",
+ examples=["my prompt", "$inputs.prompt"],
+ json_schema_extra={
+ "relevant_for": {
+ "task_type": {"values": TASKS_REQUIRING_PROMPT, "required": True},
+ },
+ },
+ )
+ output_structure: Optional[Dict[str, str]] = Field(
+ default=None,
+ description="Dictionary with structure of expected JSON response",
+ examples=[{"my_key": "description"}, "$inputs.output_structure"],
+ json_schema_extra={
+ "relevant_for": {
+ "task_type": {"values": TASKS_REQUIRING_CLASSES, "required": True},
+ },
+ },
+ )
+ classes: Optional[
+ Union[WorkflowParameterSelector(kind=[LIST_OF_VALUES_KIND]), List[str]]
+ ] = Field(
+ default=None,
+ description="List of classes to be used",
+ examples=[["class-a", "class-b"], "$inputs.classes"],
+ json_schema_extra={
+ "relevant_for": {
+ "task_type": {
+ "values": TASKS_REQUIRING_OUTPUT_STRUCTURE,
+ "required": True,
+ },
+ },
+ },
+ )
+ api_key: Union[WorkflowParameterSelector(kind=[STRING_KIND]), str] = Field(
+ description="Your OpenAI API key",
+ examples=["xxx-xxx", "$inputs.openai_api_key"],
+ private=True,
+ )
+ model_version: Union[
+ WorkflowParameterSelector(kind=[STRING_KIND]), Literal["gpt-4o", "gpt-4o-mini"]
+ ] = Field(
+ default="gpt-4o",
+ description="Model to be used",
+ examples=["gpt-4o", "$inputs.openai_model"],
+ )
+ image_detail: Union[
+ WorkflowParameterSelector(kind=[STRING_KIND]), Literal["auto", "high", "low"]
+ ] = Field(
+ default="auto",
+ description="Indicates the image's quality, with 'high' suggesting it is of high resolution and should be processed or displayed with high fidelity.",
+ examples=["auto", "high", "low"],
+ )
+ max_tokens: int = Field(
+ default=450,
+ description="Maximum number of tokens the model can generate in it's response.",
+ )
+ temperature: Optional[
+ Union[float, WorkflowParameterSelector(kind=[FLOAT_KIND])]
+ ] = Field(
+ default=None,
+ description="Temperature to sample from the model - value in range 0.0-2.0, the higher - the more "
+ 'random / "creative" the generations are.',
+ ge=0.0,
+ le=2.0,
+ )
+ max_concurrent_requests: Optional[int] = Field(
+ default=None,
+ description="Number of concurrent requests that can be executed by block when batch of input images provided. "
+ "If not given - block defaults to value configured globally in Workflows Execution Engine. "
+ "Please restrict if you hit OpenAI limits.",
+ )
+
+ @model_validator(mode="after")
+ def validate(self) -> "BlockManifest":
+ if self.task_type in TASKS_REQUIRING_PROMPT and self.prompt is None:
+ raise ValueError(
+ f"`prompt` parameter required to be set for task `{self.task_type}`"
+ )
+ if self.task_type in TASKS_REQUIRING_CLASSES and self.classes is None:
+ raise ValueError(
+ f"`classes` parameter required to be set for task `{self.task_type}`"
+ )
+ if (
+ self.task_type in TASKS_REQUIRING_OUTPUT_STRUCTURE
+ and self.output_structure is None
+ ):
+ raise ValueError(
+ f"`output_structure` parameter required to be set for task `{self.task_type}`"
+ )
+ return self
+
+ @classmethod
+ def accepts_batch_input(cls) -> bool:
+ return True
+
+ @classmethod
+ def describe_outputs(cls) -> List[OutputDefinition]:
+ return [
+ OutputDefinition(
+ name="output", kind=[STRING_KIND, LANGUAGE_MODEL_OUTPUT_KIND]
+ ),
+ OutputDefinition(name="classes", kind=[LIST_OF_VALUES_KIND]),
+ ]
+
+ @classmethod
+ def get_execution_engine_compatibility(cls) -> Optional[str]:
+ return ">=1.0.0,<2.0.0"
+
+
+class OpenAIBlockV2(WorkflowBlock):
+
+ def __init__(
+ self,
+ model_manager: ModelManager,
+ api_key: Optional[str],
+ ):
+ self._model_manager = model_manager
+ self._api_key = api_key
+
+ @classmethod
+ def get_init_parameters(cls) -> List[str]:
+ return ["model_manager", "api_key"]
+
+ @classmethod
+ def get_manifest(cls) -> Type[WorkflowBlockManifest]:
+ return BlockManifest
+
+ @classmethod
+ def get_execution_engine_compatibility(cls) -> Optional[str]:
+ return ">=1.0.0,<2.0.0"
+
+ def run(
+ self,
+ images: Batch[WorkflowImageData],
+ task_type: TaskType,
+ prompt: Optional[str],
+ output_structure: Optional[Dict[str, str]],
+ classes: Optional[List[str]],
+ api_key: str,
+ model_version: str,
+ image_detail: Literal["low", "high", "auto"],
+ max_tokens: int,
+ temperature: Optional[float],
+ max_concurrent_requests: Optional[int],
+ ) -> BlockResult:
+ inference_images = [i.to_inference_format() for i in images]
+ raw_outputs = run_gpt_4v_llm_prompting(
+ images=inference_images,
+ task_type=task_type,
+ prompt=prompt,
+ output_structure=output_structure,
+ classes=classes,
+ openai_api_key=api_key,
+ gpt_model_version=model_version,
+ gpt_image_detail=image_detail,
+ max_tokens=max_tokens,
+ temperature=temperature,
+ max_concurrent_requests=max_concurrent_requests,
+ )
+ return [
+ {"output": raw_output, "classes": classes} for raw_output in raw_outputs
+ ]
+
+
+def run_gpt_4v_llm_prompting(
+ images: List[Dict[str, Any]],
+ task_type: TaskType,
+ prompt: Optional[str],
+ output_structure: Optional[Dict[str, str]],
+ classes: Optional[List[str]],
+ openai_api_key: Optional[str],
+ gpt_model_version: str,
+ gpt_image_detail: Literal["auto", "high", "low"],
+ max_tokens: int,
+ temperature: Optional[int],
+ max_concurrent_requests: Optional[int],
+) -> List[str]:
+ if task_type not in PROMPT_BUILDERS:
+ raise ValueError(f"Task type: {task_type} not supported.")
+ gpt4_prompts = []
+ for image in images:
+ loaded_image, _ = load_image(image)
+ base64_image = base64.b64encode(
+ encode_image_to_jpeg_bytes(loaded_image)
+ ).decode("ascii")
+ prompt = PROMPT_BUILDERS[task_type](
+ base64_image=base64_image,
+ prompt=prompt,
+ output_structure=output_structure,
+ classes=classes,
+ gpt_image_detail=gpt_image_detail,
+ )
+ gpt4_prompts.append(prompt)
+ return execute_gpt_4v_requests(
+ openai_api_key=openai_api_key,
+ gpt4_prompts=gpt4_prompts,
+ gpt_model_version=gpt_model_version,
+ max_tokens=max_tokens,
+ temperature=temperature,
+ max_concurrent_requests=max_concurrent_requests,
+ )
+
+
+def execute_gpt_4v_requests(
+ openai_api_key: str,
+ gpt4_prompts: List[List[dict]],
+ gpt_model_version: str,
+ max_tokens: int,
+ temperature: Optional[float],
+ max_concurrent_requests: Optional[int],
+) -> List[str]:
+ client = OpenAI(api_key=openai_api_key)
+ tasks = [
+ partial(
+ execute_gpt_4v_request,
+ client=client,
+ prompt=prompt,
+ gpt_model_version=gpt_model_version,
+ max_tokens=max_tokens,
+ temperature=temperature,
+ )
+ for prompt in gpt4_prompts
+ ]
+ max_workers = (
+ max_concurrent_requests
+ or WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS
+ )
+ return run_in_parallel(
+ tasks=tasks,
+ max_workers=max_workers,
+ )
+
+
+def execute_gpt_4v_request(
+ client: OpenAI,
+ prompt: List[dict],
+ gpt_model_version: str,
+ max_tokens: int,
+ temperature: Optional[float],
+) -> str:
+ if temperature is None:
+ temperature = NOT_GIVEN
+ response = client.chat.completions.create(
+ model=gpt_model_version,
+ messages=prompt,
+ max_tokens=max_tokens,
+ temperature=temperature,
+ )
+ return response.choices[0].message.content
+
+
+def prepare_unconstrained_prompt(
+ base64_image: str,
+ prompt: str,
+ gpt_image_detail: str,
+ **kwargs,
+) -> List[dict]:
+ return [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": prompt},
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": f"data:image/jpeg;base64,{base64_image}",
+ "detail": gpt_image_detail,
+ },
+ },
+ ],
+ }
+ ]
+
+
+def prepare_classification_prompt(
+ base64_image: str, classes: List[str], gpt_image_detail: str, **kwargs
+) -> List[dict]:
+ serialised_classes = ", ".join(classes)
+ return [
+ {
+ "role": "system",
+ "content": "You act as single-class classification model. You must provide reasonable predictions. "
+ "You are only allowed to produce JSON document in Markdown ```json [...]``` markers. "
+ 'Expected structure of json: {"class_name": "class-name", "confidence": 0.4}. '
+ "`class-name` must be one of the class names defined by user. You are only allowed to return "
+ "single JSON document, even if there are potentially multiple classes. You are not allowed to return list.",
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "text",
+ "text": f"List of all classes to be recognised by model: {serialised_classes}",
+ },
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": f"data:image/jpeg;base64,{base64_image}",
+ "detail": gpt_image_detail,
+ },
+ },
+ ],
+ },
+ ]
+
+
+def prepare_multi_label_classification_prompt(
+ base64_image: str, classes: List[str], gpt_image_detail: str, **kwargs
+) -> List[dict]:
+ serialised_classes = ", ".join(classes)
+ return [
+ {
+ "role": "system",
+ "content": "You act as multi-label classification model. You must provide reasonable predictions. "
+ "You are only allowed to produce JSON document in Markdown ```json [...]``` markers. "
+ 'Expected structure of json: {"predicted_classes": [{"class": "class-name-1", "confidence": 0.9}, '
+ '{"class": "class-name-2", "confidence": 0.7}]}. '
+ "`class-name-X` must be one of the class names defined by user and `confidence` is a float value in range "
+ "0.0-1.0 that represent how sure you are that the class is present in the image. Only return class names "
+ "that are visible.",
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "text",
+ "text": f"List of all classes to be recognised by model: {serialised_classes}",
+ },
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": f"data:image/jpeg;base64,{base64_image}",
+ "detail": gpt_image_detail,
+ },
+ },
+ ],
+ },
+ ]
+
+
+def prepare_vqa_prompt(
+ base64_image: str, prompt: str, gpt_image_detail: str, **kwargs
+) -> List[dict]:
+ return [
+ {
+ "role": "system",
+ "content": "You act as Visual Question Answering model. Your task is to provide answer to question"
+ "submitted by user. If this is open-question - answer with few sentences, for ABCD question, "
+ "return only the indicator of the answer.",
+ },
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": f"Question: {prompt}"},
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": f"data:image/jpeg;base64,{base64_image}",
+ "detail": gpt_image_detail,
+ },
+ },
+ ],
+ },
+ ]
+
+
+def prepare_ocr_prompt(
+ base64_image: str, gpt_image_detail: str, **kwargs
+) -> List[dict]:
+ return [
+ {
+ "role": "system",
+ "content": "You act as OCR model. Your task is to read text from the image and return it in "
+ "paragraphs representing the structure of texts in the image. You should only return "
+ "recognised text, nothing else.",
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": f"data:image/jpeg;base64,{base64_image}",
+ "detail": gpt_image_detail,
+ },
+ },
+ ],
+ },
+ ]
+
+
+def prepare_caption_prompt(
+ base64_image: str, gpt_image_detail: str, short_description: bool, **kwargs
+) -> List[dict]:
+ caption_detail_level = "Caption should be short."
+ if not short_description:
+ caption_detail_level = "Caption should be extensive."
+ return [
+ {
+ "role": "system",
+ "content": f"You act as image caption model. Your task is to provide description of the image. "
+ f"{caption_detail_level}",
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": f"data:image/jpeg;base64,{base64_image}",
+ "detail": gpt_image_detail,
+ },
+ },
+ ],
+ },
+ ]
+
+
+def prepare_structured_answering_prompt(
+ base64_image: str, output_structure: Dict[str, str], gpt_image_detail: str, **kwargs
+) -> List[dict]:
+ output_structure_serialised = json.dumps(output_structure, indent=4)
+ return [
+ {
+ "role": "system",
+ "content": "You are supposed to produce responses in JSON wrapped in Markdown markers: "
+ "```json\nyour-response\n```. User is to provide you dictionary with keys and values. "
+ "Each key must be present in your response. Values in user dictionary represent "
+ "descriptions for JSON fields to be generated. Provide only JSON Markdown in response.",
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "text",
+ "text": f"Specification of requirements regarding output fields: \n"
+ f"{output_structure_serialised}",
+ },
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": f"data:image/jpeg;base64,{base64_image}",
+ "detail": gpt_image_detail,
+ },
+ },
+ ],
+ },
+ ]
+
+
+PROMPT_BUILDERS = {
+ "unconstrained": prepare_unconstrained_prompt,
+ "ocr": prepare_ocr_prompt,
+ "visual-question-answering": prepare_vqa_prompt,
+ "caption": partial(prepare_caption_prompt, short_description=True),
+ "detailed-caption": partial(prepare_caption_prompt, short_description=False),
+ "classification": prepare_classification_prompt,
+ "multi-label-classification": prepare_multi_label_classification_prompt,
+ "structured-answering": prepare_structured_answering_prompt,
+}
diff --git a/inference/core/workflows/execution_engine/entities/types.py b/inference/core/workflows/execution_engine/entities/types.py
index bbca551054..286031e4e0 100644
--- a/inference/core/workflows/execution_engine/entities/types.py
+++ b/inference/core/workflows/execution_engine/entities/types.py
@@ -136,7 +136,7 @@ def __hash__(self) -> int:
"""
LIST_OF_VALUES_KIND = Kind(
name="list_of_values",
- description="List of values of any types",
+ description="List of values of any type",
docs=LIST_OF_VALUES_KIND_DOCS,
)
@@ -292,7 +292,7 @@ def __hash__(self) -> int:
"""
CLASSIFICATION_PREDICTION_KIND = Kind(
name="classification_prediction",
- description="`'predictions'` key from Classification Model output",
+ description="Predictions from classifier",
docs=CLASSIFICATION_PREDICTION_KIND_DOCS,
)
@@ -374,9 +374,75 @@ def __hash__(self) -> int:
confidence=array([ 0.84955, 0.74344, 0.45636, 0.86537]),
class_id=array([2, 7, 2, 0]),
tracker_id=None,
- data={'class_name': array(['car', 'truck', 'car', 'car'], dtype=' int:
confidence=array([ 0.95898]),
class_id=array([6]),
tracker_id=None,
- data={'class_name': array(['G'], dtype=' int:
tracker_id=None,
data={
'class_name': array(['G'], dtype=' int:
docs=KEYPOINT_DETECTION_PREDICTION_KIND_DOCS,
)
-QR_CODE_DETECTION_KIND_DOCS = f"""
+QR_CODE_DETECTION_KIND_DOCS = """
This kind represents batch of predictions regarding QR codes location and data their provide.
Example:
```
-# Each prediction in batch is list of dictionaries that contains detected QR codes (detections) and their metadata
-[
- [
- {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "qr_code", "class_id": 0.1, "detection_id": "random-uuid", "data": ""}},
- {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "qr_code", "class_id": 0.1, "detection_id": "random-uuid", "data": ""}}, ],
- [
- {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "qr_code", "class_id": 0.1, "detection_id": "random-uuid", "data": ""}},
- {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "qr_code", "class_id": 0.1, "detection_id": "random-uuid", "data": ""}},
- ]
-]
+sv.Detections(
+ xyxy=array([
+ [ 865, 153.5, 1189, 422.5],
+ [ 192.5, 77.5, 995.5, 722.5],
+ [ 194, 82, 996, 726],
+ [ 460, 333, 704, 389]]
+ ),
+ mask=None,
+ confidence=array([ 1.0, 1.0, 1.0, 1.0]),
+ class_id=array([2, 7, 2, 0]),
+ tracker_id=None,
+ data={
+ 'class_name': array(['qr_code', 'qr_code', 'qr_code', 'qr_code'], dtype=' int:
docs=QR_CODE_DETECTION_KIND_DOCS,
)
-BAR_CODE_DETECTION_KIND_DOCS = f"""
+BAR_CODE_DETECTION_KIND_DOCS = """
This kind represents batch of predictions regarding barcodes location and data their provide.
Example:
```
-# Each prediction in batch is list of dictionaries that contains detected barcodes (detections) and their metadata
-[
- [
- {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "barcode", "class_id": 0.1, "detection_id": "random-uuid", "data": ""}},
- {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "barcode", "class_id": 0.1, "detection_id": "random-uuid", "data": ""}}, ],
- [
- {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "barcode", "class_id": 0.1, "detection_id": "random-uuid", "data": ""}},
- {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "barcode", "class_id": 0.1, "detection_id": "random-uuid", "data": ""}},
- ]
-]
+sv.Detections(
+ xyxy=array([
+ [ 865, 153.5, 1189, 422.5],
+ [ 192.5, 77.5, 995.5, 722.5],
+ [ 194, 82, 996, 726],
+ [ 460, 333, 704, 389]]
+ ),
+ mask=None,
+ confidence=array([ 1.0, 1.0, 1.0, 1.0]),
+ class_id=array([2, 7, 2, 0]),
+ tracker_id=None,
+ data={
+ 'class_name': array(['barcode', 'barcode', 'barcode', 'barcode'], dtype=' int:
)
+LANGUAGE_MODEL_OUTPUT_KIND_DOCS = """
+This kind represent output generated by language model. It is Python string, which can be processed
+by blocks transforming LLMs / VLMs output into structured form.
+
+Examples:
+```
+{"predicted_class": "car", "confidence": 0.7} # which is example JSON with classification prediction
+"The is A." # which is example unstructured generation for VQA task
+```
+"""
+
+LANGUAGE_MODEL_OUTPUT_KIND = Kind(
+ name="language_model_output",
+ description="LLM / VLM output",
+ docs=LANGUAGE_MODEL_OUTPUT_KIND_DOCS,
+)
+
STEP_AS_SELECTED_ELEMENT = "step"
STEP_OUTPUT_AS_SELECTED_ELEMENT = "step_output"
diff --git a/requirements/_requirements.txt b/requirements/_requirements.txt
index 834247e30d..e781f29f1c 100644
--- a/requirements/_requirements.txt
+++ b/requirements/_requirements.txt
@@ -29,3 +29,4 @@ pydot>=2.0.0
shapely>=2.0.0,<2.1.0
tldextract~=5.1.2
packaging~=24.0
+anthropic~=0.34.2
\ No newline at end of file
diff --git a/tests/inference/hosted_platform_tests/conftest.py b/tests/inference/hosted_platform_tests/conftest.py
index faf859864f..95689e7823 100644
--- a/tests/inference/hosted_platform_tests/conftest.py
+++ b/tests/inference/hosted_platform_tests/conftest.py
@@ -79,6 +79,8 @@ class PlatformEnvironment(Enum):
ROBOFLOW_API_KEY = os.environ["HOSTED_PLATFORM_TESTS_API_KEY"]
OPENAI_KEY = os.getenv("OPENAI_KEY")
+GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
+ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
@pytest.fixture(scope="session")
diff --git a/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_claude.py b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_claude.py
new file mode 100644
index 0000000000..efaac597bc
--- /dev/null
+++ b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_claude.py
@@ -0,0 +1,242 @@
+import numpy as np
+import pytest
+
+from inference_sdk import InferenceHTTPClient
+from tests.inference.hosted_platform_tests.conftest import (
+ ANTHROPIC_API_KEY,
+ ROBOFLOW_API_KEY,
+)
+
+CLASSIFICATION_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ {"type": "WorkflowParameter", "name": "classes"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/anthropic_claude@v1",
+ "name": "claude",
+ "images": "$inputs.image",
+ "task_type": "classification",
+ "classes": "$inputs.classes",
+ "api_key": "$inputs.api_key",
+ },
+ {
+ "type": "roboflow_core/vlm_as_classifier@v1",
+ "name": "parser",
+ "image": "$inputs.image",
+ "vlm_output": "$steps.claude.output",
+ "classes": "$steps.claude.classes",
+ },
+ {
+ "type": "roboflow_core/property_definition@v1",
+ "name": "top_class",
+ "operations": [
+ {"type": "ClassificationPropertyExtract", "property_name": "top_class"}
+ ],
+ "data": "$steps.parser.predictions",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "claude_result",
+ "selector": "$steps.claude.output",
+ },
+ {
+ "type": "JsonField",
+ "name": "top_class",
+ "selector": "$steps.top_class.output",
+ },
+ {
+ "type": "JsonField",
+ "name": "parsed_prediction",
+ "selector": "$steps.parser.*",
+ },
+ ],
+}
+
+
+@pytest.mark.skipif(ANTHROPIC_API_KEY is None, reason="No Antropic API key provided")
+@pytest.mark.flaky(retries=4, delay=1)
+def test_classification_workflow(
+ object_detection_service_url: str,
+ dogs_image: np.ndarray,
+) -> None:
+ client = InferenceHTTPClient(
+ api_url=object_detection_service_url,
+ api_key=ROBOFLOW_API_KEY,
+ )
+
+ # when
+ result = client.run_workflow(
+ specification=CLASSIFICATION_WORKFLOW,
+ images={
+ "image": dogs_image,
+ },
+ parameters={
+ "api_key": ANTHROPIC_API_KEY,
+ "classes": ["cat", "dog"],
+ },
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {
+ "claude_result",
+ "top_class",
+ "parsed_prediction",
+ }, "Expected all outputs to be delivered"
+ assert (
+ isinstance(result[0]["claude_result"], str)
+ and len(result[0]["claude_result"]) > 0
+ ), "Expected non-empty string generated"
+ assert result[0]["top_class"] == "dog"
+ assert result[0]["parsed_prediction"]["error_status"] is False
+
+
+STRUCTURED_PROMPTING_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/anthropic_claude@v1",
+ "name": "claude",
+ "images": "$inputs.image",
+ "task_type": "structured-answering",
+ "output_structure": {
+ "dogs_count": "count of dogs instances in the image",
+ "cats_count": "count of cats instances in the image",
+ },
+ "api_key": "$inputs.api_key",
+ },
+ {
+ "type": "roboflow_core/json_parser@v1",
+ "name": "parser",
+ "raw_json": "$steps.claude.output",
+ "expected_fields": ["dogs_count", "cats_count"],
+ },
+ {
+ "type": "roboflow_core/property_definition@v1",
+ "name": "property_definition",
+ "operations": [{"type": "ToString"}],
+ "data": "$steps.parser.dogs_count",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.property_definition.output",
+ }
+ ],
+}
+
+
+@pytest.mark.skipif(ANTHROPIC_API_KEY is None, reason="No Antropic API key provided")
+@pytest.mark.flaky(retries=4, delay=1)
+def test_structured_parsing_workflow(
+ object_detection_service_url: str,
+ dogs_image: np.ndarray,
+) -> None:
+ client = InferenceHTTPClient(
+ api_url=object_detection_service_url,
+ api_key=ROBOFLOW_API_KEY,
+ )
+
+ # when
+ result = client.run_workflow(
+ specification=STRUCTURED_PROMPTING_WORKFLOW,
+ images={
+ "image": dogs_image,
+ },
+ parameters={
+ "api_key": ANTHROPIC_API_KEY,
+ },
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+ assert result[0]["result"] == "2"
+
+
+OBJECT_DETECTION_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ {"type": "WorkflowParameter", "name": "classes"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/anthropic_claude@v1",
+ "name": "claude",
+ "images": "$inputs.image",
+ "task_type": "object-detection",
+ "classes": "$inputs.classes",
+ "api_key": "$inputs.api_key",
+ },
+ {
+ "type": "roboflow_core/vlm_as_detector@v1",
+ "name": "parser",
+ "vlm_output": "$steps.claude.output",
+ "image": "$inputs.image",
+ "classes": "$steps.claude.classes",
+ "model_type": "anthropic-claude",
+ "task_type": "object-detection",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "claude_result",
+ "selector": "$steps.claude.output",
+ },
+ {
+ "type": "JsonField",
+ "name": "parsed_prediction",
+ "selector": "$steps.parser.predictions",
+ },
+ ],
+}
+
+
+@pytest.mark.skipif(ANTHROPIC_API_KEY is None, reason="No Antropic API key provided")
+@pytest.mark.flaky(retries=4, delay=1)
+def test_object_detection_workflow(
+ object_detection_service_url: str,
+ dogs_image: np.ndarray,
+) -> None:
+ client = InferenceHTTPClient(
+ api_url=object_detection_service_url,
+ api_key=ROBOFLOW_API_KEY,
+ )
+
+ # when
+ result = client.run_workflow(
+ specification=OBJECT_DETECTION_WORKFLOW,
+ images={
+ "image": dogs_image,
+ },
+ parameters={
+ "api_key": ANTHROPIC_API_KEY,
+ "classes": ["cat", "dog"],
+ },
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {
+ "claude_result",
+ "parsed_prediction",
+ }, "Expected all outputs to be delivered"
+ assert [e["class"] for e in result[0]["parsed_prediction"]["predictions"]] == [
+ "dog",
+ "dog",
+ ], "Expected 2 dogs to be detected"
diff --git a/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_gemini.py b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_gemini.py
new file mode 100644
index 0000000000..37044e862c
--- /dev/null
+++ b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_gemini.py
@@ -0,0 +1,242 @@
+import numpy as np
+import pytest
+
+from inference_sdk import InferenceHTTPClient
+from tests.inference.hosted_platform_tests.conftest import (
+ GOOGLE_API_KEY,
+ ROBOFLOW_API_KEY,
+)
+
+CLASSIFICATION_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ {"type": "WorkflowParameter", "name": "classes"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/google_gemini@v1",
+ "name": "gemini",
+ "images": "$inputs.image",
+ "task_type": "classification",
+ "classes": "$inputs.classes",
+ "api_key": "$inputs.api_key",
+ },
+ {
+ "type": "roboflow_core/vlm_as_classifier@v1",
+ "name": "parser",
+ "image": "$inputs.image",
+ "vlm_output": "$steps.gemini.output",
+ "classes": "$steps.gemini.classes",
+ },
+ {
+ "type": "roboflow_core/property_definition@v1",
+ "name": "top_class",
+ "operations": [
+ {"type": "ClassificationPropertyExtract", "property_name": "top_class"}
+ ],
+ "data": "$steps.parser.predictions",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "gemini_result",
+ "selector": "$steps.gemini.output",
+ },
+ {
+ "type": "JsonField",
+ "name": "top_class",
+ "selector": "$steps.top_class.output",
+ },
+ {
+ "type": "JsonField",
+ "name": "parsed_prediction",
+ "selector": "$steps.parser.*",
+ },
+ ],
+}
+
+
+@pytest.mark.skipif(GOOGLE_API_KEY is None, reason="No Google API key provided")
+@pytest.mark.flaky(retries=4, delay=1)
+def test_classification_workflow(
+ object_detection_service_url: str,
+ dogs_image: np.ndarray,
+) -> None:
+ client = InferenceHTTPClient(
+ api_url=object_detection_service_url,
+ api_key=ROBOFLOW_API_KEY,
+ )
+
+ # when
+ result = client.run_workflow(
+ specification=CLASSIFICATION_WORKFLOW,
+ images={
+ "image": dogs_image,
+ },
+ parameters={
+ "api_key": GOOGLE_API_KEY,
+ "classes": ["cat", "dog"],
+ },
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {
+ "gemini_result",
+ "top_class",
+ "parsed_prediction",
+ }, "Expected all outputs to be delivered"
+ assert (
+ isinstance(result[0]["gemini_result"], str)
+ and len(result[0]["gemini_result"]) > 0
+ ), "Expected non-empty string generated"
+ assert result[0]["top_class"] == "dog"
+ assert result[0]["parsed_prediction"]["error_status"] is False
+
+
+STRUCTURED_PROMPTING_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/google_gemini@v1",
+ "name": "gemini",
+ "images": "$inputs.image",
+ "task_type": "structured-answering",
+ "output_structure": {
+ "dogs_count": "count of dogs instances in the image",
+ "cats_count": "count of cats instances in the image",
+ },
+ "api_key": "$inputs.api_key",
+ },
+ {
+ "type": "roboflow_core/json_parser@v1",
+ "name": "parser",
+ "raw_json": "$steps.gemini.output",
+ "expected_fields": ["dogs_count", "cats_count"],
+ },
+ {
+ "type": "roboflow_core/property_definition@v1",
+ "name": "property_definition",
+ "operations": [{"type": "ToString"}],
+ "data": "$steps.parser.dogs_count",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.property_definition.output",
+ }
+ ],
+}
+
+
+@pytest.mark.skipif(GOOGLE_API_KEY is None, reason="No Google API key provided")
+@pytest.mark.flaky(retries=4, delay=1)
+def test_structured_parsing_workflow(
+ object_detection_service_url: str,
+ dogs_image: np.ndarray,
+) -> None:
+ client = InferenceHTTPClient(
+ api_url=object_detection_service_url,
+ api_key=ROBOFLOW_API_KEY,
+ )
+
+ # when
+ result = client.run_workflow(
+ specification=STRUCTURED_PROMPTING_WORKFLOW,
+ images={
+ "image": dogs_image,
+ },
+ parameters={
+ "api_key": GOOGLE_API_KEY,
+ },
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+ assert result[0]["result"] == "2"
+
+
+OBJECT_DETECTION_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ {"type": "WorkflowParameter", "name": "classes"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/google_gemini@v1",
+ "name": "gemini",
+ "images": "$inputs.image",
+ "task_type": "object-detection",
+ "classes": "$inputs.classes",
+ "api_key": "$inputs.api_key",
+ },
+ {
+ "type": "roboflow_core/vlm_as_detector@v1",
+ "name": "parser",
+ "vlm_output": "$steps.gemini.output",
+ "image": "$inputs.image",
+ "classes": "$steps.gemini.classes",
+ "model_type": "google-gemini",
+ "task_type": "object-detection",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "gemini_result",
+ "selector": "$steps.gemini.output",
+ },
+ {
+ "type": "JsonField",
+ "name": "parsed_prediction",
+ "selector": "$steps.parser.predictions",
+ },
+ ],
+}
+
+
+@pytest.mark.skipif(GOOGLE_API_KEY is None, reason="No Google API key provided")
+@pytest.mark.flaky(retries=4, delay=1)
+def test_object_detection_workflow(
+ object_detection_service_url: str,
+ dogs_image: np.ndarray,
+) -> None:
+ client = InferenceHTTPClient(
+ api_url=object_detection_service_url,
+ api_key=ROBOFLOW_API_KEY,
+ )
+
+ # when
+ result = client.run_workflow(
+ specification=OBJECT_DETECTION_WORKFLOW,
+ images={
+ "image": dogs_image,
+ },
+ parameters={
+ "api_key": GOOGLE_API_KEY,
+ "classes": ["cat", "dog"],
+ },
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {
+ "gemini_result",
+ "parsed_prediction",
+ }, "Expected all outputs to be delivered"
+ assert [e["class"] for e in result[0]["parsed_prediction"]["predictions"]] == [
+ "dog",
+ "dog",
+ ], "Expected 2 dogs to be detected"
diff --git a/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_openai.py b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_openai.py
index 66ee23cdf5..aaa10f3564 100644
--- a/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_openai.py
+++ b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_openai.py
@@ -95,3 +95,161 @@ def test_image_description_workflow(
detection_confidences, [0.857235848903656, 0.5132315158843994], atol=1e-4
), "Expected predictions to match what was observed while test creation"
assert len(result[0]["description"]) > 0, "Expected some description"
+
+
+CLASSIFICATION_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ {"type": "WorkflowParameter", "name": "classes"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/open_ai@v2",
+ "name": "gpt",
+ "images": "$inputs.image",
+ "task_type": "classification",
+ "classes": "$inputs.classes",
+ "api_key": "$inputs.api_key",
+ },
+ {
+ "type": "roboflow_core/vlm_as_classifier@v1",
+ "name": "parser",
+ "image": "$inputs.image",
+ "vlm_output": "$steps.gpt.output",
+ "classes": "$steps.gpt.classes",
+ },
+ {
+ "type": "roboflow_core/property_definition@v1",
+ "name": "top_class",
+ "operations": [
+ {"type": "ClassificationPropertyExtract", "property_name": "top_class"}
+ ],
+ "data": "$steps.parser.predictions",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "gpt_result",
+ "selector": "$steps.gpt.output",
+ },
+ {
+ "type": "JsonField",
+ "name": "top_class",
+ "selector": "$steps.top_class.output",
+ },
+ {
+ "type": "JsonField",
+ "name": "parsed_prediction",
+ "selector": "$steps.parser.*",
+ },
+ ],
+}
+
+
+@pytest.mark.skipif(OPENAI_KEY is None, reason="No OpenAI API key provided")
+@pytest.mark.flaky(retries=4, delay=1)
+def test_classification_workflow(
+ object_detection_service_url: str,
+ dogs_image: np.ndarray,
+) -> None:
+ client = InferenceHTTPClient(
+ api_url=object_detection_service_url,
+ api_key=ROBOFLOW_API_KEY,
+ )
+
+ # when
+ result = client.run_workflow(
+ specification=CLASSIFICATION_WORKFLOW,
+ images={
+ "image": dogs_image,
+ },
+ parameters={
+ "api_key": OPENAI_KEY,
+ "classes": ["cat", "dog"],
+ },
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {
+ "gpt_result",
+ "top_class",
+ "parsed_prediction",
+ }, "Expected all outputs to be delivered"
+ assert (
+ isinstance(result[0]["gpt_result"], str) and len(result[0]["gpt_result"]) > 0
+ ), "Expected non-empty string generated"
+ assert result[0]["top_class"] == "dog"
+ assert result[0]["parsed_prediction"]["error_status"] is False
+
+
+STRUCTURED_PROMPTING_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/open_ai@v2",
+ "name": "gpt",
+ "images": "$inputs.image",
+ "task_type": "structured-answering",
+ "output_structure": {
+ "dogs_count": "count of dogs instances in the image",
+ "cats_count": "count of cats instances in the image",
+ },
+ "api_key": "$inputs.api_key",
+ },
+ {
+ "type": "roboflow_core/json_parser@v1",
+ "name": "parser",
+ "raw_json": "$steps.gpt.output",
+ "expected_fields": ["dogs_count", "cats_count"],
+ },
+ {
+ "type": "roboflow_core/property_definition@v1",
+ "name": "property_definition",
+ "operations": [{"type": "ToString"}],
+ "data": "$steps.parser.dogs_count",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.property_definition.output",
+ }
+ ],
+}
+
+
+@pytest.mark.skipif(OPENAI_KEY is None, reason="No OpenAI API key provided")
+@pytest.mark.flaky(retries=4, delay=1)
+def test_structured_prompting_workflow(
+ object_detection_service_url: str,
+ dogs_image: np.ndarray,
+) -> None:
+ client = InferenceHTTPClient(
+ api_url=object_detection_service_url,
+ api_key=ROBOFLOW_API_KEY,
+ )
+
+ # when
+ result = client.run_workflow(
+ specification=STRUCTURED_PROMPTING_WORKFLOW,
+ images={
+ "image": dogs_image,
+ },
+ parameters={
+ "api_key": OPENAI_KEY,
+ },
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+ assert result[0]["result"] == "2"
diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_claude_models.py b/tests/workflows/integration_tests/execution/test_workflow_with_claude_models.py
new file mode 100644
index 0000000000..4e244e6b87
--- /dev/null
+++ b/tests/workflows/integration_tests/execution/test_workflow_with_claude_models.py
@@ -0,0 +1,675 @@
+"""
+This test module requires Anthropic AI API key passed via env variable WORKFLOWS_TEST_ANTHROPIC_API_KEY.
+This is supposed to be used only locally, as that would be too much of a cost in CI
+"""
+
+import os
+
+import numpy as np
+import pytest
+
+from inference.core.env import WORKFLOWS_MAX_CONCURRENT_STEPS
+from inference.core.managers.base import ModelManager
+from inference.core.workflows.core_steps.common.entities import StepExecutionMode
+from inference.core.workflows.execution_engine.core import ExecutionEngine
+from tests.workflows.integration_tests.execution.workflows_gallery_collector.decorators import (
+ add_to_workflows_gallery,
+)
+
+ANTHROPIC_API_KEY = os.getenv("WORKFLOWS_TEST_ANTHROPIC_API_KEY")
+
+UNCONSTRAINED_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/anthropic_claude@v1",
+ "name": "claude",
+ "images": "$inputs.image",
+ "task_type": "unconstrained",
+ "prompt": "Give me dominant color of the image",
+ "api_key": "$inputs.api_key",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.claude.output",
+ },
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Prompting Anthropic Claude with arbitrary prompt",
+ use_case_description="""
+In this example, Anthropic Claude model is prompted with arbitrary text from user
+ """,
+ workflow_definition=UNCONSTRAINED_WORKFLOW,
+ workflow_name_in_app="claude-arbitrary-prompt",
+)
+@pytest.mark.skipif(
+ condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided"
+)
+def test_workflow_with_unconstrained_prompt(
+ model_manager: ModelManager,
+ dogs_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=UNCONSTRAINED_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [dogs_image],
+ "api_key": ANTHROPIC_API_KEY,
+ "prompt": "What is the topic of the image?",
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+ assert (
+ isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+ ), "Expected non-empty string generated"
+
+
+OCR_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/anthropic_claude@v1",
+ "name": "claude",
+ "images": "$inputs.image",
+ "task_type": "ocr",
+ "api_key": "$inputs.api_key",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.claude.output",
+ },
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Using Anthropic Claude as OCR model",
+ use_case_description="""
+In this example, Anthropic Claude model is used as OCR system. User just points task type and do not need to provide
+any prompt.
+ """,
+ workflow_definition=OCR_WORKFLOW,
+ workflow_name_in_app="claude-ocr",
+)
+@pytest.mark.skipif(
+ condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided"
+)
+def test_workflow_with_ocr_prompt(
+ model_manager: ModelManager,
+ license_plate_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=OCR_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [license_plate_image],
+ "api_key": ANTHROPIC_API_KEY,
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+ assert (
+ isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+ ), "Expected non-empty string generated"
+
+
+VQA_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ {"type": "WorkflowParameter", "name": "prompt"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/anthropic_claude@v1",
+ "name": "claude",
+ "images": "$inputs.image",
+ "task_type": "visual-question-answering",
+ "prompt": "$inputs.prompt",
+ "api_key": "$inputs.api_key",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.claude.output",
+ },
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Using Anthropic Claude as Visual Question Answering system",
+ use_case_description="""
+In this example, Anthropic Claude model is used as VQA system. User provides question via prompt.
+ """,
+ workflow_definition=VQA_WORKFLOW,
+ workflow_name_in_app="claude-vqa",
+)
+@pytest.mark.skipif(
+ condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided"
+)
+def test_workflow_with_vqa_prompt(
+ model_manager: ModelManager,
+ license_plate_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=VQA_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [license_plate_image],
+ "api_key": ANTHROPIC_API_KEY,
+ "prompt": "What are the brands of the cars?",
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+ assert (
+ isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+ ), "Expected non-empty string generated"
+
+
+CAPTION_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/anthropic_claude@v1",
+ "name": "claude",
+ "images": "$inputs.image",
+ "task_type": "caption",
+ "api_key": "$inputs.api_key",
+ "temperature": 1.0,
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.claude.output",
+ },
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Using Anthropic Claude as Image Captioning system",
+ use_case_description="""
+In this example, Anthropic Claude model is used as Image Captioning system.
+ """,
+ workflow_definition=CAPTION_WORKFLOW,
+ workflow_name_in_app="claude-captioning",
+)
+@pytest.mark.skipif(
+ condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided"
+)
+def test_workflow_with_captioning_prompt(
+ model_manager: ModelManager,
+ license_plate_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=CAPTION_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [license_plate_image],
+ "api_key": ANTHROPIC_API_KEY,
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+ assert (
+ isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+ ), "Expected non-empty string generated"
+
+
+CLASSIFICATION_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ {"type": "WorkflowParameter", "name": "classes"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/anthropic_claude@v1",
+ "name": "claude",
+ "images": "$inputs.image",
+ "task_type": "classification",
+ "classes": "$inputs.classes",
+ "api_key": "$inputs.api_key",
+ },
+ {
+ "type": "roboflow_core/vlm_as_classifier@v1",
+ "name": "parser",
+ "image": "$inputs.image",
+ "vlm_output": "$steps.claude.output",
+ "classes": "$steps.claude.classes",
+ },
+ {
+ "type": "roboflow_core/property_definition@v1",
+ "name": "top_class",
+ "operations": [
+ {"type": "ClassificationPropertyExtract", "property_name": "top_class"}
+ ],
+ "data": "$steps.parser.predictions",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "claude_result",
+ "selector": "$steps.claude.output",
+ },
+ {
+ "type": "JsonField",
+ "name": "top_class",
+ "selector": "$steps.top_class.output",
+ },
+ {
+ "type": "JsonField",
+ "name": "parsed_prediction",
+ "selector": "$steps.parser.*",
+ },
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Using Anthropic Claude as multi-class classifier",
+ use_case_description="""
+In this example, Anthropic Claude model is used as classifier. Output from the model is parsed by
+special `roboflow_core/vlm_as_classifier@v1` block which turns model output text into
+full-blown prediction, which can later be used by other blocks compatible with
+classification predictions - in this case we extract top-class property.
+ """,
+ workflow_definition=CLASSIFICATION_WORKFLOW,
+ workflow_name_in_app="claude-multi-class-classifier",
+)
+@pytest.mark.skipif(
+ condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided"
+)
+def test_workflow_with_multi_class_classifier_prompt(
+ model_manager: ModelManager,
+ dogs_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=CLASSIFICATION_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [dogs_image],
+ "api_key": ANTHROPIC_API_KEY,
+ "classes": ["cat", "dog"],
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {
+ "claude_result",
+ "top_class",
+ "parsed_prediction",
+ }, "Expected all outputs to be delivered"
+ assert (
+ isinstance(result[0]["claude_result"], str)
+ and len(result[0]["claude_result"]) > 0
+ ), "Expected non-empty string generated"
+ assert result[0]["top_class"] == "dog"
+ assert result[0]["parsed_prediction"]["error_status"] is False
+
+
+MULTI_LABEL_CLASSIFICATION_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ {"type": "WorkflowParameter", "name": "classes"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/anthropic_claude@v1",
+ "name": "claude",
+ "images": "$inputs.image",
+ "task_type": "multi-label-classification",
+ "classes": "$inputs.classes",
+ "api_key": "$inputs.api_key",
+ },
+ {
+ "type": "roboflow_core/vlm_as_classifier@v1",
+ "name": "parser",
+ "image": "$inputs.image", # requires image input to construct valid output compatible with "inference"
+ "vlm_output": "$steps.claude.output",
+ "classes": "$steps.claude.classes",
+ },
+ {
+ "type": "roboflow_core/property_definition@v1",
+ "name": "top_class",
+ "operations": [
+ {"type": "ClassificationPropertyExtract", "property_name": "top_class"}
+ ],
+ "data": "$steps.parser.predictions",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.top_class.output",
+ },
+ {
+ "type": "JsonField",
+ "name": "parsed_prediction",
+ "selector": "$steps.parser.*",
+ },
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Using Anthropic Claude as multi-label classifier",
+ use_case_description="""
+In this example, Anthropic Claude model is used as multi-label classifier. Output from the model is parsed by
+special `roboflow_core/vlm_as_classifier@v1` block which turns model output text into
+full-blown prediction, which can later be used by other blocks compatible with
+classification predictions - in this case we extract top-class property.
+ """,
+ workflow_definition=MULTI_LABEL_CLASSIFICATION_WORKFLOW,
+ workflow_name_in_app="claude-multi-label-classifier",
+)
+@pytest.mark.skipif(
+ condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided"
+)
+def test_workflow_with_multi_label_classifier_prompt(
+ model_manager: ModelManager,
+ dogs_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=MULTI_LABEL_CLASSIFICATION_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [dogs_image],
+ "api_key": ANTHROPIC_API_KEY,
+ "classes": ["cat", "dog"],
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {
+ "result",
+ "parsed_prediction",
+ }, "Expected all outputs to be delivered"
+ assert result[0]["result"] == ["dog"]
+ assert result[0]["parsed_prediction"]["error_status"] is False
+
+
+STRUCTURED_PROMPTING_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/anthropic_claude@v1",
+ "name": "claude",
+ "images": "$inputs.image",
+ "task_type": "structured-answering",
+ "output_structure": {
+ "dogs_count": "count of dogs instances in the image",
+ "cats_count": "count of cats instances in the image",
+ },
+ "api_key": "$inputs.api_key",
+ },
+ {
+ "type": "roboflow_core/json_parser@v1",
+ "name": "parser",
+ "raw_json": "$steps.claude.output",
+ "expected_fields": ["dogs_count", "cats_count"],
+ },
+ {
+ "type": "roboflow_core/property_definition@v1",
+ "name": "property_definition",
+ "operations": [{"type": "ToString"}],
+ "data": "$steps.parser.dogs_count",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.property_definition.output",
+ }
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Using Anthropic Claude to provide structured JSON",
+ use_case_description="""
+In this example, Anthropic Claude model is expected to provide structured output in JSON, which can later be
+parsed by dedicated `roboflow_core/json_parser@v1` block which transforms string into dictionary
+and expose it's keys to other blocks for further processing. In this case, parsed output is
+transformed using `roboflow_core/property_definition@v1` block.
+ """,
+ workflow_definition=STRUCTURED_PROMPTING_WORKFLOW,
+ workflow_name_in_app="claude-structured-prompting",
+)
+@pytest.mark.skipif(
+ condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided"
+)
+def test_workflow_with_structured_prompt(
+ model_manager: ModelManager,
+ dogs_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=STRUCTURED_PROMPTING_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [dogs_image],
+ "api_key": ANTHROPIC_API_KEY,
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+ assert result[0]["result"] == "2"
+
+
+OBJECT_DETECTION_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ {"type": "WorkflowParameter", "name": "classes"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/anthropic_claude@v1",
+ "name": "claude",
+ "images": "$inputs.image",
+ "task_type": "object-detection",
+ "classes": "$inputs.classes",
+ "api_key": "$inputs.api_key",
+ },
+ {
+ "type": "roboflow_core/vlm_as_detector@v1",
+ "name": "parser",
+ "vlm_output": "$steps.claude.output",
+ "image": "$inputs.image",
+ "classes": "$steps.claude.classes",
+ "model_type": "anthropic-claude",
+ "task_type": "object-detection",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "claude_result",
+ "selector": "$steps.claude.output",
+ },
+ {
+ "type": "JsonField",
+ "name": "parsed_prediction",
+ "selector": "$steps.parser.predictions",
+ },
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Using Anthropic Claude as object-detection model",
+ use_case_description="""
+In this example, Anthropic Claude model is expected to provide output, which can later be
+parsed by dedicated `roboflow_core/vlm_as_detector@v1` block which transforms string into `sv.Detections`,
+which can later be used by other blocks processing object-detection predictions.
+ """,
+ workflow_definition=OBJECT_DETECTION_WORKFLOW,
+ workflow_name_in_app="claude-object-detection",
+)
+@pytest.mark.skipif(
+ condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided"
+)
+def test_workflow_with_object_detection_prompt(
+ model_manager: ModelManager,
+ dogs_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=OBJECT_DETECTION_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [dogs_image],
+ "api_key": ANTHROPIC_API_KEY,
+ "classes": ["cat", "dog"],
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {
+ "claude_result",
+ "parsed_prediction",
+ }, "Expected all outputs to be delivered"
+ assert result[0]["parsed_prediction"].data["class_name"].tolist() == [
+ "dog",
+ "dog",
+ ], "Expected 2 dogs to be detected"
diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_detections_consensus_block.py b/tests/workflows/integration_tests/execution/test_workflow_with_detections_consensus_block.py
index e071d73974..6916fe8d40 100644
--- a/tests/workflows/integration_tests/execution/test_workflow_with_detections_consensus_block.py
+++ b/tests/workflows/integration_tests/execution/test_workflow_with_detections_consensus_block.py
@@ -15,7 +15,11 @@
"version": "1.0",
"inputs": [
{"type": "WorkflowImage", "name": "image"},
- {"type": "WorkflowParameter", "name": "model_id"},
+ {
+ "type": "WorkflowParameter",
+ "name": "model_id",
+ "default_value": "yolov8n-640",
+ },
],
"steps": [
{
@@ -228,31 +232,6 @@ def test_consensus_workflow_when_confidence_is_restricted_by_input_parameter(
), "Expected confidences to match what was validated manually as workflow outcome"
-def test_consensus_workflow_when_model_id_not_provided_in_input(
- model_manager: ModelManager,
- crowd_image: np.ndarray,
-) -> None:
- # given
- workflow_init_parameters = {
- "workflows_core.model_manager": model_manager,
- "workflows_core.api_key": None,
- "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
- }
- execution_engine = ExecutionEngine.init(
- workflow_definition=CONSENSUS_WORKFLOW,
- init_parameters=workflow_init_parameters,
- max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
- )
-
- # when
- with pytest.raises(RuntimeInputError):
- _ = execution_engine.run(
- runtime_parameters={
- "image": crowd_image,
- }
- )
-
-
def test_consensus_workflow_when_image_not_provided_in_input(
model_manager: ModelManager,
) -> None:
diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_gemini_models.py b/tests/workflows/integration_tests/execution/test_workflow_with_gemini_models.py
new file mode 100644
index 0000000000..97943e1a3e
--- /dev/null
+++ b/tests/workflows/integration_tests/execution/test_workflow_with_gemini_models.py
@@ -0,0 +1,675 @@
+"""
+This test module requires Google AI API key passed via env variable WORKFLOWS_TEST_GOOGLE_API_KEY.
+This is supposed to be used only locally, as that would be too much of a cost in CI
+"""
+
+import os
+
+import numpy as np
+import pytest
+
+from inference.core.env import WORKFLOWS_MAX_CONCURRENT_STEPS
+from inference.core.managers.base import ModelManager
+from inference.core.workflows.core_steps.common.entities import StepExecutionMode
+from inference.core.workflows.execution_engine.core import ExecutionEngine
+from tests.workflows.integration_tests.execution.workflows_gallery_collector.decorators import (
+ add_to_workflows_gallery,
+)
+
+GOOGLE_API_KEY = os.getenv("WORKFLOWS_TEST_GOOGLE_API_KEY")
+
+UNCONSTRAINED_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/google_gemini@v1",
+ "name": "gemini",
+ "images": "$inputs.image",
+ "task_type": "unconstrained",
+ "prompt": "Give me dominant color of the image",
+ "api_key": "$inputs.api_key",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.gemini.output",
+ },
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Prompting Google's Gemini with arbitrary prompt",
+ use_case_description="""
+In this example, Google's Gemini model is prompted with arbitrary text from user
+ """,
+ workflow_definition=UNCONSTRAINED_WORKFLOW,
+ workflow_name_in_app="gemini-arbitrary-prompt",
+)
+@pytest.mark.skipif(
+ condition=GOOGLE_API_KEY is None, reason="Google API key not provided"
+)
+def test_workflow_with_unconstrained_prompt(
+ model_manager: ModelManager,
+ dogs_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=UNCONSTRAINED_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [dogs_image],
+ "api_key": GOOGLE_API_KEY,
+ "prompt": "What is the topic of the image?",
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+ assert (
+ isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+ ), "Expected non-empty string generated"
+
+
+OCR_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/google_gemini@v1",
+ "name": "gemini",
+ "images": "$inputs.image",
+ "task_type": "ocr",
+ "api_key": "$inputs.api_key",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.gemini.output",
+ },
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Using Google's Gemini as OCR model",
+ use_case_description="""
+In this example, Google's Gemini model is used as OCR system. User just points task type and do not need to provide
+any prompt.
+ """,
+ workflow_definition=OCR_WORKFLOW,
+ workflow_name_in_app="gemini-ocr",
+)
+@pytest.mark.skipif(
+ condition=GOOGLE_API_KEY is None, reason="Google API key not provided"
+)
+def test_workflow_with_ocr_prompt(
+ model_manager: ModelManager,
+ license_plate_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=OCR_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [license_plate_image],
+ "api_key": GOOGLE_API_KEY,
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+ assert (
+ isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+ ), "Expected non-empty string generated"
+
+
+VQA_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ {"type": "WorkflowParameter", "name": "prompt"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/google_gemini@v1",
+ "name": "gemini",
+ "images": "$inputs.image",
+ "task_type": "visual-question-answering",
+ "prompt": "$inputs.prompt",
+ "api_key": "$inputs.api_key",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.gemini.output",
+ },
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Using Google's Gemini as Visual Question Answering system",
+ use_case_description="""
+In this example, Google's Gemini model is used as VQA system. User provides question via prompt.
+ """,
+ workflow_definition=VQA_WORKFLOW,
+ workflow_name_in_app="gemini-vqa",
+)
+@pytest.mark.skipif(
+ condition=GOOGLE_API_KEY is None, reason="Google API key not provided"
+)
+def test_workflow_with_vqa_prompt(
+ model_manager: ModelManager,
+ license_plate_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=VQA_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [license_plate_image],
+ "api_key": GOOGLE_API_KEY,
+ "prompt": "What are the brands of the cars?",
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+ assert (
+ isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+ ), "Expected non-empty string generated"
+
+
+CAPTION_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/google_gemini@v1",
+ "name": "gemini",
+ "images": "$inputs.image",
+ "task_type": "caption",
+ "api_key": "$inputs.api_key",
+ "temperature": 1.0,
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.gemini.output",
+ },
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Using Google's Gemini as Image Captioning system",
+ use_case_description="""
+In this example, Google's Gemini model is used as Image Captioning system.
+ """,
+ workflow_definition=CAPTION_WORKFLOW,
+ workflow_name_in_app="gemini-captioning",
+)
+@pytest.mark.skipif(
+ condition=GOOGLE_API_KEY is None, reason="Google API key not provided"
+)
+def test_workflow_with_captioning_prompt(
+ model_manager: ModelManager,
+ license_plate_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=CAPTION_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [license_plate_image],
+ "api_key": GOOGLE_API_KEY,
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+ assert (
+ isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+ ), "Expected non-empty string generated"
+
+
+CLASSIFICATION_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ {"type": "WorkflowParameter", "name": "classes"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/google_gemini@v1",
+ "name": "gemini",
+ "images": "$inputs.image",
+ "task_type": "classification",
+ "classes": "$inputs.classes",
+ "api_key": "$inputs.api_key",
+ },
+ {
+ "type": "roboflow_core/vlm_as_classifier@v1",
+ "name": "parser",
+ "image": "$inputs.image",
+ "vlm_output": "$steps.gemini.output",
+ "classes": "$steps.gemini.classes",
+ },
+ {
+ "type": "roboflow_core/property_definition@v1",
+ "name": "top_class",
+ "operations": [
+ {"type": "ClassificationPropertyExtract", "property_name": "top_class"}
+ ],
+ "data": "$steps.parser.predictions",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "gemini_result",
+ "selector": "$steps.gemini.output",
+ },
+ {
+ "type": "JsonField",
+ "name": "top_class",
+ "selector": "$steps.top_class.output",
+ },
+ {
+ "type": "JsonField",
+ "name": "parsed_prediction",
+ "selector": "$steps.parser.*",
+ },
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Using Google's Gemini as multi-class classifier",
+ use_case_description="""
+In this example, Google's Gemini model is used as classifier. Output from the model is parsed by
+special `roboflow_core/vlm_as_classifier@v1` block which turns model output text into
+full-blown prediction, which can later be used by other blocks compatible with
+classification predictions - in this case we extract top-class property.
+ """,
+ workflow_definition=CLASSIFICATION_WORKFLOW,
+ workflow_name_in_app="gemini-multi-class-classifier",
+)
+@pytest.mark.skipif(
+ condition=GOOGLE_API_KEY is None, reason="Google API key not provided"
+)
+def test_workflow_with_multi_class_classifier_prompt(
+ model_manager: ModelManager,
+ dogs_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=CLASSIFICATION_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [dogs_image],
+ "api_key": GOOGLE_API_KEY,
+ "classes": ["cat", "dog"],
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {
+ "gemini_result",
+ "top_class",
+ "parsed_prediction",
+ }, "Expected all outputs to be delivered"
+ assert (
+ isinstance(result[0]["gemini_result"], str)
+ and len(result[0]["gemini_result"]) > 0
+ ), "Expected non-empty string generated"
+ assert result[0]["top_class"] == "dog"
+ assert result[0]["parsed_prediction"]["error_status"] is False
+
+
+MULTI_LABEL_CLASSIFICATION_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ {"type": "WorkflowParameter", "name": "classes"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/google_gemini@v1",
+ "name": "gemini",
+ "images": "$inputs.image",
+ "task_type": "multi-label-classification",
+ "classes": "$inputs.classes",
+ "api_key": "$inputs.api_key",
+ },
+ {
+ "type": "roboflow_core/vlm_as_classifier@v1",
+ "name": "parser",
+ "image": "$inputs.image",
+ "vlm_output": "$steps.gemini.output",
+ "classes": "$steps.gemini.classes",
+ },
+ {
+ "type": "roboflow_core/property_definition@v1",
+ "name": "top_class",
+ "operations": [
+ {"type": "ClassificationPropertyExtract", "property_name": "top_class"}
+ ],
+ "data": "$steps.parser.predictions",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.top_class.output",
+ },
+ {
+ "type": "JsonField",
+ "name": "parsed_prediction",
+ "selector": "$steps.parser.*",
+ },
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Using Google's Gemini as multi-label classifier",
+ use_case_description="""
+In this example, Google's Gemini model is used as multi-label classifier. Output from the model is parsed by
+special `roboflow_core/vlm_as_classifier@v1` block which turns model output text into
+full-blown prediction, which can later be used by other blocks compatible with
+classification predictions - in this case we extract top-class property.
+ """,
+ workflow_definition=MULTI_LABEL_CLASSIFICATION_WORKFLOW,
+ workflow_name_in_app="gemini-multi-label-classifier",
+)
+@pytest.mark.skipif(
+ condition=GOOGLE_API_KEY is None, reason="Google API key not provided"
+)
+def test_workflow_with_multi_label_classifier_prompt(
+ model_manager: ModelManager,
+ dogs_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=MULTI_LABEL_CLASSIFICATION_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [dogs_image],
+ "api_key": GOOGLE_API_KEY,
+ "classes": ["cat", "dog"],
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {
+ "result",
+ "parsed_prediction",
+ }, "Expected all outputs to be delivered"
+ assert result[0]["result"] == ["dog"]
+ assert result[0]["parsed_prediction"]["error_status"] is False
+
+
+STRUCTURED_PROMPTING_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/google_gemini@v1",
+ "name": "gemini",
+ "images": "$inputs.image",
+ "task_type": "structured-answering",
+ "output_structure": {
+ "dogs_count": "count of dogs instances in the image",
+ "cats_count": "count of cats instances in the image",
+ },
+ "api_key": "$inputs.api_key",
+ },
+ {
+ "type": "roboflow_core/json_parser@v1",
+ "name": "parser",
+ "raw_json": "$steps.gemini.output",
+ "expected_fields": ["dogs_count", "cats_count"],
+ },
+ {
+ "type": "roboflow_core/property_definition@v1",
+ "name": "property_definition",
+ "operations": [{"type": "ToString"}],
+ "data": "$steps.parser.dogs_count",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.property_definition.output",
+ }
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Using Google's Gemini to provide structured JSON",
+ use_case_description="""
+In this example, Google's Gemini model is expected to provide structured output in JSON, which can later be
+parsed by dedicated `roboflow_core/json_parser@v1` block which transforms string into dictionary
+and expose it's keys to other blocks for further processing. In this case, parsed output is
+transformed using `roboflow_core/property_definition@v1` block.
+ """,
+ workflow_definition=STRUCTURED_PROMPTING_WORKFLOW,
+ workflow_name_in_app="gemini-structured-prompting",
+)
+@pytest.mark.skipif(
+ condition=GOOGLE_API_KEY is None, reason="Google API key not provided"
+)
+def test_workflow_with_structured_prompt(
+ model_manager: ModelManager,
+ dogs_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=STRUCTURED_PROMPTING_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [dogs_image],
+ "api_key": GOOGLE_API_KEY,
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+ assert result[0]["result"] == "2"
+
+
+OBJECT_DETECTION_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ {"type": "WorkflowParameter", "name": "classes"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/google_gemini@v1",
+ "name": "gemini",
+ "images": "$inputs.image",
+ "task_type": "object-detection",
+ "classes": "$inputs.classes",
+ "api_key": "$inputs.api_key",
+ },
+ {
+ "type": "roboflow_core/vlm_as_detector@v1",
+ "name": "parser",
+ "vlm_output": "$steps.gemini.output",
+ "image": "$inputs.image",
+ "classes": "$steps.gemini.classes",
+ "model_type": "google-gemini",
+ "task_type": "object-detection",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "gemini_result",
+ "selector": "$steps.gemini.output",
+ },
+ {
+ "type": "JsonField",
+ "name": "parsed_prediction",
+ "selector": "$steps.parser.predictions",
+ },
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Using Google's Gemini as object-detection model",
+ use_case_description="""
+In this example, Google's Gemini model is expected to provide output, which can later be
+parsed by dedicated `roboflow_core/vlm_as_detector@v1` block which transforms string into `sv.Detections`,
+which can later be used by other blocks processing object-detection predictions.
+ """,
+ workflow_definition=OBJECT_DETECTION_WORKFLOW,
+ workflow_name_in_app="gemini-object-detection",
+)
+@pytest.mark.skipif(
+ condition=GOOGLE_API_KEY is None, reason="Google API key not provided"
+)
+def test_workflow_with_object_detection_prompt(
+ model_manager: ModelManager,
+ dogs_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=OBJECT_DETECTION_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [dogs_image],
+ "api_key": GOOGLE_API_KEY,
+ "classes": ["cat", "dog"],
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {
+ "gemini_result",
+ "parsed_prediction",
+ }, "Expected all outputs to be delivered"
+ assert result[0]["parsed_prediction"].data["class_name"].tolist() == [
+ "dog",
+ "dog",
+ ], "Expected 2 dogs to be detected"
diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_model_running_on_absolute_static_crop.py b/tests/workflows/integration_tests/execution/test_workflow_with_model_running_on_absolute_static_crop.py
index 02e4d1db62..db9312d59a 100644
--- a/tests/workflows/integration_tests/execution/test_workflow_with_model_running_on_absolute_static_crop.py
+++ b/tests/workflows/integration_tests/execution/test_workflow_with_model_running_on_absolute_static_crop.py
@@ -15,7 +15,11 @@
"version": "1.0",
"inputs": [
{"type": "WorkflowImage", "name": "image"},
- {"type": "WorkflowParameter", "name": "model_id", "default_value": "yolov8n-640"},
+ {
+ "type": "WorkflowParameter",
+ "name": "model_id",
+ "default_value": "yolov8n-640",
+ },
{"type": "WorkflowParameter", "name": "confidence", "default_value": 0.7},
{"type": "WorkflowParameter", "name": "x_center"},
{"type": "WorkflowParameter", "name": "y_center"},
diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_open_ai_models.py b/tests/workflows/integration_tests/execution/test_workflow_with_open_ai_models.py
new file mode 100644
index 0000000000..b37850ff2d
--- /dev/null
+++ b/tests/workflows/integration_tests/execution/test_workflow_with_open_ai_models.py
@@ -0,0 +1,584 @@
+"""
+This test module requires OpenAI API key passed via env variable WORKFLOWS_TEST_OPEN_AI_KEY.
+This is supposed to be used only locally, as that would be too much of a cost in CI
+"""
+
+import os
+
+import numpy as np
+import pytest
+
+from inference.core.env import WORKFLOWS_MAX_CONCURRENT_STEPS
+from inference.core.managers.base import ModelManager
+from inference.core.workflows.core_steps.common.entities import StepExecutionMode
+from inference.core.workflows.execution_engine.core import ExecutionEngine
+from tests.workflows.integration_tests.execution.workflows_gallery_collector.decorators import (
+ add_to_workflows_gallery,
+)
+
+OPEN_AI_API_KEY = os.getenv("WORKFLOWS_TEST_OPEN_AI_KEY")
+
+UNCONSTRAINED_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ {"type": "WorkflowParameter", "name": "prompt"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/open_ai@v2",
+ "name": "gpt",
+ "images": "$inputs.image",
+ "task_type": "unconstrained",
+ "prompt": "$inputs.prompt",
+ "api_key": "$inputs.api_key",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.gpt.output",
+ },
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Prompting GPT with arbitrary prompt",
+ use_case_description="""
+In this example, GPT model is prompted with arbitrary text from user
+ """,
+ workflow_definition=UNCONSTRAINED_WORKFLOW,
+ workflow_name_in_app="gpt-arbitrary-prompt",
+)
+@pytest.mark.skipif(
+ condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided"
+)
+def test_workflow_with_unconstrained_prompt(
+ model_manager: ModelManager,
+ dogs_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=UNCONSTRAINED_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [dogs_image],
+ "api_key": OPEN_AI_API_KEY,
+ "prompt": "What is the topic of the image?",
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+ assert (
+ isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+ ), "Expected non-empty string generated"
+
+
+OCR_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/open_ai@v2",
+ "name": "gpt",
+ "images": "$inputs.image",
+ "task_type": "ocr",
+ "api_key": "$inputs.api_key",
+ "model_version": "gpt-4o-mini",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.gpt.output",
+ },
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Using GPT as OCR model",
+ use_case_description="""
+In this example, GPT model is used as OCR system. User just points task type and do not need to provide
+any prompt.
+ """,
+ workflow_definition=OCR_WORKFLOW,
+ workflow_name_in_app="gpt-ocr",
+)
+@pytest.mark.skipif(
+ condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided"
+)
+def test_workflow_with_ocr_prompt(
+ model_manager: ModelManager,
+ license_plate_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=OCR_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [license_plate_image],
+ "api_key": OPEN_AI_API_KEY,
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+ assert (
+ isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+ ), "Expected non-empty string generated"
+
+
+VQA_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ {"type": "WorkflowParameter", "name": "prompt"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/open_ai@v2",
+ "name": "gpt",
+ "images": "$inputs.image",
+ "task_type": "visual-question-answering",
+ "prompt": "$inputs.prompt",
+ "api_key": "$inputs.api_key",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.gpt.output",
+ },
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Using GPT as Visual Question Answering system",
+ use_case_description="""
+In this example, GPT model is used as VQA system. User provides question via prompt.
+ """,
+ workflow_definition=VQA_WORKFLOW,
+ workflow_name_in_app="gpt-vqa",
+)
+@pytest.mark.skipif(
+ condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided"
+)
+def test_workflow_with_vqa_prompt(
+ model_manager: ModelManager,
+ license_plate_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=VQA_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [license_plate_image],
+ "api_key": OPEN_AI_API_KEY,
+ "prompt": "What are the brands of the cars?",
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+ assert (
+ isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+ ), "Expected non-empty string generated"
+
+
+CAPTION_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/open_ai@v2",
+ "name": "gpt",
+ "images": "$inputs.image",
+ "task_type": "caption",
+ "api_key": "$inputs.api_key",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.gpt.output",
+ },
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Using GPT as Image Captioning system",
+ use_case_description="""
+In this example, GPT model is used as Image Captioning system.
+ """,
+ workflow_definition=CAPTION_WORKFLOW,
+ workflow_name_in_app="gpt-captioning",
+)
+@pytest.mark.skipif(
+ condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided"
+)
+def test_workflow_with_captioning_prompt(
+ model_manager: ModelManager,
+ license_plate_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=CAPTION_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [license_plate_image],
+ "api_key": OPEN_AI_API_KEY,
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+ assert (
+ isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+ ), "Expected non-empty string generated"
+
+
+CLASSIFICATION_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ {"type": "WorkflowParameter", "name": "classes"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/open_ai@v2",
+ "name": "gpt",
+ "images": "$inputs.image",
+ "task_type": "classification",
+ "classes": "$inputs.classes",
+ "api_key": "$inputs.api_key",
+ },
+ {
+ "type": "roboflow_core/vlm_as_classifier@v1",
+ "name": "parser",
+ "image": "$inputs.image",
+ "vlm_output": "$steps.gpt.output",
+ "classes": "$steps.gpt.classes",
+ },
+ {
+ "type": "roboflow_core/property_definition@v1",
+ "name": "top_class",
+ "operations": [
+ {"type": "ClassificationPropertyExtract", "property_name": "top_class"}
+ ],
+ "data": "$steps.parser.predictions",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "gpt_result",
+ "selector": "$steps.gpt.output",
+ },
+ {
+ "type": "JsonField",
+ "name": "top_class",
+ "selector": "$steps.top_class.output",
+ },
+ {
+ "type": "JsonField",
+ "name": "parsed_prediction",
+ "selector": "$steps.parser.*",
+ },
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Using GPT as multi-class classifier",
+ use_case_description="""
+In this example, GPT model is used as classifier. Output from the model is parsed by
+special `roboflow_core/vlm_as_classifier@v1` block which turns GPT output text into
+full-blown prediction, which can later be used by other blocks compatible with
+classification predictions - in this case we extract top-class property.
+ """,
+ workflow_definition=CLASSIFICATION_WORKFLOW,
+ workflow_name_in_app="gpt-multi-class-classifier",
+)
+@pytest.mark.skipif(
+ condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided"
+)
+def test_workflow_with_multi_class_classifier_prompt(
+ model_manager: ModelManager,
+ dogs_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=CLASSIFICATION_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [dogs_image],
+ "api_key": OPEN_AI_API_KEY,
+ "classes": ["cat", "dog"],
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {
+ "gpt_result",
+ "top_class",
+ "parsed_prediction",
+ }, "Expected all outputs to be delivered"
+ assert (
+ isinstance(result[0]["gpt_result"], str) and len(result[0]["gpt_result"]) > 0
+ ), "Expected non-empty string generated"
+ assert result[0]["top_class"] == "dog"
+ assert result[0]["parsed_prediction"]["error_status"] is False
+
+
+MULTI_LABEL_CLASSIFICATION_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ {"type": "WorkflowParameter", "name": "classes"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/open_ai@v2",
+ "name": "gpt",
+ "images": "$inputs.image",
+ "task_type": "multi-label-classification",
+ "classes": "$inputs.classes",
+ "api_key": "$inputs.api_key",
+ },
+ {
+ "type": "roboflow_core/vlm_as_classifier@v1",
+ "name": "parser",
+ "image": "$inputs.image",
+ "vlm_output": "$steps.gpt.output",
+ "classes": "$steps.gpt.classes",
+ },
+ {
+ "type": "roboflow_core/property_definition@v1",
+ "name": "top_class",
+ "operations": [
+ {"type": "ClassificationPropertyExtract", "property_name": "top_class"}
+ ],
+ "data": "$steps.parser.predictions",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.top_class.output",
+ },
+ {
+ "type": "JsonField",
+ "name": "parsed_prediction",
+ "selector": "$steps.parser.*",
+ },
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Using GPT as multi-label classifier",
+ use_case_description="""
+In this example, GPT model is used as multi-label classifier. Output from the model is parsed by
+special `roboflow_core/vlm_as_classifier@v1` block which turns GPT output text into
+full-blown prediction, which can later be used by other blocks compatible with
+classification predictions - in this case we extract top-class property.
+ """,
+ workflow_definition=MULTI_LABEL_CLASSIFICATION_WORKFLOW,
+ workflow_name_in_app="gpt-multi-label-classifier",
+)
+@pytest.mark.skipif(
+ condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided"
+)
+def test_workflow_with_multi_label_classifier_prompt(
+ model_manager: ModelManager,
+ dogs_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=MULTI_LABEL_CLASSIFICATION_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [dogs_image],
+ "api_key": OPEN_AI_API_KEY,
+ "classes": ["cat", "dog"],
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {
+ "result",
+ "parsed_prediction",
+ }, "Expected all outputs to be delivered"
+ assert result[0]["result"] == ["dog"]
+ assert result[0]["parsed_prediction"]["error_status"] is False
+
+
+STRUCTURED_PROMPTING_WORKFLOW = {
+ "version": "1.0",
+ "inputs": [
+ {"type": "WorkflowImage", "name": "image"},
+ {"type": "WorkflowParameter", "name": "api_key"},
+ ],
+ "steps": [
+ {
+ "type": "roboflow_core/open_ai@v2",
+ "name": "gpt",
+ "images": "$inputs.image",
+ "task_type": "structured-answering",
+ "output_structure": {
+ "dogs_count": "count of dogs instances in the image",
+ "cats_count": "count of cats instances in the image",
+ },
+ "api_key": "$inputs.api_key",
+ },
+ {
+ "type": "roboflow_core/json_parser@v1",
+ "name": "parser",
+ "raw_json": "$steps.gpt.output",
+ "expected_fields": ["dogs_count", "cats_count"],
+ },
+ {
+ "type": "roboflow_core/property_definition@v1",
+ "name": "property_definition",
+ "operations": [{"type": "ToString"}],
+ "data": "$steps.parser.dogs_count",
+ },
+ ],
+ "outputs": [
+ {
+ "type": "JsonField",
+ "name": "result",
+ "selector": "$steps.property_definition.output",
+ }
+ ],
+}
+
+
+@add_to_workflows_gallery(
+ category="Workflows with Visual Language Models",
+ use_case_title="Using GPT to provide structured JSON",
+ use_case_description="""
+In this example, GPT model is expected to provide structured output in JSON, which can later be
+parsed by dedicated `roboflow_core/json_parser@v1` block which transforms string into dictionary
+and expose it's keys to other blocks for further processing. In this case, parsed output is
+transformed using `roboflow_core/property_definition@v1` block.
+ """,
+ workflow_definition=STRUCTURED_PROMPTING_WORKFLOW,
+ workflow_name_in_app="gpt-structured-prompting",
+)
+@pytest.mark.skipif(
+ condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided"
+)
+def test_workflow_with_structured_prompt(
+ model_manager: ModelManager,
+ dogs_image: np.ndarray,
+) -> None:
+ # given
+ workflow_init_parameters = {
+ "workflows_core.model_manager": model_manager,
+ "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+ }
+ execution_engine = ExecutionEngine.init(
+ workflow_definition=STRUCTURED_PROMPTING_WORKFLOW,
+ init_parameters=workflow_init_parameters,
+ max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+ )
+
+ # when
+ result = execution_engine.run(
+ runtime_parameters={
+ "image": [dogs_image],
+ "api_key": OPEN_AI_API_KEY,
+ }
+ )
+
+ # then
+ assert len(result) == 1, "Single image given, expected single output"
+ assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+ assert result[0]["result"] == "2"
diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_single_model.py b/tests/workflows/integration_tests/execution/test_workflow_with_single_model.py
index a8c1966bce..031c0e0d19 100644
--- a/tests/workflows/integration_tests/execution/test_workflow_with_single_model.py
+++ b/tests/workflows/integration_tests/execution/test_workflow_with_single_model.py
@@ -15,7 +15,11 @@
"version": "1.0",
"inputs": [
{"type": "WorkflowImage", "name": "image"},
- {"type": "WorkflowParameter", "name": "model_id", "default_value": "yolov8n-640"},
+ {
+ "type": "WorkflowParameter",
+ "name": "model_id",
+ "default_value": "yolov8n-640",
+ },
{"type": "WorkflowParameter", "name": "confidence", "default_value": 0.3},
],
"steps": [
diff --git a/tests/workflows/unit_tests/core_steps/formatters/test_json_parser.py b/tests/workflows/unit_tests/core_steps/formatters/test_json_parser.py
new file mode 100644
index 0000000000..8907690232
--- /dev/null
+++ b/tests/workflows/unit_tests/core_steps/formatters/test_json_parser.py
@@ -0,0 +1,232 @@
+import json
+
+import pytest
+from pydantic import ValidationError
+
+from inference.core.workflows.core_steps.formatters.json_parser.v1 import (
+ BlockManifest,
+ JSONParserBlockV1,
+)
+from inference.core.workflows.execution_engine.entities.base import OutputDefinition
+from inference.core.workflows.execution_engine.entities.types import BOOLEAN_KIND
+
+
+def test_parsing_manifest_when_input_is_valid() -> None:
+ # given
+ raw_manifest = {
+ "name": "parser",
+ "type": "roboflow_core/json_parser@v1",
+ "raw_json": "$steps.some.a",
+ "expected_fields": ["a", "b", "c"],
+ }
+
+ # when
+ result = BlockManifest.model_validate(raw_manifest)
+
+ # then
+ assert result == BlockManifest(
+ name="parser",
+ type="roboflow_core/json_parser@v1",
+ raw_json="$steps.some.a",
+ expected_fields=["a", "b", "c"],
+ )
+
+
+def test_parsing_manifest_when_input_is_invalid() -> None:
+ # given
+ raw_manifest = {
+ "name": "parser",
+ "type": "roboflow_core/json_parser@v1",
+ "raw_json": "$steps.some.a",
+ "expected_fields": ["a", "b", "c", "error_status"],
+ }
+
+ # when
+ with pytest.raises(ValidationError):
+ _ = BlockManifest.model_validate(raw_manifest)
+
+
+def test_manifest_get_actual_outputs() -> None:
+ # given
+ manifest = BlockManifest(
+ name="parser",
+ type="roboflow_core/json_parser@v1",
+ raw_json="$steps.some.a",
+ expected_fields=["a", "b", "c"],
+ )
+
+ # when
+ result = manifest.get_actual_outputs()
+
+ # then
+ assert result == [
+ OutputDefinition(name="error_status", kind=[BOOLEAN_KIND]),
+ OutputDefinition(name="a"),
+ OutputDefinition(name="b"),
+ OutputDefinition(name="c"),
+ ]
+
+
+def test_block_run_when_valid_json_given_and_all_fields_declared() -> None:
+ # given
+ raw_json = json.dumps({"a": "1", "b": "2"})
+ block = JSONParserBlockV1()
+
+ # when
+ result = block.run(raw_json=raw_json, expected_fields=["a", "b"])
+
+ # then
+ assert result == {
+ "error_status": False,
+ "a": "1",
+ "b": "2",
+ }
+
+
+def test_block_run_when_valid_json_given_and_subset_of_fields_declared() -> None:
+ # given
+ raw_json = json.dumps({"a": "1", "b": "2"})
+ block = JSONParserBlockV1()
+
+ # when
+ result = block.run(raw_json=raw_json, expected_fields=["a"])
+
+ # then
+ assert result == {
+ "error_status": False,
+ "a": "1",
+ }
+
+
+def test_block_run_when_valid_json_given_and_subset_of_declared_fields_found() -> None:
+ # given
+ raw_json = json.dumps({"a": "1", "b": "2"})
+ block = JSONParserBlockV1()
+
+ # when
+ result = block.run(raw_json=raw_json, expected_fields=["a", "b", "c"])
+
+ # then
+ assert result == {
+ "error_status": True,
+ "a": "1",
+ "b": "2",
+ "c": None,
+ }
+
+
+def test_block_run_when_multiple_json_documents_provided() -> None:
+ # given
+ raw_json = json.dumps({"a": "1", "b": "2"})
+ block = JSONParserBlockV1()
+
+ # when
+ result = block.run(raw_json="\n".join([raw_json] * 2), expected_fields=["a", "b"])
+
+ # then
+ assert result == {
+ "error_status": True,
+ "a": None,
+ "b": None,
+ }
+
+
+def test_block_run_when_invalid_json_provided() -> None:
+ # given
+ block = JSONParserBlockV1()
+
+ # when
+ result = block.run(raw_json="invalid", expected_fields=["a", "b"])
+
+ # then
+ assert result == {
+ "error_status": True,
+ "a": None,
+ "b": None,
+ }
+
+
+def test_block_run_when_json_in_markdown_provided() -> None:
+ # given
+ raw_json = json.dumps({"a": "1", "b": "2"})
+ raw_json = f"```json\n{raw_json}\n```"
+ block = JSONParserBlockV1()
+
+ # when
+ result = block.run(raw_json=raw_json, expected_fields=["a", "b"])
+
+ # then
+ assert result == {
+ "error_status": False,
+ "a": "1",
+ "b": "2",
+ }
+
+
+def test_block_run_when_indented_json_in_markdown_provided() -> None:
+ # given
+ raw_json = json.dumps({"a": "1", "b": "2"}, indent=4)
+ raw_json = f"```json\n{raw_json}\n```"
+ block = JSONParserBlockV1()
+
+ # when
+ result = block.run(raw_json=raw_json, expected_fields=["a", "b"])
+
+ # then
+ assert result == {
+ "error_status": False,
+ "a": "1",
+ "b": "2",
+ }
+
+
+def test_block_run_when_json_in_markdown_uppercase_provided() -> None:
+ # given
+ raw_json = json.dumps({"a": "1", "b": "2"})
+ raw_json = f"```JSON\n{raw_json}\n```"
+ block = JSONParserBlockV1()
+
+ # when
+ result = block.run(raw_json=raw_json, expected_fields=["a", "b"])
+
+ # then
+ assert result == {
+ "error_status": False,
+ "a": "1",
+ "b": "2",
+ }
+
+
+def test_block_run_when_json_in_markdown_without_new_lines_provided() -> None:
+ # given
+ raw_json = json.dumps({"a": "1", "b": "2"})
+ raw_json = f"```JSON{raw_json}```"
+ block = JSONParserBlockV1()
+
+ # when
+ result = block.run(raw_json=raw_json, expected_fields=["a", "b"])
+
+ # then
+ assert result == {
+ "error_status": False,
+ "a": "1",
+ "b": "2",
+ }
+
+
+def test_block_run_when_multiple_jsons_in_markdown_provided() -> None:
+ # given
+ raw_json_1 = json.dumps({"a": "1", "b": "2"})
+ raw_json_2 = json.dumps({"a": "3", "b": "4"})
+ raw_json = f"```json\n{raw_json_1}\n```\n``json\n{raw_json_2}\n```"
+ block = JSONParserBlockV1()
+
+ # when
+ result = block.run(raw_json=raw_json, expected_fields=["a", "b"])
+
+ # then
+ assert result == {
+ "error_status": False,
+ "a": "1",
+ "b": "2",
+ }
diff --git a/tests/workflows/unit_tests/core_steps/formatters/test_vlm_as_classifier.py b/tests/workflows/unit_tests/core_steps/formatters/test_vlm_as_classifier.py
new file mode 100644
index 0000000000..796c74cb3a
--- /dev/null
+++ b/tests/workflows/unit_tests/core_steps/formatters/test_vlm_as_classifier.py
@@ -0,0 +1,342 @@
+from typing import List, Union
+
+import numpy as np
+import pytest
+
+from inference.core.workflows.core_steps.formatters.vlm_as_classifier.v1 import (
+ BlockManifest,
+ VLMAsClassifierBlockV1,
+)
+from inference.core.workflows.execution_engine.entities.base import (
+ ImageParentMetadata,
+ WorkflowImageData,
+)
+
+
+@pytest.mark.parametrize("image", ["$inputs.image", "$steps.some.image"])
+@pytest.mark.parametrize(
+ "classes", ["$inputs.classes", "$steps.some.classes", ["a", "b"]]
+)
+def test_block_manifest_parsing_when_input_is_valid(
+ image: str, classes: Union[str, List[str]]
+) -> None:
+ # given
+ raw_manifest = {
+ "type": "roboflow_core/vlm_as_classifier@v1",
+ "image": image,
+ "name": "parser",
+ "vlm_output": "$steps.vlm.output",
+ "classes": classes,
+ }
+
+ # when
+ result = BlockManifest.model_validate(raw_manifest)
+
+ # then
+ assert result == BlockManifest(
+ type="roboflow_core/vlm_as_classifier@v1",
+ name="parser",
+ image=image,
+ vlm_output="$steps.vlm.output",
+ classes=classes,
+ )
+
+
+def test_run_when_valid_json_given_for_multi_class_classification() -> None:
+ # given
+ image = WorkflowImageData(
+ numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+ parent_metadata=ImageParentMetadata(parent_id="parent"),
+ )
+ vlm_output = """
+```json
+{"class_name": "car", "confidence": "0.7"}
+```
+ """
+ block = VLMAsClassifierBlockV1()
+
+ # when
+ result = block.run(image=image, vlm_output=vlm_output, classes=["car", "cat"])
+
+ # then
+ assert result["error_status"] is False
+ assert result["predictions"]["image"] == {"width": 168, "height": 192}
+ assert result["predictions"]["predictions"] == [
+ {"class_name": "car", "class_id": 0, "confidence": 0.7},
+ {"class_name": "cat", "class_id": 1, "confidence": 0.0},
+ ]
+ assert result["predictions"]["top"] == "car"
+ assert abs(result["predictions"]["confidence"] - 0.7) < 1e-5
+ assert result["predictions"]["parent_id"] == "parent"
+ assert len(result["inference_id"]) > 0
+ assert result["inference_id"] == result["predictions"]["inference_id"]
+
+
+def test_run_when_valid_json_given_for_multi_class_classification_when_unknown_class_predicted() -> (
+ None
+):
+ # given
+ image = WorkflowImageData(
+ numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+ parent_metadata=ImageParentMetadata(parent_id="parent"),
+ )
+ vlm_output = """
+```json
+{"class_name": "my_class", "confidence": "0.7"}
+```
+ """
+ block = VLMAsClassifierBlockV1()
+
+ # when
+ result = block.run(image=image, vlm_output=vlm_output, classes=["car", "cat"])
+
+ # then
+ assert result["error_status"] is False
+ assert result["predictions"]["image"] == {"width": 168, "height": 192}
+ assert result["predictions"]["predictions"] == [
+ {"class_name": "my_class", "class_id": -1, "confidence": 0.7},
+ {"class_name": "car", "class_id": 0, "confidence": 0.0},
+ {"class_name": "cat", "class_id": 1, "confidence": 0.0},
+ ]
+ assert result["predictions"]["top"] == "my_class"
+ assert abs(result["predictions"]["confidence"] - 0.7) < 1e-5
+ assert result["predictions"]["parent_id"] == "parent"
+ assert len(result["inference_id"]) > 0
+ assert result["inference_id"] == result["predictions"]["inference_id"]
+
+
+def test_run_when_valid_json_given_for_multi_label_classification() -> None:
+ # given
+ image = WorkflowImageData(
+ numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+ parent_metadata=ImageParentMetadata(parent_id="parent"),
+ )
+ vlm_output = """
+ {"predicted_classes": [
+ {"class": "cat", "confidence": 0.3}, {"class": "dog", "confidence": 0.6},
+ {"class": "cat", "confidence": "0.7"}
+ ]}
+ """
+ block = VLMAsClassifierBlockV1()
+
+ # when
+ result = block.run(
+ image=image, vlm_output=vlm_output, classes=["car", "cat", "dog"]
+ )
+
+ # then
+ assert result["error_status"] is False
+ assert result["predictions"]["image"] == {"width": 168, "height": 192}
+ assert result["predictions"]["predictions"] == {
+ "car": {"confidence": 0.0, "class_id": 0},
+ "cat": {"confidence": 0.7, "class_id": 1},
+ "dog": {"confidence": 0.6, "class_id": 2},
+ }
+ assert set(result["predictions"]["predicted_classes"]) == {"cat", "dog"}
+ assert result["predictions"]["parent_id"] == "parent"
+ assert len(result["inference_id"]) > 0
+ assert result["inference_id"] == result["predictions"]["inference_id"]
+
+
+def test_run_when_valid_json_given_for_multi_label_classification_when_unknown_class_provided() -> (
+ None
+):
+ # given
+ image = WorkflowImageData(
+ numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+ parent_metadata=ImageParentMetadata(parent_id="parent"),
+ )
+ vlm_output = """
+ {"predicted_classes": [
+ {"class": "my_class_1", "confidence": 0.3}, {"class": "my_class_2", "confidence": 0.6},
+ {"class": "my_class_1", "confidence": 0.7}
+ ]}
+ """
+ block = VLMAsClassifierBlockV1()
+
+ # when
+ result = block.run(
+ image=image, vlm_output=vlm_output, classes=["car", "cat", "dog"]
+ )
+
+ # then
+ assert result["error_status"] is False
+ assert result["predictions"]["image"] == {"width": 168, "height": 192}
+ assert result["predictions"]["predictions"] == {
+ "car": {"confidence": 0.0, "class_id": 0},
+ "cat": {"confidence": 0.0, "class_id": 1},
+ "dog": {"confidence": 0.0, "class_id": 2},
+ "my_class_1": {"confidence": 0.7, "class_id": -1},
+ "my_class_2": {"confidence": 0.6, "class_id": -1},
+ }
+ assert set(result["predictions"]["predicted_classes"]) == {
+ "my_class_1",
+ "my_class_2",
+ }
+ assert result["predictions"]["parent_id"] == "parent"
+ assert len(result["inference_id"]) > 0
+ assert result["inference_id"] == result["predictions"]["inference_id"]
+
+
+def test_run_when_valid_json_of_unknown_structure_given() -> None:
+ # given
+ image = WorkflowImageData(
+ numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+ parent_metadata=ImageParentMetadata(parent_id="parent"),
+ )
+ block = VLMAsClassifierBlockV1()
+
+ # when
+ result = block.run(
+ image=image, vlm_output='{"some": "data"}', classes=["car", "cat"]
+ )
+
+ # then
+ assert result["error_status"] is True
+ assert result["predictions"] is None
+ assert len(result["inference_id"]) > 0
+
+
+def test_run_when_invalid_json_given() -> None:
+ # given
+ image = WorkflowImageData(
+ numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+ parent_metadata=ImageParentMetadata(parent_id="parent"),
+ )
+ block = VLMAsClassifierBlockV1()
+
+ # when
+ result = block.run(image=image, vlm_output="invalid_json", classes=["car", "cat"])
+
+ # then
+ assert result["error_status"] is True
+ assert result["predictions"] is None
+ assert len(result["inference_id"]) > 0
+
+
+def test_run_when_multiple_jsons_given() -> None:
+ # given
+ raw_json = """
+ {"predicted_classes": [
+ {"class": "cat", "confidence": 0.3}, {"class": "dog", "confidence": 0.6},
+ {"class": "cat", "confidence": "0.7"}
+ ]}
+ {"predicted_classes": [
+ {"class": "cat", "confidence": 0.4}, {"class": "dog", "confidence": 0.7},
+ {"class": "cat", "confidence": "0.8"}
+ ]}
+ """
+ image = WorkflowImageData(
+ numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+ parent_metadata=ImageParentMetadata(parent_id="parent"),
+ )
+ block = VLMAsClassifierBlockV1()
+
+ # when
+ result = block.run(image=image, vlm_output=raw_json, classes=["car", "cat"])
+
+ # then
+ assert result["error_status"] is True
+ assert result["predictions"] is None
+ assert len(result["inference_id"]) > 0
+
+
+def test_run_when_json_in_markdown_block_given() -> None:
+ # given
+ raw_json = """
+```json
+{"predicted_classes": [
+ {"class": "cat", "confidence": 0.3}, {"class": "dog", "confidence": 0.6},
+ {"class": "cat", "confidence": "0.7"}
+]}
+```
+```
+ """
+ image = WorkflowImageData(
+ numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+ parent_metadata=ImageParentMetadata(parent_id="parent"),
+ )
+ block = VLMAsClassifierBlockV1()
+
+ # when
+ result = block.run(image=image, vlm_output=raw_json, classes=["car", "cat", "dog"])
+
+ # then
+ assert result["error_status"] is False
+ assert result["predictions"]["image"] == {"width": 168, "height": 192}
+ assert result["predictions"]["predictions"] == {
+ "car": {"confidence": 0.0, "class_id": 0},
+ "cat": {"confidence": 0.7, "class_id": 1},
+ "dog": {"confidence": 0.6, "class_id": 2},
+ }
+ assert set(result["predictions"]["predicted_classes"]) == {"cat", "dog"}
+ assert result["predictions"]["parent_id"] == "parent"
+ assert len(result["inference_id"]) > 0
+ assert result["inference_id"] == result["predictions"]["inference_id"]
+
+
+def test_run_when_json_in_markdown_block_without_new_lines_given() -> None:
+ # given
+ raw_json = """
+```json{"predicted_classes": [{"class": "cat", "confidence": 0.3}, {"class": "dog", "confidence": 0.6}, {"class": "cat", "confidence": "0.7"}]}```
+"""
+ image = WorkflowImageData(
+ numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+ parent_metadata=ImageParentMetadata(parent_id="parent"),
+ )
+ block = VLMAsClassifierBlockV1()
+
+ # when
+ result = block.run(image=image, vlm_output=raw_json, classes=["car", "cat", "dog"])
+
+ # then
+ assert result["error_status"] is False
+ assert result["predictions"]["image"] == {"width": 168, "height": 192}
+ assert result["predictions"]["predictions"] == {
+ "car": {"confidence": 0.0, "class_id": 0},
+ "cat": {"confidence": 0.7, "class_id": 1},
+ "dog": {"confidence": 0.6, "class_id": 2},
+ }
+ assert set(result["predictions"]["predicted_classes"]) == {"cat", "dog"}
+ assert result["predictions"]["parent_id"] == "parent"
+ assert len(result["inference_id"]) > 0
+ assert result["inference_id"] == result["predictions"]["inference_id"]
+
+
+def test_run_when_multiple_jsons_in_markdown_block_given() -> None:
+ # given
+ raw_json = """
+```json
+{"predicted_classes": [
+ {"class": "cat", "confidence": 0.3}, {"class": "dog", "confidence": 0.6},
+ {"class": "cat", "confidence": "0.7"}
+]}
+```
+```json
+{"predicted_classes": [
+ {"class": "cat", "confidence": 0.4}, {"class": "dog", "confidence": 0.7},
+ {"class": "cat", "confidence": "0.8"}
+]}
+```
+"""
+ image = WorkflowImageData(
+ numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+ parent_metadata=ImageParentMetadata(parent_id="parent"),
+ )
+ block = VLMAsClassifierBlockV1()
+
+ # when
+ result = block.run(image=image, vlm_output=raw_json, classes=["car", "cat", "dog"])
+
+ # then
+ assert result["error_status"] is False
+ assert result["predictions"]["image"] == {"width": 168, "height": 192}
+ assert result["predictions"]["predictions"] == {
+ "car": {"confidence": 0.0, "class_id": 0},
+ "cat": {"confidence": 0.7, "class_id": 1},
+ "dog": {"confidence": 0.6, "class_id": 2},
+ }
+ assert set(result["predictions"]["predicted_classes"]) == {"cat", "dog"}
+ assert result["predictions"]["parent_id"] == "parent"
+ assert len(result["inference_id"]) > 0
+ assert result["inference_id"] == result["predictions"]["inference_id"]
diff --git a/tests/workflows/unit_tests/core_steps/formatters/test_vlm_as_detector.py b/tests/workflows/unit_tests/core_steps/formatters/test_vlm_as_detector.py
new file mode 100644
index 0000000000..10f013c26a
--- /dev/null
+++ b/tests/workflows/unit_tests/core_steps/formatters/test_vlm_as_detector.py
@@ -0,0 +1,163 @@
+from typing import List, Union
+
+import numpy as np
+import pytest
+import supervision as sv
+
+from inference.core.workflows.core_steps.formatters.vlm_as_detector.v1 import (
+ BlockManifest,
+ VLMAsDetectorBlockV1,
+)
+from inference.core.workflows.execution_engine.entities.base import (
+ ImageParentMetadata,
+ WorkflowImageData,
+)
+
+
+@pytest.mark.parametrize("image", ["$inputs.image", "$steps.some.image"])
+@pytest.mark.parametrize(
+ "classes", ["$inputs.classes", "$steps.some.classes", ["a", "b"]]
+)
+def test_manifest_parsing_when_input_valid(
+ image: str, classes: Union[str, List[str]]
+) -> None:
+ # given
+ raw_manifest = {
+ "type": "roboflow_core/vlm_as_detector@v1",
+ "name": "parser",
+ "image": image,
+ "vlm_output": "$steps.vlm.output",
+ "classes": classes,
+ "model_type": "google-gemini",
+ "task_type": "object-detection",
+ }
+
+ # when
+ result = BlockManifest.model_validate(raw_manifest)
+
+ # then
+ assert result == BlockManifest(
+ type="roboflow_core/vlm_as_detector@v1",
+ name="parser",
+ image=image,
+ vlm_output="$steps.vlm.output",
+ classes=classes,
+ model_type="google-gemini",
+ task_type="object-detection",
+ )
+
+
+def test_run_method_for_claude_and_gemini_output() -> None:
+ # given
+ block = VLMAsDetectorBlockV1()
+ image = WorkflowImageData(
+ numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+ parent_metadata=ImageParentMetadata(parent_id="parent"),
+ )
+ vlm_output = """
+{"detections": [
+ {"x_min": 0.01, "y_min": 0.15, "x_max": 0.15, "y_max": 0.85, "class_name": "cat", "confidence": 1.98},
+ {"x_min": 0.17, "y_min": 0.25, "x_max": 0.32, "y_max": 0.85, "class_name": "dog", "confidence": 0.97},
+ {"x_min": 0.33, "y_min": 0.15, "x_max": 0.47, "y_max": 0.85, "class_name": "cat", "confidence": 0.99},
+ {"x_min": 0.49, "y_min": 0.30, "x_max": 0.65, "y_max": 0.85, "class_name": "dog", "confidence": 0.98},
+ {"x_min": 0.67, "y_min": 0.20, "x_max": 0.82, "y_max": 0.85, "class_name": "cat", "confidence": 0.99},
+ {"x_min": 0.84, "y_min": 0.25, "x_max": 0.99, "y_max": 0.85, "class_name": "unknown", "confidence": 0.97}
+]}
+ """
+
+ # when
+ result = block.run(
+ image=image,
+ vlm_output=vlm_output,
+ classes=["cat", "dog", "lion"],
+ model_type="google-gemini",
+ task_type="object-detection",
+ )
+
+ # then
+ assert result["error_status"] is False
+ assert isinstance(result["predictions"], sv.Detections)
+ assert len(result["inference_id"]) > 0
+ assert np.allclose(
+ result["predictions"].xyxy,
+ np.array(
+ [
+ [2, 29, 25, 163],
+ [29, 48, 54, 163],
+ [55, 29, 79, 163],
+ [82, 58, 109, 163],
+ [113, 38, 138, 163],
+ [141, 48, 166, 163],
+ ]
+ ),
+ atol=1.0,
+ )
+ assert np.allclose(result["predictions"].class_id, np.array([0, 1, 0, 1, 0, -1]))
+ assert np.allclose(
+ result["predictions"].confidence, np.array([1.0, 0.97, 0.99, 0.98, 0.99, 0.97])
+ )
+ assert "class_name" in result["predictions"].data
+ assert "image_dimensions" in result["predictions"].data
+ assert "prediction_type" in result["predictions"].data
+ assert "parent_coordinates" in result["predictions"].data
+ assert "parent_dimensions" in result["predictions"].data
+ assert "root_parent_coordinates" in result["predictions"].data
+ assert "root_parent_dimensions" in result["predictions"].data
+ assert "parent_id" in result["predictions"].data
+ assert "root_parent_id" in result["predictions"].data
+
+
+def test_run_method_for_invalid_claude_and_gemini_output() -> None:
+ # given
+ block = VLMAsDetectorBlockV1()
+ image = WorkflowImageData(
+ numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+ parent_metadata=ImageParentMetadata(parent_id="parent"),
+ )
+ vlm_output = """
+ {"detections": [
+ {"x_min": 0.01, "y_min": 0.15, "x_max": 0.15, "y_max": 0.85, "confidence": 1.98},
+ {"x_min": 0.17, "y_min": 0.25, "x_max": 0.32, "y_max": 0.85, "class_name": "dog", "confidence": 0.97},
+ {"x_min": 0.33, "y_min": 0.15, "x_max": 0.47, "y_max": 0.85, "class_name": "cat", "confidence": 0.99},
+ {"x_min": 0.49, "x_max": 0.65, "y_max": 0.85, "class_name": "dog", "confidence": 0.98},
+ {"x_min": 0.67, "y_min": 0.20, "x_max": 0.82, "y_max": 0.85, "class_name": "cat", "confidence": 0.99},
+ {"x_min": 0.84, "y_min": 0.25, "x_max": 0.99, "y_max": 0.85, "class_name": "unknown", "confidence": 0.97}
+ ]}
+ """
+
+ # when
+ result = block.run(
+ image=image,
+ vlm_output=vlm_output,
+ classes=["cat", "dog", "lion"],
+ model_type="google-gemini",
+ task_type="object-detection",
+ )
+
+ # then
+ assert result["error_status"] is True
+ assert result["predictions"] is None
+ assert len(result["inference_id"]) > 0
+
+
+def test_run_method_for_invalid_json() -> None:
+ # given
+ block = VLMAsDetectorBlockV1()
+ image = WorkflowImageData(
+ numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+ parent_metadata=ImageParentMetadata(parent_id="parent"),
+ )
+
+ # when
+ result = block.run(
+ image=image,
+ vlm_output="invalid",
+ classes=["cat", "dog", "lion"],
+ model_type="google-gemini",
+ task_type="object-detection",
+ )
+
+ # then
+ assert result["error_status"] is True
+ assert result["predictions"] is None
+ assert len(result["inference_id"]) > 0
diff --git a/tests/workflows/unit_tests/core_steps/transformations/test_perspective_correction.py b/tests/workflows/unit_tests/core_steps/transformations/test_perspective_correction.py
index feac4169ed..c4fc40237e 100644
--- a/tests/workflows/unit_tests/core_steps/transformations/test_perspective_correction.py
+++ b/tests/workflows/unit_tests/core_steps/transformations/test_perspective_correction.py
@@ -275,7 +275,6 @@ def test_correct_detections_with_keypoints():
src=src_polygon,
dst=dst_polygon,
)
-
# when
corrected_detections = correct_detections(
detections=detections,
diff --git a/tests/workflows/unit_tests/execution_engine/introspection/test_blocks_loader.py b/tests/workflows/unit_tests/execution_engine/introspection/test_blocks_loader.py
index 399860ccdd..768afa1726 100644
--- a/tests/workflows/unit_tests/execution_engine/introspection/test_blocks_loader.py
+++ b/tests/workflows/unit_tests/execution_engine/introspection/test_blocks_loader.py
@@ -218,7 +218,7 @@ def test_describe_available_blocks_when_valid_plugins_are_loaded(
assert result.blocks[0].manifest_class == plugin_with_valid_blocks.Block1Manifest
assert result.blocks[1].block_class == plugin_with_valid_blocks.Block2
assert result.blocks[1].manifest_class == plugin_with_valid_blocks.Block2Manifest
- assert len(result.declared_kinds) == 31
+ assert len(result.declared_kinds) > 0
@mock.patch.object(blocks_loader, "load_workflow_blocks")
@@ -259,7 +259,7 @@ def test_describe_available_blocks_when_valid_plugins_are_loaded_and_multiple_ve
result.blocks[2].manifest_class
== plugin_with_multiple_versions_of_blocks.Block2Manifest
)
- assert len(result.declared_kinds) == 31
+ assert len(result.declared_kinds) > 0
@mock.patch.object(blocks_loader, "load_workflow_blocks")