diff --git a/development/docs/build_block_docs.py b/development/docs/build_block_docs.py
index a0fb7c2e52..14e948380e 100644
--- a/development/docs/build_block_docs.py
+++ b/development/docs/build_block_docs.py
@@ -361,7 +361,7 @@ def format_block_connections(
     connections = [
         (
             f"[`{block_type2manifest_type_identifier[connection]}`]"
-            f"(/workflows/blocks/{camel_to_snake(block_type2manifest_type_identifier[connection])})"
+            f"(/workflows/blocks/{slugify_block_name(block_type2manifest_type_identifier[connection])})"
         )
         for connection in connections
     ]
diff --git a/docs/workflows/blocks.md b/docs/workflows/blocks.md
index 1c6290f45c..ed2f6ecccf 100644
--- a/docs/workflows/blocks.md
+++ b/docs/workflows/blocks.md
@@ -72,6 +72,11 @@ hide:
 <p class="card block-card" data-url="image_threshold" data-name="Image Threshold" data-desc="Apply a threshold to an image." data-labels="CLASSICAL_COMPUTER_VISION, APACHE-2.0" data-author="dummy"></p>
 <p class="card block-card" data-url="image_contours" data-name="Image Contours" data-desc="Find and count the contours on an image." data-labels="CLASSICAL_COMPUTER_VISION, APACHE-2.0" data-author="dummy"></p>
 <p class="card block-card" data-url="camera_focus" data-name="Camera Focus" data-desc="Helps focus a camera by providing a focus measure." data-labels="CLASSICAL_COMPUTER_VISION, APACHE-2.0" data-author="dummy"></p>
+<p class="card block-card" data-url="json_parser" data-name="JSON Parser" data-desc="Parses raw string into JSON." data-labels="FORMATTER, APACHE-2.0" data-author="dummy"></p>
+<p class="card block-card" data-url="vl_mas_classifier" data-name="VLM as Classifier" data-desc="Parses raw string into classification prediction." data-labels="FORMATTER, APACHE-2.0" data-author="dummy"></p>
+<p class="card block-card" data-url="google_gemini" data-name="Google Gemini" data-desc="Run Google's Gemini model with vision capabilities" data-labels="MODEL, APACHE-2.0" data-author="dummy"></p>
+<p class="card block-card" data-url="vl_mas_detector" data-name="VLM as Detector" data-desc="Parses raw string into object-detection prediction." data-labels="FORMATTER, APACHE-2.0" data-author="dummy"></p>
+<p class="card block-card" data-url="anthropic_claude" data-name="Anthropic Claude" data-desc="Run Anthropic Claude model with vision capabilities" data-labels="MODEL, APACHE-2.0" data-author="dummy"></p>
 <!--- AUTOGENERATED_BLOCKS_LIST -->
     </div>
   </div>
diff --git a/docs/workflows/create_workflow_block.md b/docs/workflows/create_workflow_block.md
index c6216f5dde..d14a757703 100644
--- a/docs/workflows/create_workflow_block.md
+++ b/docs/workflows/create_workflow_block.md
@@ -1050,7 +1050,7 @@ def run(self, predictions: List[dict]) -> BlockResult:
     )
     from inference.core.workflows.execution_engine.entities.types import (
         StepOutputSelector,
-        BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND,
+        OBJECT_DETECTION_PREDICTION_KIND,
     )
     from inference.core.workflows.prototypes.block import (
         BlockResult,
@@ -1063,7 +1063,7 @@ def run(self, predictions: List[dict]) -> BlockResult:
     class BlockManifest(WorkflowBlockManifest):
         type: Literal["my_plugin/fusion_of_predictions@v1"]
         name: str
-        predictions: List[StepOutputSelector(kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND])] = Field(
+        predictions: List[StepOutputSelector(kind=[OBJECT_DETECTION_PREDICTION_KIND])] = Field(
             description="Selectors to step outputs",
             examples=[["$steps.model_1.predictions", "$steps.model_2.predictions"]],
         )
@@ -1073,7 +1073,7 @@ def run(self, predictions: List[dict]) -> BlockResult:
             return [
               OutputDefinition(
                 name="predictions", 
-                kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND],
+                kind=[OBJECT_DETECTION_PREDICTION_KIND],
               )
             ]
     
@@ -1251,8 +1251,8 @@ the method signatures.
             ImageParentMetadata,
         )
         from inference.core.workflows.execution_engine.entities.types import (
-            BATCH_OF_IMAGES_KIND,
-            BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND,
+            IMAGE_KIND,
+            OBJECT_DETECTION_PREDICTION_KIND,
             StepOutputImageSelector,
             StepOutputSelector,
             WorkflowImageSelector,
@@ -1267,7 +1267,7 @@ the method signatures.
             type: Literal["my_block/dynamic_crop@v1"]
             image: Union[WorkflowImageSelector, StepOutputImageSelector]
             predictions: StepOutputSelector(
-                kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND],
+                kind=[OBJECT_DETECTION_PREDICTION_KIND],
             )
         
             @classmethod
@@ -1277,7 +1277,7 @@ the method signatures.
             @classmethod
             def describe_outputs(cls) -> List[OutputDefinition]:
                 return [
-                    OutputDefinition(name="crops", kind=[BATCH_OF_IMAGES_KIND]),
+                    OutputDefinition(name="crops", kind=[IMAGE_KIND]),
                 ]
         
             @classmethod
@@ -1340,8 +1340,8 @@ the method signatures.
             WorkflowImageData,
         )
         from inference.core.workflows.execution_engine.entities.types import (
-            BATCH_OF_IMAGES_KIND,
-            BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND,
+            IMAGE_KIND,
+            OBJECT_DETECTION_PREDICTION_KIND,
             StepOutputImageSelector,
             StepOutputSelector,
             WorkflowImageSelector,
@@ -1357,7 +1357,7 @@ the method signatures.
             type: Literal["my_plugin/tile_detections@v1"]
             crops: Union[WorkflowImageSelector, StepOutputImageSelector]
             crops_predictions: StepOutputSelector(
-                kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND]
+                kind=[OBJECT_DETECTION_PREDICTION_KIND]
             )
         
             @classmethod
@@ -1367,7 +1367,7 @@ the method signatures.
             @classmethod
             def describe_outputs(cls) -> List[OutputDefinition]:
                 return [
-                    OutputDefinition(name="visualisations", kind=[BATCH_OF_IMAGES_KIND]),
+                    OutputDefinition(name="visualisations", kind=[IMAGE_KIND]),
                 ]
         
         
@@ -1427,7 +1427,7 @@ the method signatures.
             WorkflowImageData,
         )
         from inference.core.workflows.execution_engine.entities.types import (
-            BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND,
+            OBJECT_DETECTION_PREDICTION_KIND,
             StepOutputImageSelector,
             StepOutputSelector,
             WorkflowImageSelector,
@@ -1443,7 +1443,7 @@ the method signatures.
             type: Literal["my_plugin/stitch@v1"]
             image: Union[WorkflowImageSelector, StepOutputImageSelector]
             image_predictions: StepOutputSelector(
-                kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND],
+                kind=[OBJECT_DETECTION_PREDICTION_KIND],
             )
         
             @classmethod
@@ -1463,7 +1463,7 @@ the method signatures.
                     OutputDefinition(
                         name="predictions",
                         kind=[
-                            BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND,
+                            OBJECT_DETECTION_PREDICTION_KIND,
                         ],
                     ),
                 ]
@@ -1526,8 +1526,8 @@ the method signatures.
             Batch,
         )
         from inference.core.workflows.execution_engine.entities.types import (
-            BATCH_OF_IMAGES_KIND,
-            BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND,
+            IMAGE_KIND,
+            OBJECT_DETECTION_PREDICTION_KIND,
             StepOutputImageSelector,
             StepOutputSelector,
             WorkflowImageSelector,
@@ -1542,7 +1542,7 @@ the method signatures.
             type: Literal["my_block/dynamic_crop@v1"]
             image: Union[WorkflowImageSelector, StepOutputImageSelector]
             predictions: StepOutputSelector(
-                kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND],
+                kind=[OBJECT_DETECTION_PREDICTION_KIND],
             )
 
             @classmethod
@@ -1556,7 +1556,7 @@ the method signatures.
             @classmethod
             def describe_outputs(cls) -> List[OutputDefinition]:
                 return [
-                    OutputDefinition(name="crops", kind=[BATCH_OF_IMAGES_KIND]),
+                    OutputDefinition(name="crops", kind=[IMAGE_KIND]),
                 ]
         
             @classmethod
@@ -1629,8 +1629,8 @@ the method signatures.
             WorkflowImageData,
         )
         from inference.core.workflows.execution_engine.entities.types import (
-            BATCH_OF_IMAGES_KIND,
-            BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND,
+            IMAGE_KIND,
+            OBJECT_DETECTION_PREDICTION_KIND,
             StepOutputImageSelector,
             StepOutputSelector,
             WorkflowImageSelector,
@@ -1646,7 +1646,7 @@ the method signatures.
             type: Literal["my_plugin/tile_detections@v1"]
             images_crops: Union[WorkflowImageSelector, StepOutputImageSelector]
             crops_predictions: StepOutputSelector(
-                kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND]
+                kind=[OBJECT_DETECTION_PREDICTION_KIND]
             )
 
             @classmethod
@@ -1660,7 +1660,7 @@ the method signatures.
             @classmethod
             def describe_outputs(cls) -> List[OutputDefinition]:
                 return [
-                    OutputDefinition(name="visualisations", kind=[BATCH_OF_IMAGES_KIND]),
+                    OutputDefinition(name="visualisations", kind=[IMAGE_KIND]),
                 ]
         
         
@@ -1726,7 +1726,7 @@ the method signatures.
             WorkflowImageData,
         )
         from inference.core.workflows.execution_engine.entities.types import (
-            BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND,
+            OBJECT_DETECTION_PREDICTION_KIND,
             StepOutputImageSelector,
             StepOutputSelector,
             WorkflowImageSelector,
@@ -1742,7 +1742,7 @@ the method signatures.
             type: Literal["my_plugin/stitch@v1"]
             images: Union[WorkflowImageSelector, StepOutputImageSelector]
             images_predictions: StepOutputSelector(
-                kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND],
+                kind=[OBJECT_DETECTION_PREDICTION_KIND],
             )
 
             @classmethod
@@ -1766,7 +1766,7 @@ the method signatures.
                     OutputDefinition(
                         name="predictions",
                         kind=[
-                            BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND,
+                            OBJECT_DETECTION_PREDICTION_KIND,
                         ],
                     ),
                 ]
diff --git a/docs/workflows/gallery_index.md b/docs/workflows/gallery_index.md
index d734e77111..f9dd8a1a60 100644
--- a/docs/workflows/gallery_index.md
+++ b/docs/workflows/gallery_index.md
@@ -7,6 +7,7 @@ Browse through the various categories to find inspiration and ideas for building
 	<li><a href="/workflows/gallery/workflows_with_multiple_models">Workflows with multiple models</a></li>
 	<li><a href="/workflows/gallery/workflows_enhanced_by_roboflow_platform">Workflows enhanced by Roboflow Platform</a></li>
 	<li><a href="/workflows/gallery/workflows_with_classical_computer_vision_methods">Workflows with classical Computer Vision methods</a></li>
+	<li><a href="/workflows/gallery/workflows_with_visual_language_models">Workflows with Visual Language Models</a></li>
 	<li><a href="/workflows/gallery/basic_workflows">Basic Workflows</a></li>
 	<li><a href="/workflows/gallery/workflows_with_dynamic_python_blocks">Workflows with dynamic Python Blocks</a></li>
 	<li><a href="/workflows/gallery/workflows_with_data_transformations">Workflows with data transformations</a></li>
diff --git a/docs/workflows/kinds.md b/docs/workflows/kinds.md
index 4a92216bf4..22482dfd85 100644
--- a/docs/workflows/kinds.md
+++ b/docs/workflows/kinds.md
@@ -23,49 +23,50 @@ for the presence of a mask in the input.
 
 !!! Warning
 
-    The list presented below contains elements with `Batch[X]` markers - those will 
-    get soon deprecated and we will use only `X` markers. For now, developers are asked 
-    to create their blocks using the `Batch[X]` markers, but raise the 
-    [issue here](https://github.com/roboflow/inference/issues/608). This GH issue will be used 
-    as a point of communication regarding deprecation process.
+    In `inference` release `0.18.0` we decided to make drastic move to heal the ecosystem 
+    from the problem with ambiguous kinds names (`Batch[X]` vs `X` - see more 
+    [here](https://github.com/roboflow/inference/issues/608)). 
+
+    The change is breaking only if there is remote Workflow plugin depending on imports
+    from `inference.core.workflows.execution_engine.entities.types` module, which is
+    not the case to the best of our knowledge. We removed problematic kinds as if they
+    never existed in the ecosystem and fixed all blocks from `roboflow_core` plugin.
+    If there is anyone impacted by the change - here is the 
+    [migration guide](https://github.com/roboflow/inference/releases/tag/v0.18.0).
  
 
 ## Kinds declared in Roboflow plugins
 <!--- AUTOGENERATED_KINDS_LIST -->
-* [`zone`](/workflows/kinds/zone): Definition of polygon zone
-* [`Batch[dictionary]`](/workflows/kinds/batch_dictionary): Batch of dictionaries
-* [`dictionary`](/workflows/kinds/dictionary): Dictionary
-* [`point`](/workflows/kinds/point): Single point in 2D
-* [`Batch[parent_id]`](/workflows/kinds/batch_parent_id): Identifier of parent for step output
-* [`roboflow_model_id`](/workflows/kinds/roboflow_model_id): Roboflow model id
-* [`Batch[classification_prediction]`](/workflows/kinds/batch_classification_prediction): `'predictions'` key from Classification Model outputs
-* [`Batch[top_class]`](/workflows/kinds/batch_top_class): Batch of string values representing top class predicted by classification model
-* [`rgb_color`](/workflows/kinds/rgb_color): RGB color
-* [`Batch[keypoint_detection_prediction]`](/workflows/kinds/batch_keypoint_detection_prediction): `'predictions'` key from Keypoint Detection Model output
-* [`Batch[serialised_payloads]`](/workflows/kinds/batch_serialised_payloads): List of serialised elements that can be registered in the sink
+* [`bar_code_detection`](/workflows/kinds/bar_code_detection): Prediction with barcode detection
+* [`language_model_output`](/workflows/kinds/language_model_output): LLM / VLM output
+* [`top_class`](/workflows/kinds/top_class): String value representing top class predicted by classification model
+* [`prediction_type`](/workflows/kinds/prediction_type): String value with type of prediction
+* [`object_detection_prediction`](/workflows/kinds/object_detection_prediction): Prediction with detected bounding boxes in form of sv.Detections(...) object
+* [`qr_code_detection`](/workflows/kinds/qr_code_detection): Prediction with QR code detection
+* [`image_metadata`](/workflows/kinds/image_metadata): Dictionary with image metadata required by supervision
 * [`float_zero_to_one`](/workflows/kinds/float_zero_to_one): `float` value in range `[0.0, 1.0]`
-* [`Batch[boolean]`](/workflows/kinds/batch_boolean): Boolean values batch
-* [`list_of_values`](/workflows/kinds/list_of_values): List of values of any types
-* [`Batch[instance_segmentation_prediction]`](/workflows/kinds/batch_instance_segmentation_prediction): `'predictions'` key from Instance Segmentation Model outputs
-* [`Batch[qr_code_detection]`](/workflows/kinds/batch_qr_code_detection): Prediction with QR code detection
+* [`parent_id`](/workflows/kinds/parent_id): Identifier of parent for step output
+* [`keypoint_detection_prediction`](/workflows/kinds/keypoint_detection_prediction): Prediction with detected bounding boxes and detected keypoints in form of sv.Detections(...) object
+* [`float`](/workflows/kinds/float): Float value
+* [`*`](/workflows/kinds/*): Equivalent of any element
 * [`contours`](/workflows/kinds/contours): List of numpy arrays where each array represents contour points
-* [`Batch[image]`](/workflows/kinds/batch_image): Image in workflows
+* [`boolean`](/workflows/kinds/boolean): Boolean flag
 * [`detection`](/workflows/kinds/detection): Single element of detections-based prediction (like `object_detection_prediction`)
-* [`Batch[prediction_type]`](/workflows/kinds/batch_prediction_type): String value with type of prediction
+* [`roboflow_project`](/workflows/kinds/roboflow_project): Roboflow project name
+* [`dictionary`](/workflows/kinds/dictionary): Dictionary
+* [`numpy_array`](/workflows/kinds/numpy_array): Numpy array
 * [`roboflow_api_key`](/workflows/kinds/roboflow_api_key): Roboflow API key
 * [`string`](/workflows/kinds/string): String value
-* [`*`](/workflows/kinds/*): Equivalent of any element
-* [`float`](/workflows/kinds/float): Float value
-* [`keypoint_detection_prediction`](/workflows/kinds/keypoint_detection_prediction): Prediction with detected bounding boxes and detected keypoints in form of sv.Detections(...) object
-* [`Batch[object_detection_prediction]`](/workflows/kinds/batch_object_detection_prediction): `'predictions'` key from Object Detection Model output
-* [`integer`](/workflows/kinds/integer): Integer value
-* [`roboflow_project`](/workflows/kinds/roboflow_project): Roboflow project name
-* [`Batch[string]`](/workflows/kinds/batch_string): Batch of string values
-* [`image`](/workflows/kinds/image): Image in workflows
-* [`Batch[bar_code_detection]`](/workflows/kinds/batch_bar_code_detection): Prediction with barcode detection
-* [`object_detection_prediction`](/workflows/kinds/object_detection_prediction): Prediction with detected bounding boxes in form of sv.Detections(...) object
-* [`boolean`](/workflows/kinds/boolean): Boolean flag
+* [`roboflow_model_id`](/workflows/kinds/roboflow_model_id): Roboflow model id
+* [`list_of_values`](/workflows/kinds/list_of_values): List of values of any types
 * [`instance_segmentation_prediction`](/workflows/kinds/instance_segmentation_prediction): Prediction with detected bounding boxes and segmentation masks in form of sv.Detections(...) object
+* [`image`](/workflows/kinds/image): Image in workflows
+* [`video_metadata`](/workflows/kinds/video_metadata): Video image metadata
+* [`serialised_payloads`](/workflows/kinds/serialised_payloads): Serialised element that is usually accepted by sink
+* [`integer`](/workflows/kinds/integer): Integer value
+* [`rgb_color`](/workflows/kinds/rgb_color): RGB color
+* [`classification_prediction`](/workflows/kinds/classification_prediction): Predictions from classifier
 * [`image_keypoints`](/workflows/kinds/image_keypoints): Image keypoints detected by classical Computer Vision method
-* [`Batch[image_metadata]`](/workflows/kinds/batch_image_metadata): Dictionary with image metadata required by supervision
+* [`point`](/workflows/kinds/point): Single point in 2D
+* [`zone`](/workflows/kinds/zone): Definition of polygon zone
 <!--- AUTOGENERATED_KINDS_LIST -->
diff --git a/inference/core/version.py b/inference/core/version.py
index 7dbc1800f2..b3b607f742 100644
--- a/inference/core/version.py
+++ b/inference/core/version.py
@@ -1,4 +1,4 @@
-__version__ = "0.17.1"
+__version__ = "0.18.0"
 
 
 if __name__ == "__main__":
diff --git a/inference/core/workflows/core_steps/formatters/json_parser/__init__.py b/inference/core/workflows/core_steps/formatters/json_parser/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/inference/core/workflows/core_steps/formatters/json_parser/v1.py b/inference/core/workflows/core_steps/formatters/json_parser/v1.py
new file mode 100644
index 0000000000..0e1a0f1b3e
--- /dev/null
+++ b/inference/core/workflows/core_steps/formatters/json_parser/v1.py
@@ -0,0 +1,142 @@
+import json
+import logging
+import re
+from typing import List, Literal, Optional, Tuple, Type
+
+from pydantic import AfterValidator, ConfigDict, Field
+from typing_extensions import Annotated
+
+from inference.core.workflows.execution_engine.entities.base import OutputDefinition
+from inference.core.workflows.execution_engine.entities.types import (
+    BOOLEAN_KIND,
+    LANGUAGE_MODEL_OUTPUT_KIND,
+    StepOutputSelector,
+)
+from inference.core.workflows.prototypes.block import (
+    BlockResult,
+    WorkflowBlock,
+    WorkflowBlockManifest,
+)
+
+JSON_MARKDOWN_BLOCK_PATTERN = re.compile(r"```json([\s\S]*?)```", flags=re.IGNORECASE)
+
+LONG_DESCRIPTION = """
+The block expects string input that would be produced by blocks exposing Large Language Models (LLMs) and 
+Visual Language Models (VLMs). Input is parsed to JSON, and its keys are exposed as block outputs.
+
+Accepted formats:
+- valid JSON strings
+- JSON documents wrapped with Markdown tags (very common for GPT responses)
+```
+{"my": "json"}
+```
+
+**Details regarding block behavior:**
+
+- `error_status` is set `True` whenever at least one of `expected_fields` cannot be retrieved from input
+
+- in case of multiple markdown blocks with raw JSON content - only first will be parsed and returned, while
+`error_status` will remain `False`
+"""
+
+SHORT_DESCRIPTION = "Parses raw string into JSON."
+
+
+def validate_reserved_fields(expected_fields: List[str]) -> List[str]:
+    if "error_status" in expected_fields:
+        raise ValueError(
+            "`error_status` is reserved field name and cannot be "
+            "used in `expected_fields` of `roboflow_core/json_parser@v1` block."
+        )
+    return expected_fields
+
+
+class BlockManifest(WorkflowBlockManifest):
+    model_config = ConfigDict(
+        json_schema_extra={
+            "name": "JSON Parser",
+            "version": "v1",
+            "short_description": SHORT_DESCRIPTION,
+            "long_description": LONG_DESCRIPTION,
+            "license": "Apache-2.0",
+            "block_type": "formatter",
+        }
+    )
+    type: Literal["roboflow_core/json_parser@v1"]
+    raw_json: StepOutputSelector(kind=[LANGUAGE_MODEL_OUTPUT_KIND]) = Field(
+        description="The string with raw JSON to parse.",
+        examples=[["$steps.lmm.output"]],
+    )
+    expected_fields: Annotated[List[str], AfterValidator(validate_reserved_fields)] = (
+        Field(
+            description="List of expected JSON fields. `error_status` field name is reserved and cannot be used.",
+            examples=[["field_a", "field_b"]],
+        )
+    )
+
+    @classmethod
+    def describe_outputs(cls) -> List[OutputDefinition]:
+        return [
+            OutputDefinition(name="error_status", kind=[BOOLEAN_KIND]),
+            OutputDefinition(name="*"),
+        ]
+
+    def get_actual_outputs(self) -> List[OutputDefinition]:
+        result = [
+            OutputDefinition(name="error_status", kind=[BOOLEAN_KIND]),
+        ]
+        for field_name in self.expected_fields:
+            result.append(OutputDefinition(name=field_name))
+        return result
+
+    @classmethod
+    def get_execution_engine_compatibility(cls) -> Optional[str]:
+        return ">=1.0.0,<2.0.0"
+
+
+class JSONParserBlockV1(WorkflowBlock):
+
+    @classmethod
+    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
+        return BlockManifest
+
+    def run(
+        self,
+        raw_json: str,
+        expected_fields: List[str],
+    ) -> BlockResult:
+        error_status, parsed_data = string2json(
+            raw_json=raw_json,
+            expected_fields=expected_fields,
+        )
+        parsed_data["error_status"] = error_status
+        return parsed_data
+
+
+def string2json(
+    raw_json: str,
+    expected_fields: List[str],
+) -> Tuple[bool, dict]:
+    json_blocks_found = JSON_MARKDOWN_BLOCK_PATTERN.findall(raw_json)
+    if len(json_blocks_found) == 0:
+        return try_parse_json(raw_json, expected_fields=expected_fields)
+    first_block = json_blocks_found[0]
+    return try_parse_json(first_block, expected_fields=expected_fields)
+
+
+def try_parse_json(content: str, expected_fields: List[str]) -> Tuple[bool, dict]:
+    try:
+        parsed_data = json.loads(content)
+        result = {}
+        all_fields_find = True
+        for field in expected_fields:
+            if field not in parsed_data:
+                all_fields_find = False
+            result[field] = parsed_data.get(field)
+        return not all_fields_find, result
+    except Exception as error:
+        logging.warning(
+            f"Could not parse JSON in `roboflow_core/json_parser@v1` block. "
+            f"Error type: {error.__class__.__name__}. Details: {error}"
+        )
+        return True, {field: None for field in expected_fields}
diff --git a/inference/core/workflows/core_steps/formatters/vlm_as_classifier/__init__.py b/inference/core/workflows/core_steps/formatters/vlm_as_classifier/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/inference/core/workflows/core_steps/formatters/vlm_as_classifier/v1.py b/inference/core/workflows/core_steps/formatters/vlm_as_classifier/v1.py
new file mode 100644
index 0000000000..7edce35af6
--- /dev/null
+++ b/inference/core/workflows/core_steps/formatters/vlm_as_classifier/v1.py
@@ -0,0 +1,269 @@
+import json
+import logging
+import re
+from typing import Dict, List, Literal, Optional, Tuple, Type, Union
+from uuid import uuid4
+
+from pydantic import ConfigDict, Field
+
+from inference.core.workflows.execution_engine.entities.base import (
+    OutputDefinition,
+    WorkflowImageData,
+)
+from inference.core.workflows.execution_engine.entities.types import (
+    BOOLEAN_KIND,
+    CLASSIFICATION_PREDICTION_KIND,
+    LANGUAGE_MODEL_OUTPUT_KIND,
+    LIST_OF_VALUES_KIND,
+    STRING_KIND,
+    StepOutputImageSelector,
+    StepOutputSelector,
+    WorkflowImageSelector,
+    WorkflowParameterSelector,
+)
+from inference.core.workflows.prototypes.block import (
+    BlockResult,
+    WorkflowBlock,
+    WorkflowBlockManifest,
+)
+
+JSON_MARKDOWN_BLOCK_PATTERN = re.compile(r"```json([\s\S]*?)```", flags=re.IGNORECASE)
+
+LONG_DESCRIPTION = """
+The block expects string input that would be produced by blocks exposing Large Language Models (LLMs) and 
+Visual Language Models (VLMs). Input is parsed to classification prediction and returned as block output.
+
+Accepted formats:
+
+- valid JSON strings
+
+- JSON documents wrapped with Markdown tags (very common for GPT responses)
+
+Example:
+```
+{"my": "json"}
+```
+
+**Details regarding block behavior:**
+
+- `error_status` is set `True` whenever parsing cannot be completed
+
+- in case of multiple markdown blocks with raw JSON content - only first will be parsed
+"""
+
+SHORT_DESCRIPTION = "Parses raw string into classification prediction."
+
+
+class BlockManifest(WorkflowBlockManifest):
+    model_config = ConfigDict(
+        json_schema_extra={
+            "name": "VLM as Classifier",
+            "version": "v1",
+            "short_description": SHORT_DESCRIPTION,
+            "long_description": LONG_DESCRIPTION,
+            "license": "Apache-2.0",
+            "block_type": "formatter",
+        }
+    )
+    type: Literal["roboflow_core/vlm_as_classifier@v1"]
+    image: Union[WorkflowImageSelector, StepOutputImageSelector] = Field(
+        description="The image which was the base to generate VLM prediction",
+        examples=["$inputs.image", "$steps.cropping.crops"],
+    )
+    vlm_output: StepOutputSelector(kind=[LANGUAGE_MODEL_OUTPUT_KIND]) = Field(
+        title="VLM Output",
+        description="The string with raw classification prediction to parse.",
+        examples=[["$steps.lmm.output"]],
+    )
+    classes: Union[
+        WorkflowParameterSelector(kind=[LIST_OF_VALUES_KIND]),
+        StepOutputSelector(kind=[LIST_OF_VALUES_KIND]),
+        List[str],
+    ] = Field(
+        description="List of all classes used by the model, required to "
+        "generate mapping between class name and class id.",
+        examples=[["$steps.lmm.classes", "$inputs.classes", ["class_a", "class_b"]]],
+    )
+
+    @classmethod
+    def describe_outputs(cls) -> List[OutputDefinition]:
+        return [
+            OutputDefinition(name="error_status", kind=[BOOLEAN_KIND]),
+            OutputDefinition(name="predictions", kind=[CLASSIFICATION_PREDICTION_KIND]),
+            OutputDefinition(name="inference_id", kind=[STRING_KIND]),
+        ]
+
+    @classmethod
+    def get_execution_engine_compatibility(cls) -> Optional[str]:
+        return ">=1.0.0,<2.0.0"
+
+
+class VLMAsClassifierBlockV1(WorkflowBlock):
+
+    @classmethod
+    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
+        return BlockManifest
+
+    def run(
+        self,
+        image: WorkflowImageData,
+        vlm_output: str,
+        classes: List[str],
+    ) -> BlockResult:
+        inference_id = f"{uuid4()}"
+        error_status, parsed_data = string2json(
+            raw_json=vlm_output,
+        )
+        if error_status:
+            return {
+                "error_status": True,
+                "predictions": None,
+                "inference_id": inference_id,
+            }
+        if "class_name" in parsed_data and "confidence" in parsed_data:
+            return parse_multi_class_classification_results(
+                image=image,
+                results=parsed_data,
+                classes=classes,
+                inference_id=inference_id,
+            )
+        if "predicted_classes" in parsed_data:
+            return parse_multi_label_classification_results(
+                image=image,
+                results=parsed_data,
+                classes=classes,
+                inference_id=inference_id,
+            )
+        return {
+            "error_status": True,
+            "predictions": None,
+            "inference_id": inference_id,
+        }
+
+
+def string2json(
+    raw_json: str,
+) -> Tuple[bool, dict]:
+    json_blocks_found = JSON_MARKDOWN_BLOCK_PATTERN.findall(raw_json)
+    if len(json_blocks_found) == 0:
+        return try_parse_json(raw_json)
+    first_block = json_blocks_found[0]
+    return try_parse_json(first_block)
+
+
+def try_parse_json(content: str) -> Tuple[bool, dict]:
+    try:
+        return False, json.loads(content)
+    except Exception as error:
+        logging.warning(
+            f"Could not parse JSON to dict in `roboflow_core/vlm_as_classifier@v1` block. "
+            f"Error type: {error.__class__.__name__}. Details: {error}"
+        )
+        return True, {}
+
+
+def parse_multi_class_classification_results(
+    image: WorkflowImageData,
+    results: dict,
+    classes: List[str],
+    inference_id: str,
+) -> dict:
+    try:
+        class2id_mapping = create_classes_index(classes=classes)
+        height, width = image.numpy_image.shape[:2]
+        top_class = results["class_name"]
+        confidences = {top_class: scale_confidence(results["confidence"])}
+        predictions = []
+        if top_class not in class2id_mapping:
+            predictions.append(
+                {
+                    "class_name": top_class,
+                    "class_id": -1,
+                    "confidence": confidences.get(top_class, 0.0),
+                }
+            )
+        for class_name, class_id in class2id_mapping.items():
+            predictions.append(
+                {
+                    "class_name": class_name,
+                    "class_id": class_id,
+                    "confidence": confidences.get(class_name, 0.0),
+                }
+            )
+        parsed_prediction = {
+            "image": {"width": width, "height": height},
+            "predictions": predictions,
+            "top": top_class,
+            "confidence": confidences[top_class],
+            "inference_id": inference_id,
+            "parent_id": image.parent_metadata.parent_id,
+        }
+        return {
+            "error_status": False,
+            "predictions": parsed_prediction,
+            "inference_id": inference_id,
+        }
+    except Exception as error:
+        logging.warning(
+            f"Could not parse multi-class classification results in `roboflow_core/vlm_as_classifier@v1` block. "
+            f"Error type: {error.__class__.__name__}. Details: {error}"
+        )
+        return {"error_status": True, "predictions": None, "inference_id": inference_id}
+
+
+def parse_multi_label_classification_results(
+    image: WorkflowImageData,
+    results: dict,
+    classes: List[str],
+    inference_id: str,
+) -> dict:
+    try:
+        class2id_mapping = create_classes_index(classes=classes)
+        height, width = image.numpy_image.shape[:2]
+        predicted_classes_confidences = {}
+        for prediction in results["predicted_classes"]:
+            if prediction["class"] not in class2id_mapping:
+                class2id_mapping[prediction["class"]] = -1
+            if prediction["class"] in predicted_classes_confidences:
+                old_confidence = predicted_classes_confidences[prediction["class"]]
+                new_confidence = scale_confidence(value=prediction["confidence"])
+                predicted_classes_confidences[prediction["class"]] = max(
+                    old_confidence, new_confidence
+                )
+            else:
+                predicted_classes_confidences[prediction["class"]] = scale_confidence(
+                    value=prediction["confidence"]
+                )
+        predictions = {
+            class_name: {
+                "confidence": predicted_classes_confidences.get(class_name, 0.0),
+                "class_id": class_id,
+            }
+            for class_name, class_id in class2id_mapping.items()
+        }
+        parsed_prediction = {
+            "image": {"width": width, "height": height},
+            "predictions": predictions,
+            "predicted_classes": list(predicted_classes_confidences.keys()),
+            "inference_id": inference_id,
+            "parent_id": image.parent_metadata.parent_id,
+        }
+        return {
+            "error_status": False,
+            "predictions": parsed_prediction,
+            "inference_id": inference_id,
+        }
+    except Exception as error:
+        logging.warning(
+            f"Could not parse multi-label classification results in `roboflow_core/vlm_as_classifier@v1` block. "
+            f"Error type: {error.__class__.__name__}. Details: {error}"
+        )
+        return {"error_status": True, "predictions": None, "inference_id": inference_id}
+
+
+def create_classes_index(classes: List[str]) -> Dict[str, int]:
+    return {class_name: idx for idx, class_name in enumerate(classes)}
+
+
+def scale_confidence(value: float) -> float:
+    return min(max(float(value), 0.0), 1.0)
diff --git a/inference/core/workflows/core_steps/formatters/vlm_as_detector/__init__.py b/inference/core/workflows/core_steps/formatters/vlm_as_detector/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/inference/core/workflows/core_steps/formatters/vlm_as_detector/v1.py b/inference/core/workflows/core_steps/formatters/vlm_as_detector/v1.py
new file mode 100644
index 0000000000..3dbb7cf3dc
--- /dev/null
+++ b/inference/core/workflows/core_steps/formatters/vlm_as_detector/v1.py
@@ -0,0 +1,261 @@
+import json
+import logging
+import re
+from typing import Dict, List, Literal, Optional, Tuple, Type, Union
+from uuid import uuid4
+
+import numpy as np
+import supervision as sv
+from pydantic import ConfigDict, Field, model_validator
+from supervision.config import CLASS_NAME_DATA_FIELD
+
+from inference.core.workflows.core_steps.common.utils import (
+    attach_parents_coordinates_to_sv_detections,
+)
+from inference.core.workflows.execution_engine.constants import (
+    DETECTION_ID_KEY,
+    IMAGE_DIMENSIONS_KEY,
+    INFERENCE_ID_KEY,
+    PREDICTION_TYPE_KEY,
+)
+from inference.core.workflows.execution_engine.entities.base import (
+    OutputDefinition,
+    WorkflowImageData,
+)
+from inference.core.workflows.execution_engine.entities.types import (
+    BOOLEAN_KIND,
+    LANGUAGE_MODEL_OUTPUT_KIND,
+    LIST_OF_VALUES_KIND,
+    OBJECT_DETECTION_PREDICTION_KIND,
+    STRING_KIND,
+    StepOutputImageSelector,
+    StepOutputSelector,
+    WorkflowImageSelector,
+    WorkflowParameterSelector,
+)
+from inference.core.workflows.prototypes.block import (
+    BlockResult,
+    WorkflowBlock,
+    WorkflowBlockManifest,
+)
+
+JSON_MARKDOWN_BLOCK_PATTERN = re.compile(r"```json([\s\S]*?)```", flags=re.IGNORECASE)
+
+LONG_DESCRIPTION = """
+The block expects string input that would be produced by blocks exposing Large Language Models (LLMs) and 
+Visual Language Models (VLMs). Input is parsed to object-detection prediction and returned as block output.
+
+Accepted formats:
+
+- valid JSON strings
+
+- JSON documents wrapped with Markdown tags
+
+Example
+```
+{"my": "json"}
+```
+
+**Details regarding block behavior:**
+
+- `error_status` is set `True` whenever parsing cannot be completed
+
+- in case of multiple markdown blocks with raw JSON content - only first will be parsed
+"""
+
+SHORT_DESCRIPTION = "Parses raw string into object-detection prediction."
+
+
+class BlockManifest(WorkflowBlockManifest):
+    model_config = ConfigDict(
+        json_schema_extra={
+            "name": "VLM as Detector",
+            "version": "v1",
+            "short_description": SHORT_DESCRIPTION,
+            "long_description": LONG_DESCRIPTION,
+            "license": "Apache-2.0",
+            "block_type": "formatter",
+        }
+    )
+    type: Literal["roboflow_core/vlm_as_detector@v1"]
+    image: Union[WorkflowImageSelector, StepOutputImageSelector] = Field(
+        description="The image which was the base to generate VLM prediction",
+        examples=["$inputs.image", "$steps.cropping.crops"],
+    )
+    vlm_output: StepOutputSelector(kind=[LANGUAGE_MODEL_OUTPUT_KIND]) = Field(
+        title="VLM Output",
+        description="The string with raw classification prediction to parse.",
+        examples=[["$steps.lmm.output"]],
+    )
+    classes: Union[
+        WorkflowParameterSelector(kind=[LIST_OF_VALUES_KIND]),
+        StepOutputSelector(kind=[LIST_OF_VALUES_KIND]),
+        List[str],
+    ] = Field(
+        description="List of all classes used by the model, required to "
+        "generate mapping between class name and class id.",
+        examples=[["$steps.lmm.classes", "$inputs.classes", ["class_a", "class_b"]]],
+    )
+    model_type: Literal["google-gemini", "anthropic-claude"] = Field(
+        description="Type of the model that generated prediction",
+        examples=[["google-gemini", "anthropic-claude"]],
+    )
+    task_type: Literal["object-detection"]
+
+    @model_validator(mode="after")
+    def validate(self) -> "BlockManifest":
+        if (self.model_type, self.task_type) not in REGISTERED_PARSERS:
+            raise ValueError(
+                f"Could not parse result of task {self.task_type} for model {self.model_type}"
+            )
+        return self
+
+    @classmethod
+    def describe_outputs(cls) -> List[OutputDefinition]:
+        return [
+            OutputDefinition(name="error_status", kind=[BOOLEAN_KIND]),
+            OutputDefinition(
+                name="predictions", kind=[OBJECT_DETECTION_PREDICTION_KIND]
+            ),
+            OutputDefinition(name="inference_id", kind=[STRING_KIND]),
+        ]
+
+    @classmethod
+    def get_execution_engine_compatibility(cls) -> Optional[str]:
+        return ">=1.0.0,<2.0.0"
+
+
+class VLMAsDetectorBlockV1(WorkflowBlock):
+
+    @classmethod
+    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
+        return BlockManifest
+
+    def run(
+        self,
+        image: WorkflowImageData,
+        vlm_output: str,
+        classes: List[str],
+        model_type: str,
+        task_type: str,
+    ) -> BlockResult:
+        inference_id = f"{uuid4()}"
+        error_status, parsed_data = string2json(
+            raw_json=vlm_output,
+        )
+        if error_status:
+            return {
+                "error_status": True,
+                "predictions": None,
+                "inference_id": inference_id,
+            }
+        try:
+            predictions = REGISTERED_PARSERS[(model_type, task_type)](
+                image=image,
+                parsed_data=parsed_data,
+                classes=classes,
+                inference_id=inference_id,
+            )
+            return {
+                "error_status": False,
+                "predictions": predictions,
+                "inference_id": inference_id,
+            }
+        except Exception as error:
+            logging.warning(
+                f"Could not parse VLM prediction for model {model_type} and task {task_type} "
+                f"in `roboflow_core/vlm_as_detector@v1` block. "
+                f"Error type: {error.__class__.__name__}. Details: {error}"
+            )
+            return {
+                "error_status": True,
+                "predictions": None,
+                "inference_id": inference_id,
+            }
+
+
+def string2json(
+    raw_json: str,
+) -> Tuple[bool, dict]:
+    json_blocks_found = JSON_MARKDOWN_BLOCK_PATTERN.findall(raw_json)
+    if len(json_blocks_found) == 0:
+        return try_parse_json(raw_json)
+    first_block = json_blocks_found[0]
+    return try_parse_json(first_block)
+
+
+def try_parse_json(content: str) -> Tuple[bool, dict]:
+    try:
+        return False, json.loads(content)
+    except Exception as error:
+        logging.warning(
+            f"Could not parse JSON to dict in `roboflow_core/vlm_as_detector@v1` block. "
+            f"Error type: {error.__class__.__name__}. Details: {error}"
+        )
+        return True, {}
+
+
+def parse_gemini_object_detection_response(
+    image: WorkflowImageData,
+    parsed_data: dict,
+    classes: List[str],
+    inference_id: str,
+) -> sv.Detections:
+    class_name2id = create_classes_index(classes=classes)
+    image_height, image_width = image.numpy_image.shape[:2]
+    if len(parsed_data["detections"]) == 0:
+        return sv.Detections.empty()
+    xyxy, class_id, class_name, confidence = [], [], [], []
+    for detection in parsed_data["detections"]:
+        xyxy.append(
+            [
+                detection["x_min"] * image_width,
+                detection["y_min"] * image_height,
+                detection["x_max"] * image_width,
+                detection["y_max"] * image_height,
+            ]
+        )
+        class_id.append(class_name2id.get(detection["class_name"], -1))
+        class_name.append(detection["class_name"])
+        confidence.append(scale_confidence(detection.get("confidence", 1.0)))
+    xyxy = np.array(xyxy).round(0) if len(xyxy) > 0 else np.empty((0, 4))
+    confidence = np.array(confidence) if len(confidence) > 0 else np.empty(0)
+    class_id = np.array(class_id).astype(int) if len(class_id) > 0 else np.empty(0)
+    class_name = np.array(class_name) if len(class_name) > 0 else np.empty(0)
+    detection_ids = np.array([str(uuid4()) for _ in range(len(xyxy))])
+    dimensions = np.array([[image_height, image_width]] * len(xyxy))
+    inference_ids = np.array([inference_id] * len(xyxy))
+    prediction_type = np.array(["object-detection"] * len(xyxy))
+    data = {
+        CLASS_NAME_DATA_FIELD: class_name,
+        IMAGE_DIMENSIONS_KEY: dimensions,
+        INFERENCE_ID_KEY: inference_ids,
+        DETECTION_ID_KEY: detection_ids,
+        PREDICTION_TYPE_KEY: prediction_type,
+    }
+    detections = sv.Detections(
+        xyxy=xyxy,
+        confidence=confidence,
+        class_id=class_id,
+        mask=None,
+        tracker_id=None,
+        data=data,
+    )
+    return attach_parents_coordinates_to_sv_detections(
+        detections=detections,
+        image=image,
+    )
+
+
+def create_classes_index(classes: List[str]) -> Dict[str, int]:
+    return {class_name: idx for idx, class_name in enumerate(classes)}
+
+
+def scale_confidence(value: float) -> float:
+    return min(max(float(value), 0.0), 1.0)
+
+
+REGISTERED_PARSERS = {
+    ("google-gemini", "object-detection"): parse_gemini_object_detection_response,
+    ("anthropic-claude", "object-detection"): parse_gemini_object_detection_response,
+}
diff --git a/inference/core/workflows/core_steps/loader.py b/inference/core/workflows/core_steps/loader.py
index 6cfca977d1..e09ef79df4 100644
--- a/inference/core/workflows/core_steps/loader.py
+++ b/inference/core/workflows/core_steps/loader.py
@@ -40,9 +40,18 @@
 from inference.core.workflows.core_steps.formatters.first_non_empty_or_default.v1 import (
     FirstNonEmptyOrDefaultBlockV1,
 )
+from inference.core.workflows.core_steps.formatters.json_parser.v1 import (
+    JSONParserBlockV1,
+)
 from inference.core.workflows.core_steps.formatters.property_definition.v1 import (
     PropertyDefinitionBlockV1,
 )
+from inference.core.workflows.core_steps.formatters.vlm_as_classifier.v1 import (
+    VLMAsClassifierBlockV1,
+)
+from inference.core.workflows.core_steps.formatters.vlm_as_detector.v1 import (
+    VLMAsDetectorBlockV1,
+)
 from inference.core.workflows.core_steps.fusion.detections_classes_replacement.v1 import (
     DetectionsClassesReplacementBlockV1,
 )
@@ -55,6 +64,9 @@
 from inference.core.workflows.core_steps.fusion.dimension_collapse.v1 import (
     DimensionCollapseBlockV1,
 )
+from inference.core.workflows.core_steps.models.foundation.anthropic_claude.v1 import (
+    AntropicClaudeBlockV1,
+)
 from inference.core.workflows.core_steps.models.foundation.clip_comparison.v1 import (
     ClipComparisonBlockV1,
 )
@@ -64,6 +76,9 @@
 from inference.core.workflows.core_steps.models.foundation.cog_vlm.v1 import (
     CogVLMBlockV1,
 )
+from inference.core.workflows.core_steps.models.foundation.google_gemini.v1 import (
+    GoogleGeminiBlockV1,
+)
 from inference.core.workflows.core_steps.models.foundation.lmm.v1 import LMMBlockV1
 from inference.core.workflows.core_steps.models.foundation.lmm_classifier.v1 import (
     LMMForClassificationBlockV1,
@@ -72,6 +87,9 @@
 from inference.core.workflows.core_steps.models.foundation.openai.v1 import (
     OpenAIBlockV1,
 )
+from inference.core.workflows.core_steps.models.foundation.openai.v2 import (
+    OpenAIBlockV2,
+)
 from inference.core.workflows.core_steps.models.foundation.segment_anything2.v1 import (
     SegmentAnything2BlockV1,
 )
@@ -197,6 +215,7 @@
     INSTANCE_SEGMENTATION_PREDICTION_KIND,
     INTEGER_KIND,
     KEYPOINT_DETECTION_PREDICTION_KIND,
+    LANGUAGE_MODEL_OUTPUT_KIND,
     LIST_OF_VALUES_KIND,
     NUMPY_ARRAY_KIND,
     OBJECT_DETECTION_PREDICTION_KIND,
@@ -290,6 +309,12 @@ def load_blocks() -> List[Type[WorkflowBlock]]:
         ClipComparisonBlockV2,
         CameraFocusBlockV1,
         RoboflowDatasetUploadBlockV2,
+        OpenAIBlockV2,
+        JSONParserBlockV1,
+        VLMAsClassifierBlockV1,
+        GoogleGeminiBlockV1,
+        VLMAsDetectorBlockV1,
+        AntropicClaudeBlockV1,
     ]
 
 
@@ -320,6 +345,7 @@ def load_kinds() -> List[Kind]:
         RGB_COLOR_KIND,
         IMAGE_KEYPOINTS_KIND,
         CONTOURS_KIND,
+        LANGUAGE_MODEL_OUTPUT_KIND,
         NUMPY_ARRAY_KIND,
         QR_CODE_DETECTION_KIND,
         BAR_CODE_DETECTION_KIND,
diff --git a/inference/core/workflows/core_steps/models/foundation/anthropic_claude/__init__.py b/inference/core/workflows/core_steps/models/foundation/anthropic_claude/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/inference/core/workflows/core_steps/models/foundation/anthropic_claude/v1.py b/inference/core/workflows/core_steps/models/foundation/anthropic_claude/v1.py
new file mode 100644
index 0000000000..370ea4cc72
--- /dev/null
+++ b/inference/core/workflows/core_steps/models/foundation/anthropic_claude/v1.py
@@ -0,0 +1,657 @@
+import base64
+import json
+import re
+from functools import partial
+from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union
+
+import anthropic
+from anthropic import NOT_GIVEN
+from pydantic import ConfigDict, Field, model_validator
+
+from inference.core.env import WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS
+from inference.core.managers.base import ModelManager
+from inference.core.utils.image_utils import encode_image_to_jpeg_bytes, load_image
+from inference.core.utils.preprocess import downscale_image_keeping_aspect_ratio
+from inference.core.workflows.core_steps.common.utils import run_in_parallel
+from inference.core.workflows.execution_engine.entities.base import (
+    Batch,
+    OutputDefinition,
+    WorkflowImageData,
+)
+from inference.core.workflows.execution_engine.entities.types import (
+    FLOAT_KIND,
+    INTEGER_KIND,
+    LANGUAGE_MODEL_OUTPUT_KIND,
+    LIST_OF_VALUES_KIND,
+    STRING_KIND,
+    ImageInputField,
+    StepOutputImageSelector,
+    WorkflowImageSelector,
+    WorkflowParameterSelector,
+)
+from inference.core.workflows.prototypes.block import (
+    BlockResult,
+    WorkflowBlock,
+    WorkflowBlockManifest,
+)
+
+LONG_DESCRIPTION = """
+Ask a question to Anthropic Claude model with vision capabilities.
+
+You can specify arbitrary text prompts or predefined ones, the block supports the following types of prompt:
+
+- `unconstrained` - any arbitrary prompt you like 
+
+- `ocr`- predefined prompt to recognise text from image
+
+- `visual-question-answering` - your prompt is supposed to provide question and will be 
+wrapped into structure that is suited for VQA task
+
+- `caption` - predefined prompt to generate short caption of the image
+
+- `detailed-caption` - predefined prompt to generate elaborated caption of the image
+
+- `classification` - predefined prompt to generate multi-class classification output (that can be parsed
+with `VLM as Classifier` block)
+
+- `multi-label-classification` - predefined prompt to generate multi-label classification output (that 
+can be parsed with `VLM as Classifier` block)
+
+- `object-detection` - predefined prompt to generate object detection output (that can be parsed
+with `VLM as Detector` block)
+
+- `structured-answering` - your input defines expected JSON output fields that can be parsed with `JSON Parser`
+block. 
+
+You need to provide your Anthropic API key to use the Claude model. 
+"""
+
+TaskType = Literal[
+    "unconstrained",
+    "ocr",
+    "visual-question-answering",
+    "caption",
+    "detailed-caption",
+    "classification",
+    "multi-label-classification",
+    "structured-answering",
+    "object-detection",
+]
+
+TASKS_REQUIRING_PROMPT = {
+    "unconstrained",
+    "visual-question-answering",
+}
+
+TASKS_REQUIRING_CLASSES = {
+    "classification",
+    "multi-label-classification",
+    "object-detection",
+}
+
+TASKS_REQUIRING_OUTPUT_STRUCTURE = {
+    "structured-answering",
+}
+
+
+class BlockManifest(WorkflowBlockManifest):
+    model_config = ConfigDict(
+        json_schema_extra={
+            "name": "Anthropic Claude",
+            "version": "v1",
+            "short_description": "Run Anthropic Claude model with vision capabilities",
+            "long_description": LONG_DESCRIPTION,
+            "license": "Apache-2.0",
+            "block_type": "model",
+            "search_keywords": ["LMM", "VLM", "Claude", "Anthropic"],
+        }
+    )
+    type: Literal["roboflow_core/anthropic_claude@v1"]
+    images: Union[WorkflowImageSelector, StepOutputImageSelector] = ImageInputField
+    task_type: TaskType = Field(
+        description="Task type to be performed by model. Value of parameter determine set of fields "
+        "that are required. For `unconstrained`, `visual-question-answering`, "
+        " - `prompt` parameter must be provided."
+        "For `structured-answering` - `output-structure` must be provided. For "
+        "`classification`, `multi-label-classification` and `object-detection` - "
+        "`classes` must be filled. `ocr`, `caption`, `detailed-caption` do not"
+        "require any additional parameter.",
+    )
+    prompt: Optional[Union[WorkflowParameterSelector(kind=[STRING_KIND]), str]] = Field(
+        default=None,
+        description="Text prompt to the Claude model",
+        examples=["my prompt", "$inputs.prompt"],
+        json_schema_extra={
+            "relevant_for": {
+                "task_type": {"values": TASKS_REQUIRING_PROMPT, "required": True},
+            },
+        },
+    )
+    output_structure: Optional[Dict[str, str]] = Field(
+        default=None,
+        description="Dictionary with structure of expected JSON response",
+        examples=[{"my_key": "description"}, "$inputs.output_structure"],
+        json_schema_extra={
+            "relevant_for": {
+                "task_type": {"values": TASKS_REQUIRING_CLASSES, "required": True},
+            },
+        },
+    )
+    classes: Optional[
+        Union[WorkflowParameterSelector(kind=[LIST_OF_VALUES_KIND]), List[str]]
+    ] = Field(
+        default=None,
+        description="List of classes to be used",
+        examples=[["class-a", "class-b"], "$inputs.classes"],
+        json_schema_extra={
+            "relevant_for": {
+                "task_type": {
+                    "values": TASKS_REQUIRING_OUTPUT_STRUCTURE,
+                    "required": True,
+                },
+            },
+        },
+    )
+    api_key: Union[WorkflowParameterSelector(kind=[STRING_KIND]), str] = Field(
+        description="Your Antropic API key",
+        examples=["xxx-xxx", "$inputs.antropics_api_key"],
+        private=True,
+    )
+    model_version: Union[
+        WorkflowParameterSelector(kind=[STRING_KIND]),
+        Literal[
+            "claude-3-5-sonnet", "claude-3-opus", "claude-3-sonnet", "claude-3-haiku"
+        ],
+    ] = Field(
+        default="claude-3-5-sonnet",
+        description="Model to be used",
+        examples=["claude-3-5-sonnet", "$inputs.claude"],
+    )
+    max_tokens: int = Field(
+        default=450,
+        description="Maximum number of tokens the model can generate in it's response.",
+    )
+    temperature: Optional[
+        Union[float, WorkflowParameterSelector(kind=[FLOAT_KIND])]
+    ] = Field(
+        default=None,
+        description="Temperature to sample from the model - value in range 0.0-2.0, the higher - the more "
+        'random / "creative" the generations are.',
+        ge=0.0,
+        le=2.0,
+    )
+    max_image_size: Union[int, WorkflowParameterSelector(kind=[INTEGER_KIND])] = Field(
+        description="Maximum size of the image - if input has larger side, it will be downscaled, keeping aspect ratio",
+        default=1024,
+    )
+    max_concurrent_requests: Optional[int] = Field(
+        default=None,
+        description="Number of concurrent requests that can be executed by block when batch of input images provided. "
+        "If not given - block defaults to value configured globally in Workflows Execution Engine. "
+        "Please restrict if you hit ANtropic API limits.",
+    )
+
+    @model_validator(mode="after")
+    def validate(self) -> "BlockManifest":
+        if self.task_type in TASKS_REQUIRING_PROMPT and self.prompt is None:
+            raise ValueError(
+                f"`prompt` parameter required to be set for task `{self.task_type}`"
+            )
+        if self.task_type in TASKS_REQUIRING_CLASSES and self.classes is None:
+            raise ValueError(
+                f"`classes` parameter required to be set for task `{self.task_type}`"
+            )
+        if (
+            self.task_type in TASKS_REQUIRING_OUTPUT_STRUCTURE
+            and self.output_structure is None
+        ):
+            raise ValueError(
+                f"`output_structure` parameter required to be set for task `{self.task_type}`"
+            )
+        return self
+
+    @classmethod
+    def accepts_batch_input(cls) -> bool:
+        return True
+
+    @classmethod
+    def describe_outputs(cls) -> List[OutputDefinition]:
+        return [
+            OutputDefinition(
+                name="output", kind=[STRING_KIND, LANGUAGE_MODEL_OUTPUT_KIND]
+            ),
+            OutputDefinition(name="classes", kind=[LIST_OF_VALUES_KIND]),
+        ]
+
+    @classmethod
+    def get_execution_engine_compatibility(cls) -> Optional[str]:
+        return ">=1.0.0,<2.0.0"
+
+
+class AntropicClaudeBlockV1(WorkflowBlock):
+
+    def __init__(
+        self,
+        model_manager: ModelManager,
+        api_key: Optional[str],
+    ):
+        self._model_manager = model_manager
+        self._api_key = api_key
+
+    @classmethod
+    def get_init_parameters(cls) -> List[str]:
+        return ["model_manager", "api_key"]
+
+    @classmethod
+    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
+        return BlockManifest
+
+    @classmethod
+    def get_execution_engine_compatibility(cls) -> Optional[str]:
+        return ">=1.0.0,<2.0.0"
+
+    def run(
+        self,
+        images: Batch[WorkflowImageData],
+        task_type: TaskType,
+        prompt: Optional[str],
+        output_structure: Optional[Dict[str, str]],
+        classes: Optional[List[str]],
+        api_key: str,
+        model_version: str,
+        max_tokens: int,
+        temperature: Optional[float],
+        max_image_size: int,
+        max_concurrent_requests: Optional[int],
+    ) -> BlockResult:
+        inference_images = [i.to_inference_format() for i in images]
+        raw_outputs = run_claude_prompting(
+            images=inference_images,
+            task_type=task_type,
+            prompt=prompt,
+            output_structure=output_structure,
+            classes=classes,
+            api_key=api_key,
+            model_version=model_version,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            max_image_size=max_image_size,
+            max_concurrent_requests=max_concurrent_requests,
+        )
+        return [
+            {"output": raw_output, "classes": classes} for raw_output in raw_outputs
+        ]
+
+
+def run_claude_prompting(
+    images: List[Dict[str, Any]],
+    task_type: TaskType,
+    prompt: Optional[str],
+    output_structure: Optional[Dict[str, str]],
+    classes: Optional[List[str]],
+    api_key: str,
+    model_version: str,
+    max_tokens: int,
+    temperature: Optional[float],
+    max_image_size: int,
+    max_concurrent_requests: Optional[int],
+) -> List[str]:
+    if task_type not in PROMPT_BUILDERS:
+        raise ValueError(f"Task type: {task_type} not supported.")
+    prompts = []
+    for image in images:
+        loaded_image, _ = load_image(image)
+        loaded_image = downscale_image_keeping_aspect_ratio(
+            image=loaded_image, desired_size=(max_image_size, max_image_size)
+        )
+        base64_image = base64.b64encode(
+            encode_image_to_jpeg_bytes(loaded_image)
+        ).decode("ascii")
+        prompt = PROMPT_BUILDERS[task_type](
+            base64_image=base64_image,
+            prompt=prompt,
+            output_structure=output_structure,
+            classes=classes,
+        )
+        prompts.append(prompt)
+    return execute_claude_requests(
+        api_key=api_key,
+        prompts=prompts,
+        model_version=model_version,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        max_concurrent_requests=max_concurrent_requests,
+    )
+
+
+def execute_claude_requests(
+    api_key: str,
+    prompts: List[Tuple[Optional[str], List[dict]]],
+    model_version: str,
+    max_tokens: int,
+    temperature: Optional[float],
+    max_concurrent_requests: Optional[int],
+) -> List[str]:
+    tasks = [
+        partial(
+            execute_claude_request,
+            system_prompt=prompt[0],
+            messages=prompt[1],
+            model_version=model_version,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            api_key=api_key,
+        )
+        for prompt in prompts
+    ]
+    max_workers = (
+        max_concurrent_requests
+        or WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS
+    )
+    return run_in_parallel(
+        tasks=tasks,
+        max_workers=max_workers,
+    )
+
+
+EXACT_MODELS_VERSIONS_MAPPING = {
+    "claude-3-5-sonnet": "claude-3-5-sonnet-20240620",
+    "claude-3-opus": "claude-3-opus-20240229",
+    "claude-3-sonnet": "claude-3-sonnet-20240229",
+    "claude-3-haiku": "claude-3-haiku-20240307",
+}
+
+
+def execute_claude_request(
+    system_prompt: Optional[str],
+    messages: List[dict],
+    model_version: str,
+    max_tokens: int,
+    temperature: Optional[float],
+    api_key: str,
+) -> str:
+    client = anthropic.Anthropic(api_key=api_key)
+    if system_prompt is None:
+        system_prompt = NOT_GIVEN
+    if temperature is None:
+        temperature = NOT_GIVEN
+    result = client.messages.create(
+        system=system_prompt,
+        messages=messages,
+        max_tokens=max_tokens,
+        model=EXACT_MODELS_VERSIONS_MAPPING[model_version],
+        temperature=temperature,
+    )
+    return result.content[0].text
+
+
+def prepare_unconstrained_prompt(
+    base64_image: str,
+    prompt: str,
+    **kwargs,
+) -> Tuple[Optional[str], List[dict]]:
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/jpeg",
+                        "data": base64_image,
+                    },
+                },
+                {
+                    "type": "text",
+                    "text": prompt,
+                },
+            ],
+        }
+    ]
+    return None, messages
+
+
+def prepare_classification_prompt(
+    base64_image: str,
+    classes: List[str],
+    **kwargs,
+) -> Tuple[Optional[str], List[dict]]:
+    serialised_classes = ", ".join(classes)
+    system_prompt = (
+        "You act as single-class classification model. You must provide reasonable predictions. "
+        "You are only allowed to produce JSON document. "
+        'Expected structure of json: {"class_name": "class-name", "confidence": 0.4}. '
+        "`class-name` must be one of the class names defined by user. You are only allowed to return "
+        "single JSON document, even if there are potentially multiple classes. You are not allowed to "
+        "return list."
+    )
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/jpeg",
+                        "data": base64_image,
+                    },
+                },
+                {
+                    "type": "text",
+                    "text": f"List of all classes to be recognised by model: {serialised_classes}",
+                },
+            ],
+        }
+    ]
+    return system_prompt, messages
+
+
+def prepare_multi_label_classification_prompt(
+    base64_image: str,
+    classes: List[str],
+    **kwargs,
+) -> Tuple[Optional[str], List[dict]]:
+    serialised_classes = ", ".join(classes)
+    system_prompt = (
+        "You act as multi-label classification model. You must provide reasonable predictions. "
+        "You are only allowed to produce JSON document. "
+        'Expected structure of json: {"predicted_classes": [{"class": "class-name-1", "confidence": 0.9}, '
+        '{"class": "class-name-2", "confidence": 0.7}]}.'
+        "`class-name-X` must be one of the class names defined by user and `confidence` is a float value "
+        "in range 0.0-1.0 that represents how sure you are that the class is present in the image. "
+        "Only return class names that are visible."
+    )
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/jpeg",
+                        "data": base64_image,
+                    },
+                },
+                {
+                    "type": "text",
+                    "text": f"List of all classes to be recognised by model: {serialised_classes}",
+                },
+            ],
+        }
+    ]
+    return system_prompt, messages
+
+
+def prepare_vqa_prompt(
+    base64_image: str,
+    prompt: str,
+    **kwargs,
+) -> Tuple[Optional[str], List[dict]]:
+    system_prompt = (
+        "You act as Visual Question Answering model. Your task is to provide answer to question"
+        "submitted by user. If this is open-question - answer with few sentences, for ABCD question, "
+        "return only the indicator of the answer."
+    )
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/jpeg",
+                        "data": base64_image,
+                    },
+                },
+                {
+                    "type": "text",
+                    "text": f"Question: {prompt}",
+                },
+            ],
+        }
+    ]
+    return system_prompt, messages
+
+
+def prepare_ocr_prompt(
+    base64_image: str,
+    **kwargs,
+) -> Tuple[Optional[str], List[dict]]:
+    system_prompt = (
+        "You act as OCR model. Your task is to read text from the image and return it in "
+        "paragraphs representing the structure of texts in the image. You should only return "
+        "recognised text, nothing else."
+    )
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/jpeg",
+                        "data": base64_image,
+                    },
+                },
+            ],
+        }
+    ]
+    return system_prompt, messages
+
+
+def prepare_caption_prompt(
+    base64_image: str,
+    short_description: bool,
+    **kwargs,
+) -> Tuple[Optional[str], List[dict]]:
+    caption_detail_level = "Caption should be short."
+    if not short_description:
+        caption_detail_level = "Caption should be extensive."
+    system_prompt = (
+        f"You act as image caption model. Your task is to provide description of the image. "
+        f"{caption_detail_level}"
+    )
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/jpeg",
+                        "data": base64_image,
+                    },
+                },
+            ],
+        }
+    ]
+    return system_prompt, messages
+
+
+def prepare_structured_answering_prompt(
+    base64_image: str,
+    output_structure: Dict[str, str],
+    **kwargs,
+) -> Tuple[Optional[str], List[dict]]:
+    output_structure_serialised = json.dumps(output_structure, indent=4)
+    system_prompt = (
+        "You are supposed to produce responses in JSON. User is to provide you dictionary with "
+        "keys and values. Each key must be present in your response. Values in user dictionary "
+        "represent descriptions for JSON fields to be generated. Provide only JSON in response."
+    )
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/jpeg",
+                        "data": base64_image,
+                    },
+                },
+                {
+                    "type": "text",
+                    "text": f"Specification of requirements regarding output fields: \n"
+                    f"{output_structure_serialised}",
+                },
+            ],
+        }
+    ]
+    return system_prompt, messages
+
+
+def prepare_object_detection_prompt(
+    base64_image: str,
+    classes: List[str],
+    **kwargs,
+) -> Tuple[Optional[str], List[dict]]:
+    serialised_classes = ", ".join(classes)
+    system_prompt = (
+        "You act as object-detection model. You must provide reasonable predictions. "
+        "You are only allowed to produce JSON document. "
+        'Expected structure of json: {"detections": [{"x_min": 0.1, "y_min": 0.2, "x_max": 0.3, "y_max": 0.4, "class_name": "my-class-X", "confidence": 0.7}]} '
+        "- remember to close top-level dictionary at the end. "
+        "`my-class-X` must be one of the class names defined by user. All coordinates must be in range 0.0-1.0, representing percentage of image dimensions. "
+        "`confidence` is a value in range 0.0-1.0 representing your confidence in prediction. You should detect all instances of classes provided by user."
+    )
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/jpeg",
+                        "data": base64_image,
+                    },
+                },
+                {
+                    "type": "text",
+                    "text": f"List of all classes to be recognised by model: {serialised_classes}",
+                },
+            ],
+        }
+    ]
+    return system_prompt, messages
+
+
+PROMPT_BUILDERS = {
+    "unconstrained": prepare_unconstrained_prompt,
+    "ocr": prepare_ocr_prompt,
+    "visual-question-answering": prepare_vqa_prompt,
+    "caption": partial(prepare_caption_prompt, short_description=True),
+    "detailed-caption": partial(prepare_caption_prompt, short_description=False),
+    "classification": prepare_classification_prompt,
+    "multi-label-classification": prepare_multi_label_classification_prompt,
+    "structured-answering": prepare_structured_answering_prompt,
+    "object-detection": prepare_object_detection_prompt,
+}
diff --git a/inference/core/workflows/core_steps/models/foundation/google_gemini/__init__.py b/inference/core/workflows/core_steps/models/foundation/google_gemini/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/inference/core/workflows/core_steps/models/foundation/google_gemini/v1.py b/inference/core/workflows/core_steps/models/foundation/google_gemini/v1.py
new file mode 100644
index 0000000000..9fb2d6638a
--- /dev/null
+++ b/inference/core/workflows/core_steps/models/foundation/google_gemini/v1.py
@@ -0,0 +1,725 @@
+import base64
+import json
+import re
+from functools import partial
+from typing import Any, Dict, List, Literal, Optional, Type, Union
+
+import requests
+from pydantic import ConfigDict, Field, model_validator
+from requests import Response
+
+from inference.core.env import WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS
+from inference.core.managers.base import ModelManager
+from inference.core.utils.image_utils import encode_image_to_jpeg_bytes, load_image
+from inference.core.workflows.core_steps.common.utils import run_in_parallel
+from inference.core.workflows.execution_engine.entities.base import (
+    Batch,
+    OutputDefinition,
+    WorkflowImageData,
+)
+from inference.core.workflows.execution_engine.entities.types import (
+    FLOAT_KIND,
+    LANGUAGE_MODEL_OUTPUT_KIND,
+    LIST_OF_VALUES_KIND,
+    STRING_KIND,
+    ImageInputField,
+    StepOutputImageSelector,
+    WorkflowImageSelector,
+    WorkflowParameterSelector,
+)
+from inference.core.workflows.prototypes.block import (
+    BlockResult,
+    WorkflowBlock,
+    WorkflowBlockManifest,
+)
+
+GOOGLE_API_KEY_PATTERN = re.compile(r"key=(.[^&]*)")
+GOOGLE_API_KEY_VALUE_GROUP = 1
+MIN_KEY_LENGTH_TO_REVEAL_PREFIX = 8
+
+LONG_DESCRIPTION = """
+Ask a question to Google's Gemini model with vision capabilities.
+
+You can specify arbitrary text prompts or predefined ones, the block supports the following types of prompt:
+
+- `unconstrained` - any arbitrary prompt you like 
+
+- `ocr`- predefined prompt to recognise text from image
+
+- `visual-question-answering` - your prompt is supposed to provide question and will be 
+wrapped into structure that is suited for VQA task
+
+- `caption` - predefined prompt to generate short caption of the image
+
+- `detailed-caption` - predefined prompt to generate elaborated caption of the image
+
+- `classification` - predefined prompt to generate multi-class classification output (that can be parsed
+with `VLM as Classifier` block)
+
+- `multi-label-classification` - predefined prompt to generate multi-label classification output (that 
+can be parsed with `VLM as Classifier` block)
+
+- `object-detection` - predefined prompt to generate object detection output (that can be parsed
+with `VLM as Detector` block)
+
+- `structured-answering` - your input defines expected JSON output fields that can be parsed with `JSON Parser`
+block. 
+
+You need to provide your Google AI API key to use the Gemini model. 
+
+**WARNING!**
+
+This block makes use of `/v1beta` API of Google Gemini model - the implementation may change 
+in the future, without guarantee of backward compatibility.
+"""
+
+TaskType = Literal[
+    "unconstrained",
+    "ocr",
+    "visual-question-answering",
+    "caption",
+    "detailed-caption",
+    "classification",
+    "multi-label-classification",
+    "structured-answering",
+    "object-detection",
+]
+
+TASKS_REQUIRING_PROMPT = {
+    "unconstrained",
+    "visual-question-answering",
+}
+
+TASKS_REQUIRING_CLASSES = {
+    "classification",
+    "multi-label-classification",
+    "object-detection",
+}
+
+TASKS_REQUIRING_OUTPUT_STRUCTURE = {
+    "structured-answering",
+}
+
+
+class BlockManifest(WorkflowBlockManifest):
+    model_config = ConfigDict(
+        json_schema_extra={
+            "name": "Google Gemini",
+            "version": "v1",
+            "short_description": "Run Google's Gemini model with vision capabilities",
+            "long_description": LONG_DESCRIPTION,
+            "license": "Apache-2.0",
+            "block_type": "model",
+            "search_keywords": ["LMM", "VLM", "Gemini", "Google"],
+            "beta": True,
+        }
+    )
+    type: Literal["roboflow_core/google_gemini@v1"]
+    images: Union[WorkflowImageSelector, StepOutputImageSelector] = ImageInputField
+    task_type: TaskType = Field(
+        description="Task type to be performed by model. Value of parameter determine set of fields "
+        "that are required. For `unconstrained`, `visual-question-answering`, "
+        " - `prompt` parameter must be provided."
+        "For `structured-answering` - `output-structure` must be provided. For "
+        "`classification`, `multi-label-classification` and `object-detection` - "
+        "`classes` must be filled. `ocr`, `caption`, `detailed-caption` do not"
+        "require any additional parameter.",
+    )
+    prompt: Optional[Union[WorkflowParameterSelector(kind=[STRING_KIND]), str]] = Field(
+        default=None,
+        description="Text prompt to the Gemini model",
+        examples=["my prompt", "$inputs.prompt"],
+        json_schema_extra={
+            "relevant_for": {
+                "task_type": {"values": TASKS_REQUIRING_PROMPT, "required": True},
+            },
+        },
+    )
+    output_structure: Optional[Dict[str, str]] = Field(
+        default=None,
+        description="Dictionary with structure of expected JSON response",
+        examples=[{"my_key": "description"}, "$inputs.output_structure"],
+        json_schema_extra={
+            "relevant_for": {
+                "task_type": {"values": TASKS_REQUIRING_CLASSES, "required": True},
+            },
+        },
+    )
+    classes: Optional[
+        Union[WorkflowParameterSelector(kind=[LIST_OF_VALUES_KIND]), List[str]]
+    ] = Field(
+        default=None,
+        description="List of classes to be used",
+        examples=[["class-a", "class-b"], "$inputs.classes"],
+        json_schema_extra={
+            "relevant_for": {
+                "task_type": {
+                    "values": TASKS_REQUIRING_OUTPUT_STRUCTURE,
+                    "required": True,
+                },
+            },
+        },
+    )
+    api_key: Union[WorkflowParameterSelector(kind=[STRING_KIND]), str] = Field(
+        description="Your Google AI API key",
+        examples=["xxx-xxx", "$inputs.google_api_key"],
+        private=True,
+    )
+    model_version: Union[
+        WorkflowParameterSelector(kind=[STRING_KIND]),
+        Literal["gemini-1.5-flash", "gemini-1.5-pro"],
+    ] = Field(
+        default="gemini-1.5-flash",
+        description="Model to be used",
+        examples=["gemini-1.5-flash", "$inputs.gemini_model"],
+    )
+    max_tokens: int = Field(
+        default=450,
+        description="Maximum number of tokens the model can generate in it's response.",
+    )
+    temperature: Optional[
+        Union[float, WorkflowParameterSelector(kind=[FLOAT_KIND])]
+    ] = Field(
+        default=None,
+        description="Temperature to sample from the model - value in range 0.0-2.0, the higher - the more "
+        'random / "creative" the generations are.',
+        ge=0.0,
+        le=2.0,
+    )
+    max_concurrent_requests: Optional[int] = Field(
+        default=None,
+        description="Number of concurrent requests that can be executed by block when batch of input images provided. "
+        "If not given - block defaults to value configured globally in Workflows Execution Engine. "
+        "Please restrict if you hit Google Gemini API limits.",
+    )
+
+    @model_validator(mode="after")
+    def validate(self) -> "BlockManifest":
+        if self.task_type in TASKS_REQUIRING_PROMPT and self.prompt is None:
+            raise ValueError(
+                f"`prompt` parameter required to be set for task `{self.task_type}`"
+            )
+        if self.task_type in TASKS_REQUIRING_CLASSES and self.classes is None:
+            raise ValueError(
+                f"`classes` parameter required to be set for task `{self.task_type}`"
+            )
+        if (
+            self.task_type in TASKS_REQUIRING_OUTPUT_STRUCTURE
+            and self.output_structure is None
+        ):
+            raise ValueError(
+                f"`output_structure` parameter required to be set for task `{self.task_type}`"
+            )
+        return self
+
+    @classmethod
+    def accepts_batch_input(cls) -> bool:
+        return True
+
+    @classmethod
+    def describe_outputs(cls) -> List[OutputDefinition]:
+        return [
+            OutputDefinition(
+                name="output", kind=[STRING_KIND, LANGUAGE_MODEL_OUTPUT_KIND]
+            ),
+            OutputDefinition(name="classes", kind=[LIST_OF_VALUES_KIND]),
+        ]
+
+    @classmethod
+    def get_execution_engine_compatibility(cls) -> Optional[str]:
+        return ">=1.0.0,<2.0.0"
+
+
+class GoogleGeminiBlockV1(WorkflowBlock):
+
+    def __init__(
+        self,
+        model_manager: ModelManager,
+        api_key: Optional[str],
+    ):
+        self._model_manager = model_manager
+        self._api_key = api_key
+
+    @classmethod
+    def get_init_parameters(cls) -> List[str]:
+        return ["model_manager", "api_key"]
+
+    @classmethod
+    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
+        return BlockManifest
+
+    @classmethod
+    def get_execution_engine_compatibility(cls) -> Optional[str]:
+        return ">=1.0.0,<2.0.0"
+
+    def run(
+        self,
+        images: Batch[WorkflowImageData],
+        task_type: TaskType,
+        prompt: Optional[str],
+        output_structure: Optional[Dict[str, str]],
+        classes: Optional[List[str]],
+        api_key: str,
+        model_version: str,
+        max_tokens: int,
+        temperature: Optional[float],
+        max_concurrent_requests: Optional[int],
+    ) -> BlockResult:
+        inference_images = [i.to_inference_format() for i in images]
+        raw_outputs = run_gemini_prompting(
+            images=inference_images,
+            task_type=task_type,
+            prompt=prompt,
+            output_structure=output_structure,
+            classes=classes,
+            google_api_key=api_key,
+            model_version=model_version,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            max_concurrent_requests=max_concurrent_requests,
+        )
+        return [
+            {"output": raw_output, "classes": classes} for raw_output in raw_outputs
+        ]
+
+
+def run_gemini_prompting(
+    images: List[Dict[str, Any]],
+    task_type: TaskType,
+    prompt: Optional[str],
+    output_structure: Optional[Dict[str, str]],
+    classes: Optional[List[str]],
+    google_api_key: Optional[str],
+    model_version: str,
+    max_tokens: int,
+    temperature: Optional[float],
+    max_concurrent_requests: Optional[int],
+) -> List[str]:
+    if task_type not in PROMPT_BUILDERS:
+        raise ValueError(f"Task type: {task_type} not supported.")
+    gemini_prompts = []
+    for image in images:
+        loaded_image, _ = load_image(image)
+        base64_image = base64.b64encode(
+            encode_image_to_jpeg_bytes(loaded_image)
+        ).decode("ascii")
+        prompt = PROMPT_BUILDERS[task_type](
+            base64_image=base64_image,
+            prompt=prompt,
+            output_structure=output_structure,
+            classes=classes,
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+        gemini_prompts.append(prompt)
+    return execute_gemini_requests(
+        google_api_key=google_api_key,
+        gemini_prompts=gemini_prompts,
+        model_version=model_version,
+        max_concurrent_requests=max_concurrent_requests,
+    )
+
+
+def execute_gemini_requests(
+    google_api_key: str,
+    gemini_prompts: List[dict],
+    model_version: str,
+    max_concurrent_requests: Optional[int],
+) -> List[str]:
+    tasks = [
+        partial(
+            execute_gemini_request,
+            prompt=prompt,
+            model_version=model_version,
+            google_api_key=google_api_key,
+        )
+        for prompt in gemini_prompts
+    ]
+    max_workers = (
+        max_concurrent_requests
+        or WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS
+    )
+    return run_in_parallel(
+        tasks=tasks,
+        max_workers=max_workers,
+    )
+
+
+def execute_gemini_request(
+    prompt: dict,
+    model_version: str,
+    google_api_key: str,
+) -> str:
+    response = requests.post(
+        f"https://generativelanguage.googleapis.com/v1beta/models/{model_version}:generateContent",
+        headers={
+            "Content-Type": "application/json",
+        },
+        params={
+            "key": google_api_key,
+        },
+        json=prompt,
+    )
+    response_data = response.json()
+    google_api_key_safe_raise_for_status(response=response)
+    return response_data["candidates"][0]["content"]["parts"][0]["text"]
+
+
+def prepare_unconstrained_prompt(
+    base64_image: str,
+    prompt: str,
+    temperature: Optional[float],
+    max_tokens: int,
+    **kwargs,
+) -> dict:
+    return {
+        "contents": {
+            "parts": [
+                {
+                    "inline_data": {
+                        "mime_type": "image/jpeg",
+                        "data": base64_image,
+                    }
+                },
+                {
+                    "text": prompt,
+                },
+            ],
+            "role": "user",
+        },
+        "generationConfig": prepare_generation_config(
+            max_tokens=max_tokens,
+            temperature=temperature,
+        ),
+    }
+
+
+def prepare_classification_prompt(
+    base64_image: str,
+    classes: List[str],
+    temperature: Optional[float],
+    max_tokens: int,
+    **kwargs,
+) -> dict:
+    serialised_classes = ", ".join(classes)
+    return {
+        "systemInstruction": {
+            "role": "system",
+            "parts": [
+                {
+                    "text": "You act as single-class classification model. You must provide reasonable predictions. "
+                    "You are only allowed to produce JSON document. "
+                    'Expected structure of json: {"class_name": "class-name", "confidence": 0.4}. '
+                    "`class-name` must be one of the class names defined by user. You are only allowed to return "
+                    "single JSON document, even if there are potentially multiple classes. You are not allowed to "
+                    "return list.",
+                }
+            ],
+        },
+        "contents": {
+            "parts": [
+                {
+                    "inline_data": {
+                        "mime_type": "image/jpeg",
+                        "data": base64_image,
+                    }
+                },
+                {
+                    "text": f"List of all classes to be recognised by model: {serialised_classes}",
+                },
+            ],
+            "role": "user",
+        },
+        "generationConfig": prepare_generation_config(
+            max_tokens=max_tokens,
+            temperature=temperature,
+            response_mime_type="application/json",
+        ),
+    }
+
+
+def prepare_multi_label_classification_prompt(
+    base64_image: str,
+    classes: List[str],
+    temperature: Optional[float],
+    max_tokens: int,
+    **kwargs,
+) -> dict:
+    serialised_classes = ", ".join(classes)
+    return {
+        "systemInstruction": {
+            "role": "system",
+            "parts": [
+                {
+                    "text": "You act as multi-label classification model. You must provide reasonable predictions. "
+                    "You are only allowed to produce JSON document. "
+                    'Expected structure of json: {"predicted_classes": [{"class": "class-name-1", "confidence": 0.9}, '
+                    '{"class": "class-name-2", "confidence": 0.7}]}. '
+                    "`class-name-X` must be one of the class names defined by user and `confidence` is a float value "
+                    "in range 0.0-1.0 that represents how sure you are that the class is present in the image. "
+                    "Only return class names that are visible.",
+                }
+            ],
+        },
+        "contents": {
+            "parts": [
+                {
+                    "inline_data": {
+                        "mime_type": "image/jpeg",
+                        "data": base64_image,
+                    }
+                },
+                {
+                    "text": f"List of all classes to be recognised by model: {serialised_classes}",
+                },
+            ],
+            "role": "user",
+        },
+        "generationConfig": prepare_generation_config(
+            max_tokens=max_tokens,
+            temperature=temperature,
+            response_mime_type="application/json",
+        ),
+    }
+
+
+def prepare_vqa_prompt(
+    base64_image: str,
+    prompt: str,
+    temperature: Optional[float],
+    max_tokens: int,
+    **kwargs,
+) -> dict:
+    return {
+        "systemInstruction": {
+            "role": "system",
+            "parts": [
+                {
+                    "text": "You act as Visual Question Answering model. Your task is to provide answer to question"
+                    "submitted by user. If this is open-question - answer with few sentences, for ABCD question, "
+                    "return only the indicator of the answer.",
+                }
+            ],
+        },
+        "contents": {
+            "parts": [
+                {
+                    "inline_data": {
+                        "mime_type": "image/jpeg",
+                        "data": base64_image,
+                    }
+                },
+                {
+                    "text": f"Question: {prompt}",
+                },
+            ],
+            "role": "user",
+        },
+        "generationConfig": prepare_generation_config(
+            max_tokens=max_tokens,
+            temperature=temperature,
+        ),
+    }
+
+
+def prepare_ocr_prompt(
+    base64_image: str,
+    temperature: Optional[float],
+    max_tokens: int,
+    **kwargs,
+) -> dict:
+    return {
+        "systemInstruction": {
+            "role": "system",
+            "parts": [
+                {
+                    "text": "You act as OCR model. Your task is to read text from the image and return it in "
+                    "paragraphs representing the structure of texts in the image. You should only return "
+                    "recognised text, nothing else.",
+                }
+            ],
+        },
+        "contents": {
+            "parts": [
+                {
+                    "inline_data": {
+                        "mime_type": "image/jpeg",
+                        "data": base64_image,
+                    }
+                }
+            ],
+            "role": "user",
+        },
+        "generationConfig": prepare_generation_config(
+            max_tokens=max_tokens,
+            temperature=temperature,
+        ),
+    }
+
+
+def prepare_caption_prompt(
+    base64_image: str,
+    short_description: bool,
+    temperature: Optional[float],
+    max_tokens: int,
+    **kwargs,
+) -> dict:
+    caption_detail_level = "Caption should be short."
+    if not short_description:
+        caption_detail_level = "Caption should be extensive."
+    return {
+        "systemInstruction": {
+            "role": "system",
+            "parts": [
+                {
+                    "text": f"You act as image caption model. Your task is to provide description of the image. "
+                    f"{caption_detail_level}",
+                }
+            ],
+        },
+        "contents": {
+            "parts": [
+                {
+                    "inline_data": {
+                        "mime_type": "image/jpeg",
+                        "data": base64_image,
+                    }
+                }
+            ],
+            "role": "user",
+        },
+        "generationConfig": prepare_generation_config(
+            max_tokens=max_tokens,
+            temperature=temperature,
+        ),
+    }
+
+
+def prepare_structured_answering_prompt(
+    base64_image: str,
+    output_structure: Dict[str, str],
+    temperature: Optional[float],
+    max_tokens: int,
+    **kwargs,
+) -> dict:
+    output_structure_serialised = json.dumps(output_structure, indent=4)
+    return {
+        "systemInstruction": {
+            "role": "system",
+            "parts": [
+                {
+                    "text": "You are supposed to produce responses in JSON. User is to provide you dictionary with "
+                    "keys and values. Each key must be present in your response. Values in user dictionary "
+                    "represent descriptions for JSON fields to be generated. Provide only JSON in response.",
+                }
+            ],
+        },
+        "contents": {
+            "parts": [
+                {
+                    "inline_data": {
+                        "mime_type": "image/jpeg",
+                        "data": base64_image,
+                    }
+                },
+                {
+                    "text": f"Specification of requirements regarding output fields: \n"
+                    f"{output_structure_serialised}",
+                },
+            ],
+            "role": "user",
+        },
+        "generationConfig": prepare_generation_config(
+            max_tokens=max_tokens,
+            temperature=temperature,
+            response_mime_type="application/json",
+        ),
+    }
+
+
+def prepare_object_detection_prompt(
+    base64_image: str,
+    classes: List[str],
+    temperature: Optional[float],
+    max_tokens: int,
+    **kwargs,
+) -> dict:
+    serialised_classes = ", ".join(classes)
+    return {
+        "systemInstruction": {
+            "role": "system",
+            "parts": [
+                {
+                    "text": "You act as object-detection model. You must provide reasonable predictions. "
+                    "You are only allowed to produce JSON document. "
+                    'Expected structure of json: {"detections": [{"x_min": 0.1, "y_min": 0.2, "x_max": 0.3, "y_max": 0.4, "class_name": "my-class-X", "confidence": 0.7}]}. '
+                    "`my-class-X` must be one of the class names defined by user. All coordinates must be in range 0.0-1.0, representing percentage of image dimensions. "
+                    "`confidence` is a value in range 0.0-1.0 representing your confidence in prediction. You should detect all instances of classes provided by user.",
+                }
+            ],
+        },
+        "contents": {
+            "parts": [
+                {
+                    "inline_data": {
+                        "mime_type": "image/jpeg",
+                        "data": base64_image,
+                    }
+                },
+                {
+                    "text": f"List of all classes to be recognised by model: {serialised_classes}",
+                },
+            ],
+            "role": "user",
+        },
+        "generationConfig": prepare_generation_config(
+            max_tokens=max_tokens,
+            temperature=temperature,
+            response_mime_type="application/json",
+        ),
+    }
+
+
+def prepare_generation_config(
+    max_tokens: int,
+    temperature: Optional[float],
+    response_mime_type: str = "text/plain",
+) -> dict:
+    result = {
+        "max_output_tokens": max_tokens,
+        "response_mime_type": response_mime_type,
+        "candidate_count": 1,
+    }
+    if temperature is not None:
+        result["temperature"] = temperature
+    return result
+
+
+def google_api_key_safe_raise_for_status(response: Response) -> None:
+    request_is_successful = response.status_code < 400
+    if request_is_successful:
+        return None
+    response.url = GOOGLE_API_KEY_PATTERN.sub(deduct_api_key, response.url)
+    response.raise_for_status()
+
+
+def deduct_api_key(match: re.Match) -> str:
+    key_value = match.group(GOOGLE_API_KEY_VALUE_GROUP)
+    if len(key_value) < MIN_KEY_LENGTH_TO_REVEAL_PREFIX:
+        return f"key=***"
+    key_prefix = key_value[:2]
+    key_postfix = key_value[-2:]
+    return f"key={key_prefix}***{key_postfix}"
+
+
+PROMPT_BUILDERS = {
+    "unconstrained": prepare_unconstrained_prompt,
+    "ocr": prepare_ocr_prompt,
+    "visual-question-answering": prepare_vqa_prompt,
+    "caption": partial(prepare_caption_prompt, short_description=True),
+    "detailed-caption": partial(prepare_caption_prompt, short_description=False),
+    "classification": prepare_classification_prompt,
+    "multi-label-classification": prepare_multi_label_classification_prompt,
+    "structured-answering": prepare_structured_answering_prompt,
+    "object-detection": prepare_object_detection_prompt,
+}
diff --git a/inference/core/workflows/core_steps/models/foundation/lmm_classifier/v1.py b/inference/core/workflows/core_steps/models/foundation/lmm_classifier/v1.py
index 2c029dd506..3f468a3321 100644
--- a/inference/core/workflows/core_steps/models/foundation/lmm_classifier/v1.py
+++ b/inference/core/workflows/core_steps/models/foundation/lmm_classifier/v1.py
@@ -64,6 +64,7 @@ class BlockManifest(WorkflowBlockManifest):
             "long_description": LONG_DESCRIPTION,
             "license": "Apache-2.0",
             "block_type": "model",
+            "deprecated": True,
         }
     )
     type: Literal["roboflow_core/lmm_for_classification@v1", "LMMForClassification"]
diff --git a/inference/core/workflows/core_steps/models/foundation/openai/v2.py b/inference/core/workflows/core_steps/models/foundation/openai/v2.py
new file mode 100644
index 0000000000..406c462121
--- /dev/null
+++ b/inference/core/workflows/core_steps/models/foundation/openai/v2.py
@@ -0,0 +1,573 @@
+import base64
+import json
+from functools import partial
+from typing import Any, Dict, List, Literal, Optional, Type, Union
+
+from openai import OpenAI
+from openai._types import NOT_GIVEN
+from pydantic import ConfigDict, Field, model_validator
+
+from inference.core.env import WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS
+from inference.core.managers.base import ModelManager
+from inference.core.utils.image_utils import encode_image_to_jpeg_bytes, load_image
+from inference.core.workflows.core_steps.common.utils import run_in_parallel
+from inference.core.workflows.execution_engine.entities.base import (
+    Batch,
+    OutputDefinition,
+    WorkflowImageData,
+)
+from inference.core.workflows.execution_engine.entities.types import (
+    FLOAT_KIND,
+    LANGUAGE_MODEL_OUTPUT_KIND,
+    LIST_OF_VALUES_KIND,
+    STRING_KIND,
+    ImageInputField,
+    StepOutputImageSelector,
+    WorkflowImageSelector,
+    WorkflowParameterSelector,
+)
+from inference.core.workflows.prototypes.block import (
+    BlockResult,
+    WorkflowBlock,
+    WorkflowBlockManifest,
+)
+
+LONG_DESCRIPTION = """
+Ask a question to OpenAI's GPT-4 with Vision model.
+
+You can specify arbitrary text prompts or predefined ones, the block supports the following types of prompt:
+
+- `unconstrained` - any arbitrary prompt you like 
+
+- `ocr`- predefined prompt to recognise text from image
+
+- `visual-question-answering` - your prompt is supposed to provide question and will be 
+wrapped into structure that is suited for VQA task
+
+- `caption` - predefined prompt to generate short caption of the image
+
+- `detailed-caption` - predefined prompt to generate elaborated caption of the image
+
+- `classification` - predefined prompt to generate multi-class classification output (that can be parsed
+with `VLM as Classifier` block)
+
+- `multi-label-classification` - predefined prompt to generate multi-label classification output (that 
+can be parsed with `VLM as Classifier` block)
+
+- `structured-answering` - your input defines expected JSON output fields that can be parsed with `JSON Parser`
+block. 
+
+You need to provide your OpenAI API key to use the GPT-4 with Vision model. 
+"""
+
+TaskType = Literal[
+    "unconstrained",
+    "ocr",
+    "visual-question-answering",
+    "caption",
+    "detailed-caption",
+    "classification",
+    "multi-label-classification",
+    "structured-answering",
+]
+
+TASKS_REQUIRING_PROMPT = {
+    "unconstrained",
+    "visual-question-answering",
+}
+
+TASKS_REQUIRING_CLASSES = {
+    "classification",
+    "multi-label-classification",
+}
+
+TASKS_REQUIRING_OUTPUT_STRUCTURE = {
+    "structured-answering",
+}
+
+
+class BlockManifest(WorkflowBlockManifest):
+    model_config = ConfigDict(
+        json_schema_extra={
+            "name": "OpenAI",
+            "version": "v2",
+            "short_description": "Run OpenAI's GPT-4 with Vision",
+            "long_description": LONG_DESCRIPTION,
+            "license": "Apache-2.0",
+            "block_type": "model",
+            "search_keywords": ["LMM", "VLM", "ChatGPT", "GPT", "OpenAI"],
+        }
+    )
+    type: Literal["roboflow_core/open_ai@v2"]
+    images: Union[WorkflowImageSelector, StepOutputImageSelector] = ImageInputField
+    task_type: TaskType = Field(
+        description="Task type to be performed by model. Value of parameter determine set of fields "
+        "that are required. For `unconstrained`, `visual-question-answering`, "
+        " - `prompt` parameter must be provided."
+        "For `structured-answering` - `output-structure` must be provided. For "
+        "`classification`, `multi-label-classification` - "
+        "`classes` must be filled. `ocr`, `caption`, `detailed-caption` do not"
+        "require any additional parameter.",
+    )
+    prompt: Optional[Union[WorkflowParameterSelector(kind=[STRING_KIND]), str]] = Field(
+        default=None,
+        description="Text prompt to the OpenAI model",
+        examples=["my prompt", "$inputs.prompt"],
+        json_schema_extra={
+            "relevant_for": {
+                "task_type": {"values": TASKS_REQUIRING_PROMPT, "required": True},
+            },
+        },
+    )
+    output_structure: Optional[Dict[str, str]] = Field(
+        default=None,
+        description="Dictionary with structure of expected JSON response",
+        examples=[{"my_key": "description"}, "$inputs.output_structure"],
+        json_schema_extra={
+            "relevant_for": {
+                "task_type": {"values": TASKS_REQUIRING_CLASSES, "required": True},
+            },
+        },
+    )
+    classes: Optional[
+        Union[WorkflowParameterSelector(kind=[LIST_OF_VALUES_KIND]), List[str]]
+    ] = Field(
+        default=None,
+        description="List of classes to be used",
+        examples=[["class-a", "class-b"], "$inputs.classes"],
+        json_schema_extra={
+            "relevant_for": {
+                "task_type": {
+                    "values": TASKS_REQUIRING_OUTPUT_STRUCTURE,
+                    "required": True,
+                },
+            },
+        },
+    )
+    api_key: Union[WorkflowParameterSelector(kind=[STRING_KIND]), str] = Field(
+        description="Your OpenAI API key",
+        examples=["xxx-xxx", "$inputs.openai_api_key"],
+        private=True,
+    )
+    model_version: Union[
+        WorkflowParameterSelector(kind=[STRING_KIND]), Literal["gpt-4o", "gpt-4o-mini"]
+    ] = Field(
+        default="gpt-4o",
+        description="Model to be used",
+        examples=["gpt-4o", "$inputs.openai_model"],
+    )
+    image_detail: Union[
+        WorkflowParameterSelector(kind=[STRING_KIND]), Literal["auto", "high", "low"]
+    ] = Field(
+        default="auto",
+        description="Indicates the image's quality, with 'high' suggesting it is of high resolution and should be processed or displayed with high fidelity.",
+        examples=["auto", "high", "low"],
+    )
+    max_tokens: int = Field(
+        default=450,
+        description="Maximum number of tokens the model can generate in it's response.",
+    )
+    temperature: Optional[
+        Union[float, WorkflowParameterSelector(kind=[FLOAT_KIND])]
+    ] = Field(
+        default=None,
+        description="Temperature to sample from the model - value in range 0.0-2.0, the higher - the more "
+        'random / "creative" the generations are.',
+        ge=0.0,
+        le=2.0,
+    )
+    max_concurrent_requests: Optional[int] = Field(
+        default=None,
+        description="Number of concurrent requests that can be executed by block when batch of input images provided. "
+        "If not given - block defaults to value configured globally in Workflows Execution Engine. "
+        "Please restrict if you hit OpenAI limits.",
+    )
+
+    @model_validator(mode="after")
+    def validate(self) -> "BlockManifest":
+        if self.task_type in TASKS_REQUIRING_PROMPT and self.prompt is None:
+            raise ValueError(
+                f"`prompt` parameter required to be set for task `{self.task_type}`"
+            )
+        if self.task_type in TASKS_REQUIRING_CLASSES and self.classes is None:
+            raise ValueError(
+                f"`classes` parameter required to be set for task `{self.task_type}`"
+            )
+        if (
+            self.task_type in TASKS_REQUIRING_OUTPUT_STRUCTURE
+            and self.output_structure is None
+        ):
+            raise ValueError(
+                f"`output_structure` parameter required to be set for task `{self.task_type}`"
+            )
+        return self
+
+    @classmethod
+    def accepts_batch_input(cls) -> bool:
+        return True
+
+    @classmethod
+    def describe_outputs(cls) -> List[OutputDefinition]:
+        return [
+            OutputDefinition(
+                name="output", kind=[STRING_KIND, LANGUAGE_MODEL_OUTPUT_KIND]
+            ),
+            OutputDefinition(name="classes", kind=[LIST_OF_VALUES_KIND]),
+        ]
+
+    @classmethod
+    def get_execution_engine_compatibility(cls) -> Optional[str]:
+        return ">=1.0.0,<2.0.0"
+
+
+class OpenAIBlockV2(WorkflowBlock):
+
+    def __init__(
+        self,
+        model_manager: ModelManager,
+        api_key: Optional[str],
+    ):
+        self._model_manager = model_manager
+        self._api_key = api_key
+
+    @classmethod
+    def get_init_parameters(cls) -> List[str]:
+        return ["model_manager", "api_key"]
+
+    @classmethod
+    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
+        return BlockManifest
+
+    @classmethod
+    def get_execution_engine_compatibility(cls) -> Optional[str]:
+        return ">=1.0.0,<2.0.0"
+
+    def run(
+        self,
+        images: Batch[WorkflowImageData],
+        task_type: TaskType,
+        prompt: Optional[str],
+        output_structure: Optional[Dict[str, str]],
+        classes: Optional[List[str]],
+        api_key: str,
+        model_version: str,
+        image_detail: Literal["low", "high", "auto"],
+        max_tokens: int,
+        temperature: Optional[float],
+        max_concurrent_requests: Optional[int],
+    ) -> BlockResult:
+        inference_images = [i.to_inference_format() for i in images]
+        raw_outputs = run_gpt_4v_llm_prompting(
+            images=inference_images,
+            task_type=task_type,
+            prompt=prompt,
+            output_structure=output_structure,
+            classes=classes,
+            openai_api_key=api_key,
+            gpt_model_version=model_version,
+            gpt_image_detail=image_detail,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            max_concurrent_requests=max_concurrent_requests,
+        )
+        return [
+            {"output": raw_output, "classes": classes} for raw_output in raw_outputs
+        ]
+
+
+def run_gpt_4v_llm_prompting(
+    images: List[Dict[str, Any]],
+    task_type: TaskType,
+    prompt: Optional[str],
+    output_structure: Optional[Dict[str, str]],
+    classes: Optional[List[str]],
+    openai_api_key: Optional[str],
+    gpt_model_version: str,
+    gpt_image_detail: Literal["auto", "high", "low"],
+    max_tokens: int,
+    temperature: Optional[int],
+    max_concurrent_requests: Optional[int],
+) -> List[str]:
+    if task_type not in PROMPT_BUILDERS:
+        raise ValueError(f"Task type: {task_type} not supported.")
+    gpt4_prompts = []
+    for image in images:
+        loaded_image, _ = load_image(image)
+        base64_image = base64.b64encode(
+            encode_image_to_jpeg_bytes(loaded_image)
+        ).decode("ascii")
+        prompt = PROMPT_BUILDERS[task_type](
+            base64_image=base64_image,
+            prompt=prompt,
+            output_structure=output_structure,
+            classes=classes,
+            gpt_image_detail=gpt_image_detail,
+        )
+        gpt4_prompts.append(prompt)
+    return execute_gpt_4v_requests(
+        openai_api_key=openai_api_key,
+        gpt4_prompts=gpt4_prompts,
+        gpt_model_version=gpt_model_version,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        max_concurrent_requests=max_concurrent_requests,
+    )
+
+
+def execute_gpt_4v_requests(
+    openai_api_key: str,
+    gpt4_prompts: List[List[dict]],
+    gpt_model_version: str,
+    max_tokens: int,
+    temperature: Optional[float],
+    max_concurrent_requests: Optional[int],
+) -> List[str]:
+    client = OpenAI(api_key=openai_api_key)
+    tasks = [
+        partial(
+            execute_gpt_4v_request,
+            client=client,
+            prompt=prompt,
+            gpt_model_version=gpt_model_version,
+            max_tokens=max_tokens,
+            temperature=temperature,
+        )
+        for prompt in gpt4_prompts
+    ]
+    max_workers = (
+        max_concurrent_requests
+        or WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS
+    )
+    return run_in_parallel(
+        tasks=tasks,
+        max_workers=max_workers,
+    )
+
+
+def execute_gpt_4v_request(
+    client: OpenAI,
+    prompt: List[dict],
+    gpt_model_version: str,
+    max_tokens: int,
+    temperature: Optional[float],
+) -> str:
+    if temperature is None:
+        temperature = NOT_GIVEN
+    response = client.chat.completions.create(
+        model=gpt_model_version,
+        messages=prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+    )
+    return response.choices[0].message.content
+
+
+def prepare_unconstrained_prompt(
+    base64_image: str,
+    prompt: str,
+    gpt_image_detail: str,
+    **kwargs,
+) -> List[dict]:
+    return [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{base64_image}",
+                        "detail": gpt_image_detail,
+                    },
+                },
+            ],
+        }
+    ]
+
+
+def prepare_classification_prompt(
+    base64_image: str, classes: List[str], gpt_image_detail: str, **kwargs
+) -> List[dict]:
+    serialised_classes = ", ".join(classes)
+    return [
+        {
+            "role": "system",
+            "content": "You act as single-class classification model. You must provide reasonable predictions. "
+            "You are only allowed to produce JSON document in Markdown ```json [...]``` markers. "
+            'Expected structure of json: {"class_name": "class-name", "confidence": 0.4}. '
+            "`class-name` must be one of the class names defined by user. You are only allowed to return "
+            "single JSON document, even if there are potentially multiple classes. You are not allowed to return list.",
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": f"List of all classes to be recognised by model: {serialised_classes}",
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{base64_image}",
+                        "detail": gpt_image_detail,
+                    },
+                },
+            ],
+        },
+    ]
+
+
+def prepare_multi_label_classification_prompt(
+    base64_image: str, classes: List[str], gpt_image_detail: str, **kwargs
+) -> List[dict]:
+    serialised_classes = ", ".join(classes)
+    return [
+        {
+            "role": "system",
+            "content": "You act as multi-label classification model. You must provide reasonable predictions. "
+            "You are only allowed to produce JSON document in Markdown ```json [...]``` markers. "
+            'Expected structure of json: {"predicted_classes": [{"class": "class-name-1", "confidence": 0.9}, '
+            '{"class": "class-name-2", "confidence": 0.7}]}. '
+            "`class-name-X` must be one of the class names defined by user and `confidence` is a float value in range "
+            "0.0-1.0 that represent how sure you are that the class is present in the image. Only return class names "
+            "that are visible.",
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": f"List of all classes to be recognised by model: {serialised_classes}",
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{base64_image}",
+                        "detail": gpt_image_detail,
+                    },
+                },
+            ],
+        },
+    ]
+
+
+def prepare_vqa_prompt(
+    base64_image: str, prompt: str, gpt_image_detail: str, **kwargs
+) -> List[dict]:
+    return [
+        {
+            "role": "system",
+            "content": "You act as Visual Question Answering model. Your task is to provide answer to question"
+            "submitted by user. If this is open-question - answer with few sentences, for ABCD question, "
+            "return only the indicator of the answer.",
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": f"Question: {prompt}"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{base64_image}",
+                        "detail": gpt_image_detail,
+                    },
+                },
+            ],
+        },
+    ]
+
+
+def prepare_ocr_prompt(
+    base64_image: str, gpt_image_detail: str, **kwargs
+) -> List[dict]:
+    return [
+        {
+            "role": "system",
+            "content": "You act as OCR model. Your task is to read text from the image and return it in "
+            "paragraphs representing the structure of texts in the image. You should only return "
+            "recognised text, nothing else.",
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{base64_image}",
+                        "detail": gpt_image_detail,
+                    },
+                },
+            ],
+        },
+    ]
+
+
+def prepare_caption_prompt(
+    base64_image: str, gpt_image_detail: str, short_description: bool, **kwargs
+) -> List[dict]:
+    caption_detail_level = "Caption should be short."
+    if not short_description:
+        caption_detail_level = "Caption should be extensive."
+    return [
+        {
+            "role": "system",
+            "content": f"You act as image caption model. Your task is to provide description of the image. "
+            f"{caption_detail_level}",
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{base64_image}",
+                        "detail": gpt_image_detail,
+                    },
+                },
+            ],
+        },
+    ]
+
+
+def prepare_structured_answering_prompt(
+    base64_image: str, output_structure: Dict[str, str], gpt_image_detail: str, **kwargs
+) -> List[dict]:
+    output_structure_serialised = json.dumps(output_structure, indent=4)
+    return [
+        {
+            "role": "system",
+            "content": "You are supposed to produce responses in JSON wrapped in Markdown markers: "
+            "```json\nyour-response\n```. User is to provide you dictionary with keys and values. "
+            "Each key must be present in your response. Values in user dictionary represent "
+            "descriptions for JSON fields to be generated. Provide only JSON Markdown in response.",
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": f"Specification of requirements regarding output fields: \n"
+                    f"{output_structure_serialised}",
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{base64_image}",
+                        "detail": gpt_image_detail,
+                    },
+                },
+            ],
+        },
+    ]
+
+
+PROMPT_BUILDERS = {
+    "unconstrained": prepare_unconstrained_prompt,
+    "ocr": prepare_ocr_prompt,
+    "visual-question-answering": prepare_vqa_prompt,
+    "caption": partial(prepare_caption_prompt, short_description=True),
+    "detailed-caption": partial(prepare_caption_prompt, short_description=False),
+    "classification": prepare_classification_prompt,
+    "multi-label-classification": prepare_multi_label_classification_prompt,
+    "structured-answering": prepare_structured_answering_prompt,
+}
diff --git a/inference/core/workflows/execution_engine/entities/types.py b/inference/core/workflows/execution_engine/entities/types.py
index bbca551054..286031e4e0 100644
--- a/inference/core/workflows/execution_engine/entities/types.py
+++ b/inference/core/workflows/execution_engine/entities/types.py
@@ -136,7 +136,7 @@ def __hash__(self) -> int:
 """
 LIST_OF_VALUES_KIND = Kind(
     name="list_of_values",
-    description="List of values of any types",
+    description="List of values of any type",
     docs=LIST_OF_VALUES_KIND_DOCS,
 )
 
@@ -292,7 +292,7 @@ def __hash__(self) -> int:
 """
 CLASSIFICATION_PREDICTION_KIND = Kind(
     name="classification_prediction",
-    description="`'predictions'` key from Classification Model output",
+    description="Predictions from classifier",
     docs=CLASSIFICATION_PREDICTION_KIND_DOCS,
 )
 
@@ -374,9 +374,75 @@ def __hash__(self) -> int:
     confidence=array([    0.84955,     0.74344,     0.45636,     0.86537]), 
     class_id=array([2, 7, 2, 0]), 
     tracker_id=None, 
-    data={'class_name': array(['car', 'truck', 'car', 'car'], dtype='<U13')}
+    data={
+        'class_name': array(['car', 'truck', 'car', 'car'], dtype='<U13')
+        'detection_id': array([
+            '51dfa8d5-261c-4dcb-ab30-9aafe9b52379', 'c0c684d1-1e30-4880-aedd-29e67e417264'
+            '8cfc543b-9cfe-493b-b5ad-77afed7bee83', 'c0c684d1-1e30-4880-aedd-38e67e441454'
+        ], dtype='<U36'),
+        'parent_id': array(['image.[0]', 'image.[0]', 'image.[0]', 'image.[0]'], dtype='<U9'),
+        'image_dimensions': array([[425, 640], [425, 640], [425, 640], [425, 640]]),
+        'inference_id': array([
+            '51dfa8d5-261c-4dcb-ab30-9aafe9b52379', 'c0c684d1-1e30-4880-aedd-29e67e417264'
+            '8cfc543b-9cfe-493b-b5ad-77afed7bee83', 'c0c684d1-1e30-4880-aedd-38e67e441454'
+        ], dtype='<U36'),
+        'prediction_type': array([
+            'object-detection', 'object-detection', 
+            'object-detection', 'object-detection'
+        ], dtype='<U16'),
+        'root_parent_id': array(['image.[0]', 'image.[0]', 'image.[0]', 'image.[0]'], dtype='<U9'),
+        'root_parent_coordinates': array([[0, 0], [0, 0], [0, 0], [0, 0]]),
+        'root_parent_dimensions': array([[425, 640], [425, 640], [425, 640], [425, 640]]),
+        'parent_coordinates': array([[0, 0], [0, 0], [0, 0], [0, 0]]),
+        'parent_dimensions': array([[425, 640], [425, 640], [425, 640], [425, 640]]),
+        'scaling_relative_to_parent': array([1, 1, 1, 1]),
+        'scaling_relative_to_root_parent': array([1, 1, 1, 1]),
+    }
 )
-```
+```   
+
+As you can see, we have extended the standard set of metadata for predictions maintained by `supervision`.
+Adding this metadata is needed to ensure compatibility with blocks from `roboflow_core` plugin.
+
+The design of metadata is suboptimal (as metadata regarding whole image is duplicated across all 
+bounding boxes and there is no way on how to save metadata for empty predictions). We
+have [GH issue](https://github.com/roboflow/inference/issues/567) to communicate around this
+problem.
+
+**Details of additional fields:**
+
+* `detection_id` - unique identifier for each detection, to be used for when dependent elements 
+are created based on specific detection (example: Dynamic Crop takes this value as parent id for new image)
+
+* `parent_id` - identifier of image that generated prediction (to be fetched from `WorkflowImageData` object)
+
+* `image_dimensions` - dimensions of image that was basis for prediction - format: `(height, width)`
+
+* `inference_id` - identifier of inference request (optional, relevant for Roboflow models)
+
+* `prediction_type` - type of prediction
+
+* `root_parent_id` - identifier of primary Workflow input that was responsible for downstream prediction 
+(to be fetched from `WorkflowImageData` object) - usually identifier of Workflow input placeholder 
+
+* `root_parent_coordinates` - offset regarding origin input - format (`offset_x`, `offset_y`)
+
+* `root_parent_dimensions` - dimensions of origin input image `(height, width)`
+
+* `parent_coordinates` - offset regarding parent - format (`offset_x`, `offset_y`)
+
+* `parent_dimensions` - dimensions of parent image `(height, width)`
+
+* `scaling_relative_to_parent` - scaling factor regarding parent image
+
+* `scaling_relative_to_root_parent` - scaling factor regarding origin input image
+
+
+**SERIALISATION:**
+
+Execution Engine behind API will serialise underlying data once selector of this kind is declared as
+Workflow output - serialisation will be executed such that `sv.Detections.from_inference(...)`
+can decode the output. Entity details: [ObjectDetectionInferenceResponse](https://detect.roboflow.com/docs)
 """
 OBJECT_DETECTION_PREDICTION_KIND = Kind(
     name="object_detection_prediction",
@@ -405,9 +471,65 @@ def __hash__(self) -> int:
     confidence=array([    0.95898]), 
     class_id=array([6]), 
     tracker_id=None, 
-    data={'class_name': array(['G'], dtype='<U1')}
+    data={
+        'class_name': array(['G'], dtype='<U1'),
+        'detection_id': array(['51dfa8d5-261c-4dcb-ab30-9aafe9b52379'], dtype='<U36'),
+        'parent_id': array(['image.[0]'], dtype='<U9'),
+        'image_dimensions': array([[425, 640]]),
+        'inference_id': array(['51dfa8d5-261c-4dcb-ab30-9aafe9b52379'], dtype='<U36'),
+        'prediction_type': array(['instance-segmentation'], dtype='<U16'),
+        'root_parent_id': array(['image.[0]'], dtype='<U9'),
+        'root_parent_coordinates': array([[0, 0]]),
+        'root_parent_dimensions': array([[425, 640]]),
+        'parent_coordinates': array([[0, 0]]),
+        'parent_dimensions': array([[425, 640]]),
+        'scaling_relative_to_parent': array([1]),
+        'scaling_relative_to_root_parent': array([1]),
+    }
 )
 ```
+
+As you can see, we have extended the standard set of metadata for predictions maintained by `supervision`.
+Adding this metadata is needed to ensure compatibility with blocks from `roboflow_core` plugin.
+
+The design of metadata is suboptimal (as metadata regarding whole image is duplicated across all 
+bounding boxes and there is no way on how to save metadata for empty predictions). We
+have [GH issue](https://github.com/roboflow/inference/issues/567) to communicate around this
+problem.
+
+**Details of additional fields:**
+
+* `detection_id` - unique identifier for each detection, to be used for when dependent elements 
+are created based on specific detection (example: Dynamic Crop takes this value as parent id for new image)
+
+* `parent_id` - identifier of image that generated prediction (to be fetched from `WorkflowImageData` object)
+
+* `image_dimensions` - dimensions of image that was basis for prediction - format: `(height, width)`
+
+* `inference_id` - identifier of inference request (optional, relevant for Roboflow models)
+
+* `prediction_type` - type of prediction
+
+* `root_parent_id` - identifier of primary Workflow input that was responsible for downstream prediction 
+(to be fetched from `WorkflowImageData` object) - usually identifier of Workflow input placeholder 
+
+* `root_parent_coordinates` - offset regarding origin input - format (`offset_x`, `offset_y`)
+
+* `root_parent_dimensions` - dimensions of origin input image `(height, width)`
+
+* `parent_coordinates` - offset regarding parent - format (`offset_x`, `offset_y`)
+
+* `parent_dimensions` - dimensions of parent image `(height, width)`
+
+* `scaling_relative_to_parent` - scaling factor regarding parent image
+
+* `scaling_relative_to_root_parent` - scaling factor regarding origin input image
+
+**SERIALISATION:**
+
+Execution Engine behind API will serialise underlying data once selector of this kind is declared as
+Workflow output - serialisation will be executed such that `sv.Detections.from_inference(...)`
+can decode the output. Entity details: [InstanceSegmentationInferenceResponse](https://detect.roboflow.com/docs)
 """
 INSTANCE_SEGMENTATION_PREDICTION_KIND = Kind(
     name="instance_segmentation_prediction",
@@ -430,10 +552,78 @@ def __hash__(self) -> int:
     tracker_id=None, 
     data={
         'class_name': array(['G'], dtype='<U1'),
-        # TODO: put details here
+        'detection_id': array(['51dfa8d5-261c-4dcb-ab30-9aafe9b52379'], dtype='<U36'),
+        'parent_id': array(['image.[0]'], dtype='<U9'),
+        'image_dimensions': array([[425, 640]]),
+        'inference_id': array(['51dfa8d5-261c-4dcb-ab30-9aafe9b52379'], dtype='<U36'),
+        'prediction_type': array(['instance-segmentation'], dtype='<U16'),
+        'root_parent_id': array(['image.[0]'], dtype='<U9'),
+        'root_parent_coordinates': array([[0, 0]]),
+        'root_parent_dimensions': array([[425, 640]]),
+        'parent_coordinates': array([[0, 0]]),
+        'parent_dimensions': array([[425, 640]]),
+        'scaling_relative_to_parent': array([1]),
+        'scaling_relative_to_root_parent': array([1]),
+        'keypoints_class_name': array(),  # variable length array of type object - one 1D array of str for each box
+        'keypoints_class_id': array(),  # variable length array of type object - one 1D array of int for each box
+        'keypoints_confidence': array(),  # variable length array of type object - one 1D array of float for each box
+        'keypoints_xy': array(),  # variable length array of type object - one 2D array for bbox with (x, y) coords
     }
 )
 ```
+
+Prior to [sv.Keypoints(...)](https://supervision.roboflow.com/0.21.0/keypoint/core/) we introduced 
+keypoints detection based on [`sv.Detections(...)`](https://supervision.roboflow.com/latest/detection/core/) object.
+The decision was suboptimal so we would need to revert in the future, but for now this is the format of
+data for keypoints detection. 
+
+The design of metadata is also suboptimal (as metadata regarding whole image is duplicated across all 
+bounding boxes and there is no way on how to save metadata for empty predictions). We
+have [GH issue](https://github.com/roboflow/inference/issues/567) to communicate around this
+problem.
+
+**Details of additional fields:**
+
+* `detection_id` - unique identifier for each detection, to be used for when dependent elements 
+are created based on specific detection (example: Dynamic Crop takes this value as parent id for new image)
+
+* `parent_id` - identifier of image that generated prediction (to be fetched from `WorkflowImageData` object)
+
+* `image_dimensions` - dimensions of image that was basis for prediction - format: `(height, width)`
+
+* `inference_id` - identifier of inference request (optional, relevant for Roboflow models)
+
+* `prediction_type` - type of prediction
+
+* `root_parent_id` - identifier of primary Workflow input that was responsible for downstream prediction 
+(to be fetched from `WorkflowImageData` object) - usually identifier of Workflow input placeholder 
+
+* `root_parent_coordinates` - offset regarding origin input - format (`offset_x`, `offset_y`)
+
+* `root_parent_dimensions` - dimensions of origin input image `(height, width)`
+
+* `parent_coordinates` - offset regarding parent - format (`offset_x`, `offset_y`)
+
+* `parent_dimensions` - dimensions of parent image `(height, width)`
+
+* `scaling_relative_to_parent` - scaling factor regarding parent image
+
+* `scaling_relative_to_root_parent` - scaling factor regarding origin input image
+
+* `keypoints_class_name` array of variable size 1D arrays of string with key points class names
+
+* `keypoints_class_id` array of variable size 1D arrays of int with key points class ids
+
+* `keypoints_confidence` array of variable size 1D arrays of float with key points confidence
+
+* `keypoints_xy` array of variable size 2D arrays of coordinates of keypoints in `(x, y)` format
+
+**SERIALISATION:**
+
+Execution Engine behind API will serialise underlying data once selector of this kind is declared as
+Workflow output - serialisation will be executed such that `sv.Detections.from_inference(...)`
+can decode the output, but **loosing keypoints details** - which can be recovered if output 
+JSON field is parsed. Entity details: [KeypointsDetectionInferenceResponse](https://detect.roboflow.com/docs)
 """
 KEYPOINT_DETECTION_PREDICTION_KIND = Kind(
     name="keypoint_detection_prediction",
@@ -441,22 +631,92 @@ def __hash__(self) -> int:
     docs=KEYPOINT_DETECTION_PREDICTION_KIND_DOCS,
 )
 
-QR_CODE_DETECTION_KIND_DOCS = f"""
+QR_CODE_DETECTION_KIND_DOCS = """
 This kind represents batch of predictions regarding QR codes location and data their provide.
 
 Example:
 ```
-# Each prediction in batch is list of dictionaries that contains detected QR codes (detections) and their metadata
-[
-    [
-        {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "qr_code", "class_id": 0.1, "detection_id": "random-uuid", "data": "<qr-code-data>"}},
-        {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "qr_code", "class_id": 0.1, "detection_id": "random-uuid", "data": "<qr-code-data>"}},    ],
-    [
-        {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "qr_code", "class_id": 0.1, "detection_id": "random-uuid", "data": "<qr-code-data>"}},
-        {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "qr_code", "class_id": 0.1, "detection_id": "random-uuid", "data": "<qr-code-data>"}},
-    ]
-]
+sv.Detections(
+    xyxy=array([
+       [        865,       153.5,        1189,       422.5],
+       [      192.5,        77.5,       995.5,       722.5],
+       [        194,          82,         996,         726],
+       [        460,         333,         704,         389]]
+    ), 
+    mask=None, 
+    confidence=array([    1.0, 1.0, 1.0, 1.0]), 
+    class_id=array([2, 7, 2, 0]), 
+    tracker_id=None, 
+    data={
+        'class_name': array(['qr_code', 'qr_code', 'qr_code', 'qr_code'], dtype='<U13')
+        'detection_id': array([
+            '51dfa8d5-261c-4dcb-ab30-9aafe9b52379', 'c0c684d1-1e30-4880-aedd-29e67e417264'
+            '8cfc543b-9cfe-493b-b5ad-77afed7bee83', 'c0c684d1-1e30-4880-aedd-38e67e441454'
+        ], dtype='<U36'),
+        'parent_id': array(['image.[0]', 'image.[0]', 'image.[0]', 'image.[0]'], dtype='<U9'),
+        'image_dimensions': array([[425, 640], [425, 640], [425, 640], [425, 640]]),
+        'inference_id': array([
+            '51dfa8d5-261c-4dcb-ab30-9aafe9b52379', 'c0c684d1-1e30-4880-aedd-29e67e417264'
+            '8cfc543b-9cfe-493b-b5ad-77afed7bee83', 'c0c684d1-1e30-4880-aedd-38e67e441454'
+        ], dtype='<U36'),
+        'prediction_type': array([
+            'qrcode-detection', 'qrcode-detection', 
+            'qrcode-detection', 'qrcode-detection'
+        ], dtype='<U16'),
+        'root_parent_id': array(['image.[0]', 'image.[0]', 'image.[0]', 'image.[0]'], dtype='<U9'),
+        'root_parent_coordinates': array([[0, 0], [0, 0], [0, 0], [0, 0]]),
+        'root_parent_dimensions': array([[425, 640], [425, 640], [425, 640], [425, 640]]),
+        'parent_coordinates': array([[0, 0], [0, 0], [0, 0], [0, 0]]),
+        'parent_dimensions': array([[425, 640], [425, 640], [425, 640], [425, 640]]),
+        'scaling_relative_to_parent': array([1, 1, 1, 1]),
+        'scaling_relative_to_root_parent': array([1, 1, 1, 1]),
+        'data': np.array(['qr-code-1-data', 'qr-code-2-data', 'qr-code-3-data', 'qr-code-4-data'])
+    }
+)
 ```
+
+As you can see, we have extended the standard set of metadata for predictions maintained by `supervision`.
+Adding this metadata is needed to ensure compatibility with blocks from `roboflow_core` plugin.
+
+The design of metadata is suboptimal (as metadata regarding whole image is duplicated across all 
+bounding boxes and there is no way on how to save metadata for empty predictions). We
+have [GH issue](https://github.com/roboflow/inference/issues/567) to communicate around this
+problem.
+
+**Details of additional fields:**
+
+* `detection_id` - unique identifier for each detection, to be used for when dependent elements 
+are created based on specific detection (example: Dynamic Crop takes this value as parent id for new image)
+
+* `parent_id` - identifier of image that generated prediction (to be fetched from `WorkflowImageData` object)
+
+* `image_dimensions` - dimensions of image that was basis for prediction - format: `(height, width)`
+
+* `inference_id` - identifier of inference request (optional, relevant for Roboflow models)
+
+* `prediction_type` - type of prediction
+
+* `root_parent_id` - identifier of primary Workflow input that was responsible for downstream prediction 
+(to be fetched from `WorkflowImageData` object) - usually identifier of Workflow input placeholder 
+
+* `root_parent_coordinates` - offset regarding origin input - format (`offset_x`, `offset_y`)
+
+* `root_parent_dimensions` - dimensions of origin input image `(height, width)`
+
+* `parent_coordinates` - offset regarding parent - format (`offset_x`, `offset_y`)
+
+* `parent_dimensions` - dimensions of parent image `(height, width)`
+
+* `scaling_relative_to_parent` - scaling factor regarding parent image
+
+* `scaling_relative_to_root_parent` - scaling factor regarding origin input image
+
+* `data` - extracted QR code
+
+**SERIALISATION:**
+Execution Engine behind API will serialise underlying data once selector of this kind is declared as
+Workflow output - serialisation will be executed such that `sv.Detections.from_inference(...)`
+can decode the output. Entity details: [ObjectDetectionInferenceResponse](https://detect.roboflow.com/docs)
 """
 QR_CODE_DETECTION_KIND = Kind(
     name="qr_code_detection",
@@ -464,22 +724,92 @@ def __hash__(self) -> int:
     docs=QR_CODE_DETECTION_KIND_DOCS,
 )
 
-BAR_CODE_DETECTION_KIND_DOCS = f"""
+BAR_CODE_DETECTION_KIND_DOCS = """
 This kind represents batch of predictions regarding barcodes location and data their provide.
 
 Example:
 ```
-# Each prediction in batch is list of dictionaries that contains detected barcodes (detections) and their metadata
-[
-    [
-        {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "barcode", "class_id": 0.1, "detection_id": "random-uuid", "data": "<qr-code-data>"}},
-        {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "barcode", "class_id": 0.1, "detection_id": "random-uuid", "data": "<qr-code-data>"}},    ],
-    [
-        {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "barcode", "class_id": 0.1, "detection_id": "random-uuid", "data": "<qr-code-data>"}},
-        {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "barcode", "class_id": 0.1, "detection_id": "random-uuid", "data": "<qr-code-data>"}},
-    ]
-]
+sv.Detections(
+    xyxy=array([
+       [        865,       153.5,        1189,       422.5],
+       [      192.5,        77.5,       995.5,       722.5],
+       [        194,          82,         996,         726],
+       [        460,         333,         704,         389]]
+    ), 
+    mask=None, 
+    confidence=array([    1.0, 1.0, 1.0, 1.0]), 
+    class_id=array([2, 7, 2, 0]), 
+    tracker_id=None, 
+    data={
+        'class_name': array(['barcode', 'barcode', 'barcode', 'barcode'], dtype='<U13')
+        'detection_id': array([
+            '51dfa8d5-261c-4dcb-ab30-9aafe9b52379', 'c0c684d1-1e30-4880-aedd-29e67e417264'
+            '8cfc543b-9cfe-493b-b5ad-77afed7bee83', 'c0c684d1-1e30-4880-aedd-38e67e441454'
+        ], dtype='<U36'),
+        'parent_id': array(['image.[0]', 'image.[0]', 'image.[0]', 'image.[0]'], dtype='<U9'),
+        'image_dimensions': array([[425, 640], [425, 640], [425, 640], [425, 640]]),
+        'inference_id': array([
+            '51dfa8d5-261c-4dcb-ab30-9aafe9b52379', 'c0c684d1-1e30-4880-aedd-29e67e417264'
+            '8cfc543b-9cfe-493b-b5ad-77afed7bee83', 'c0c684d1-1e30-4880-aedd-38e67e441454'
+        ], dtype='<U36'),
+        'prediction_type': array([
+            'barcode-detection', 'barcode-detection', 
+            'barcode-detection', 'barcode-detection'
+        ], dtype='<U16'),
+        'root_parent_id': array(['image.[0]', 'image.[0]', 'image.[0]', 'image.[0]'], dtype='<U9'),
+        'root_parent_coordinates': array([[0, 0], [0, 0], [0, 0], [0, 0]]),
+        'root_parent_dimensions': array([[425, 640], [425, 640], [425, 640], [425, 640]]),
+        'parent_coordinates': array([[0, 0], [0, 0], [0, 0], [0, 0]]),
+        'parent_dimensions': array([[425, 640], [425, 640], [425, 640], [425, 640]]),
+        'scaling_relative_to_parent': array([1, 1, 1, 1]),
+        'scaling_relative_to_root_parent': array([1, 1, 1, 1]),
+        'data': np.array(['qr-code-1-data', 'qr-code-2-data', 'qr-code-3-data', 'qr-code-4-data'])
+    }
+)
 ```
+
+As you can see, we have extended the standard set of metadata for predictions maintained by `supervision`.
+Adding this metadata is needed to ensure compatibility with blocks from `roboflow_core` plugin.
+
+The design of metadata is suboptimal (as metadata regarding whole image is duplicated across all 
+bounding boxes and there is no way on how to save metadata for empty predictions). We
+have [GH issue](https://github.com/roboflow/inference/issues/567) to communicate around this
+problem.
+
+**Details of additional fields:**
+
+* `detection_id` - unique identifier for each detection, to be used for when dependent elements 
+are created based on specific detection (example: Dynamic Crop takes this value as parent id for new image)
+
+* `parent_id` - identifier of image that generated prediction (to be fetched from `WorkflowImageData` object)
+
+* `image_dimensions` - dimensions of image that was basis for prediction - format: `(height, width)`
+
+* `inference_id` - identifier of inference request (optional, relevant for Roboflow models)
+
+* `prediction_type` - type of prediction
+
+* `root_parent_id` - identifier of primary Workflow input that was responsible for downstream prediction 
+(to be fetched from `WorkflowImageData` object) - usually identifier of Workflow input placeholder 
+
+* `root_parent_coordinates` - offset regarding origin input - format (`offset_x`, `offset_y`)
+
+* `root_parent_dimensions` - dimensions of origin input image `(height, width)`
+
+* `parent_coordinates` - offset regarding parent - format (`offset_x`, `offset_y`)
+
+* `parent_dimensions` - dimensions of parent image `(height, width)`
+
+* `scaling_relative_to_parent` - scaling factor regarding parent image
+
+* `scaling_relative_to_root_parent` - scaling factor regarding origin input image
+
+* `data` - extracted barcode
+
+**SERIALISATION:**
+Execution Engine behind API will serialise underlying data once selector of this kind is declared as
+Workflow output - serialisation will be executed such that `sv.Detections.from_inference(...)`
+can decode the output. Entity details: [ObjectDetectionInferenceResponse](https://detect.roboflow.com/docs)
 """
 BAR_CODE_DETECTION_KIND = Kind(
     name="bar_code_detection",
@@ -535,6 +865,23 @@ def __hash__(self) -> int:
 )
 
 
+LANGUAGE_MODEL_OUTPUT_KIND_DOCS = """
+This kind represent output generated by language model. It is Python string, which can be processed 
+by blocks transforming LLMs / VLMs output into structured form.
+
+Examples:
+```
+{"predicted_class": "car", "confidence": 0.7}  # which is example JSON with classification prediction
+"The is A."  # which is example unstructured generation for VQA task 
+``` 
+"""
+
+LANGUAGE_MODEL_OUTPUT_KIND = Kind(
+    name="language_model_output",
+    description="LLM / VLM output",
+    docs=LANGUAGE_MODEL_OUTPUT_KIND_DOCS,
+)
+
 STEP_AS_SELECTED_ELEMENT = "step"
 STEP_OUTPUT_AS_SELECTED_ELEMENT = "step_output"
 
diff --git a/requirements/_requirements.txt b/requirements/_requirements.txt
index 834247e30d..e781f29f1c 100644
--- a/requirements/_requirements.txt
+++ b/requirements/_requirements.txt
@@ -29,3 +29,4 @@ pydot>=2.0.0
 shapely>=2.0.0,<2.1.0
 tldextract~=5.1.2
 packaging~=24.0
+anthropic~=0.34.2
\ No newline at end of file
diff --git a/tests/inference/hosted_platform_tests/conftest.py b/tests/inference/hosted_platform_tests/conftest.py
index faf859864f..95689e7823 100644
--- a/tests/inference/hosted_platform_tests/conftest.py
+++ b/tests/inference/hosted_platform_tests/conftest.py
@@ -79,6 +79,8 @@ class PlatformEnvironment(Enum):
 
 ROBOFLOW_API_KEY = os.environ["HOSTED_PLATFORM_TESTS_API_KEY"]
 OPENAI_KEY = os.getenv("OPENAI_KEY")
+GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
+ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_claude.py b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_claude.py
new file mode 100644
index 0000000000..efaac597bc
--- /dev/null
+++ b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_claude.py
@@ -0,0 +1,242 @@
+import numpy as np
+import pytest
+
+from inference_sdk import InferenceHTTPClient
+from tests.inference.hosted_platform_tests.conftest import (
+    ANTHROPIC_API_KEY,
+    ROBOFLOW_API_KEY,
+)
+
+CLASSIFICATION_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+        {"type": "WorkflowParameter", "name": "classes"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/anthropic_claude@v1",
+            "name": "claude",
+            "images": "$inputs.image",
+            "task_type": "classification",
+            "classes": "$inputs.classes",
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/vlm_as_classifier@v1",
+            "name": "parser",
+            "image": "$inputs.image",
+            "vlm_output": "$steps.claude.output",
+            "classes": "$steps.claude.classes",
+        },
+        {
+            "type": "roboflow_core/property_definition@v1",
+            "name": "top_class",
+            "operations": [
+                {"type": "ClassificationPropertyExtract", "property_name": "top_class"}
+            ],
+            "data": "$steps.parser.predictions",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "claude_result",
+            "selector": "$steps.claude.output",
+        },
+        {
+            "type": "JsonField",
+            "name": "top_class",
+            "selector": "$steps.top_class.output",
+        },
+        {
+            "type": "JsonField",
+            "name": "parsed_prediction",
+            "selector": "$steps.parser.*",
+        },
+    ],
+}
+
+
+@pytest.mark.skipif(ANTHROPIC_API_KEY is None, reason="No Antropic API key provided")
+@pytest.mark.flaky(retries=4, delay=1)
+def test_classification_workflow(
+    object_detection_service_url: str,
+    dogs_image: np.ndarray,
+) -> None:
+    client = InferenceHTTPClient(
+        api_url=object_detection_service_url,
+        api_key=ROBOFLOW_API_KEY,
+    )
+
+    # when
+    result = client.run_workflow(
+        specification=CLASSIFICATION_WORKFLOW,
+        images={
+            "image": dogs_image,
+        },
+        parameters={
+            "api_key": ANTHROPIC_API_KEY,
+            "classes": ["cat", "dog"],
+        },
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {
+        "claude_result",
+        "top_class",
+        "parsed_prediction",
+    }, "Expected all outputs to be delivered"
+    assert (
+        isinstance(result[0]["claude_result"], str)
+        and len(result[0]["claude_result"]) > 0
+    ), "Expected non-empty string generated"
+    assert result[0]["top_class"] == "dog"
+    assert result[0]["parsed_prediction"]["error_status"] is False
+
+
+STRUCTURED_PROMPTING_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/anthropic_claude@v1",
+            "name": "claude",
+            "images": "$inputs.image",
+            "task_type": "structured-answering",
+            "output_structure": {
+                "dogs_count": "count of dogs instances in the image",
+                "cats_count": "count of cats instances in the image",
+            },
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/json_parser@v1",
+            "name": "parser",
+            "raw_json": "$steps.claude.output",
+            "expected_fields": ["dogs_count", "cats_count"],
+        },
+        {
+            "type": "roboflow_core/property_definition@v1",
+            "name": "property_definition",
+            "operations": [{"type": "ToString"}],
+            "data": "$steps.parser.dogs_count",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.property_definition.output",
+        }
+    ],
+}
+
+
+@pytest.mark.skipif(ANTHROPIC_API_KEY is None, reason="No Antropic API key provided")
+@pytest.mark.flaky(retries=4, delay=1)
+def test_structured_parsing_workflow(
+    object_detection_service_url: str,
+    dogs_image: np.ndarray,
+) -> None:
+    client = InferenceHTTPClient(
+        api_url=object_detection_service_url,
+        api_key=ROBOFLOW_API_KEY,
+    )
+
+    # when
+    result = client.run_workflow(
+        specification=STRUCTURED_PROMPTING_WORKFLOW,
+        images={
+            "image": dogs_image,
+        },
+        parameters={
+            "api_key": ANTHROPIC_API_KEY,
+        },
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert result[0]["result"] == "2"
+
+
+OBJECT_DETECTION_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+        {"type": "WorkflowParameter", "name": "classes"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/anthropic_claude@v1",
+            "name": "claude",
+            "images": "$inputs.image",
+            "task_type": "object-detection",
+            "classes": "$inputs.classes",
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/vlm_as_detector@v1",
+            "name": "parser",
+            "vlm_output": "$steps.claude.output",
+            "image": "$inputs.image",
+            "classes": "$steps.claude.classes",
+            "model_type": "anthropic-claude",
+            "task_type": "object-detection",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "claude_result",
+            "selector": "$steps.claude.output",
+        },
+        {
+            "type": "JsonField",
+            "name": "parsed_prediction",
+            "selector": "$steps.parser.predictions",
+        },
+    ],
+}
+
+
+@pytest.mark.skipif(ANTHROPIC_API_KEY is None, reason="No Antropic API key provided")
+@pytest.mark.flaky(retries=4, delay=1)
+def test_object_detection_workflow(
+    object_detection_service_url: str,
+    dogs_image: np.ndarray,
+) -> None:
+    client = InferenceHTTPClient(
+        api_url=object_detection_service_url,
+        api_key=ROBOFLOW_API_KEY,
+    )
+
+    # when
+    result = client.run_workflow(
+        specification=OBJECT_DETECTION_WORKFLOW,
+        images={
+            "image": dogs_image,
+        },
+        parameters={
+            "api_key": ANTHROPIC_API_KEY,
+            "classes": ["cat", "dog"],
+        },
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {
+        "claude_result",
+        "parsed_prediction",
+    }, "Expected all outputs to be delivered"
+    assert [e["class"] for e in result[0]["parsed_prediction"]["predictions"]] == [
+        "dog",
+        "dog",
+    ], "Expected 2 dogs to be detected"
diff --git a/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_gemini.py b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_gemini.py
new file mode 100644
index 0000000000..37044e862c
--- /dev/null
+++ b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_gemini.py
@@ -0,0 +1,242 @@
+import numpy as np
+import pytest
+
+from inference_sdk import InferenceHTTPClient
+from tests.inference.hosted_platform_tests.conftest import (
+    GOOGLE_API_KEY,
+    ROBOFLOW_API_KEY,
+)
+
+CLASSIFICATION_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+        {"type": "WorkflowParameter", "name": "classes"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/google_gemini@v1",
+            "name": "gemini",
+            "images": "$inputs.image",
+            "task_type": "classification",
+            "classes": "$inputs.classes",
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/vlm_as_classifier@v1",
+            "name": "parser",
+            "image": "$inputs.image",
+            "vlm_output": "$steps.gemini.output",
+            "classes": "$steps.gemini.classes",
+        },
+        {
+            "type": "roboflow_core/property_definition@v1",
+            "name": "top_class",
+            "operations": [
+                {"type": "ClassificationPropertyExtract", "property_name": "top_class"}
+            ],
+            "data": "$steps.parser.predictions",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "gemini_result",
+            "selector": "$steps.gemini.output",
+        },
+        {
+            "type": "JsonField",
+            "name": "top_class",
+            "selector": "$steps.top_class.output",
+        },
+        {
+            "type": "JsonField",
+            "name": "parsed_prediction",
+            "selector": "$steps.parser.*",
+        },
+    ],
+}
+
+
+@pytest.mark.skipif(GOOGLE_API_KEY is None, reason="No Google API key provided")
+@pytest.mark.flaky(retries=4, delay=1)
+def test_classification_workflow(
+    object_detection_service_url: str,
+    dogs_image: np.ndarray,
+) -> None:
+    client = InferenceHTTPClient(
+        api_url=object_detection_service_url,
+        api_key=ROBOFLOW_API_KEY,
+    )
+
+    # when
+    result = client.run_workflow(
+        specification=CLASSIFICATION_WORKFLOW,
+        images={
+            "image": dogs_image,
+        },
+        parameters={
+            "api_key": GOOGLE_API_KEY,
+            "classes": ["cat", "dog"],
+        },
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {
+        "gemini_result",
+        "top_class",
+        "parsed_prediction",
+    }, "Expected all outputs to be delivered"
+    assert (
+        isinstance(result[0]["gemini_result"], str)
+        and len(result[0]["gemini_result"]) > 0
+    ), "Expected non-empty string generated"
+    assert result[0]["top_class"] == "dog"
+    assert result[0]["parsed_prediction"]["error_status"] is False
+
+
+STRUCTURED_PROMPTING_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/google_gemini@v1",
+            "name": "gemini",
+            "images": "$inputs.image",
+            "task_type": "structured-answering",
+            "output_structure": {
+                "dogs_count": "count of dogs instances in the image",
+                "cats_count": "count of cats instances in the image",
+            },
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/json_parser@v1",
+            "name": "parser",
+            "raw_json": "$steps.gemini.output",
+            "expected_fields": ["dogs_count", "cats_count"],
+        },
+        {
+            "type": "roboflow_core/property_definition@v1",
+            "name": "property_definition",
+            "operations": [{"type": "ToString"}],
+            "data": "$steps.parser.dogs_count",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.property_definition.output",
+        }
+    ],
+}
+
+
+@pytest.mark.skipif(GOOGLE_API_KEY is None, reason="No Google API key provided")
+@pytest.mark.flaky(retries=4, delay=1)
+def test_structured_parsing_workflow(
+    object_detection_service_url: str,
+    dogs_image: np.ndarray,
+) -> None:
+    client = InferenceHTTPClient(
+        api_url=object_detection_service_url,
+        api_key=ROBOFLOW_API_KEY,
+    )
+
+    # when
+    result = client.run_workflow(
+        specification=STRUCTURED_PROMPTING_WORKFLOW,
+        images={
+            "image": dogs_image,
+        },
+        parameters={
+            "api_key": GOOGLE_API_KEY,
+        },
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert result[0]["result"] == "2"
+
+
+OBJECT_DETECTION_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+        {"type": "WorkflowParameter", "name": "classes"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/google_gemini@v1",
+            "name": "gemini",
+            "images": "$inputs.image",
+            "task_type": "object-detection",
+            "classes": "$inputs.classes",
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/vlm_as_detector@v1",
+            "name": "parser",
+            "vlm_output": "$steps.gemini.output",
+            "image": "$inputs.image",
+            "classes": "$steps.gemini.classes",
+            "model_type": "google-gemini",
+            "task_type": "object-detection",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "gemini_result",
+            "selector": "$steps.gemini.output",
+        },
+        {
+            "type": "JsonField",
+            "name": "parsed_prediction",
+            "selector": "$steps.parser.predictions",
+        },
+    ],
+}
+
+
+@pytest.mark.skipif(GOOGLE_API_KEY is None, reason="No Google API key provided")
+@pytest.mark.flaky(retries=4, delay=1)
+def test_object_detection_workflow(
+    object_detection_service_url: str,
+    dogs_image: np.ndarray,
+) -> None:
+    client = InferenceHTTPClient(
+        api_url=object_detection_service_url,
+        api_key=ROBOFLOW_API_KEY,
+    )
+
+    # when
+    result = client.run_workflow(
+        specification=OBJECT_DETECTION_WORKFLOW,
+        images={
+            "image": dogs_image,
+        },
+        parameters={
+            "api_key": GOOGLE_API_KEY,
+            "classes": ["cat", "dog"],
+        },
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {
+        "gemini_result",
+        "parsed_prediction",
+    }, "Expected all outputs to be delivered"
+    assert [e["class"] for e in result[0]["parsed_prediction"]["predictions"]] == [
+        "dog",
+        "dog",
+    ], "Expected 2 dogs to be detected"
diff --git a/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_openai.py b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_openai.py
index 66ee23cdf5..aaa10f3564 100644
--- a/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_openai.py
+++ b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_openai.py
@@ -95,3 +95,161 @@ def test_image_description_workflow(
         detection_confidences, [0.857235848903656, 0.5132315158843994], atol=1e-4
     ), "Expected predictions to match what was observed while test creation"
     assert len(result[0]["description"]) > 0, "Expected some description"
+
+
+CLASSIFICATION_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+        {"type": "WorkflowParameter", "name": "classes"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/open_ai@v2",
+            "name": "gpt",
+            "images": "$inputs.image",
+            "task_type": "classification",
+            "classes": "$inputs.classes",
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/vlm_as_classifier@v1",
+            "name": "parser",
+            "image": "$inputs.image",
+            "vlm_output": "$steps.gpt.output",
+            "classes": "$steps.gpt.classes",
+        },
+        {
+            "type": "roboflow_core/property_definition@v1",
+            "name": "top_class",
+            "operations": [
+                {"type": "ClassificationPropertyExtract", "property_name": "top_class"}
+            ],
+            "data": "$steps.parser.predictions",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "gpt_result",
+            "selector": "$steps.gpt.output",
+        },
+        {
+            "type": "JsonField",
+            "name": "top_class",
+            "selector": "$steps.top_class.output",
+        },
+        {
+            "type": "JsonField",
+            "name": "parsed_prediction",
+            "selector": "$steps.parser.*",
+        },
+    ],
+}
+
+
+@pytest.mark.skipif(OPENAI_KEY is None, reason="No OpenAI API key provided")
+@pytest.mark.flaky(retries=4, delay=1)
+def test_classification_workflow(
+    object_detection_service_url: str,
+    dogs_image: np.ndarray,
+) -> None:
+    client = InferenceHTTPClient(
+        api_url=object_detection_service_url,
+        api_key=ROBOFLOW_API_KEY,
+    )
+
+    # when
+    result = client.run_workflow(
+        specification=CLASSIFICATION_WORKFLOW,
+        images={
+            "image": dogs_image,
+        },
+        parameters={
+            "api_key": OPENAI_KEY,
+            "classes": ["cat", "dog"],
+        },
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {
+        "gpt_result",
+        "top_class",
+        "parsed_prediction",
+    }, "Expected all outputs to be delivered"
+    assert (
+        isinstance(result[0]["gpt_result"], str) and len(result[0]["gpt_result"]) > 0
+    ), "Expected non-empty string generated"
+    assert result[0]["top_class"] == "dog"
+    assert result[0]["parsed_prediction"]["error_status"] is False
+
+
+STRUCTURED_PROMPTING_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/open_ai@v2",
+            "name": "gpt",
+            "images": "$inputs.image",
+            "task_type": "structured-answering",
+            "output_structure": {
+                "dogs_count": "count of dogs instances in the image",
+                "cats_count": "count of cats instances in the image",
+            },
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/json_parser@v1",
+            "name": "parser",
+            "raw_json": "$steps.gpt.output",
+            "expected_fields": ["dogs_count", "cats_count"],
+        },
+        {
+            "type": "roboflow_core/property_definition@v1",
+            "name": "property_definition",
+            "operations": [{"type": "ToString"}],
+            "data": "$steps.parser.dogs_count",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.property_definition.output",
+        }
+    ],
+}
+
+
+@pytest.mark.skipif(OPENAI_KEY is None, reason="No OpenAI API key provided")
+@pytest.mark.flaky(retries=4, delay=1)
+def test_structured_prompting_workflow(
+    object_detection_service_url: str,
+    dogs_image: np.ndarray,
+) -> None:
+    client = InferenceHTTPClient(
+        api_url=object_detection_service_url,
+        api_key=ROBOFLOW_API_KEY,
+    )
+
+    # when
+    result = client.run_workflow(
+        specification=STRUCTURED_PROMPTING_WORKFLOW,
+        images={
+            "image": dogs_image,
+        },
+        parameters={
+            "api_key": OPENAI_KEY,
+        },
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert result[0]["result"] == "2"
diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_claude_models.py b/tests/workflows/integration_tests/execution/test_workflow_with_claude_models.py
new file mode 100644
index 0000000000..4e244e6b87
--- /dev/null
+++ b/tests/workflows/integration_tests/execution/test_workflow_with_claude_models.py
@@ -0,0 +1,675 @@
+"""
+This test module requires Anthropic AI API key passed via env variable WORKFLOWS_TEST_ANTHROPIC_API_KEY.
+This is supposed to be used only locally, as that would be too much of a cost in CI
+"""
+
+import os
+
+import numpy as np
+import pytest
+
+from inference.core.env import WORKFLOWS_MAX_CONCURRENT_STEPS
+from inference.core.managers.base import ModelManager
+from inference.core.workflows.core_steps.common.entities import StepExecutionMode
+from inference.core.workflows.execution_engine.core import ExecutionEngine
+from tests.workflows.integration_tests.execution.workflows_gallery_collector.decorators import (
+    add_to_workflows_gallery,
+)
+
+ANTHROPIC_API_KEY = os.getenv("WORKFLOWS_TEST_ANTHROPIC_API_KEY")
+
+UNCONSTRAINED_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/anthropic_claude@v1",
+            "name": "claude",
+            "images": "$inputs.image",
+            "task_type": "unconstrained",
+            "prompt": "Give me dominant color of the image",
+            "api_key": "$inputs.api_key",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.claude.output",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Prompting Anthropic Claude with arbitrary prompt",
+    use_case_description="""
+In this example, Anthropic Claude model is prompted with arbitrary text from user 
+    """,
+    workflow_definition=UNCONSTRAINED_WORKFLOW,
+    workflow_name_in_app="claude-arbitrary-prompt",
+)
+@pytest.mark.skipif(
+    condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided"
+)
+def test_workflow_with_unconstrained_prompt(
+    model_manager: ModelManager,
+    dogs_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=UNCONSTRAINED_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [dogs_image],
+            "api_key": ANTHROPIC_API_KEY,
+            "prompt": "What is the topic of the image?",
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert (
+        isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+    ), "Expected non-empty string generated"
+
+
+OCR_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/anthropic_claude@v1",
+            "name": "claude",
+            "images": "$inputs.image",
+            "task_type": "ocr",
+            "api_key": "$inputs.api_key",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.claude.output",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Using Anthropic Claude as OCR model",
+    use_case_description="""
+In this example, Anthropic Claude model is used as OCR system. User just points task type and do not need to provide
+any prompt.
+    """,
+    workflow_definition=OCR_WORKFLOW,
+    workflow_name_in_app="claude-ocr",
+)
+@pytest.mark.skipif(
+    condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided"
+)
+def test_workflow_with_ocr_prompt(
+    model_manager: ModelManager,
+    license_plate_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=OCR_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [license_plate_image],
+            "api_key": ANTHROPIC_API_KEY,
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert (
+        isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+    ), "Expected non-empty string generated"
+
+
+VQA_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+        {"type": "WorkflowParameter", "name": "prompt"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/anthropic_claude@v1",
+            "name": "claude",
+            "images": "$inputs.image",
+            "task_type": "visual-question-answering",
+            "prompt": "$inputs.prompt",
+            "api_key": "$inputs.api_key",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.claude.output",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Using Anthropic Claude as Visual Question Answering system",
+    use_case_description="""
+In this example, Anthropic Claude model is used as VQA system. User provides question via prompt.
+    """,
+    workflow_definition=VQA_WORKFLOW,
+    workflow_name_in_app="claude-vqa",
+)
+@pytest.mark.skipif(
+    condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided"
+)
+def test_workflow_with_vqa_prompt(
+    model_manager: ModelManager,
+    license_plate_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=VQA_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [license_plate_image],
+            "api_key": ANTHROPIC_API_KEY,
+            "prompt": "What are the brands of the cars?",
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert (
+        isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+    ), "Expected non-empty string generated"
+
+
+CAPTION_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/anthropic_claude@v1",
+            "name": "claude",
+            "images": "$inputs.image",
+            "task_type": "caption",
+            "api_key": "$inputs.api_key",
+            "temperature": 1.0,
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.claude.output",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Using Anthropic Claude as Image Captioning system",
+    use_case_description="""
+In this example, Anthropic Claude model is used as Image Captioning system.
+    """,
+    workflow_definition=CAPTION_WORKFLOW,
+    workflow_name_in_app="claude-captioning",
+)
+@pytest.mark.skipif(
+    condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided"
+)
+def test_workflow_with_captioning_prompt(
+    model_manager: ModelManager,
+    license_plate_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=CAPTION_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [license_plate_image],
+            "api_key": ANTHROPIC_API_KEY,
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert (
+        isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+    ), "Expected non-empty string generated"
+
+
+CLASSIFICATION_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+        {"type": "WorkflowParameter", "name": "classes"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/anthropic_claude@v1",
+            "name": "claude",
+            "images": "$inputs.image",
+            "task_type": "classification",
+            "classes": "$inputs.classes",
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/vlm_as_classifier@v1",
+            "name": "parser",
+            "image": "$inputs.image",
+            "vlm_output": "$steps.claude.output",
+            "classes": "$steps.claude.classes",
+        },
+        {
+            "type": "roboflow_core/property_definition@v1",
+            "name": "top_class",
+            "operations": [
+                {"type": "ClassificationPropertyExtract", "property_name": "top_class"}
+            ],
+            "data": "$steps.parser.predictions",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "claude_result",
+            "selector": "$steps.claude.output",
+        },
+        {
+            "type": "JsonField",
+            "name": "top_class",
+            "selector": "$steps.top_class.output",
+        },
+        {
+            "type": "JsonField",
+            "name": "parsed_prediction",
+            "selector": "$steps.parser.*",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Using Anthropic Claude as multi-class classifier",
+    use_case_description="""
+In this example, Anthropic Claude model is used as classifier. Output from the model is parsed by
+special `roboflow_core/vlm_as_classifier@v1` block which turns model output text into
+full-blown prediction, which can later be used by other blocks compatible with 
+classification predictions - in this case we extract top-class property.
+    """,
+    workflow_definition=CLASSIFICATION_WORKFLOW,
+    workflow_name_in_app="claude-multi-class-classifier",
+)
+@pytest.mark.skipif(
+    condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided"
+)
+def test_workflow_with_multi_class_classifier_prompt(
+    model_manager: ModelManager,
+    dogs_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=CLASSIFICATION_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [dogs_image],
+            "api_key": ANTHROPIC_API_KEY,
+            "classes": ["cat", "dog"],
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {
+        "claude_result",
+        "top_class",
+        "parsed_prediction",
+    }, "Expected all outputs to be delivered"
+    assert (
+        isinstance(result[0]["claude_result"], str)
+        and len(result[0]["claude_result"]) > 0
+    ), "Expected non-empty string generated"
+    assert result[0]["top_class"] == "dog"
+    assert result[0]["parsed_prediction"]["error_status"] is False
+
+
+MULTI_LABEL_CLASSIFICATION_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+        {"type": "WorkflowParameter", "name": "classes"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/anthropic_claude@v1",
+            "name": "claude",
+            "images": "$inputs.image",
+            "task_type": "multi-label-classification",
+            "classes": "$inputs.classes",
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/vlm_as_classifier@v1",
+            "name": "parser",
+            "image": "$inputs.image",  # requires image input to construct valid output compatible with "inference"
+            "vlm_output": "$steps.claude.output",
+            "classes": "$steps.claude.classes",
+        },
+        {
+            "type": "roboflow_core/property_definition@v1",
+            "name": "top_class",
+            "operations": [
+                {"type": "ClassificationPropertyExtract", "property_name": "top_class"}
+            ],
+            "data": "$steps.parser.predictions",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.top_class.output",
+        },
+        {
+            "type": "JsonField",
+            "name": "parsed_prediction",
+            "selector": "$steps.parser.*",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Using Anthropic Claude as multi-label classifier",
+    use_case_description="""
+In this example, Anthropic Claude model is used as multi-label classifier. Output from the model is parsed by
+special `roboflow_core/vlm_as_classifier@v1` block which turns model output text into
+full-blown prediction, which can later be used by other blocks compatible with 
+classification predictions - in this case we extract top-class property.
+    """,
+    workflow_definition=MULTI_LABEL_CLASSIFICATION_WORKFLOW,
+    workflow_name_in_app="claude-multi-label-classifier",
+)
+@pytest.mark.skipif(
+    condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided"
+)
+def test_workflow_with_multi_label_classifier_prompt(
+    model_manager: ModelManager,
+    dogs_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=MULTI_LABEL_CLASSIFICATION_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [dogs_image],
+            "api_key": ANTHROPIC_API_KEY,
+            "classes": ["cat", "dog"],
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {
+        "result",
+        "parsed_prediction",
+    }, "Expected all outputs to be delivered"
+    assert result[0]["result"] == ["dog"]
+    assert result[0]["parsed_prediction"]["error_status"] is False
+
+
+STRUCTURED_PROMPTING_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/anthropic_claude@v1",
+            "name": "claude",
+            "images": "$inputs.image",
+            "task_type": "structured-answering",
+            "output_structure": {
+                "dogs_count": "count of dogs instances in the image",
+                "cats_count": "count of cats instances in the image",
+            },
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/json_parser@v1",
+            "name": "parser",
+            "raw_json": "$steps.claude.output",
+            "expected_fields": ["dogs_count", "cats_count"],
+        },
+        {
+            "type": "roboflow_core/property_definition@v1",
+            "name": "property_definition",
+            "operations": [{"type": "ToString"}],
+            "data": "$steps.parser.dogs_count",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.property_definition.output",
+        }
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Using Anthropic Claude to provide structured JSON",
+    use_case_description="""
+In this example, Anthropic Claude model is expected to provide structured output in JSON, which can later be
+parsed by dedicated `roboflow_core/json_parser@v1` block which transforms string into dictionary 
+and expose it's keys to other blocks for further processing. In this case, parsed output is
+transformed using `roboflow_core/property_definition@v1` block.
+    """,
+    workflow_definition=STRUCTURED_PROMPTING_WORKFLOW,
+    workflow_name_in_app="claude-structured-prompting",
+)
+@pytest.mark.skipif(
+    condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided"
+)
+def test_workflow_with_structured_prompt(
+    model_manager: ModelManager,
+    dogs_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=STRUCTURED_PROMPTING_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [dogs_image],
+            "api_key": ANTHROPIC_API_KEY,
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert result[0]["result"] == "2"
+
+
+OBJECT_DETECTION_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+        {"type": "WorkflowParameter", "name": "classes"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/anthropic_claude@v1",
+            "name": "claude",
+            "images": "$inputs.image",
+            "task_type": "object-detection",
+            "classes": "$inputs.classes",
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/vlm_as_detector@v1",
+            "name": "parser",
+            "vlm_output": "$steps.claude.output",
+            "image": "$inputs.image",
+            "classes": "$steps.claude.classes",
+            "model_type": "anthropic-claude",
+            "task_type": "object-detection",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "claude_result",
+            "selector": "$steps.claude.output",
+        },
+        {
+            "type": "JsonField",
+            "name": "parsed_prediction",
+            "selector": "$steps.parser.predictions",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Using Anthropic Claude as object-detection model",
+    use_case_description="""
+In this example, Anthropic Claude model is expected to provide output, which can later be
+parsed by dedicated `roboflow_core/vlm_as_detector@v1` block which transforms string into `sv.Detections`, 
+which can later be used by other blocks processing object-detection predictions.
+    """,
+    workflow_definition=OBJECT_DETECTION_WORKFLOW,
+    workflow_name_in_app="claude-object-detection",
+)
+@pytest.mark.skipif(
+    condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided"
+)
+def test_workflow_with_object_detection_prompt(
+    model_manager: ModelManager,
+    dogs_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=OBJECT_DETECTION_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [dogs_image],
+            "api_key": ANTHROPIC_API_KEY,
+            "classes": ["cat", "dog"],
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {
+        "claude_result",
+        "parsed_prediction",
+    }, "Expected all outputs to be delivered"
+    assert result[0]["parsed_prediction"].data["class_name"].tolist() == [
+        "dog",
+        "dog",
+    ], "Expected 2 dogs to be detected"
diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_detections_consensus_block.py b/tests/workflows/integration_tests/execution/test_workflow_with_detections_consensus_block.py
index e071d73974..6916fe8d40 100644
--- a/tests/workflows/integration_tests/execution/test_workflow_with_detections_consensus_block.py
+++ b/tests/workflows/integration_tests/execution/test_workflow_with_detections_consensus_block.py
@@ -15,7 +15,11 @@
     "version": "1.0",
     "inputs": [
         {"type": "WorkflowImage", "name": "image"},
-        {"type": "WorkflowParameter", "name": "model_id"},
+        {
+            "type": "WorkflowParameter",
+            "name": "model_id",
+            "default_value": "yolov8n-640",
+        },
     ],
     "steps": [
         {
@@ -228,31 +232,6 @@ def test_consensus_workflow_when_confidence_is_restricted_by_input_parameter(
     ), "Expected confidences to match what was validated manually as workflow outcome"
 
 
-def test_consensus_workflow_when_model_id_not_provided_in_input(
-    model_manager: ModelManager,
-    crowd_image: np.ndarray,
-) -> None:
-    # given
-    workflow_init_parameters = {
-        "workflows_core.model_manager": model_manager,
-        "workflows_core.api_key": None,
-        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
-    }
-    execution_engine = ExecutionEngine.init(
-        workflow_definition=CONSENSUS_WORKFLOW,
-        init_parameters=workflow_init_parameters,
-        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
-    )
-
-    # when
-    with pytest.raises(RuntimeInputError):
-        _ = execution_engine.run(
-            runtime_parameters={
-                "image": crowd_image,
-            }
-        )
-
-
 def test_consensus_workflow_when_image_not_provided_in_input(
     model_manager: ModelManager,
 ) -> None:
diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_gemini_models.py b/tests/workflows/integration_tests/execution/test_workflow_with_gemini_models.py
new file mode 100644
index 0000000000..97943e1a3e
--- /dev/null
+++ b/tests/workflows/integration_tests/execution/test_workflow_with_gemini_models.py
@@ -0,0 +1,675 @@
+"""
+This test module requires Google AI API key passed via env variable WORKFLOWS_TEST_GOOGLE_API_KEY.
+This is supposed to be used only locally, as that would be too much of a cost in CI
+"""
+
+import os
+
+import numpy as np
+import pytest
+
+from inference.core.env import WORKFLOWS_MAX_CONCURRENT_STEPS
+from inference.core.managers.base import ModelManager
+from inference.core.workflows.core_steps.common.entities import StepExecutionMode
+from inference.core.workflows.execution_engine.core import ExecutionEngine
+from tests.workflows.integration_tests.execution.workflows_gallery_collector.decorators import (
+    add_to_workflows_gallery,
+)
+
+GOOGLE_API_KEY = os.getenv("WORKFLOWS_TEST_GOOGLE_API_KEY")
+
+UNCONSTRAINED_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/google_gemini@v1",
+            "name": "gemini",
+            "images": "$inputs.image",
+            "task_type": "unconstrained",
+            "prompt": "Give me dominant color of the image",
+            "api_key": "$inputs.api_key",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.gemini.output",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Prompting Google's Gemini with arbitrary prompt",
+    use_case_description="""
+In this example, Google's Gemini model is prompted with arbitrary text from user 
+    """,
+    workflow_definition=UNCONSTRAINED_WORKFLOW,
+    workflow_name_in_app="gemini-arbitrary-prompt",
+)
+@pytest.mark.skipif(
+    condition=GOOGLE_API_KEY is None, reason="Google API key not provided"
+)
+def test_workflow_with_unconstrained_prompt(
+    model_manager: ModelManager,
+    dogs_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=UNCONSTRAINED_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [dogs_image],
+            "api_key": GOOGLE_API_KEY,
+            "prompt": "What is the topic of the image?",
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert (
+        isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+    ), "Expected non-empty string generated"
+
+
+OCR_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/google_gemini@v1",
+            "name": "gemini",
+            "images": "$inputs.image",
+            "task_type": "ocr",
+            "api_key": "$inputs.api_key",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.gemini.output",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Using Google's Gemini as OCR model",
+    use_case_description="""
+In this example, Google's Gemini model is used as OCR system. User just points task type and do not need to provide
+any prompt.
+    """,
+    workflow_definition=OCR_WORKFLOW,
+    workflow_name_in_app="gemini-ocr",
+)
+@pytest.mark.skipif(
+    condition=GOOGLE_API_KEY is None, reason="Google API key not provided"
+)
+def test_workflow_with_ocr_prompt(
+    model_manager: ModelManager,
+    license_plate_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=OCR_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [license_plate_image],
+            "api_key": GOOGLE_API_KEY,
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert (
+        isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+    ), "Expected non-empty string generated"
+
+
+VQA_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+        {"type": "WorkflowParameter", "name": "prompt"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/google_gemini@v1",
+            "name": "gemini",
+            "images": "$inputs.image",
+            "task_type": "visual-question-answering",
+            "prompt": "$inputs.prompt",
+            "api_key": "$inputs.api_key",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.gemini.output",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Using Google's Gemini as Visual Question Answering system",
+    use_case_description="""
+In this example, Google's Gemini model is used as VQA system. User provides question via prompt.
+    """,
+    workflow_definition=VQA_WORKFLOW,
+    workflow_name_in_app="gemini-vqa",
+)
+@pytest.mark.skipif(
+    condition=GOOGLE_API_KEY is None, reason="Google API key not provided"
+)
+def test_workflow_with_vqa_prompt(
+    model_manager: ModelManager,
+    license_plate_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=VQA_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [license_plate_image],
+            "api_key": GOOGLE_API_KEY,
+            "prompt": "What are the brands of the cars?",
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert (
+        isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+    ), "Expected non-empty string generated"
+
+
+CAPTION_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/google_gemini@v1",
+            "name": "gemini",
+            "images": "$inputs.image",
+            "task_type": "caption",
+            "api_key": "$inputs.api_key",
+            "temperature": 1.0,
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.gemini.output",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Using Google's Gemini as Image Captioning system",
+    use_case_description="""
+In this example, Google's Gemini model is used as Image Captioning system.
+    """,
+    workflow_definition=CAPTION_WORKFLOW,
+    workflow_name_in_app="gemini-captioning",
+)
+@pytest.mark.skipif(
+    condition=GOOGLE_API_KEY is None, reason="Google API key not provided"
+)
+def test_workflow_with_captioning_prompt(
+    model_manager: ModelManager,
+    license_plate_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=CAPTION_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [license_plate_image],
+            "api_key": GOOGLE_API_KEY,
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert (
+        isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+    ), "Expected non-empty string generated"
+
+
+CLASSIFICATION_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+        {"type": "WorkflowParameter", "name": "classes"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/google_gemini@v1",
+            "name": "gemini",
+            "images": "$inputs.image",
+            "task_type": "classification",
+            "classes": "$inputs.classes",
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/vlm_as_classifier@v1",
+            "name": "parser",
+            "image": "$inputs.image",
+            "vlm_output": "$steps.gemini.output",
+            "classes": "$steps.gemini.classes",
+        },
+        {
+            "type": "roboflow_core/property_definition@v1",
+            "name": "top_class",
+            "operations": [
+                {"type": "ClassificationPropertyExtract", "property_name": "top_class"}
+            ],
+            "data": "$steps.parser.predictions",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "gemini_result",
+            "selector": "$steps.gemini.output",
+        },
+        {
+            "type": "JsonField",
+            "name": "top_class",
+            "selector": "$steps.top_class.output",
+        },
+        {
+            "type": "JsonField",
+            "name": "parsed_prediction",
+            "selector": "$steps.parser.*",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Using Google's Gemini as multi-class classifier",
+    use_case_description="""
+In this example, Google's Gemini model is used as classifier. Output from the model is parsed by
+special `roboflow_core/vlm_as_classifier@v1` block which turns model output text into
+full-blown prediction, which can later be used by other blocks compatible with 
+classification predictions - in this case we extract top-class property.
+    """,
+    workflow_definition=CLASSIFICATION_WORKFLOW,
+    workflow_name_in_app="gemini-multi-class-classifier",
+)
+@pytest.mark.skipif(
+    condition=GOOGLE_API_KEY is None, reason="Google API key not provided"
+)
+def test_workflow_with_multi_class_classifier_prompt(
+    model_manager: ModelManager,
+    dogs_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=CLASSIFICATION_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [dogs_image],
+            "api_key": GOOGLE_API_KEY,
+            "classes": ["cat", "dog"],
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {
+        "gemini_result",
+        "top_class",
+        "parsed_prediction",
+    }, "Expected all outputs to be delivered"
+    assert (
+        isinstance(result[0]["gemini_result"], str)
+        and len(result[0]["gemini_result"]) > 0
+    ), "Expected non-empty string generated"
+    assert result[0]["top_class"] == "dog"
+    assert result[0]["parsed_prediction"]["error_status"] is False
+
+
+MULTI_LABEL_CLASSIFICATION_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+        {"type": "WorkflowParameter", "name": "classes"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/google_gemini@v1",
+            "name": "gemini",
+            "images": "$inputs.image",
+            "task_type": "multi-label-classification",
+            "classes": "$inputs.classes",
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/vlm_as_classifier@v1",
+            "name": "parser",
+            "image": "$inputs.image",
+            "vlm_output": "$steps.gemini.output",
+            "classes": "$steps.gemini.classes",
+        },
+        {
+            "type": "roboflow_core/property_definition@v1",
+            "name": "top_class",
+            "operations": [
+                {"type": "ClassificationPropertyExtract", "property_name": "top_class"}
+            ],
+            "data": "$steps.parser.predictions",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.top_class.output",
+        },
+        {
+            "type": "JsonField",
+            "name": "parsed_prediction",
+            "selector": "$steps.parser.*",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Using Google's Gemini as multi-label classifier",
+    use_case_description="""
+In this example, Google's Gemini model is used as multi-label classifier. Output from the model is parsed by
+special `roboflow_core/vlm_as_classifier@v1` block which turns model output text into
+full-blown prediction, which can later be used by other blocks compatible with 
+classification predictions - in this case we extract top-class property.
+    """,
+    workflow_definition=MULTI_LABEL_CLASSIFICATION_WORKFLOW,
+    workflow_name_in_app="gemini-multi-label-classifier",
+)
+@pytest.mark.skipif(
+    condition=GOOGLE_API_KEY is None, reason="Google API key not provided"
+)
+def test_workflow_with_multi_label_classifier_prompt(
+    model_manager: ModelManager,
+    dogs_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=MULTI_LABEL_CLASSIFICATION_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [dogs_image],
+            "api_key": GOOGLE_API_KEY,
+            "classes": ["cat", "dog"],
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {
+        "result",
+        "parsed_prediction",
+    }, "Expected all outputs to be delivered"
+    assert result[0]["result"] == ["dog"]
+    assert result[0]["parsed_prediction"]["error_status"] is False
+
+
+STRUCTURED_PROMPTING_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/google_gemini@v1",
+            "name": "gemini",
+            "images": "$inputs.image",
+            "task_type": "structured-answering",
+            "output_structure": {
+                "dogs_count": "count of dogs instances in the image",
+                "cats_count": "count of cats instances in the image",
+            },
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/json_parser@v1",
+            "name": "parser",
+            "raw_json": "$steps.gemini.output",
+            "expected_fields": ["dogs_count", "cats_count"],
+        },
+        {
+            "type": "roboflow_core/property_definition@v1",
+            "name": "property_definition",
+            "operations": [{"type": "ToString"}],
+            "data": "$steps.parser.dogs_count",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.property_definition.output",
+        }
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Using Google's Gemini to provide structured JSON",
+    use_case_description="""
+In this example, Google's Gemini model is expected to provide structured output in JSON, which can later be
+parsed by dedicated `roboflow_core/json_parser@v1` block which transforms string into dictionary 
+and expose it's keys to other blocks for further processing. In this case, parsed output is
+transformed using `roboflow_core/property_definition@v1` block.
+    """,
+    workflow_definition=STRUCTURED_PROMPTING_WORKFLOW,
+    workflow_name_in_app="gemini-structured-prompting",
+)
+@pytest.mark.skipif(
+    condition=GOOGLE_API_KEY is None, reason="Google API key not provided"
+)
+def test_workflow_with_structured_prompt(
+    model_manager: ModelManager,
+    dogs_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=STRUCTURED_PROMPTING_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [dogs_image],
+            "api_key": GOOGLE_API_KEY,
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert result[0]["result"] == "2"
+
+
+OBJECT_DETECTION_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+        {"type": "WorkflowParameter", "name": "classes"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/google_gemini@v1",
+            "name": "gemini",
+            "images": "$inputs.image",
+            "task_type": "object-detection",
+            "classes": "$inputs.classes",
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/vlm_as_detector@v1",
+            "name": "parser",
+            "vlm_output": "$steps.gemini.output",
+            "image": "$inputs.image",
+            "classes": "$steps.gemini.classes",
+            "model_type": "google-gemini",
+            "task_type": "object-detection",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "gemini_result",
+            "selector": "$steps.gemini.output",
+        },
+        {
+            "type": "JsonField",
+            "name": "parsed_prediction",
+            "selector": "$steps.parser.predictions",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Using Google's Gemini as object-detection model",
+    use_case_description="""
+In this example, Google's Gemini model is expected to provide output, which can later be
+parsed by dedicated `roboflow_core/vlm_as_detector@v1` block which transforms string into `sv.Detections`, 
+which can later be used by other blocks processing object-detection predictions.
+    """,
+    workflow_definition=OBJECT_DETECTION_WORKFLOW,
+    workflow_name_in_app="gemini-object-detection",
+)
+@pytest.mark.skipif(
+    condition=GOOGLE_API_KEY is None, reason="Google API key not provided"
+)
+def test_workflow_with_object_detection_prompt(
+    model_manager: ModelManager,
+    dogs_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=OBJECT_DETECTION_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [dogs_image],
+            "api_key": GOOGLE_API_KEY,
+            "classes": ["cat", "dog"],
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {
+        "gemini_result",
+        "parsed_prediction",
+    }, "Expected all outputs to be delivered"
+    assert result[0]["parsed_prediction"].data["class_name"].tolist() == [
+        "dog",
+        "dog",
+    ], "Expected 2 dogs to be detected"
diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_model_running_on_absolute_static_crop.py b/tests/workflows/integration_tests/execution/test_workflow_with_model_running_on_absolute_static_crop.py
index 02e4d1db62..db9312d59a 100644
--- a/tests/workflows/integration_tests/execution/test_workflow_with_model_running_on_absolute_static_crop.py
+++ b/tests/workflows/integration_tests/execution/test_workflow_with_model_running_on_absolute_static_crop.py
@@ -15,7 +15,11 @@
     "version": "1.0",
     "inputs": [
         {"type": "WorkflowImage", "name": "image"},
-        {"type": "WorkflowParameter", "name": "model_id", "default_value": "yolov8n-640"},
+        {
+            "type": "WorkflowParameter",
+            "name": "model_id",
+            "default_value": "yolov8n-640",
+        },
         {"type": "WorkflowParameter", "name": "confidence", "default_value": 0.7},
         {"type": "WorkflowParameter", "name": "x_center"},
         {"type": "WorkflowParameter", "name": "y_center"},
diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_open_ai_models.py b/tests/workflows/integration_tests/execution/test_workflow_with_open_ai_models.py
new file mode 100644
index 0000000000..b37850ff2d
--- /dev/null
+++ b/tests/workflows/integration_tests/execution/test_workflow_with_open_ai_models.py
@@ -0,0 +1,584 @@
+"""
+This test module requires OpenAI API key passed via env variable WORKFLOWS_TEST_OPEN_AI_KEY.
+This is supposed to be used only locally, as that would be too much of a cost in CI
+"""
+
+import os
+
+import numpy as np
+import pytest
+
+from inference.core.env import WORKFLOWS_MAX_CONCURRENT_STEPS
+from inference.core.managers.base import ModelManager
+from inference.core.workflows.core_steps.common.entities import StepExecutionMode
+from inference.core.workflows.execution_engine.core import ExecutionEngine
+from tests.workflows.integration_tests.execution.workflows_gallery_collector.decorators import (
+    add_to_workflows_gallery,
+)
+
+OPEN_AI_API_KEY = os.getenv("WORKFLOWS_TEST_OPEN_AI_KEY")
+
+UNCONSTRAINED_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+        {"type": "WorkflowParameter", "name": "prompt"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/open_ai@v2",
+            "name": "gpt",
+            "images": "$inputs.image",
+            "task_type": "unconstrained",
+            "prompt": "$inputs.prompt",
+            "api_key": "$inputs.api_key",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.gpt.output",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Prompting GPT with arbitrary prompt",
+    use_case_description="""
+In this example, GPT model is prompted with arbitrary text from user 
+    """,
+    workflow_definition=UNCONSTRAINED_WORKFLOW,
+    workflow_name_in_app="gpt-arbitrary-prompt",
+)
+@pytest.mark.skipif(
+    condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided"
+)
+def test_workflow_with_unconstrained_prompt(
+    model_manager: ModelManager,
+    dogs_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=UNCONSTRAINED_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [dogs_image],
+            "api_key": OPEN_AI_API_KEY,
+            "prompt": "What is the topic of the image?",
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert (
+        isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+    ), "Expected non-empty string generated"
+
+
+OCR_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/open_ai@v2",
+            "name": "gpt",
+            "images": "$inputs.image",
+            "task_type": "ocr",
+            "api_key": "$inputs.api_key",
+            "model_version": "gpt-4o-mini",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.gpt.output",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Using GPT as OCR model",
+    use_case_description="""
+In this example, GPT model is used as OCR system. User just points task type and do not need to provide
+any prompt.
+    """,
+    workflow_definition=OCR_WORKFLOW,
+    workflow_name_in_app="gpt-ocr",
+)
+@pytest.mark.skipif(
+    condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided"
+)
+def test_workflow_with_ocr_prompt(
+    model_manager: ModelManager,
+    license_plate_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=OCR_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [license_plate_image],
+            "api_key": OPEN_AI_API_KEY,
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert (
+        isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+    ), "Expected non-empty string generated"
+
+
+VQA_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+        {"type": "WorkflowParameter", "name": "prompt"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/open_ai@v2",
+            "name": "gpt",
+            "images": "$inputs.image",
+            "task_type": "visual-question-answering",
+            "prompt": "$inputs.prompt",
+            "api_key": "$inputs.api_key",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.gpt.output",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Using GPT as Visual Question Answering system",
+    use_case_description="""
+In this example, GPT model is used as VQA system. User provides question via prompt.
+    """,
+    workflow_definition=VQA_WORKFLOW,
+    workflow_name_in_app="gpt-vqa",
+)
+@pytest.mark.skipif(
+    condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided"
+)
+def test_workflow_with_vqa_prompt(
+    model_manager: ModelManager,
+    license_plate_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=VQA_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [license_plate_image],
+            "api_key": OPEN_AI_API_KEY,
+            "prompt": "What are the brands of the cars?",
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert (
+        isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+    ), "Expected non-empty string generated"
+
+
+CAPTION_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/open_ai@v2",
+            "name": "gpt",
+            "images": "$inputs.image",
+            "task_type": "caption",
+            "api_key": "$inputs.api_key",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.gpt.output",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Using GPT as Image Captioning system",
+    use_case_description="""
+In this example, GPT model is used as Image Captioning system.
+    """,
+    workflow_definition=CAPTION_WORKFLOW,
+    workflow_name_in_app="gpt-captioning",
+)
+@pytest.mark.skipif(
+    condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided"
+)
+def test_workflow_with_captioning_prompt(
+    model_manager: ModelManager,
+    license_plate_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=CAPTION_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [license_plate_image],
+            "api_key": OPEN_AI_API_KEY,
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert (
+        isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0
+    ), "Expected non-empty string generated"
+
+
+CLASSIFICATION_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+        {"type": "WorkflowParameter", "name": "classes"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/open_ai@v2",
+            "name": "gpt",
+            "images": "$inputs.image",
+            "task_type": "classification",
+            "classes": "$inputs.classes",
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/vlm_as_classifier@v1",
+            "name": "parser",
+            "image": "$inputs.image",
+            "vlm_output": "$steps.gpt.output",
+            "classes": "$steps.gpt.classes",
+        },
+        {
+            "type": "roboflow_core/property_definition@v1",
+            "name": "top_class",
+            "operations": [
+                {"type": "ClassificationPropertyExtract", "property_name": "top_class"}
+            ],
+            "data": "$steps.parser.predictions",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "gpt_result",
+            "selector": "$steps.gpt.output",
+        },
+        {
+            "type": "JsonField",
+            "name": "top_class",
+            "selector": "$steps.top_class.output",
+        },
+        {
+            "type": "JsonField",
+            "name": "parsed_prediction",
+            "selector": "$steps.parser.*",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Using GPT as multi-class classifier",
+    use_case_description="""
+In this example, GPT model is used as classifier. Output from the model is parsed by
+special `roboflow_core/vlm_as_classifier@v1` block which turns GPT output text into
+full-blown prediction, which can later be used by other blocks compatible with 
+classification predictions - in this case we extract top-class property.
+    """,
+    workflow_definition=CLASSIFICATION_WORKFLOW,
+    workflow_name_in_app="gpt-multi-class-classifier",
+)
+@pytest.mark.skipif(
+    condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided"
+)
+def test_workflow_with_multi_class_classifier_prompt(
+    model_manager: ModelManager,
+    dogs_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=CLASSIFICATION_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [dogs_image],
+            "api_key": OPEN_AI_API_KEY,
+            "classes": ["cat", "dog"],
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {
+        "gpt_result",
+        "top_class",
+        "parsed_prediction",
+    }, "Expected all outputs to be delivered"
+    assert (
+        isinstance(result[0]["gpt_result"], str) and len(result[0]["gpt_result"]) > 0
+    ), "Expected non-empty string generated"
+    assert result[0]["top_class"] == "dog"
+    assert result[0]["parsed_prediction"]["error_status"] is False
+
+
+MULTI_LABEL_CLASSIFICATION_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+        {"type": "WorkflowParameter", "name": "classes"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/open_ai@v2",
+            "name": "gpt",
+            "images": "$inputs.image",
+            "task_type": "multi-label-classification",
+            "classes": "$inputs.classes",
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/vlm_as_classifier@v1",
+            "name": "parser",
+            "image": "$inputs.image",
+            "vlm_output": "$steps.gpt.output",
+            "classes": "$steps.gpt.classes",
+        },
+        {
+            "type": "roboflow_core/property_definition@v1",
+            "name": "top_class",
+            "operations": [
+                {"type": "ClassificationPropertyExtract", "property_name": "top_class"}
+            ],
+            "data": "$steps.parser.predictions",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.top_class.output",
+        },
+        {
+            "type": "JsonField",
+            "name": "parsed_prediction",
+            "selector": "$steps.parser.*",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Using GPT as multi-label classifier",
+    use_case_description="""
+In this example, GPT model is used as multi-label classifier. Output from the model is parsed by
+special `roboflow_core/vlm_as_classifier@v1` block which turns GPT output text into
+full-blown prediction, which can later be used by other blocks compatible with 
+classification predictions - in this case we extract top-class property.
+    """,
+    workflow_definition=MULTI_LABEL_CLASSIFICATION_WORKFLOW,
+    workflow_name_in_app="gpt-multi-label-classifier",
+)
+@pytest.mark.skipif(
+    condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided"
+)
+def test_workflow_with_multi_label_classifier_prompt(
+    model_manager: ModelManager,
+    dogs_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=MULTI_LABEL_CLASSIFICATION_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [dogs_image],
+            "api_key": OPEN_AI_API_KEY,
+            "classes": ["cat", "dog"],
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {
+        "result",
+        "parsed_prediction",
+    }, "Expected all outputs to be delivered"
+    assert result[0]["result"] == ["dog"]
+    assert result[0]["parsed_prediction"]["error_status"] is False
+
+
+STRUCTURED_PROMPTING_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowImage", "name": "image"},
+        {"type": "WorkflowParameter", "name": "api_key"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/open_ai@v2",
+            "name": "gpt",
+            "images": "$inputs.image",
+            "task_type": "structured-answering",
+            "output_structure": {
+                "dogs_count": "count of dogs instances in the image",
+                "cats_count": "count of cats instances in the image",
+            },
+            "api_key": "$inputs.api_key",
+        },
+        {
+            "type": "roboflow_core/json_parser@v1",
+            "name": "parser",
+            "raw_json": "$steps.gpt.output",
+            "expected_fields": ["dogs_count", "cats_count"],
+        },
+        {
+            "type": "roboflow_core/property_definition@v1",
+            "name": "property_definition",
+            "operations": [{"type": "ToString"}],
+            "data": "$steps.parser.dogs_count",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "result",
+            "selector": "$steps.property_definition.output",
+        }
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Workflows with Visual Language Models",
+    use_case_title="Using GPT to provide structured JSON",
+    use_case_description="""
+In this example, GPT model is expected to provide structured output in JSON, which can later be
+parsed by dedicated `roboflow_core/json_parser@v1` block which transforms string into dictionary 
+and expose it's keys to other blocks for further processing. In this case, parsed output is
+transformed using `roboflow_core/property_definition@v1` block.
+    """,
+    workflow_definition=STRUCTURED_PROMPTING_WORKFLOW,
+    workflow_name_in_app="gpt-structured-prompting",
+)
+@pytest.mark.skipif(
+    condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided"
+)
+def test_workflow_with_structured_prompt(
+    model_manager: ModelManager,
+    dogs_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=STRUCTURED_PROMPTING_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image": [dogs_image],
+            "api_key": OPEN_AI_API_KEY,
+        }
+    )
+
+    # then
+    assert len(result) == 1, "Single image given, expected single output"
+    assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered"
+    assert result[0]["result"] == "2"
diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_single_model.py b/tests/workflows/integration_tests/execution/test_workflow_with_single_model.py
index a8c1966bce..031c0e0d19 100644
--- a/tests/workflows/integration_tests/execution/test_workflow_with_single_model.py
+++ b/tests/workflows/integration_tests/execution/test_workflow_with_single_model.py
@@ -15,7 +15,11 @@
     "version": "1.0",
     "inputs": [
         {"type": "WorkflowImage", "name": "image"},
-        {"type": "WorkflowParameter", "name": "model_id", "default_value": "yolov8n-640"},
+        {
+            "type": "WorkflowParameter",
+            "name": "model_id",
+            "default_value": "yolov8n-640",
+        },
         {"type": "WorkflowParameter", "name": "confidence", "default_value": 0.3},
     ],
     "steps": [
diff --git a/tests/workflows/unit_tests/core_steps/formatters/test_json_parser.py b/tests/workflows/unit_tests/core_steps/formatters/test_json_parser.py
new file mode 100644
index 0000000000..8907690232
--- /dev/null
+++ b/tests/workflows/unit_tests/core_steps/formatters/test_json_parser.py
@@ -0,0 +1,232 @@
+import json
+
+import pytest
+from pydantic import ValidationError
+
+from inference.core.workflows.core_steps.formatters.json_parser.v1 import (
+    BlockManifest,
+    JSONParserBlockV1,
+)
+from inference.core.workflows.execution_engine.entities.base import OutputDefinition
+from inference.core.workflows.execution_engine.entities.types import BOOLEAN_KIND
+
+
+def test_parsing_manifest_when_input_is_valid() -> None:
+    # given
+    raw_manifest = {
+        "name": "parser",
+        "type": "roboflow_core/json_parser@v1",
+        "raw_json": "$steps.some.a",
+        "expected_fields": ["a", "b", "c"],
+    }
+
+    # when
+    result = BlockManifest.model_validate(raw_manifest)
+
+    # then
+    assert result == BlockManifest(
+        name="parser",
+        type="roboflow_core/json_parser@v1",
+        raw_json="$steps.some.a",
+        expected_fields=["a", "b", "c"],
+    )
+
+
+def test_parsing_manifest_when_input_is_invalid() -> None:
+    # given
+    raw_manifest = {
+        "name": "parser",
+        "type": "roboflow_core/json_parser@v1",
+        "raw_json": "$steps.some.a",
+        "expected_fields": ["a", "b", "c", "error_status"],
+    }
+
+    # when
+    with pytest.raises(ValidationError):
+        _ = BlockManifest.model_validate(raw_manifest)
+
+
+def test_manifest_get_actual_outputs() -> None:
+    # given
+    manifest = BlockManifest(
+        name="parser",
+        type="roboflow_core/json_parser@v1",
+        raw_json="$steps.some.a",
+        expected_fields=["a", "b", "c"],
+    )
+
+    # when
+    result = manifest.get_actual_outputs()
+
+    # then
+    assert result == [
+        OutputDefinition(name="error_status", kind=[BOOLEAN_KIND]),
+        OutputDefinition(name="a"),
+        OutputDefinition(name="b"),
+        OutputDefinition(name="c"),
+    ]
+
+
+def test_block_run_when_valid_json_given_and_all_fields_declared() -> None:
+    # given
+    raw_json = json.dumps({"a": "1", "b": "2"})
+    block = JSONParserBlockV1()
+
+    # when
+    result = block.run(raw_json=raw_json, expected_fields=["a", "b"])
+
+    # then
+    assert result == {
+        "error_status": False,
+        "a": "1",
+        "b": "2",
+    }
+
+
+def test_block_run_when_valid_json_given_and_subset_of_fields_declared() -> None:
+    # given
+    raw_json = json.dumps({"a": "1", "b": "2"})
+    block = JSONParserBlockV1()
+
+    # when
+    result = block.run(raw_json=raw_json, expected_fields=["a"])
+
+    # then
+    assert result == {
+        "error_status": False,
+        "a": "1",
+    }
+
+
+def test_block_run_when_valid_json_given_and_subset_of_declared_fields_found() -> None:
+    # given
+    raw_json = json.dumps({"a": "1", "b": "2"})
+    block = JSONParserBlockV1()
+
+    # when
+    result = block.run(raw_json=raw_json, expected_fields=["a", "b", "c"])
+
+    # then
+    assert result == {
+        "error_status": True,
+        "a": "1",
+        "b": "2",
+        "c": None,
+    }
+
+
+def test_block_run_when_multiple_json_documents_provided() -> None:
+    # given
+    raw_json = json.dumps({"a": "1", "b": "2"})
+    block = JSONParserBlockV1()
+
+    # when
+    result = block.run(raw_json="\n".join([raw_json] * 2), expected_fields=["a", "b"])
+
+    # then
+    assert result == {
+        "error_status": True,
+        "a": None,
+        "b": None,
+    }
+
+
+def test_block_run_when_invalid_json_provided() -> None:
+    # given
+    block = JSONParserBlockV1()
+
+    # when
+    result = block.run(raw_json="invalid", expected_fields=["a", "b"])
+
+    # then
+    assert result == {
+        "error_status": True,
+        "a": None,
+        "b": None,
+    }
+
+
+def test_block_run_when_json_in_markdown_provided() -> None:
+    # given
+    raw_json = json.dumps({"a": "1", "b": "2"})
+    raw_json = f"```json\n{raw_json}\n```"
+    block = JSONParserBlockV1()
+
+    # when
+    result = block.run(raw_json=raw_json, expected_fields=["a", "b"])
+
+    # then
+    assert result == {
+        "error_status": False,
+        "a": "1",
+        "b": "2",
+    }
+
+
+def test_block_run_when_indented_json_in_markdown_provided() -> None:
+    # given
+    raw_json = json.dumps({"a": "1", "b": "2"}, indent=4)
+    raw_json = f"```json\n{raw_json}\n```"
+    block = JSONParserBlockV1()
+
+    # when
+    result = block.run(raw_json=raw_json, expected_fields=["a", "b"])
+
+    # then
+    assert result == {
+        "error_status": False,
+        "a": "1",
+        "b": "2",
+    }
+
+
+def test_block_run_when_json_in_markdown_uppercase_provided() -> None:
+    # given
+    raw_json = json.dumps({"a": "1", "b": "2"})
+    raw_json = f"```JSON\n{raw_json}\n```"
+    block = JSONParserBlockV1()
+
+    # when
+    result = block.run(raw_json=raw_json, expected_fields=["a", "b"])
+
+    # then
+    assert result == {
+        "error_status": False,
+        "a": "1",
+        "b": "2",
+    }
+
+
+def test_block_run_when_json_in_markdown_without_new_lines_provided() -> None:
+    # given
+    raw_json = json.dumps({"a": "1", "b": "2"})
+    raw_json = f"```JSON{raw_json}```"
+    block = JSONParserBlockV1()
+
+    # when
+    result = block.run(raw_json=raw_json, expected_fields=["a", "b"])
+
+    # then
+    assert result == {
+        "error_status": False,
+        "a": "1",
+        "b": "2",
+    }
+
+
+def test_block_run_when_multiple_jsons_in_markdown_provided() -> None:
+    # given
+    raw_json_1 = json.dumps({"a": "1", "b": "2"})
+    raw_json_2 = json.dumps({"a": "3", "b": "4"})
+    raw_json = f"```json\n{raw_json_1}\n```\n``json\n{raw_json_2}\n```"
+    block = JSONParserBlockV1()
+
+    # when
+    result = block.run(raw_json=raw_json, expected_fields=["a", "b"])
+
+    # then
+    assert result == {
+        "error_status": False,
+        "a": "1",
+        "b": "2",
+    }
diff --git a/tests/workflows/unit_tests/core_steps/formatters/test_vlm_as_classifier.py b/tests/workflows/unit_tests/core_steps/formatters/test_vlm_as_classifier.py
new file mode 100644
index 0000000000..796c74cb3a
--- /dev/null
+++ b/tests/workflows/unit_tests/core_steps/formatters/test_vlm_as_classifier.py
@@ -0,0 +1,342 @@
+from typing import List, Union
+
+import numpy as np
+import pytest
+
+from inference.core.workflows.core_steps.formatters.vlm_as_classifier.v1 import (
+    BlockManifest,
+    VLMAsClassifierBlockV1,
+)
+from inference.core.workflows.execution_engine.entities.base import (
+    ImageParentMetadata,
+    WorkflowImageData,
+)
+
+
+@pytest.mark.parametrize("image", ["$inputs.image", "$steps.some.image"])
+@pytest.mark.parametrize(
+    "classes", ["$inputs.classes", "$steps.some.classes", ["a", "b"]]
+)
+def test_block_manifest_parsing_when_input_is_valid(
+    image: str, classes: Union[str, List[str]]
+) -> None:
+    # given
+    raw_manifest = {
+        "type": "roboflow_core/vlm_as_classifier@v1",
+        "image": image,
+        "name": "parser",
+        "vlm_output": "$steps.vlm.output",
+        "classes": classes,
+    }
+
+    # when
+    result = BlockManifest.model_validate(raw_manifest)
+
+    # then
+    assert result == BlockManifest(
+        type="roboflow_core/vlm_as_classifier@v1",
+        name="parser",
+        image=image,
+        vlm_output="$steps.vlm.output",
+        classes=classes,
+    )
+
+
+def test_run_when_valid_json_given_for_multi_class_classification() -> None:
+    # given
+    image = WorkflowImageData(
+        numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+        parent_metadata=ImageParentMetadata(parent_id="parent"),
+    )
+    vlm_output = """
+```json
+{"class_name": "car", "confidence": "0.7"}
+```
+    """
+    block = VLMAsClassifierBlockV1()
+
+    # when
+    result = block.run(image=image, vlm_output=vlm_output, classes=["car", "cat"])
+
+    # then
+    assert result["error_status"] is False
+    assert result["predictions"]["image"] == {"width": 168, "height": 192}
+    assert result["predictions"]["predictions"] == [
+        {"class_name": "car", "class_id": 0, "confidence": 0.7},
+        {"class_name": "cat", "class_id": 1, "confidence": 0.0},
+    ]
+    assert result["predictions"]["top"] == "car"
+    assert abs(result["predictions"]["confidence"] - 0.7) < 1e-5
+    assert result["predictions"]["parent_id"] == "parent"
+    assert len(result["inference_id"]) > 0
+    assert result["inference_id"] == result["predictions"]["inference_id"]
+
+
+def test_run_when_valid_json_given_for_multi_class_classification_when_unknown_class_predicted() -> (
+    None
+):
+    # given
+    image = WorkflowImageData(
+        numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+        parent_metadata=ImageParentMetadata(parent_id="parent"),
+    )
+    vlm_output = """
+```json
+{"class_name": "my_class", "confidence": "0.7"}
+```
+    """
+    block = VLMAsClassifierBlockV1()
+
+    # when
+    result = block.run(image=image, vlm_output=vlm_output, classes=["car", "cat"])
+
+    # then
+    assert result["error_status"] is False
+    assert result["predictions"]["image"] == {"width": 168, "height": 192}
+    assert result["predictions"]["predictions"] == [
+        {"class_name": "my_class", "class_id": -1, "confidence": 0.7},
+        {"class_name": "car", "class_id": 0, "confidence": 0.0},
+        {"class_name": "cat", "class_id": 1, "confidence": 0.0},
+    ]
+    assert result["predictions"]["top"] == "my_class"
+    assert abs(result["predictions"]["confidence"] - 0.7) < 1e-5
+    assert result["predictions"]["parent_id"] == "parent"
+    assert len(result["inference_id"]) > 0
+    assert result["inference_id"] == result["predictions"]["inference_id"]
+
+
+def test_run_when_valid_json_given_for_multi_label_classification() -> None:
+    # given
+    image = WorkflowImageData(
+        numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+        parent_metadata=ImageParentMetadata(parent_id="parent"),
+    )
+    vlm_output = """
+    {"predicted_classes": [
+        {"class": "cat", "confidence": 0.3}, {"class": "dog", "confidence": 0.6},
+        {"class": "cat", "confidence": "0.7"}
+    ]}
+    """
+    block = VLMAsClassifierBlockV1()
+
+    # when
+    result = block.run(
+        image=image, vlm_output=vlm_output, classes=["car", "cat", "dog"]
+    )
+
+    # then
+    assert result["error_status"] is False
+    assert result["predictions"]["image"] == {"width": 168, "height": 192}
+    assert result["predictions"]["predictions"] == {
+        "car": {"confidence": 0.0, "class_id": 0},
+        "cat": {"confidence": 0.7, "class_id": 1},
+        "dog": {"confidence": 0.6, "class_id": 2},
+    }
+    assert set(result["predictions"]["predicted_classes"]) == {"cat", "dog"}
+    assert result["predictions"]["parent_id"] == "parent"
+    assert len(result["inference_id"]) > 0
+    assert result["inference_id"] == result["predictions"]["inference_id"]
+
+
+def test_run_when_valid_json_given_for_multi_label_classification_when_unknown_class_provided() -> (
+    None
+):
+    # given
+    image = WorkflowImageData(
+        numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+        parent_metadata=ImageParentMetadata(parent_id="parent"),
+    )
+    vlm_output = """
+    {"predicted_classes": [
+        {"class": "my_class_1", "confidence": 0.3}, {"class": "my_class_2", "confidence": 0.6},
+        {"class": "my_class_1", "confidence": 0.7}
+    ]}
+    """
+    block = VLMAsClassifierBlockV1()
+
+    # when
+    result = block.run(
+        image=image, vlm_output=vlm_output, classes=["car", "cat", "dog"]
+    )
+
+    # then
+    assert result["error_status"] is False
+    assert result["predictions"]["image"] == {"width": 168, "height": 192}
+    assert result["predictions"]["predictions"] == {
+        "car": {"confidence": 0.0, "class_id": 0},
+        "cat": {"confidence": 0.0, "class_id": 1},
+        "dog": {"confidence": 0.0, "class_id": 2},
+        "my_class_1": {"confidence": 0.7, "class_id": -1},
+        "my_class_2": {"confidence": 0.6, "class_id": -1},
+    }
+    assert set(result["predictions"]["predicted_classes"]) == {
+        "my_class_1",
+        "my_class_2",
+    }
+    assert result["predictions"]["parent_id"] == "parent"
+    assert len(result["inference_id"]) > 0
+    assert result["inference_id"] == result["predictions"]["inference_id"]
+
+
+def test_run_when_valid_json_of_unknown_structure_given() -> None:
+    # given
+    image = WorkflowImageData(
+        numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+        parent_metadata=ImageParentMetadata(parent_id="parent"),
+    )
+    block = VLMAsClassifierBlockV1()
+
+    # when
+    result = block.run(
+        image=image, vlm_output='{"some": "data"}', classes=["car", "cat"]
+    )
+
+    # then
+    assert result["error_status"] is True
+    assert result["predictions"] is None
+    assert len(result["inference_id"]) > 0
+
+
+def test_run_when_invalid_json_given() -> None:
+    # given
+    image = WorkflowImageData(
+        numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+        parent_metadata=ImageParentMetadata(parent_id="parent"),
+    )
+    block = VLMAsClassifierBlockV1()
+
+    # when
+    result = block.run(image=image, vlm_output="invalid_json", classes=["car", "cat"])
+
+    # then
+    assert result["error_status"] is True
+    assert result["predictions"] is None
+    assert len(result["inference_id"]) > 0
+
+
+def test_run_when_multiple_jsons_given() -> None:
+    # given
+    raw_json = """
+    {"predicted_classes": [
+        {"class": "cat", "confidence": 0.3}, {"class": "dog", "confidence": 0.6},
+        {"class": "cat", "confidence": "0.7"}
+    ]}
+    {"predicted_classes": [
+        {"class": "cat", "confidence": 0.4}, {"class": "dog", "confidence": 0.7},
+        {"class": "cat", "confidence": "0.8"}
+    ]}
+    """
+    image = WorkflowImageData(
+        numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+        parent_metadata=ImageParentMetadata(parent_id="parent"),
+    )
+    block = VLMAsClassifierBlockV1()
+
+    # when
+    result = block.run(image=image, vlm_output=raw_json, classes=["car", "cat"])
+
+    # then
+    assert result["error_status"] is True
+    assert result["predictions"] is None
+    assert len(result["inference_id"]) > 0
+
+
+def test_run_when_json_in_markdown_block_given() -> None:
+    # given
+    raw_json = """
+```json
+{"predicted_classes": [
+    {"class": "cat", "confidence": 0.3}, {"class": "dog", "confidence": 0.6},
+    {"class": "cat", "confidence": "0.7"}
+]}
+```
+```
+        """
+    image = WorkflowImageData(
+        numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+        parent_metadata=ImageParentMetadata(parent_id="parent"),
+    )
+    block = VLMAsClassifierBlockV1()
+
+    # when
+    result = block.run(image=image, vlm_output=raw_json, classes=["car", "cat", "dog"])
+
+    # then
+    assert result["error_status"] is False
+    assert result["predictions"]["image"] == {"width": 168, "height": 192}
+    assert result["predictions"]["predictions"] == {
+        "car": {"confidence": 0.0, "class_id": 0},
+        "cat": {"confidence": 0.7, "class_id": 1},
+        "dog": {"confidence": 0.6, "class_id": 2},
+    }
+    assert set(result["predictions"]["predicted_classes"]) == {"cat", "dog"}
+    assert result["predictions"]["parent_id"] == "parent"
+    assert len(result["inference_id"]) > 0
+    assert result["inference_id"] == result["predictions"]["inference_id"]
+
+
+def test_run_when_json_in_markdown_block_without_new_lines_given() -> None:
+    # given
+    raw_json = """
+```json{"predicted_classes": [{"class": "cat", "confidence": 0.3}, {"class": "dog", "confidence": 0.6}, {"class": "cat", "confidence": "0.7"}]}```
+"""
+    image = WorkflowImageData(
+        numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+        parent_metadata=ImageParentMetadata(parent_id="parent"),
+    )
+    block = VLMAsClassifierBlockV1()
+
+    # when
+    result = block.run(image=image, vlm_output=raw_json, classes=["car", "cat", "dog"])
+
+    # then
+    assert result["error_status"] is False
+    assert result["predictions"]["image"] == {"width": 168, "height": 192}
+    assert result["predictions"]["predictions"] == {
+        "car": {"confidence": 0.0, "class_id": 0},
+        "cat": {"confidence": 0.7, "class_id": 1},
+        "dog": {"confidence": 0.6, "class_id": 2},
+    }
+    assert set(result["predictions"]["predicted_classes"]) == {"cat", "dog"}
+    assert result["predictions"]["parent_id"] == "parent"
+    assert len(result["inference_id"]) > 0
+    assert result["inference_id"] == result["predictions"]["inference_id"]
+
+
+def test_run_when_multiple_jsons_in_markdown_block_given() -> None:
+    # given
+    raw_json = """
+```json
+{"predicted_classes": [
+    {"class": "cat", "confidence": 0.3}, {"class": "dog", "confidence": 0.6},
+    {"class": "cat", "confidence": "0.7"}
+]}
+```
+```json
+{"predicted_classes": [
+    {"class": "cat", "confidence": 0.4}, {"class": "dog", "confidence": 0.7},
+    {"class": "cat", "confidence": "0.8"}
+]}
+```
+"""
+    image = WorkflowImageData(
+        numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+        parent_metadata=ImageParentMetadata(parent_id="parent"),
+    )
+    block = VLMAsClassifierBlockV1()
+
+    # when
+    result = block.run(image=image, vlm_output=raw_json, classes=["car", "cat", "dog"])
+
+    # then
+    assert result["error_status"] is False
+    assert result["predictions"]["image"] == {"width": 168, "height": 192}
+    assert result["predictions"]["predictions"] == {
+        "car": {"confidence": 0.0, "class_id": 0},
+        "cat": {"confidence": 0.7, "class_id": 1},
+        "dog": {"confidence": 0.6, "class_id": 2},
+    }
+    assert set(result["predictions"]["predicted_classes"]) == {"cat", "dog"}
+    assert result["predictions"]["parent_id"] == "parent"
+    assert len(result["inference_id"]) > 0
+    assert result["inference_id"] == result["predictions"]["inference_id"]
diff --git a/tests/workflows/unit_tests/core_steps/formatters/test_vlm_as_detector.py b/tests/workflows/unit_tests/core_steps/formatters/test_vlm_as_detector.py
new file mode 100644
index 0000000000..10f013c26a
--- /dev/null
+++ b/tests/workflows/unit_tests/core_steps/formatters/test_vlm_as_detector.py
@@ -0,0 +1,163 @@
+from typing import List, Union
+
+import numpy as np
+import pytest
+import supervision as sv
+
+from inference.core.workflows.core_steps.formatters.vlm_as_detector.v1 import (
+    BlockManifest,
+    VLMAsDetectorBlockV1,
+)
+from inference.core.workflows.execution_engine.entities.base import (
+    ImageParentMetadata,
+    WorkflowImageData,
+)
+
+
+@pytest.mark.parametrize("image", ["$inputs.image", "$steps.some.image"])
+@pytest.mark.parametrize(
+    "classes", ["$inputs.classes", "$steps.some.classes", ["a", "b"]]
+)
+def test_manifest_parsing_when_input_valid(
+    image: str, classes: Union[str, List[str]]
+) -> None:
+    # given
+    raw_manifest = {
+        "type": "roboflow_core/vlm_as_detector@v1",
+        "name": "parser",
+        "image": image,
+        "vlm_output": "$steps.vlm.output",
+        "classes": classes,
+        "model_type": "google-gemini",
+        "task_type": "object-detection",
+    }
+
+    # when
+    result = BlockManifest.model_validate(raw_manifest)
+
+    # then
+    assert result == BlockManifest(
+        type="roboflow_core/vlm_as_detector@v1",
+        name="parser",
+        image=image,
+        vlm_output="$steps.vlm.output",
+        classes=classes,
+        model_type="google-gemini",
+        task_type="object-detection",
+    )
+
+
+def test_run_method_for_claude_and_gemini_output() -> None:
+    # given
+    block = VLMAsDetectorBlockV1()
+    image = WorkflowImageData(
+        numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+        parent_metadata=ImageParentMetadata(parent_id="parent"),
+    )
+    vlm_output = """
+{"detections": [
+  {"x_min": 0.01, "y_min": 0.15, "x_max": 0.15, "y_max": 0.85, "class_name": "cat", "confidence": 1.98},
+  {"x_min": 0.17, "y_min": 0.25, "x_max": 0.32, "y_max": 0.85, "class_name": "dog", "confidence": 0.97},
+  {"x_min": 0.33, "y_min": 0.15, "x_max": 0.47, "y_max": 0.85, "class_name": "cat", "confidence": 0.99},
+  {"x_min": 0.49, "y_min": 0.30, "x_max": 0.65, "y_max": 0.85, "class_name": "dog", "confidence": 0.98},
+  {"x_min": 0.67, "y_min": 0.20, "x_max": 0.82, "y_max": 0.85, "class_name": "cat", "confidence": 0.99},
+  {"x_min": 0.84, "y_min": 0.25, "x_max": 0.99, "y_max": 0.85, "class_name": "unknown", "confidence": 0.97}
+]}
+    """
+
+    # when
+    result = block.run(
+        image=image,
+        vlm_output=vlm_output,
+        classes=["cat", "dog", "lion"],
+        model_type="google-gemini",
+        task_type="object-detection",
+    )
+
+    # then
+    assert result["error_status"] is False
+    assert isinstance(result["predictions"], sv.Detections)
+    assert len(result["inference_id"]) > 0
+    assert np.allclose(
+        result["predictions"].xyxy,
+        np.array(
+            [
+                [2, 29, 25, 163],
+                [29, 48, 54, 163],
+                [55, 29, 79, 163],
+                [82, 58, 109, 163],
+                [113, 38, 138, 163],
+                [141, 48, 166, 163],
+            ]
+        ),
+        atol=1.0,
+    )
+    assert np.allclose(result["predictions"].class_id, np.array([0, 1, 0, 1, 0, -1]))
+    assert np.allclose(
+        result["predictions"].confidence, np.array([1.0, 0.97, 0.99, 0.98, 0.99, 0.97])
+    )
+    assert "class_name" in result["predictions"].data
+    assert "image_dimensions" in result["predictions"].data
+    assert "prediction_type" in result["predictions"].data
+    assert "parent_coordinates" in result["predictions"].data
+    assert "parent_dimensions" in result["predictions"].data
+    assert "root_parent_coordinates" in result["predictions"].data
+    assert "root_parent_dimensions" in result["predictions"].data
+    assert "parent_id" in result["predictions"].data
+    assert "root_parent_id" in result["predictions"].data
+
+
+def test_run_method_for_invalid_claude_and_gemini_output() -> None:
+    # given
+    block = VLMAsDetectorBlockV1()
+    image = WorkflowImageData(
+        numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+        parent_metadata=ImageParentMetadata(parent_id="parent"),
+    )
+    vlm_output = """
+    {"detections": [
+      {"x_min": 0.01, "y_min": 0.15, "x_max": 0.15, "y_max": 0.85, "confidence": 1.98},
+      {"x_min": 0.17, "y_min": 0.25, "x_max": 0.32, "y_max": 0.85, "class_name": "dog", "confidence": 0.97},
+      {"x_min": 0.33, "y_min": 0.15, "x_max": 0.47, "y_max": 0.85, "class_name": "cat", "confidence": 0.99},
+      {"x_min": 0.49, "x_max": 0.65, "y_max": 0.85, "class_name": "dog", "confidence": 0.98},
+      {"x_min": 0.67, "y_min": 0.20, "x_max": 0.82, "y_max": 0.85, "class_name": "cat", "confidence": 0.99},
+      {"x_min": 0.84, "y_min": 0.25, "x_max": 0.99, "y_max": 0.85, "class_name": "unknown", "confidence": 0.97}
+    ]}
+        """
+
+    # when
+    result = block.run(
+        image=image,
+        vlm_output=vlm_output,
+        classes=["cat", "dog", "lion"],
+        model_type="google-gemini",
+        task_type="object-detection",
+    )
+
+    # then
+    assert result["error_status"] is True
+    assert result["predictions"] is None
+    assert len(result["inference_id"]) > 0
+
+
+def test_run_method_for_invalid_json() -> None:
+    # given
+    block = VLMAsDetectorBlockV1()
+    image = WorkflowImageData(
+        numpy_image=np.zeros((192, 168, 3), dtype=np.uint8),
+        parent_metadata=ImageParentMetadata(parent_id="parent"),
+    )
+
+    # when
+    result = block.run(
+        image=image,
+        vlm_output="invalid",
+        classes=["cat", "dog", "lion"],
+        model_type="google-gemini",
+        task_type="object-detection",
+    )
+
+    # then
+    assert result["error_status"] is True
+    assert result["predictions"] is None
+    assert len(result["inference_id"]) > 0
diff --git a/tests/workflows/unit_tests/core_steps/transformations/test_perspective_correction.py b/tests/workflows/unit_tests/core_steps/transformations/test_perspective_correction.py
index feac4169ed..c4fc40237e 100644
--- a/tests/workflows/unit_tests/core_steps/transformations/test_perspective_correction.py
+++ b/tests/workflows/unit_tests/core_steps/transformations/test_perspective_correction.py
@@ -275,7 +275,6 @@ def test_correct_detections_with_keypoints():
         src=src_polygon,
         dst=dst_polygon,
     )
-
     # when
     corrected_detections = correct_detections(
         detections=detections,
diff --git a/tests/workflows/unit_tests/execution_engine/introspection/test_blocks_loader.py b/tests/workflows/unit_tests/execution_engine/introspection/test_blocks_loader.py
index 399860ccdd..768afa1726 100644
--- a/tests/workflows/unit_tests/execution_engine/introspection/test_blocks_loader.py
+++ b/tests/workflows/unit_tests/execution_engine/introspection/test_blocks_loader.py
@@ -218,7 +218,7 @@ def test_describe_available_blocks_when_valid_plugins_are_loaded(
     assert result.blocks[0].manifest_class == plugin_with_valid_blocks.Block1Manifest
     assert result.blocks[1].block_class == plugin_with_valid_blocks.Block2
     assert result.blocks[1].manifest_class == plugin_with_valid_blocks.Block2Manifest
-    assert len(result.declared_kinds) == 31
+    assert len(result.declared_kinds) > 0
 
 
 @mock.patch.object(blocks_loader, "load_workflow_blocks")
@@ -259,7 +259,7 @@ def test_describe_available_blocks_when_valid_plugins_are_loaded_and_multiple_ve
         result.blocks[2].manifest_class
         == plugin_with_multiple_versions_of_blocks.Block2Manifest
     )
-    assert len(result.declared_kinds) == 31
+    assert len(result.declared_kinds) > 0
 
 
 @mock.patch.object(blocks_loader, "load_workflow_blocks")