diff --git a/development/docs/build_block_docs.py b/development/docs/build_block_docs.py index a0fb7c2e52..14e948380e 100644 --- a/development/docs/build_block_docs.py +++ b/development/docs/build_block_docs.py @@ -361,7 +361,7 @@ def format_block_connections( connections = [ ( f"[`{block_type2manifest_type_identifier[connection]}`]" - f"(/workflows/blocks/{camel_to_snake(block_type2manifest_type_identifier[connection])})" + f"(/workflows/blocks/{slugify_block_name(block_type2manifest_type_identifier[connection])})" ) for connection in connections ] diff --git a/docs/workflows/blocks.md b/docs/workflows/blocks.md index 1c6290f45c..ed2f6ecccf 100644 --- a/docs/workflows/blocks.md +++ b/docs/workflows/blocks.md @@ -72,6 +72,11 @@ hide:

+

+

+

+

+

diff --git a/docs/workflows/create_workflow_block.md b/docs/workflows/create_workflow_block.md index c6216f5dde..d14a757703 100644 --- a/docs/workflows/create_workflow_block.md +++ b/docs/workflows/create_workflow_block.md @@ -1050,7 +1050,7 @@ def run(self, predictions: List[dict]) -> BlockResult: ) from inference.core.workflows.execution_engine.entities.types import ( StepOutputSelector, - BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND, + OBJECT_DETECTION_PREDICTION_KIND, ) from inference.core.workflows.prototypes.block import ( BlockResult, @@ -1063,7 +1063,7 @@ def run(self, predictions: List[dict]) -> BlockResult: class BlockManifest(WorkflowBlockManifest): type: Literal["my_plugin/fusion_of_predictions@v1"] name: str - predictions: List[StepOutputSelector(kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND])] = Field( + predictions: List[StepOutputSelector(kind=[OBJECT_DETECTION_PREDICTION_KIND])] = Field( description="Selectors to step outputs", examples=[["$steps.model_1.predictions", "$steps.model_2.predictions"]], ) @@ -1073,7 +1073,7 @@ def run(self, predictions: List[dict]) -> BlockResult: return [ OutputDefinition( name="predictions", - kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND], + kind=[OBJECT_DETECTION_PREDICTION_KIND], ) ] @@ -1251,8 +1251,8 @@ the method signatures. ImageParentMetadata, ) from inference.core.workflows.execution_engine.entities.types import ( - BATCH_OF_IMAGES_KIND, - BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND, + IMAGE_KIND, + OBJECT_DETECTION_PREDICTION_KIND, StepOutputImageSelector, StepOutputSelector, WorkflowImageSelector, @@ -1267,7 +1267,7 @@ the method signatures. type: Literal["my_block/dynamic_crop@v1"] image: Union[WorkflowImageSelector, StepOutputImageSelector] predictions: StepOutputSelector( - kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND], + kind=[OBJECT_DETECTION_PREDICTION_KIND], ) @classmethod @@ -1277,7 +1277,7 @@ the method signatures. @classmethod def describe_outputs(cls) -> List[OutputDefinition]: return [ - OutputDefinition(name="crops", kind=[BATCH_OF_IMAGES_KIND]), + OutputDefinition(name="crops", kind=[IMAGE_KIND]), ] @classmethod @@ -1340,8 +1340,8 @@ the method signatures. WorkflowImageData, ) from inference.core.workflows.execution_engine.entities.types import ( - BATCH_OF_IMAGES_KIND, - BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND, + IMAGE_KIND, + OBJECT_DETECTION_PREDICTION_KIND, StepOutputImageSelector, StepOutputSelector, WorkflowImageSelector, @@ -1357,7 +1357,7 @@ the method signatures. type: Literal["my_plugin/tile_detections@v1"] crops: Union[WorkflowImageSelector, StepOutputImageSelector] crops_predictions: StepOutputSelector( - kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND] + kind=[OBJECT_DETECTION_PREDICTION_KIND] ) @classmethod @@ -1367,7 +1367,7 @@ the method signatures. @classmethod def describe_outputs(cls) -> List[OutputDefinition]: return [ - OutputDefinition(name="visualisations", kind=[BATCH_OF_IMAGES_KIND]), + OutputDefinition(name="visualisations", kind=[IMAGE_KIND]), ] @@ -1427,7 +1427,7 @@ the method signatures. WorkflowImageData, ) from inference.core.workflows.execution_engine.entities.types import ( - BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND, + OBJECT_DETECTION_PREDICTION_KIND, StepOutputImageSelector, StepOutputSelector, WorkflowImageSelector, @@ -1443,7 +1443,7 @@ the method signatures. type: Literal["my_plugin/stitch@v1"] image: Union[WorkflowImageSelector, StepOutputImageSelector] image_predictions: StepOutputSelector( - kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND], + kind=[OBJECT_DETECTION_PREDICTION_KIND], ) @classmethod @@ -1463,7 +1463,7 @@ the method signatures. OutputDefinition( name="predictions", kind=[ - BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND, + OBJECT_DETECTION_PREDICTION_KIND, ], ), ] @@ -1526,8 +1526,8 @@ the method signatures. Batch, ) from inference.core.workflows.execution_engine.entities.types import ( - BATCH_OF_IMAGES_KIND, - BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND, + IMAGE_KIND, + OBJECT_DETECTION_PREDICTION_KIND, StepOutputImageSelector, StepOutputSelector, WorkflowImageSelector, @@ -1542,7 +1542,7 @@ the method signatures. type: Literal["my_block/dynamic_crop@v1"] image: Union[WorkflowImageSelector, StepOutputImageSelector] predictions: StepOutputSelector( - kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND], + kind=[OBJECT_DETECTION_PREDICTION_KIND], ) @classmethod @@ -1556,7 +1556,7 @@ the method signatures. @classmethod def describe_outputs(cls) -> List[OutputDefinition]: return [ - OutputDefinition(name="crops", kind=[BATCH_OF_IMAGES_KIND]), + OutputDefinition(name="crops", kind=[IMAGE_KIND]), ] @classmethod @@ -1629,8 +1629,8 @@ the method signatures. WorkflowImageData, ) from inference.core.workflows.execution_engine.entities.types import ( - BATCH_OF_IMAGES_KIND, - BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND, + IMAGE_KIND, + OBJECT_DETECTION_PREDICTION_KIND, StepOutputImageSelector, StepOutputSelector, WorkflowImageSelector, @@ -1646,7 +1646,7 @@ the method signatures. type: Literal["my_plugin/tile_detections@v1"] images_crops: Union[WorkflowImageSelector, StepOutputImageSelector] crops_predictions: StepOutputSelector( - kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND] + kind=[OBJECT_DETECTION_PREDICTION_KIND] ) @classmethod @@ -1660,7 +1660,7 @@ the method signatures. @classmethod def describe_outputs(cls) -> List[OutputDefinition]: return [ - OutputDefinition(name="visualisations", kind=[BATCH_OF_IMAGES_KIND]), + OutputDefinition(name="visualisations", kind=[IMAGE_KIND]), ] @@ -1726,7 +1726,7 @@ the method signatures. WorkflowImageData, ) from inference.core.workflows.execution_engine.entities.types import ( - BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND, + OBJECT_DETECTION_PREDICTION_KIND, StepOutputImageSelector, StepOutputSelector, WorkflowImageSelector, @@ -1742,7 +1742,7 @@ the method signatures. type: Literal["my_plugin/stitch@v1"] images: Union[WorkflowImageSelector, StepOutputImageSelector] images_predictions: StepOutputSelector( - kind=[BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND], + kind=[OBJECT_DETECTION_PREDICTION_KIND], ) @classmethod @@ -1766,7 +1766,7 @@ the method signatures. OutputDefinition( name="predictions", kind=[ - BATCH_OF_OBJECT_DETECTION_PREDICTION_KIND, + OBJECT_DETECTION_PREDICTION_KIND, ], ), ] diff --git a/docs/workflows/gallery_index.md b/docs/workflows/gallery_index.md index d734e77111..f9dd8a1a60 100644 --- a/docs/workflows/gallery_index.md +++ b/docs/workflows/gallery_index.md @@ -7,6 +7,7 @@ Browse through the various categories to find inspiration and ideas for building
  • Workflows with multiple models
  • Workflows enhanced by Roboflow Platform
  • Workflows with classical Computer Vision methods
  • +
  • Workflows with Visual Language Models
  • Basic Workflows
  • Workflows with dynamic Python Blocks
  • Workflows with data transformations
  • diff --git a/docs/workflows/kinds.md b/docs/workflows/kinds.md index 4a92216bf4..22482dfd85 100644 --- a/docs/workflows/kinds.md +++ b/docs/workflows/kinds.md @@ -23,49 +23,50 @@ for the presence of a mask in the input. !!! Warning - The list presented below contains elements with `Batch[X]` markers - those will - get soon deprecated and we will use only `X` markers. For now, developers are asked - to create their blocks using the `Batch[X]` markers, but raise the - [issue here](https://github.com/roboflow/inference/issues/608). This GH issue will be used - as a point of communication regarding deprecation process. + In `inference` release `0.18.0` we decided to make drastic move to heal the ecosystem + from the problem with ambiguous kinds names (`Batch[X]` vs `X` - see more + [here](https://github.com/roboflow/inference/issues/608)). + + The change is breaking only if there is remote Workflow plugin depending on imports + from `inference.core.workflows.execution_engine.entities.types` module, which is + not the case to the best of our knowledge. We removed problematic kinds as if they + never existed in the ecosystem and fixed all blocks from `roboflow_core` plugin. + If there is anyone impacted by the change - here is the + [migration guide](https://github.com/roboflow/inference/releases/tag/v0.18.0). ## Kinds declared in Roboflow plugins -* [`zone`](/workflows/kinds/zone): Definition of polygon zone -* [`Batch[dictionary]`](/workflows/kinds/batch_dictionary): Batch of dictionaries -* [`dictionary`](/workflows/kinds/dictionary): Dictionary -* [`point`](/workflows/kinds/point): Single point in 2D -* [`Batch[parent_id]`](/workflows/kinds/batch_parent_id): Identifier of parent for step output -* [`roboflow_model_id`](/workflows/kinds/roboflow_model_id): Roboflow model id -* [`Batch[classification_prediction]`](/workflows/kinds/batch_classification_prediction): `'predictions'` key from Classification Model outputs -* [`Batch[top_class]`](/workflows/kinds/batch_top_class): Batch of string values representing top class predicted by classification model -* [`rgb_color`](/workflows/kinds/rgb_color): RGB color -* [`Batch[keypoint_detection_prediction]`](/workflows/kinds/batch_keypoint_detection_prediction): `'predictions'` key from Keypoint Detection Model output -* [`Batch[serialised_payloads]`](/workflows/kinds/batch_serialised_payloads): List of serialised elements that can be registered in the sink +* [`bar_code_detection`](/workflows/kinds/bar_code_detection): Prediction with barcode detection +* [`language_model_output`](/workflows/kinds/language_model_output): LLM / VLM output +* [`top_class`](/workflows/kinds/top_class): String value representing top class predicted by classification model +* [`prediction_type`](/workflows/kinds/prediction_type): String value with type of prediction +* [`object_detection_prediction`](/workflows/kinds/object_detection_prediction): Prediction with detected bounding boxes in form of sv.Detections(...) object +* [`qr_code_detection`](/workflows/kinds/qr_code_detection): Prediction with QR code detection +* [`image_metadata`](/workflows/kinds/image_metadata): Dictionary with image metadata required by supervision * [`float_zero_to_one`](/workflows/kinds/float_zero_to_one): `float` value in range `[0.0, 1.0]` -* [`Batch[boolean]`](/workflows/kinds/batch_boolean): Boolean values batch -* [`list_of_values`](/workflows/kinds/list_of_values): List of values of any types -* [`Batch[instance_segmentation_prediction]`](/workflows/kinds/batch_instance_segmentation_prediction): `'predictions'` key from Instance Segmentation Model outputs -* [`Batch[qr_code_detection]`](/workflows/kinds/batch_qr_code_detection): Prediction with QR code detection +* [`parent_id`](/workflows/kinds/parent_id): Identifier of parent for step output +* [`keypoint_detection_prediction`](/workflows/kinds/keypoint_detection_prediction): Prediction with detected bounding boxes and detected keypoints in form of sv.Detections(...) object +* [`float`](/workflows/kinds/float): Float value +* [`*`](/workflows/kinds/*): Equivalent of any element * [`contours`](/workflows/kinds/contours): List of numpy arrays where each array represents contour points -* [`Batch[image]`](/workflows/kinds/batch_image): Image in workflows +* [`boolean`](/workflows/kinds/boolean): Boolean flag * [`detection`](/workflows/kinds/detection): Single element of detections-based prediction (like `object_detection_prediction`) -* [`Batch[prediction_type]`](/workflows/kinds/batch_prediction_type): String value with type of prediction +* [`roboflow_project`](/workflows/kinds/roboflow_project): Roboflow project name +* [`dictionary`](/workflows/kinds/dictionary): Dictionary +* [`numpy_array`](/workflows/kinds/numpy_array): Numpy array * [`roboflow_api_key`](/workflows/kinds/roboflow_api_key): Roboflow API key * [`string`](/workflows/kinds/string): String value -* [`*`](/workflows/kinds/*): Equivalent of any element -* [`float`](/workflows/kinds/float): Float value -* [`keypoint_detection_prediction`](/workflows/kinds/keypoint_detection_prediction): Prediction with detected bounding boxes and detected keypoints in form of sv.Detections(...) object -* [`Batch[object_detection_prediction]`](/workflows/kinds/batch_object_detection_prediction): `'predictions'` key from Object Detection Model output -* [`integer`](/workflows/kinds/integer): Integer value -* [`roboflow_project`](/workflows/kinds/roboflow_project): Roboflow project name -* [`Batch[string]`](/workflows/kinds/batch_string): Batch of string values -* [`image`](/workflows/kinds/image): Image in workflows -* [`Batch[bar_code_detection]`](/workflows/kinds/batch_bar_code_detection): Prediction with barcode detection -* [`object_detection_prediction`](/workflows/kinds/object_detection_prediction): Prediction with detected bounding boxes in form of sv.Detections(...) object -* [`boolean`](/workflows/kinds/boolean): Boolean flag +* [`roboflow_model_id`](/workflows/kinds/roboflow_model_id): Roboflow model id +* [`list_of_values`](/workflows/kinds/list_of_values): List of values of any types * [`instance_segmentation_prediction`](/workflows/kinds/instance_segmentation_prediction): Prediction with detected bounding boxes and segmentation masks in form of sv.Detections(...) object +* [`image`](/workflows/kinds/image): Image in workflows +* [`video_metadata`](/workflows/kinds/video_metadata): Video image metadata +* [`serialised_payloads`](/workflows/kinds/serialised_payloads): Serialised element that is usually accepted by sink +* [`integer`](/workflows/kinds/integer): Integer value +* [`rgb_color`](/workflows/kinds/rgb_color): RGB color +* [`classification_prediction`](/workflows/kinds/classification_prediction): Predictions from classifier * [`image_keypoints`](/workflows/kinds/image_keypoints): Image keypoints detected by classical Computer Vision method -* [`Batch[image_metadata]`](/workflows/kinds/batch_image_metadata): Dictionary with image metadata required by supervision +* [`point`](/workflows/kinds/point): Single point in 2D +* [`zone`](/workflows/kinds/zone): Definition of polygon zone diff --git a/inference/core/version.py b/inference/core/version.py index 7dbc1800f2..b3b607f742 100644 --- a/inference/core/version.py +++ b/inference/core/version.py @@ -1,4 +1,4 @@ -__version__ = "0.17.1" +__version__ = "0.18.0" if __name__ == "__main__": diff --git a/inference/core/workflows/core_steps/formatters/json_parser/__init__.py b/inference/core/workflows/core_steps/formatters/json_parser/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/inference/core/workflows/core_steps/formatters/json_parser/v1.py b/inference/core/workflows/core_steps/formatters/json_parser/v1.py new file mode 100644 index 0000000000..0e1a0f1b3e --- /dev/null +++ b/inference/core/workflows/core_steps/formatters/json_parser/v1.py @@ -0,0 +1,142 @@ +import json +import logging +import re +from typing import List, Literal, Optional, Tuple, Type + +from pydantic import AfterValidator, ConfigDict, Field +from typing_extensions import Annotated + +from inference.core.workflows.execution_engine.entities.base import OutputDefinition +from inference.core.workflows.execution_engine.entities.types import ( + BOOLEAN_KIND, + LANGUAGE_MODEL_OUTPUT_KIND, + StepOutputSelector, +) +from inference.core.workflows.prototypes.block import ( + BlockResult, + WorkflowBlock, + WorkflowBlockManifest, +) + +JSON_MARKDOWN_BLOCK_PATTERN = re.compile(r"```json([\s\S]*?)```", flags=re.IGNORECASE) + +LONG_DESCRIPTION = """ +The block expects string input that would be produced by blocks exposing Large Language Models (LLMs) and +Visual Language Models (VLMs). Input is parsed to JSON, and its keys are exposed as block outputs. + +Accepted formats: +- valid JSON strings +- JSON documents wrapped with Markdown tags (very common for GPT responses) +``` +{"my": "json"} +``` + +**Details regarding block behavior:** + +- `error_status` is set `True` whenever at least one of `expected_fields` cannot be retrieved from input + +- in case of multiple markdown blocks with raw JSON content - only first will be parsed and returned, while +`error_status` will remain `False` +""" + +SHORT_DESCRIPTION = "Parses raw string into JSON." + + +def validate_reserved_fields(expected_fields: List[str]) -> List[str]: + if "error_status" in expected_fields: + raise ValueError( + "`error_status` is reserved field name and cannot be " + "used in `expected_fields` of `roboflow_core/json_parser@v1` block." + ) + return expected_fields + + +class BlockManifest(WorkflowBlockManifest): + model_config = ConfigDict( + json_schema_extra={ + "name": "JSON Parser", + "version": "v1", + "short_description": SHORT_DESCRIPTION, + "long_description": LONG_DESCRIPTION, + "license": "Apache-2.0", + "block_type": "formatter", + } + ) + type: Literal["roboflow_core/json_parser@v1"] + raw_json: StepOutputSelector(kind=[LANGUAGE_MODEL_OUTPUT_KIND]) = Field( + description="The string with raw JSON to parse.", + examples=[["$steps.lmm.output"]], + ) + expected_fields: Annotated[List[str], AfterValidator(validate_reserved_fields)] = ( + Field( + description="List of expected JSON fields. `error_status` field name is reserved and cannot be used.", + examples=[["field_a", "field_b"]], + ) + ) + + @classmethod + def describe_outputs(cls) -> List[OutputDefinition]: + return [ + OutputDefinition(name="error_status", kind=[BOOLEAN_KIND]), + OutputDefinition(name="*"), + ] + + def get_actual_outputs(self) -> List[OutputDefinition]: + result = [ + OutputDefinition(name="error_status", kind=[BOOLEAN_KIND]), + ] + for field_name in self.expected_fields: + result.append(OutputDefinition(name=field_name)) + return result + + @classmethod + def get_execution_engine_compatibility(cls) -> Optional[str]: + return ">=1.0.0,<2.0.0" + + +class JSONParserBlockV1(WorkflowBlock): + + @classmethod + def get_manifest(cls) -> Type[WorkflowBlockManifest]: + return BlockManifest + + def run( + self, + raw_json: str, + expected_fields: List[str], + ) -> BlockResult: + error_status, parsed_data = string2json( + raw_json=raw_json, + expected_fields=expected_fields, + ) + parsed_data["error_status"] = error_status + return parsed_data + + +def string2json( + raw_json: str, + expected_fields: List[str], +) -> Tuple[bool, dict]: + json_blocks_found = JSON_MARKDOWN_BLOCK_PATTERN.findall(raw_json) + if len(json_blocks_found) == 0: + return try_parse_json(raw_json, expected_fields=expected_fields) + first_block = json_blocks_found[0] + return try_parse_json(first_block, expected_fields=expected_fields) + + +def try_parse_json(content: str, expected_fields: List[str]) -> Tuple[bool, dict]: + try: + parsed_data = json.loads(content) + result = {} + all_fields_find = True + for field in expected_fields: + if field not in parsed_data: + all_fields_find = False + result[field] = parsed_data.get(field) + return not all_fields_find, result + except Exception as error: + logging.warning( + f"Could not parse JSON in `roboflow_core/json_parser@v1` block. " + f"Error type: {error.__class__.__name__}. Details: {error}" + ) + return True, {field: None for field in expected_fields} diff --git a/inference/core/workflows/core_steps/formatters/vlm_as_classifier/__init__.py b/inference/core/workflows/core_steps/formatters/vlm_as_classifier/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/inference/core/workflows/core_steps/formatters/vlm_as_classifier/v1.py b/inference/core/workflows/core_steps/formatters/vlm_as_classifier/v1.py new file mode 100644 index 0000000000..7edce35af6 --- /dev/null +++ b/inference/core/workflows/core_steps/formatters/vlm_as_classifier/v1.py @@ -0,0 +1,269 @@ +import json +import logging +import re +from typing import Dict, List, Literal, Optional, Tuple, Type, Union +from uuid import uuid4 + +from pydantic import ConfigDict, Field + +from inference.core.workflows.execution_engine.entities.base import ( + OutputDefinition, + WorkflowImageData, +) +from inference.core.workflows.execution_engine.entities.types import ( + BOOLEAN_KIND, + CLASSIFICATION_PREDICTION_KIND, + LANGUAGE_MODEL_OUTPUT_KIND, + LIST_OF_VALUES_KIND, + STRING_KIND, + StepOutputImageSelector, + StepOutputSelector, + WorkflowImageSelector, + WorkflowParameterSelector, +) +from inference.core.workflows.prototypes.block import ( + BlockResult, + WorkflowBlock, + WorkflowBlockManifest, +) + +JSON_MARKDOWN_BLOCK_PATTERN = re.compile(r"```json([\s\S]*?)```", flags=re.IGNORECASE) + +LONG_DESCRIPTION = """ +The block expects string input that would be produced by blocks exposing Large Language Models (LLMs) and +Visual Language Models (VLMs). Input is parsed to classification prediction and returned as block output. + +Accepted formats: + +- valid JSON strings + +- JSON documents wrapped with Markdown tags (very common for GPT responses) + +Example: +``` +{"my": "json"} +``` + +**Details regarding block behavior:** + +- `error_status` is set `True` whenever parsing cannot be completed + +- in case of multiple markdown blocks with raw JSON content - only first will be parsed +""" + +SHORT_DESCRIPTION = "Parses raw string into classification prediction." + + +class BlockManifest(WorkflowBlockManifest): + model_config = ConfigDict( + json_schema_extra={ + "name": "VLM as Classifier", + "version": "v1", + "short_description": SHORT_DESCRIPTION, + "long_description": LONG_DESCRIPTION, + "license": "Apache-2.0", + "block_type": "formatter", + } + ) + type: Literal["roboflow_core/vlm_as_classifier@v1"] + image: Union[WorkflowImageSelector, StepOutputImageSelector] = Field( + description="The image which was the base to generate VLM prediction", + examples=["$inputs.image", "$steps.cropping.crops"], + ) + vlm_output: StepOutputSelector(kind=[LANGUAGE_MODEL_OUTPUT_KIND]) = Field( + title="VLM Output", + description="The string with raw classification prediction to parse.", + examples=[["$steps.lmm.output"]], + ) + classes: Union[ + WorkflowParameterSelector(kind=[LIST_OF_VALUES_KIND]), + StepOutputSelector(kind=[LIST_OF_VALUES_KIND]), + List[str], + ] = Field( + description="List of all classes used by the model, required to " + "generate mapping between class name and class id.", + examples=[["$steps.lmm.classes", "$inputs.classes", ["class_a", "class_b"]]], + ) + + @classmethod + def describe_outputs(cls) -> List[OutputDefinition]: + return [ + OutputDefinition(name="error_status", kind=[BOOLEAN_KIND]), + OutputDefinition(name="predictions", kind=[CLASSIFICATION_PREDICTION_KIND]), + OutputDefinition(name="inference_id", kind=[STRING_KIND]), + ] + + @classmethod + def get_execution_engine_compatibility(cls) -> Optional[str]: + return ">=1.0.0,<2.0.0" + + +class VLMAsClassifierBlockV1(WorkflowBlock): + + @classmethod + def get_manifest(cls) -> Type[WorkflowBlockManifest]: + return BlockManifest + + def run( + self, + image: WorkflowImageData, + vlm_output: str, + classes: List[str], + ) -> BlockResult: + inference_id = f"{uuid4()}" + error_status, parsed_data = string2json( + raw_json=vlm_output, + ) + if error_status: + return { + "error_status": True, + "predictions": None, + "inference_id": inference_id, + } + if "class_name" in parsed_data and "confidence" in parsed_data: + return parse_multi_class_classification_results( + image=image, + results=parsed_data, + classes=classes, + inference_id=inference_id, + ) + if "predicted_classes" in parsed_data: + return parse_multi_label_classification_results( + image=image, + results=parsed_data, + classes=classes, + inference_id=inference_id, + ) + return { + "error_status": True, + "predictions": None, + "inference_id": inference_id, + } + + +def string2json( + raw_json: str, +) -> Tuple[bool, dict]: + json_blocks_found = JSON_MARKDOWN_BLOCK_PATTERN.findall(raw_json) + if len(json_blocks_found) == 0: + return try_parse_json(raw_json) + first_block = json_blocks_found[0] + return try_parse_json(first_block) + + +def try_parse_json(content: str) -> Tuple[bool, dict]: + try: + return False, json.loads(content) + except Exception as error: + logging.warning( + f"Could not parse JSON to dict in `roboflow_core/vlm_as_classifier@v1` block. " + f"Error type: {error.__class__.__name__}. Details: {error}" + ) + return True, {} + + +def parse_multi_class_classification_results( + image: WorkflowImageData, + results: dict, + classes: List[str], + inference_id: str, +) -> dict: + try: + class2id_mapping = create_classes_index(classes=classes) + height, width = image.numpy_image.shape[:2] + top_class = results["class_name"] + confidences = {top_class: scale_confidence(results["confidence"])} + predictions = [] + if top_class not in class2id_mapping: + predictions.append( + { + "class_name": top_class, + "class_id": -1, + "confidence": confidences.get(top_class, 0.0), + } + ) + for class_name, class_id in class2id_mapping.items(): + predictions.append( + { + "class_name": class_name, + "class_id": class_id, + "confidence": confidences.get(class_name, 0.0), + } + ) + parsed_prediction = { + "image": {"width": width, "height": height}, + "predictions": predictions, + "top": top_class, + "confidence": confidences[top_class], + "inference_id": inference_id, + "parent_id": image.parent_metadata.parent_id, + } + return { + "error_status": False, + "predictions": parsed_prediction, + "inference_id": inference_id, + } + except Exception as error: + logging.warning( + f"Could not parse multi-class classification results in `roboflow_core/vlm_as_classifier@v1` block. " + f"Error type: {error.__class__.__name__}. Details: {error}" + ) + return {"error_status": True, "predictions": None, "inference_id": inference_id} + + +def parse_multi_label_classification_results( + image: WorkflowImageData, + results: dict, + classes: List[str], + inference_id: str, +) -> dict: + try: + class2id_mapping = create_classes_index(classes=classes) + height, width = image.numpy_image.shape[:2] + predicted_classes_confidences = {} + for prediction in results["predicted_classes"]: + if prediction["class"] not in class2id_mapping: + class2id_mapping[prediction["class"]] = -1 + if prediction["class"] in predicted_classes_confidences: + old_confidence = predicted_classes_confidences[prediction["class"]] + new_confidence = scale_confidence(value=prediction["confidence"]) + predicted_classes_confidences[prediction["class"]] = max( + old_confidence, new_confidence + ) + else: + predicted_classes_confidences[prediction["class"]] = scale_confidence( + value=prediction["confidence"] + ) + predictions = { + class_name: { + "confidence": predicted_classes_confidences.get(class_name, 0.0), + "class_id": class_id, + } + for class_name, class_id in class2id_mapping.items() + } + parsed_prediction = { + "image": {"width": width, "height": height}, + "predictions": predictions, + "predicted_classes": list(predicted_classes_confidences.keys()), + "inference_id": inference_id, + "parent_id": image.parent_metadata.parent_id, + } + return { + "error_status": False, + "predictions": parsed_prediction, + "inference_id": inference_id, + } + except Exception as error: + logging.warning( + f"Could not parse multi-label classification results in `roboflow_core/vlm_as_classifier@v1` block. " + f"Error type: {error.__class__.__name__}. Details: {error}" + ) + return {"error_status": True, "predictions": None, "inference_id": inference_id} + + +def create_classes_index(classes: List[str]) -> Dict[str, int]: + return {class_name: idx for idx, class_name in enumerate(classes)} + + +def scale_confidence(value: float) -> float: + return min(max(float(value), 0.0), 1.0) diff --git a/inference/core/workflows/core_steps/formatters/vlm_as_detector/__init__.py b/inference/core/workflows/core_steps/formatters/vlm_as_detector/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/inference/core/workflows/core_steps/formatters/vlm_as_detector/v1.py b/inference/core/workflows/core_steps/formatters/vlm_as_detector/v1.py new file mode 100644 index 0000000000..3dbb7cf3dc --- /dev/null +++ b/inference/core/workflows/core_steps/formatters/vlm_as_detector/v1.py @@ -0,0 +1,261 @@ +import json +import logging +import re +from typing import Dict, List, Literal, Optional, Tuple, Type, Union +from uuid import uuid4 + +import numpy as np +import supervision as sv +from pydantic import ConfigDict, Field, model_validator +from supervision.config import CLASS_NAME_DATA_FIELD + +from inference.core.workflows.core_steps.common.utils import ( + attach_parents_coordinates_to_sv_detections, +) +from inference.core.workflows.execution_engine.constants import ( + DETECTION_ID_KEY, + IMAGE_DIMENSIONS_KEY, + INFERENCE_ID_KEY, + PREDICTION_TYPE_KEY, +) +from inference.core.workflows.execution_engine.entities.base import ( + OutputDefinition, + WorkflowImageData, +) +from inference.core.workflows.execution_engine.entities.types import ( + BOOLEAN_KIND, + LANGUAGE_MODEL_OUTPUT_KIND, + LIST_OF_VALUES_KIND, + OBJECT_DETECTION_PREDICTION_KIND, + STRING_KIND, + StepOutputImageSelector, + StepOutputSelector, + WorkflowImageSelector, + WorkflowParameterSelector, +) +from inference.core.workflows.prototypes.block import ( + BlockResult, + WorkflowBlock, + WorkflowBlockManifest, +) + +JSON_MARKDOWN_BLOCK_PATTERN = re.compile(r"```json([\s\S]*?)```", flags=re.IGNORECASE) + +LONG_DESCRIPTION = """ +The block expects string input that would be produced by blocks exposing Large Language Models (LLMs) and +Visual Language Models (VLMs). Input is parsed to object-detection prediction and returned as block output. + +Accepted formats: + +- valid JSON strings + +- JSON documents wrapped with Markdown tags + +Example +``` +{"my": "json"} +``` + +**Details regarding block behavior:** + +- `error_status` is set `True` whenever parsing cannot be completed + +- in case of multiple markdown blocks with raw JSON content - only first will be parsed +""" + +SHORT_DESCRIPTION = "Parses raw string into object-detection prediction." + + +class BlockManifest(WorkflowBlockManifest): + model_config = ConfigDict( + json_schema_extra={ + "name": "VLM as Detector", + "version": "v1", + "short_description": SHORT_DESCRIPTION, + "long_description": LONG_DESCRIPTION, + "license": "Apache-2.0", + "block_type": "formatter", + } + ) + type: Literal["roboflow_core/vlm_as_detector@v1"] + image: Union[WorkflowImageSelector, StepOutputImageSelector] = Field( + description="The image which was the base to generate VLM prediction", + examples=["$inputs.image", "$steps.cropping.crops"], + ) + vlm_output: StepOutputSelector(kind=[LANGUAGE_MODEL_OUTPUT_KIND]) = Field( + title="VLM Output", + description="The string with raw classification prediction to parse.", + examples=[["$steps.lmm.output"]], + ) + classes: Union[ + WorkflowParameterSelector(kind=[LIST_OF_VALUES_KIND]), + StepOutputSelector(kind=[LIST_OF_VALUES_KIND]), + List[str], + ] = Field( + description="List of all classes used by the model, required to " + "generate mapping between class name and class id.", + examples=[["$steps.lmm.classes", "$inputs.classes", ["class_a", "class_b"]]], + ) + model_type: Literal["google-gemini", "anthropic-claude"] = Field( + description="Type of the model that generated prediction", + examples=[["google-gemini", "anthropic-claude"]], + ) + task_type: Literal["object-detection"] + + @model_validator(mode="after") + def validate(self) -> "BlockManifest": + if (self.model_type, self.task_type) not in REGISTERED_PARSERS: + raise ValueError( + f"Could not parse result of task {self.task_type} for model {self.model_type}" + ) + return self + + @classmethod + def describe_outputs(cls) -> List[OutputDefinition]: + return [ + OutputDefinition(name="error_status", kind=[BOOLEAN_KIND]), + OutputDefinition( + name="predictions", kind=[OBJECT_DETECTION_PREDICTION_KIND] + ), + OutputDefinition(name="inference_id", kind=[STRING_KIND]), + ] + + @classmethod + def get_execution_engine_compatibility(cls) -> Optional[str]: + return ">=1.0.0,<2.0.0" + + +class VLMAsDetectorBlockV1(WorkflowBlock): + + @classmethod + def get_manifest(cls) -> Type[WorkflowBlockManifest]: + return BlockManifest + + def run( + self, + image: WorkflowImageData, + vlm_output: str, + classes: List[str], + model_type: str, + task_type: str, + ) -> BlockResult: + inference_id = f"{uuid4()}" + error_status, parsed_data = string2json( + raw_json=vlm_output, + ) + if error_status: + return { + "error_status": True, + "predictions": None, + "inference_id": inference_id, + } + try: + predictions = REGISTERED_PARSERS[(model_type, task_type)]( + image=image, + parsed_data=parsed_data, + classes=classes, + inference_id=inference_id, + ) + return { + "error_status": False, + "predictions": predictions, + "inference_id": inference_id, + } + except Exception as error: + logging.warning( + f"Could not parse VLM prediction for model {model_type} and task {task_type} " + f"in `roboflow_core/vlm_as_detector@v1` block. " + f"Error type: {error.__class__.__name__}. Details: {error}" + ) + return { + "error_status": True, + "predictions": None, + "inference_id": inference_id, + } + + +def string2json( + raw_json: str, +) -> Tuple[bool, dict]: + json_blocks_found = JSON_MARKDOWN_BLOCK_PATTERN.findall(raw_json) + if len(json_blocks_found) == 0: + return try_parse_json(raw_json) + first_block = json_blocks_found[0] + return try_parse_json(first_block) + + +def try_parse_json(content: str) -> Tuple[bool, dict]: + try: + return False, json.loads(content) + except Exception as error: + logging.warning( + f"Could not parse JSON to dict in `roboflow_core/vlm_as_detector@v1` block. " + f"Error type: {error.__class__.__name__}. Details: {error}" + ) + return True, {} + + +def parse_gemini_object_detection_response( + image: WorkflowImageData, + parsed_data: dict, + classes: List[str], + inference_id: str, +) -> sv.Detections: + class_name2id = create_classes_index(classes=classes) + image_height, image_width = image.numpy_image.shape[:2] + if len(parsed_data["detections"]) == 0: + return sv.Detections.empty() + xyxy, class_id, class_name, confidence = [], [], [], [] + for detection in parsed_data["detections"]: + xyxy.append( + [ + detection["x_min"] * image_width, + detection["y_min"] * image_height, + detection["x_max"] * image_width, + detection["y_max"] * image_height, + ] + ) + class_id.append(class_name2id.get(detection["class_name"], -1)) + class_name.append(detection["class_name"]) + confidence.append(scale_confidence(detection.get("confidence", 1.0))) + xyxy = np.array(xyxy).round(0) if len(xyxy) > 0 else np.empty((0, 4)) + confidence = np.array(confidence) if len(confidence) > 0 else np.empty(0) + class_id = np.array(class_id).astype(int) if len(class_id) > 0 else np.empty(0) + class_name = np.array(class_name) if len(class_name) > 0 else np.empty(0) + detection_ids = np.array([str(uuid4()) for _ in range(len(xyxy))]) + dimensions = np.array([[image_height, image_width]] * len(xyxy)) + inference_ids = np.array([inference_id] * len(xyxy)) + prediction_type = np.array(["object-detection"] * len(xyxy)) + data = { + CLASS_NAME_DATA_FIELD: class_name, + IMAGE_DIMENSIONS_KEY: dimensions, + INFERENCE_ID_KEY: inference_ids, + DETECTION_ID_KEY: detection_ids, + PREDICTION_TYPE_KEY: prediction_type, + } + detections = sv.Detections( + xyxy=xyxy, + confidence=confidence, + class_id=class_id, + mask=None, + tracker_id=None, + data=data, + ) + return attach_parents_coordinates_to_sv_detections( + detections=detections, + image=image, + ) + + +def create_classes_index(classes: List[str]) -> Dict[str, int]: + return {class_name: idx for idx, class_name in enumerate(classes)} + + +def scale_confidence(value: float) -> float: + return min(max(float(value), 0.0), 1.0) + + +REGISTERED_PARSERS = { + ("google-gemini", "object-detection"): parse_gemini_object_detection_response, + ("anthropic-claude", "object-detection"): parse_gemini_object_detection_response, +} diff --git a/inference/core/workflows/core_steps/loader.py b/inference/core/workflows/core_steps/loader.py index 6cfca977d1..e09ef79df4 100644 --- a/inference/core/workflows/core_steps/loader.py +++ b/inference/core/workflows/core_steps/loader.py @@ -40,9 +40,18 @@ from inference.core.workflows.core_steps.formatters.first_non_empty_or_default.v1 import ( FirstNonEmptyOrDefaultBlockV1, ) +from inference.core.workflows.core_steps.formatters.json_parser.v1 import ( + JSONParserBlockV1, +) from inference.core.workflows.core_steps.formatters.property_definition.v1 import ( PropertyDefinitionBlockV1, ) +from inference.core.workflows.core_steps.formatters.vlm_as_classifier.v1 import ( + VLMAsClassifierBlockV1, +) +from inference.core.workflows.core_steps.formatters.vlm_as_detector.v1 import ( + VLMAsDetectorBlockV1, +) from inference.core.workflows.core_steps.fusion.detections_classes_replacement.v1 import ( DetectionsClassesReplacementBlockV1, ) @@ -55,6 +64,9 @@ from inference.core.workflows.core_steps.fusion.dimension_collapse.v1 import ( DimensionCollapseBlockV1, ) +from inference.core.workflows.core_steps.models.foundation.anthropic_claude.v1 import ( + AntropicClaudeBlockV1, +) from inference.core.workflows.core_steps.models.foundation.clip_comparison.v1 import ( ClipComparisonBlockV1, ) @@ -64,6 +76,9 @@ from inference.core.workflows.core_steps.models.foundation.cog_vlm.v1 import ( CogVLMBlockV1, ) +from inference.core.workflows.core_steps.models.foundation.google_gemini.v1 import ( + GoogleGeminiBlockV1, +) from inference.core.workflows.core_steps.models.foundation.lmm.v1 import LMMBlockV1 from inference.core.workflows.core_steps.models.foundation.lmm_classifier.v1 import ( LMMForClassificationBlockV1, @@ -72,6 +87,9 @@ from inference.core.workflows.core_steps.models.foundation.openai.v1 import ( OpenAIBlockV1, ) +from inference.core.workflows.core_steps.models.foundation.openai.v2 import ( + OpenAIBlockV2, +) from inference.core.workflows.core_steps.models.foundation.segment_anything2.v1 import ( SegmentAnything2BlockV1, ) @@ -197,6 +215,7 @@ INSTANCE_SEGMENTATION_PREDICTION_KIND, INTEGER_KIND, KEYPOINT_DETECTION_PREDICTION_KIND, + LANGUAGE_MODEL_OUTPUT_KIND, LIST_OF_VALUES_KIND, NUMPY_ARRAY_KIND, OBJECT_DETECTION_PREDICTION_KIND, @@ -290,6 +309,12 @@ def load_blocks() -> List[Type[WorkflowBlock]]: ClipComparisonBlockV2, CameraFocusBlockV1, RoboflowDatasetUploadBlockV2, + OpenAIBlockV2, + JSONParserBlockV1, + VLMAsClassifierBlockV1, + GoogleGeminiBlockV1, + VLMAsDetectorBlockV1, + AntropicClaudeBlockV1, ] @@ -320,6 +345,7 @@ def load_kinds() -> List[Kind]: RGB_COLOR_KIND, IMAGE_KEYPOINTS_KIND, CONTOURS_KIND, + LANGUAGE_MODEL_OUTPUT_KIND, NUMPY_ARRAY_KIND, QR_CODE_DETECTION_KIND, BAR_CODE_DETECTION_KIND, diff --git a/inference/core/workflows/core_steps/models/foundation/anthropic_claude/__init__.py b/inference/core/workflows/core_steps/models/foundation/anthropic_claude/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/inference/core/workflows/core_steps/models/foundation/anthropic_claude/v1.py b/inference/core/workflows/core_steps/models/foundation/anthropic_claude/v1.py new file mode 100644 index 0000000000..370ea4cc72 --- /dev/null +++ b/inference/core/workflows/core_steps/models/foundation/anthropic_claude/v1.py @@ -0,0 +1,657 @@ +import base64 +import json +import re +from functools import partial +from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union + +import anthropic +from anthropic import NOT_GIVEN +from pydantic import ConfigDict, Field, model_validator + +from inference.core.env import WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS +from inference.core.managers.base import ModelManager +from inference.core.utils.image_utils import encode_image_to_jpeg_bytes, load_image +from inference.core.utils.preprocess import downscale_image_keeping_aspect_ratio +from inference.core.workflows.core_steps.common.utils import run_in_parallel +from inference.core.workflows.execution_engine.entities.base import ( + Batch, + OutputDefinition, + WorkflowImageData, +) +from inference.core.workflows.execution_engine.entities.types import ( + FLOAT_KIND, + INTEGER_KIND, + LANGUAGE_MODEL_OUTPUT_KIND, + LIST_OF_VALUES_KIND, + STRING_KIND, + ImageInputField, + StepOutputImageSelector, + WorkflowImageSelector, + WorkflowParameterSelector, +) +from inference.core.workflows.prototypes.block import ( + BlockResult, + WorkflowBlock, + WorkflowBlockManifest, +) + +LONG_DESCRIPTION = """ +Ask a question to Anthropic Claude model with vision capabilities. + +You can specify arbitrary text prompts or predefined ones, the block supports the following types of prompt: + +- `unconstrained` - any arbitrary prompt you like + +- `ocr`- predefined prompt to recognise text from image + +- `visual-question-answering` - your prompt is supposed to provide question and will be +wrapped into structure that is suited for VQA task + +- `caption` - predefined prompt to generate short caption of the image + +- `detailed-caption` - predefined prompt to generate elaborated caption of the image + +- `classification` - predefined prompt to generate multi-class classification output (that can be parsed +with `VLM as Classifier` block) + +- `multi-label-classification` - predefined prompt to generate multi-label classification output (that +can be parsed with `VLM as Classifier` block) + +- `object-detection` - predefined prompt to generate object detection output (that can be parsed +with `VLM as Detector` block) + +- `structured-answering` - your input defines expected JSON output fields that can be parsed with `JSON Parser` +block. + +You need to provide your Anthropic API key to use the Claude model. +""" + +TaskType = Literal[ + "unconstrained", + "ocr", + "visual-question-answering", + "caption", + "detailed-caption", + "classification", + "multi-label-classification", + "structured-answering", + "object-detection", +] + +TASKS_REQUIRING_PROMPT = { + "unconstrained", + "visual-question-answering", +} + +TASKS_REQUIRING_CLASSES = { + "classification", + "multi-label-classification", + "object-detection", +} + +TASKS_REQUIRING_OUTPUT_STRUCTURE = { + "structured-answering", +} + + +class BlockManifest(WorkflowBlockManifest): + model_config = ConfigDict( + json_schema_extra={ + "name": "Anthropic Claude", + "version": "v1", + "short_description": "Run Anthropic Claude model with vision capabilities", + "long_description": LONG_DESCRIPTION, + "license": "Apache-2.0", + "block_type": "model", + "search_keywords": ["LMM", "VLM", "Claude", "Anthropic"], + } + ) + type: Literal["roboflow_core/anthropic_claude@v1"] + images: Union[WorkflowImageSelector, StepOutputImageSelector] = ImageInputField + task_type: TaskType = Field( + description="Task type to be performed by model. Value of parameter determine set of fields " + "that are required. For `unconstrained`, `visual-question-answering`, " + " - `prompt` parameter must be provided." + "For `structured-answering` - `output-structure` must be provided. For " + "`classification`, `multi-label-classification` and `object-detection` - " + "`classes` must be filled. `ocr`, `caption`, `detailed-caption` do not" + "require any additional parameter.", + ) + prompt: Optional[Union[WorkflowParameterSelector(kind=[STRING_KIND]), str]] = Field( + default=None, + description="Text prompt to the Claude model", + examples=["my prompt", "$inputs.prompt"], + json_schema_extra={ + "relevant_for": { + "task_type": {"values": TASKS_REQUIRING_PROMPT, "required": True}, + }, + }, + ) + output_structure: Optional[Dict[str, str]] = Field( + default=None, + description="Dictionary with structure of expected JSON response", + examples=[{"my_key": "description"}, "$inputs.output_structure"], + json_schema_extra={ + "relevant_for": { + "task_type": {"values": TASKS_REQUIRING_CLASSES, "required": True}, + }, + }, + ) + classes: Optional[ + Union[WorkflowParameterSelector(kind=[LIST_OF_VALUES_KIND]), List[str]] + ] = Field( + default=None, + description="List of classes to be used", + examples=[["class-a", "class-b"], "$inputs.classes"], + json_schema_extra={ + "relevant_for": { + "task_type": { + "values": TASKS_REQUIRING_OUTPUT_STRUCTURE, + "required": True, + }, + }, + }, + ) + api_key: Union[WorkflowParameterSelector(kind=[STRING_KIND]), str] = Field( + description="Your Antropic API key", + examples=["xxx-xxx", "$inputs.antropics_api_key"], + private=True, + ) + model_version: Union[ + WorkflowParameterSelector(kind=[STRING_KIND]), + Literal[ + "claude-3-5-sonnet", "claude-3-opus", "claude-3-sonnet", "claude-3-haiku" + ], + ] = Field( + default="claude-3-5-sonnet", + description="Model to be used", + examples=["claude-3-5-sonnet", "$inputs.claude"], + ) + max_tokens: int = Field( + default=450, + description="Maximum number of tokens the model can generate in it's response.", + ) + temperature: Optional[ + Union[float, WorkflowParameterSelector(kind=[FLOAT_KIND])] + ] = Field( + default=None, + description="Temperature to sample from the model - value in range 0.0-2.0, the higher - the more " + 'random / "creative" the generations are.', + ge=0.0, + le=2.0, + ) + max_image_size: Union[int, WorkflowParameterSelector(kind=[INTEGER_KIND])] = Field( + description="Maximum size of the image - if input has larger side, it will be downscaled, keeping aspect ratio", + default=1024, + ) + max_concurrent_requests: Optional[int] = Field( + default=None, + description="Number of concurrent requests that can be executed by block when batch of input images provided. " + "If not given - block defaults to value configured globally in Workflows Execution Engine. " + "Please restrict if you hit ANtropic API limits.", + ) + + @model_validator(mode="after") + def validate(self) -> "BlockManifest": + if self.task_type in TASKS_REQUIRING_PROMPT and self.prompt is None: + raise ValueError( + f"`prompt` parameter required to be set for task `{self.task_type}`" + ) + if self.task_type in TASKS_REQUIRING_CLASSES and self.classes is None: + raise ValueError( + f"`classes` parameter required to be set for task `{self.task_type}`" + ) + if ( + self.task_type in TASKS_REQUIRING_OUTPUT_STRUCTURE + and self.output_structure is None + ): + raise ValueError( + f"`output_structure` parameter required to be set for task `{self.task_type}`" + ) + return self + + @classmethod + def accepts_batch_input(cls) -> bool: + return True + + @classmethod + def describe_outputs(cls) -> List[OutputDefinition]: + return [ + OutputDefinition( + name="output", kind=[STRING_KIND, LANGUAGE_MODEL_OUTPUT_KIND] + ), + OutputDefinition(name="classes", kind=[LIST_OF_VALUES_KIND]), + ] + + @classmethod + def get_execution_engine_compatibility(cls) -> Optional[str]: + return ">=1.0.0,<2.0.0" + + +class AntropicClaudeBlockV1(WorkflowBlock): + + def __init__( + self, + model_manager: ModelManager, + api_key: Optional[str], + ): + self._model_manager = model_manager + self._api_key = api_key + + @classmethod + def get_init_parameters(cls) -> List[str]: + return ["model_manager", "api_key"] + + @classmethod + def get_manifest(cls) -> Type[WorkflowBlockManifest]: + return BlockManifest + + @classmethod + def get_execution_engine_compatibility(cls) -> Optional[str]: + return ">=1.0.0,<2.0.0" + + def run( + self, + images: Batch[WorkflowImageData], + task_type: TaskType, + prompt: Optional[str], + output_structure: Optional[Dict[str, str]], + classes: Optional[List[str]], + api_key: str, + model_version: str, + max_tokens: int, + temperature: Optional[float], + max_image_size: int, + max_concurrent_requests: Optional[int], + ) -> BlockResult: + inference_images = [i.to_inference_format() for i in images] + raw_outputs = run_claude_prompting( + images=inference_images, + task_type=task_type, + prompt=prompt, + output_structure=output_structure, + classes=classes, + api_key=api_key, + model_version=model_version, + max_tokens=max_tokens, + temperature=temperature, + max_image_size=max_image_size, + max_concurrent_requests=max_concurrent_requests, + ) + return [ + {"output": raw_output, "classes": classes} for raw_output in raw_outputs + ] + + +def run_claude_prompting( + images: List[Dict[str, Any]], + task_type: TaskType, + prompt: Optional[str], + output_structure: Optional[Dict[str, str]], + classes: Optional[List[str]], + api_key: str, + model_version: str, + max_tokens: int, + temperature: Optional[float], + max_image_size: int, + max_concurrent_requests: Optional[int], +) -> List[str]: + if task_type not in PROMPT_BUILDERS: + raise ValueError(f"Task type: {task_type} not supported.") + prompts = [] + for image in images: + loaded_image, _ = load_image(image) + loaded_image = downscale_image_keeping_aspect_ratio( + image=loaded_image, desired_size=(max_image_size, max_image_size) + ) + base64_image = base64.b64encode( + encode_image_to_jpeg_bytes(loaded_image) + ).decode("ascii") + prompt = PROMPT_BUILDERS[task_type]( + base64_image=base64_image, + prompt=prompt, + output_structure=output_structure, + classes=classes, + ) + prompts.append(prompt) + return execute_claude_requests( + api_key=api_key, + prompts=prompts, + model_version=model_version, + max_tokens=max_tokens, + temperature=temperature, + max_concurrent_requests=max_concurrent_requests, + ) + + +def execute_claude_requests( + api_key: str, + prompts: List[Tuple[Optional[str], List[dict]]], + model_version: str, + max_tokens: int, + temperature: Optional[float], + max_concurrent_requests: Optional[int], +) -> List[str]: + tasks = [ + partial( + execute_claude_request, + system_prompt=prompt[0], + messages=prompt[1], + model_version=model_version, + max_tokens=max_tokens, + temperature=temperature, + api_key=api_key, + ) + for prompt in prompts + ] + max_workers = ( + max_concurrent_requests + or WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS + ) + return run_in_parallel( + tasks=tasks, + max_workers=max_workers, + ) + + +EXACT_MODELS_VERSIONS_MAPPING = { + "claude-3-5-sonnet": "claude-3-5-sonnet-20240620", + "claude-3-opus": "claude-3-opus-20240229", + "claude-3-sonnet": "claude-3-sonnet-20240229", + "claude-3-haiku": "claude-3-haiku-20240307", +} + + +def execute_claude_request( + system_prompt: Optional[str], + messages: List[dict], + model_version: str, + max_tokens: int, + temperature: Optional[float], + api_key: str, +) -> str: + client = anthropic.Anthropic(api_key=api_key) + if system_prompt is None: + system_prompt = NOT_GIVEN + if temperature is None: + temperature = NOT_GIVEN + result = client.messages.create( + system=system_prompt, + messages=messages, + max_tokens=max_tokens, + model=EXACT_MODELS_VERSIONS_MAPPING[model_version], + temperature=temperature, + ) + return result.content[0].text + + +def prepare_unconstrained_prompt( + base64_image: str, + prompt: str, + **kwargs, +) -> Tuple[Optional[str], List[dict]]: + messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": base64_image, + }, + }, + { + "type": "text", + "text": prompt, + }, + ], + } + ] + return None, messages + + +def prepare_classification_prompt( + base64_image: str, + classes: List[str], + **kwargs, +) -> Tuple[Optional[str], List[dict]]: + serialised_classes = ", ".join(classes) + system_prompt = ( + "You act as single-class classification model. You must provide reasonable predictions. " + "You are only allowed to produce JSON document. " + 'Expected structure of json: {"class_name": "class-name", "confidence": 0.4}. ' + "`class-name` must be one of the class names defined by user. You are only allowed to return " + "single JSON document, even if there are potentially multiple classes. You are not allowed to " + "return list." + ) + messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": base64_image, + }, + }, + { + "type": "text", + "text": f"List of all classes to be recognised by model: {serialised_classes}", + }, + ], + } + ] + return system_prompt, messages + + +def prepare_multi_label_classification_prompt( + base64_image: str, + classes: List[str], + **kwargs, +) -> Tuple[Optional[str], List[dict]]: + serialised_classes = ", ".join(classes) + system_prompt = ( + "You act as multi-label classification model. You must provide reasonable predictions. " + "You are only allowed to produce JSON document. " + 'Expected structure of json: {"predicted_classes": [{"class": "class-name-1", "confidence": 0.9}, ' + '{"class": "class-name-2", "confidence": 0.7}]}.' + "`class-name-X` must be one of the class names defined by user and `confidence` is a float value " + "in range 0.0-1.0 that represents how sure you are that the class is present in the image. " + "Only return class names that are visible." + ) + messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": base64_image, + }, + }, + { + "type": "text", + "text": f"List of all classes to be recognised by model: {serialised_classes}", + }, + ], + } + ] + return system_prompt, messages + + +def prepare_vqa_prompt( + base64_image: str, + prompt: str, + **kwargs, +) -> Tuple[Optional[str], List[dict]]: + system_prompt = ( + "You act as Visual Question Answering model. Your task is to provide answer to question" + "submitted by user. If this is open-question - answer with few sentences, for ABCD question, " + "return only the indicator of the answer." + ) + messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": base64_image, + }, + }, + { + "type": "text", + "text": f"Question: {prompt}", + }, + ], + } + ] + return system_prompt, messages + + +def prepare_ocr_prompt( + base64_image: str, + **kwargs, +) -> Tuple[Optional[str], List[dict]]: + system_prompt = ( + "You act as OCR model. Your task is to read text from the image and return it in " + "paragraphs representing the structure of texts in the image. You should only return " + "recognised text, nothing else." + ) + messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": base64_image, + }, + }, + ], + } + ] + return system_prompt, messages + + +def prepare_caption_prompt( + base64_image: str, + short_description: bool, + **kwargs, +) -> Tuple[Optional[str], List[dict]]: + caption_detail_level = "Caption should be short." + if not short_description: + caption_detail_level = "Caption should be extensive." + system_prompt = ( + f"You act as image caption model. Your task is to provide description of the image. " + f"{caption_detail_level}" + ) + messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": base64_image, + }, + }, + ], + } + ] + return system_prompt, messages + + +def prepare_structured_answering_prompt( + base64_image: str, + output_structure: Dict[str, str], + **kwargs, +) -> Tuple[Optional[str], List[dict]]: + output_structure_serialised = json.dumps(output_structure, indent=4) + system_prompt = ( + "You are supposed to produce responses in JSON. User is to provide you dictionary with " + "keys and values. Each key must be present in your response. Values in user dictionary " + "represent descriptions for JSON fields to be generated. Provide only JSON in response." + ) + messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": base64_image, + }, + }, + { + "type": "text", + "text": f"Specification of requirements regarding output fields: \n" + f"{output_structure_serialised}", + }, + ], + } + ] + return system_prompt, messages + + +def prepare_object_detection_prompt( + base64_image: str, + classes: List[str], + **kwargs, +) -> Tuple[Optional[str], List[dict]]: + serialised_classes = ", ".join(classes) + system_prompt = ( + "You act as object-detection model. You must provide reasonable predictions. " + "You are only allowed to produce JSON document. " + 'Expected structure of json: {"detections": [{"x_min": 0.1, "y_min": 0.2, "x_max": 0.3, "y_max": 0.4, "class_name": "my-class-X", "confidence": 0.7}]} ' + "- remember to close top-level dictionary at the end. " + "`my-class-X` must be one of the class names defined by user. All coordinates must be in range 0.0-1.0, representing percentage of image dimensions. " + "`confidence` is a value in range 0.0-1.0 representing your confidence in prediction. You should detect all instances of classes provided by user." + ) + messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": base64_image, + }, + }, + { + "type": "text", + "text": f"List of all classes to be recognised by model: {serialised_classes}", + }, + ], + } + ] + return system_prompt, messages + + +PROMPT_BUILDERS = { + "unconstrained": prepare_unconstrained_prompt, + "ocr": prepare_ocr_prompt, + "visual-question-answering": prepare_vqa_prompt, + "caption": partial(prepare_caption_prompt, short_description=True), + "detailed-caption": partial(prepare_caption_prompt, short_description=False), + "classification": prepare_classification_prompt, + "multi-label-classification": prepare_multi_label_classification_prompt, + "structured-answering": prepare_structured_answering_prompt, + "object-detection": prepare_object_detection_prompt, +} diff --git a/inference/core/workflows/core_steps/models/foundation/google_gemini/__init__.py b/inference/core/workflows/core_steps/models/foundation/google_gemini/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/inference/core/workflows/core_steps/models/foundation/google_gemini/v1.py b/inference/core/workflows/core_steps/models/foundation/google_gemini/v1.py new file mode 100644 index 0000000000..9fb2d6638a --- /dev/null +++ b/inference/core/workflows/core_steps/models/foundation/google_gemini/v1.py @@ -0,0 +1,725 @@ +import base64 +import json +import re +from functools import partial +from typing import Any, Dict, List, Literal, Optional, Type, Union + +import requests +from pydantic import ConfigDict, Field, model_validator +from requests import Response + +from inference.core.env import WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS +from inference.core.managers.base import ModelManager +from inference.core.utils.image_utils import encode_image_to_jpeg_bytes, load_image +from inference.core.workflows.core_steps.common.utils import run_in_parallel +from inference.core.workflows.execution_engine.entities.base import ( + Batch, + OutputDefinition, + WorkflowImageData, +) +from inference.core.workflows.execution_engine.entities.types import ( + FLOAT_KIND, + LANGUAGE_MODEL_OUTPUT_KIND, + LIST_OF_VALUES_KIND, + STRING_KIND, + ImageInputField, + StepOutputImageSelector, + WorkflowImageSelector, + WorkflowParameterSelector, +) +from inference.core.workflows.prototypes.block import ( + BlockResult, + WorkflowBlock, + WorkflowBlockManifest, +) + +GOOGLE_API_KEY_PATTERN = re.compile(r"key=(.[^&]*)") +GOOGLE_API_KEY_VALUE_GROUP = 1 +MIN_KEY_LENGTH_TO_REVEAL_PREFIX = 8 + +LONG_DESCRIPTION = """ +Ask a question to Google's Gemini model with vision capabilities. + +You can specify arbitrary text prompts or predefined ones, the block supports the following types of prompt: + +- `unconstrained` - any arbitrary prompt you like + +- `ocr`- predefined prompt to recognise text from image + +- `visual-question-answering` - your prompt is supposed to provide question and will be +wrapped into structure that is suited for VQA task + +- `caption` - predefined prompt to generate short caption of the image + +- `detailed-caption` - predefined prompt to generate elaborated caption of the image + +- `classification` - predefined prompt to generate multi-class classification output (that can be parsed +with `VLM as Classifier` block) + +- `multi-label-classification` - predefined prompt to generate multi-label classification output (that +can be parsed with `VLM as Classifier` block) + +- `object-detection` - predefined prompt to generate object detection output (that can be parsed +with `VLM as Detector` block) + +- `structured-answering` - your input defines expected JSON output fields that can be parsed with `JSON Parser` +block. + +You need to provide your Google AI API key to use the Gemini model. + +**WARNING!** + +This block makes use of `/v1beta` API of Google Gemini model - the implementation may change +in the future, without guarantee of backward compatibility. +""" + +TaskType = Literal[ + "unconstrained", + "ocr", + "visual-question-answering", + "caption", + "detailed-caption", + "classification", + "multi-label-classification", + "structured-answering", + "object-detection", +] + +TASKS_REQUIRING_PROMPT = { + "unconstrained", + "visual-question-answering", +} + +TASKS_REQUIRING_CLASSES = { + "classification", + "multi-label-classification", + "object-detection", +} + +TASKS_REQUIRING_OUTPUT_STRUCTURE = { + "structured-answering", +} + + +class BlockManifest(WorkflowBlockManifest): + model_config = ConfigDict( + json_schema_extra={ + "name": "Google Gemini", + "version": "v1", + "short_description": "Run Google's Gemini model with vision capabilities", + "long_description": LONG_DESCRIPTION, + "license": "Apache-2.0", + "block_type": "model", + "search_keywords": ["LMM", "VLM", "Gemini", "Google"], + "beta": True, + } + ) + type: Literal["roboflow_core/google_gemini@v1"] + images: Union[WorkflowImageSelector, StepOutputImageSelector] = ImageInputField + task_type: TaskType = Field( + description="Task type to be performed by model. Value of parameter determine set of fields " + "that are required. For `unconstrained`, `visual-question-answering`, " + " - `prompt` parameter must be provided." + "For `structured-answering` - `output-structure` must be provided. For " + "`classification`, `multi-label-classification` and `object-detection` - " + "`classes` must be filled. `ocr`, `caption`, `detailed-caption` do not" + "require any additional parameter.", + ) + prompt: Optional[Union[WorkflowParameterSelector(kind=[STRING_KIND]), str]] = Field( + default=None, + description="Text prompt to the Gemini model", + examples=["my prompt", "$inputs.prompt"], + json_schema_extra={ + "relevant_for": { + "task_type": {"values": TASKS_REQUIRING_PROMPT, "required": True}, + }, + }, + ) + output_structure: Optional[Dict[str, str]] = Field( + default=None, + description="Dictionary with structure of expected JSON response", + examples=[{"my_key": "description"}, "$inputs.output_structure"], + json_schema_extra={ + "relevant_for": { + "task_type": {"values": TASKS_REQUIRING_CLASSES, "required": True}, + }, + }, + ) + classes: Optional[ + Union[WorkflowParameterSelector(kind=[LIST_OF_VALUES_KIND]), List[str]] + ] = Field( + default=None, + description="List of classes to be used", + examples=[["class-a", "class-b"], "$inputs.classes"], + json_schema_extra={ + "relevant_for": { + "task_type": { + "values": TASKS_REQUIRING_OUTPUT_STRUCTURE, + "required": True, + }, + }, + }, + ) + api_key: Union[WorkflowParameterSelector(kind=[STRING_KIND]), str] = Field( + description="Your Google AI API key", + examples=["xxx-xxx", "$inputs.google_api_key"], + private=True, + ) + model_version: Union[ + WorkflowParameterSelector(kind=[STRING_KIND]), + Literal["gemini-1.5-flash", "gemini-1.5-pro"], + ] = Field( + default="gemini-1.5-flash", + description="Model to be used", + examples=["gemini-1.5-flash", "$inputs.gemini_model"], + ) + max_tokens: int = Field( + default=450, + description="Maximum number of tokens the model can generate in it's response.", + ) + temperature: Optional[ + Union[float, WorkflowParameterSelector(kind=[FLOAT_KIND])] + ] = Field( + default=None, + description="Temperature to sample from the model - value in range 0.0-2.0, the higher - the more " + 'random / "creative" the generations are.', + ge=0.0, + le=2.0, + ) + max_concurrent_requests: Optional[int] = Field( + default=None, + description="Number of concurrent requests that can be executed by block when batch of input images provided. " + "If not given - block defaults to value configured globally in Workflows Execution Engine. " + "Please restrict if you hit Google Gemini API limits.", + ) + + @model_validator(mode="after") + def validate(self) -> "BlockManifest": + if self.task_type in TASKS_REQUIRING_PROMPT and self.prompt is None: + raise ValueError( + f"`prompt` parameter required to be set for task `{self.task_type}`" + ) + if self.task_type in TASKS_REQUIRING_CLASSES and self.classes is None: + raise ValueError( + f"`classes` parameter required to be set for task `{self.task_type}`" + ) + if ( + self.task_type in TASKS_REQUIRING_OUTPUT_STRUCTURE + and self.output_structure is None + ): + raise ValueError( + f"`output_structure` parameter required to be set for task `{self.task_type}`" + ) + return self + + @classmethod + def accepts_batch_input(cls) -> bool: + return True + + @classmethod + def describe_outputs(cls) -> List[OutputDefinition]: + return [ + OutputDefinition( + name="output", kind=[STRING_KIND, LANGUAGE_MODEL_OUTPUT_KIND] + ), + OutputDefinition(name="classes", kind=[LIST_OF_VALUES_KIND]), + ] + + @classmethod + def get_execution_engine_compatibility(cls) -> Optional[str]: + return ">=1.0.0,<2.0.0" + + +class GoogleGeminiBlockV1(WorkflowBlock): + + def __init__( + self, + model_manager: ModelManager, + api_key: Optional[str], + ): + self._model_manager = model_manager + self._api_key = api_key + + @classmethod + def get_init_parameters(cls) -> List[str]: + return ["model_manager", "api_key"] + + @classmethod + def get_manifest(cls) -> Type[WorkflowBlockManifest]: + return BlockManifest + + @classmethod + def get_execution_engine_compatibility(cls) -> Optional[str]: + return ">=1.0.0,<2.0.0" + + def run( + self, + images: Batch[WorkflowImageData], + task_type: TaskType, + prompt: Optional[str], + output_structure: Optional[Dict[str, str]], + classes: Optional[List[str]], + api_key: str, + model_version: str, + max_tokens: int, + temperature: Optional[float], + max_concurrent_requests: Optional[int], + ) -> BlockResult: + inference_images = [i.to_inference_format() for i in images] + raw_outputs = run_gemini_prompting( + images=inference_images, + task_type=task_type, + prompt=prompt, + output_structure=output_structure, + classes=classes, + google_api_key=api_key, + model_version=model_version, + max_tokens=max_tokens, + temperature=temperature, + max_concurrent_requests=max_concurrent_requests, + ) + return [ + {"output": raw_output, "classes": classes} for raw_output in raw_outputs + ] + + +def run_gemini_prompting( + images: List[Dict[str, Any]], + task_type: TaskType, + prompt: Optional[str], + output_structure: Optional[Dict[str, str]], + classes: Optional[List[str]], + google_api_key: Optional[str], + model_version: str, + max_tokens: int, + temperature: Optional[float], + max_concurrent_requests: Optional[int], +) -> List[str]: + if task_type not in PROMPT_BUILDERS: + raise ValueError(f"Task type: {task_type} not supported.") + gemini_prompts = [] + for image in images: + loaded_image, _ = load_image(image) + base64_image = base64.b64encode( + encode_image_to_jpeg_bytes(loaded_image) + ).decode("ascii") + prompt = PROMPT_BUILDERS[task_type]( + base64_image=base64_image, + prompt=prompt, + output_structure=output_structure, + classes=classes, + temperature=temperature, + max_tokens=max_tokens, + ) + gemini_prompts.append(prompt) + return execute_gemini_requests( + google_api_key=google_api_key, + gemini_prompts=gemini_prompts, + model_version=model_version, + max_concurrent_requests=max_concurrent_requests, + ) + + +def execute_gemini_requests( + google_api_key: str, + gemini_prompts: List[dict], + model_version: str, + max_concurrent_requests: Optional[int], +) -> List[str]: + tasks = [ + partial( + execute_gemini_request, + prompt=prompt, + model_version=model_version, + google_api_key=google_api_key, + ) + for prompt in gemini_prompts + ] + max_workers = ( + max_concurrent_requests + or WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS + ) + return run_in_parallel( + tasks=tasks, + max_workers=max_workers, + ) + + +def execute_gemini_request( + prompt: dict, + model_version: str, + google_api_key: str, +) -> str: + response = requests.post( + f"https://generativelanguage.googleapis.com/v1beta/models/{model_version}:generateContent", + headers={ + "Content-Type": "application/json", + }, + params={ + "key": google_api_key, + }, + json=prompt, + ) + response_data = response.json() + google_api_key_safe_raise_for_status(response=response) + return response_data["candidates"][0]["content"]["parts"][0]["text"] + + +def prepare_unconstrained_prompt( + base64_image: str, + prompt: str, + temperature: Optional[float], + max_tokens: int, + **kwargs, +) -> dict: + return { + "contents": { + "parts": [ + { + "inline_data": { + "mime_type": "image/jpeg", + "data": base64_image, + } + }, + { + "text": prompt, + }, + ], + "role": "user", + }, + "generationConfig": prepare_generation_config( + max_tokens=max_tokens, + temperature=temperature, + ), + } + + +def prepare_classification_prompt( + base64_image: str, + classes: List[str], + temperature: Optional[float], + max_tokens: int, + **kwargs, +) -> dict: + serialised_classes = ", ".join(classes) + return { + "systemInstruction": { + "role": "system", + "parts": [ + { + "text": "You act as single-class classification model. You must provide reasonable predictions. " + "You are only allowed to produce JSON document. " + 'Expected structure of json: {"class_name": "class-name", "confidence": 0.4}. ' + "`class-name` must be one of the class names defined by user. You are only allowed to return " + "single JSON document, even if there are potentially multiple classes. You are not allowed to " + "return list.", + } + ], + }, + "contents": { + "parts": [ + { + "inline_data": { + "mime_type": "image/jpeg", + "data": base64_image, + } + }, + { + "text": f"List of all classes to be recognised by model: {serialised_classes}", + }, + ], + "role": "user", + }, + "generationConfig": prepare_generation_config( + max_tokens=max_tokens, + temperature=temperature, + response_mime_type="application/json", + ), + } + + +def prepare_multi_label_classification_prompt( + base64_image: str, + classes: List[str], + temperature: Optional[float], + max_tokens: int, + **kwargs, +) -> dict: + serialised_classes = ", ".join(classes) + return { + "systemInstruction": { + "role": "system", + "parts": [ + { + "text": "You act as multi-label classification model. You must provide reasonable predictions. " + "You are only allowed to produce JSON document. " + 'Expected structure of json: {"predicted_classes": [{"class": "class-name-1", "confidence": 0.9}, ' + '{"class": "class-name-2", "confidence": 0.7}]}. ' + "`class-name-X` must be one of the class names defined by user and `confidence` is a float value " + "in range 0.0-1.0 that represents how sure you are that the class is present in the image. " + "Only return class names that are visible.", + } + ], + }, + "contents": { + "parts": [ + { + "inline_data": { + "mime_type": "image/jpeg", + "data": base64_image, + } + }, + { + "text": f"List of all classes to be recognised by model: {serialised_classes}", + }, + ], + "role": "user", + }, + "generationConfig": prepare_generation_config( + max_tokens=max_tokens, + temperature=temperature, + response_mime_type="application/json", + ), + } + + +def prepare_vqa_prompt( + base64_image: str, + prompt: str, + temperature: Optional[float], + max_tokens: int, + **kwargs, +) -> dict: + return { + "systemInstruction": { + "role": "system", + "parts": [ + { + "text": "You act as Visual Question Answering model. Your task is to provide answer to question" + "submitted by user. If this is open-question - answer with few sentences, for ABCD question, " + "return only the indicator of the answer.", + } + ], + }, + "contents": { + "parts": [ + { + "inline_data": { + "mime_type": "image/jpeg", + "data": base64_image, + } + }, + { + "text": f"Question: {prompt}", + }, + ], + "role": "user", + }, + "generationConfig": prepare_generation_config( + max_tokens=max_tokens, + temperature=temperature, + ), + } + + +def prepare_ocr_prompt( + base64_image: str, + temperature: Optional[float], + max_tokens: int, + **kwargs, +) -> dict: + return { + "systemInstruction": { + "role": "system", + "parts": [ + { + "text": "You act as OCR model. Your task is to read text from the image and return it in " + "paragraphs representing the structure of texts in the image. You should only return " + "recognised text, nothing else.", + } + ], + }, + "contents": { + "parts": [ + { + "inline_data": { + "mime_type": "image/jpeg", + "data": base64_image, + } + } + ], + "role": "user", + }, + "generationConfig": prepare_generation_config( + max_tokens=max_tokens, + temperature=temperature, + ), + } + + +def prepare_caption_prompt( + base64_image: str, + short_description: bool, + temperature: Optional[float], + max_tokens: int, + **kwargs, +) -> dict: + caption_detail_level = "Caption should be short." + if not short_description: + caption_detail_level = "Caption should be extensive." + return { + "systemInstruction": { + "role": "system", + "parts": [ + { + "text": f"You act as image caption model. Your task is to provide description of the image. " + f"{caption_detail_level}", + } + ], + }, + "contents": { + "parts": [ + { + "inline_data": { + "mime_type": "image/jpeg", + "data": base64_image, + } + } + ], + "role": "user", + }, + "generationConfig": prepare_generation_config( + max_tokens=max_tokens, + temperature=temperature, + ), + } + + +def prepare_structured_answering_prompt( + base64_image: str, + output_structure: Dict[str, str], + temperature: Optional[float], + max_tokens: int, + **kwargs, +) -> dict: + output_structure_serialised = json.dumps(output_structure, indent=4) + return { + "systemInstruction": { + "role": "system", + "parts": [ + { + "text": "You are supposed to produce responses in JSON. User is to provide you dictionary with " + "keys and values. Each key must be present in your response. Values in user dictionary " + "represent descriptions for JSON fields to be generated. Provide only JSON in response.", + } + ], + }, + "contents": { + "parts": [ + { + "inline_data": { + "mime_type": "image/jpeg", + "data": base64_image, + } + }, + { + "text": f"Specification of requirements regarding output fields: \n" + f"{output_structure_serialised}", + }, + ], + "role": "user", + }, + "generationConfig": prepare_generation_config( + max_tokens=max_tokens, + temperature=temperature, + response_mime_type="application/json", + ), + } + + +def prepare_object_detection_prompt( + base64_image: str, + classes: List[str], + temperature: Optional[float], + max_tokens: int, + **kwargs, +) -> dict: + serialised_classes = ", ".join(classes) + return { + "systemInstruction": { + "role": "system", + "parts": [ + { + "text": "You act as object-detection model. You must provide reasonable predictions. " + "You are only allowed to produce JSON document. " + 'Expected structure of json: {"detections": [{"x_min": 0.1, "y_min": 0.2, "x_max": 0.3, "y_max": 0.4, "class_name": "my-class-X", "confidence": 0.7}]}. ' + "`my-class-X` must be one of the class names defined by user. All coordinates must be in range 0.0-1.0, representing percentage of image dimensions. " + "`confidence` is a value in range 0.0-1.0 representing your confidence in prediction. You should detect all instances of classes provided by user.", + } + ], + }, + "contents": { + "parts": [ + { + "inline_data": { + "mime_type": "image/jpeg", + "data": base64_image, + } + }, + { + "text": f"List of all classes to be recognised by model: {serialised_classes}", + }, + ], + "role": "user", + }, + "generationConfig": prepare_generation_config( + max_tokens=max_tokens, + temperature=temperature, + response_mime_type="application/json", + ), + } + + +def prepare_generation_config( + max_tokens: int, + temperature: Optional[float], + response_mime_type: str = "text/plain", +) -> dict: + result = { + "max_output_tokens": max_tokens, + "response_mime_type": response_mime_type, + "candidate_count": 1, + } + if temperature is not None: + result["temperature"] = temperature + return result + + +def google_api_key_safe_raise_for_status(response: Response) -> None: + request_is_successful = response.status_code < 400 + if request_is_successful: + return None + response.url = GOOGLE_API_KEY_PATTERN.sub(deduct_api_key, response.url) + response.raise_for_status() + + +def deduct_api_key(match: re.Match) -> str: + key_value = match.group(GOOGLE_API_KEY_VALUE_GROUP) + if len(key_value) < MIN_KEY_LENGTH_TO_REVEAL_PREFIX: + return f"key=***" + key_prefix = key_value[:2] + key_postfix = key_value[-2:] + return f"key={key_prefix}***{key_postfix}" + + +PROMPT_BUILDERS = { + "unconstrained": prepare_unconstrained_prompt, + "ocr": prepare_ocr_prompt, + "visual-question-answering": prepare_vqa_prompt, + "caption": partial(prepare_caption_prompt, short_description=True), + "detailed-caption": partial(prepare_caption_prompt, short_description=False), + "classification": prepare_classification_prompt, + "multi-label-classification": prepare_multi_label_classification_prompt, + "structured-answering": prepare_structured_answering_prompt, + "object-detection": prepare_object_detection_prompt, +} diff --git a/inference/core/workflows/core_steps/models/foundation/lmm_classifier/v1.py b/inference/core/workflows/core_steps/models/foundation/lmm_classifier/v1.py index 2c029dd506..3f468a3321 100644 --- a/inference/core/workflows/core_steps/models/foundation/lmm_classifier/v1.py +++ b/inference/core/workflows/core_steps/models/foundation/lmm_classifier/v1.py @@ -64,6 +64,7 @@ class BlockManifest(WorkflowBlockManifest): "long_description": LONG_DESCRIPTION, "license": "Apache-2.0", "block_type": "model", + "deprecated": True, } ) type: Literal["roboflow_core/lmm_for_classification@v1", "LMMForClassification"] diff --git a/inference/core/workflows/core_steps/models/foundation/openai/v2.py b/inference/core/workflows/core_steps/models/foundation/openai/v2.py new file mode 100644 index 0000000000..406c462121 --- /dev/null +++ b/inference/core/workflows/core_steps/models/foundation/openai/v2.py @@ -0,0 +1,573 @@ +import base64 +import json +from functools import partial +from typing import Any, Dict, List, Literal, Optional, Type, Union + +from openai import OpenAI +from openai._types import NOT_GIVEN +from pydantic import ConfigDict, Field, model_validator + +from inference.core.env import WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS +from inference.core.managers.base import ModelManager +from inference.core.utils.image_utils import encode_image_to_jpeg_bytes, load_image +from inference.core.workflows.core_steps.common.utils import run_in_parallel +from inference.core.workflows.execution_engine.entities.base import ( + Batch, + OutputDefinition, + WorkflowImageData, +) +from inference.core.workflows.execution_engine.entities.types import ( + FLOAT_KIND, + LANGUAGE_MODEL_OUTPUT_KIND, + LIST_OF_VALUES_KIND, + STRING_KIND, + ImageInputField, + StepOutputImageSelector, + WorkflowImageSelector, + WorkflowParameterSelector, +) +from inference.core.workflows.prototypes.block import ( + BlockResult, + WorkflowBlock, + WorkflowBlockManifest, +) + +LONG_DESCRIPTION = """ +Ask a question to OpenAI's GPT-4 with Vision model. + +You can specify arbitrary text prompts or predefined ones, the block supports the following types of prompt: + +- `unconstrained` - any arbitrary prompt you like + +- `ocr`- predefined prompt to recognise text from image + +- `visual-question-answering` - your prompt is supposed to provide question and will be +wrapped into structure that is suited for VQA task + +- `caption` - predefined prompt to generate short caption of the image + +- `detailed-caption` - predefined prompt to generate elaborated caption of the image + +- `classification` - predefined prompt to generate multi-class classification output (that can be parsed +with `VLM as Classifier` block) + +- `multi-label-classification` - predefined prompt to generate multi-label classification output (that +can be parsed with `VLM as Classifier` block) + +- `structured-answering` - your input defines expected JSON output fields that can be parsed with `JSON Parser` +block. + +You need to provide your OpenAI API key to use the GPT-4 with Vision model. +""" + +TaskType = Literal[ + "unconstrained", + "ocr", + "visual-question-answering", + "caption", + "detailed-caption", + "classification", + "multi-label-classification", + "structured-answering", +] + +TASKS_REQUIRING_PROMPT = { + "unconstrained", + "visual-question-answering", +} + +TASKS_REQUIRING_CLASSES = { + "classification", + "multi-label-classification", +} + +TASKS_REQUIRING_OUTPUT_STRUCTURE = { + "structured-answering", +} + + +class BlockManifest(WorkflowBlockManifest): + model_config = ConfigDict( + json_schema_extra={ + "name": "OpenAI", + "version": "v2", + "short_description": "Run OpenAI's GPT-4 with Vision", + "long_description": LONG_DESCRIPTION, + "license": "Apache-2.0", + "block_type": "model", + "search_keywords": ["LMM", "VLM", "ChatGPT", "GPT", "OpenAI"], + } + ) + type: Literal["roboflow_core/open_ai@v2"] + images: Union[WorkflowImageSelector, StepOutputImageSelector] = ImageInputField + task_type: TaskType = Field( + description="Task type to be performed by model. Value of parameter determine set of fields " + "that are required. For `unconstrained`, `visual-question-answering`, " + " - `prompt` parameter must be provided." + "For `structured-answering` - `output-structure` must be provided. For " + "`classification`, `multi-label-classification` - " + "`classes` must be filled. `ocr`, `caption`, `detailed-caption` do not" + "require any additional parameter.", + ) + prompt: Optional[Union[WorkflowParameterSelector(kind=[STRING_KIND]), str]] = Field( + default=None, + description="Text prompt to the OpenAI model", + examples=["my prompt", "$inputs.prompt"], + json_schema_extra={ + "relevant_for": { + "task_type": {"values": TASKS_REQUIRING_PROMPT, "required": True}, + }, + }, + ) + output_structure: Optional[Dict[str, str]] = Field( + default=None, + description="Dictionary with structure of expected JSON response", + examples=[{"my_key": "description"}, "$inputs.output_structure"], + json_schema_extra={ + "relevant_for": { + "task_type": {"values": TASKS_REQUIRING_CLASSES, "required": True}, + }, + }, + ) + classes: Optional[ + Union[WorkflowParameterSelector(kind=[LIST_OF_VALUES_KIND]), List[str]] + ] = Field( + default=None, + description="List of classes to be used", + examples=[["class-a", "class-b"], "$inputs.classes"], + json_schema_extra={ + "relevant_for": { + "task_type": { + "values": TASKS_REQUIRING_OUTPUT_STRUCTURE, + "required": True, + }, + }, + }, + ) + api_key: Union[WorkflowParameterSelector(kind=[STRING_KIND]), str] = Field( + description="Your OpenAI API key", + examples=["xxx-xxx", "$inputs.openai_api_key"], + private=True, + ) + model_version: Union[ + WorkflowParameterSelector(kind=[STRING_KIND]), Literal["gpt-4o", "gpt-4o-mini"] + ] = Field( + default="gpt-4o", + description="Model to be used", + examples=["gpt-4o", "$inputs.openai_model"], + ) + image_detail: Union[ + WorkflowParameterSelector(kind=[STRING_KIND]), Literal["auto", "high", "low"] + ] = Field( + default="auto", + description="Indicates the image's quality, with 'high' suggesting it is of high resolution and should be processed or displayed with high fidelity.", + examples=["auto", "high", "low"], + ) + max_tokens: int = Field( + default=450, + description="Maximum number of tokens the model can generate in it's response.", + ) + temperature: Optional[ + Union[float, WorkflowParameterSelector(kind=[FLOAT_KIND])] + ] = Field( + default=None, + description="Temperature to sample from the model - value in range 0.0-2.0, the higher - the more " + 'random / "creative" the generations are.', + ge=0.0, + le=2.0, + ) + max_concurrent_requests: Optional[int] = Field( + default=None, + description="Number of concurrent requests that can be executed by block when batch of input images provided. " + "If not given - block defaults to value configured globally in Workflows Execution Engine. " + "Please restrict if you hit OpenAI limits.", + ) + + @model_validator(mode="after") + def validate(self) -> "BlockManifest": + if self.task_type in TASKS_REQUIRING_PROMPT and self.prompt is None: + raise ValueError( + f"`prompt` parameter required to be set for task `{self.task_type}`" + ) + if self.task_type in TASKS_REQUIRING_CLASSES and self.classes is None: + raise ValueError( + f"`classes` parameter required to be set for task `{self.task_type}`" + ) + if ( + self.task_type in TASKS_REQUIRING_OUTPUT_STRUCTURE + and self.output_structure is None + ): + raise ValueError( + f"`output_structure` parameter required to be set for task `{self.task_type}`" + ) + return self + + @classmethod + def accepts_batch_input(cls) -> bool: + return True + + @classmethod + def describe_outputs(cls) -> List[OutputDefinition]: + return [ + OutputDefinition( + name="output", kind=[STRING_KIND, LANGUAGE_MODEL_OUTPUT_KIND] + ), + OutputDefinition(name="classes", kind=[LIST_OF_VALUES_KIND]), + ] + + @classmethod + def get_execution_engine_compatibility(cls) -> Optional[str]: + return ">=1.0.0,<2.0.0" + + +class OpenAIBlockV2(WorkflowBlock): + + def __init__( + self, + model_manager: ModelManager, + api_key: Optional[str], + ): + self._model_manager = model_manager + self._api_key = api_key + + @classmethod + def get_init_parameters(cls) -> List[str]: + return ["model_manager", "api_key"] + + @classmethod + def get_manifest(cls) -> Type[WorkflowBlockManifest]: + return BlockManifest + + @classmethod + def get_execution_engine_compatibility(cls) -> Optional[str]: + return ">=1.0.0,<2.0.0" + + def run( + self, + images: Batch[WorkflowImageData], + task_type: TaskType, + prompt: Optional[str], + output_structure: Optional[Dict[str, str]], + classes: Optional[List[str]], + api_key: str, + model_version: str, + image_detail: Literal["low", "high", "auto"], + max_tokens: int, + temperature: Optional[float], + max_concurrent_requests: Optional[int], + ) -> BlockResult: + inference_images = [i.to_inference_format() for i in images] + raw_outputs = run_gpt_4v_llm_prompting( + images=inference_images, + task_type=task_type, + prompt=prompt, + output_structure=output_structure, + classes=classes, + openai_api_key=api_key, + gpt_model_version=model_version, + gpt_image_detail=image_detail, + max_tokens=max_tokens, + temperature=temperature, + max_concurrent_requests=max_concurrent_requests, + ) + return [ + {"output": raw_output, "classes": classes} for raw_output in raw_outputs + ] + + +def run_gpt_4v_llm_prompting( + images: List[Dict[str, Any]], + task_type: TaskType, + prompt: Optional[str], + output_structure: Optional[Dict[str, str]], + classes: Optional[List[str]], + openai_api_key: Optional[str], + gpt_model_version: str, + gpt_image_detail: Literal["auto", "high", "low"], + max_tokens: int, + temperature: Optional[int], + max_concurrent_requests: Optional[int], +) -> List[str]: + if task_type not in PROMPT_BUILDERS: + raise ValueError(f"Task type: {task_type} not supported.") + gpt4_prompts = [] + for image in images: + loaded_image, _ = load_image(image) + base64_image = base64.b64encode( + encode_image_to_jpeg_bytes(loaded_image) + ).decode("ascii") + prompt = PROMPT_BUILDERS[task_type]( + base64_image=base64_image, + prompt=prompt, + output_structure=output_structure, + classes=classes, + gpt_image_detail=gpt_image_detail, + ) + gpt4_prompts.append(prompt) + return execute_gpt_4v_requests( + openai_api_key=openai_api_key, + gpt4_prompts=gpt4_prompts, + gpt_model_version=gpt_model_version, + max_tokens=max_tokens, + temperature=temperature, + max_concurrent_requests=max_concurrent_requests, + ) + + +def execute_gpt_4v_requests( + openai_api_key: str, + gpt4_prompts: List[List[dict]], + gpt_model_version: str, + max_tokens: int, + temperature: Optional[float], + max_concurrent_requests: Optional[int], +) -> List[str]: + client = OpenAI(api_key=openai_api_key) + tasks = [ + partial( + execute_gpt_4v_request, + client=client, + prompt=prompt, + gpt_model_version=gpt_model_version, + max_tokens=max_tokens, + temperature=temperature, + ) + for prompt in gpt4_prompts + ] + max_workers = ( + max_concurrent_requests + or WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS + ) + return run_in_parallel( + tasks=tasks, + max_workers=max_workers, + ) + + +def execute_gpt_4v_request( + client: OpenAI, + prompt: List[dict], + gpt_model_version: str, + max_tokens: int, + temperature: Optional[float], +) -> str: + if temperature is None: + temperature = NOT_GIVEN + response = client.chat.completions.create( + model=gpt_model_version, + messages=prompt, + max_tokens=max_tokens, + temperature=temperature, + ) + return response.choices[0].message.content + + +def prepare_unconstrained_prompt( + base64_image: str, + prompt: str, + gpt_image_detail: str, + **kwargs, +) -> List[dict]: + return [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}", + "detail": gpt_image_detail, + }, + }, + ], + } + ] + + +def prepare_classification_prompt( + base64_image: str, classes: List[str], gpt_image_detail: str, **kwargs +) -> List[dict]: + serialised_classes = ", ".join(classes) + return [ + { + "role": "system", + "content": "You act as single-class classification model. You must provide reasonable predictions. " + "You are only allowed to produce JSON document in Markdown ```json [...]``` markers. " + 'Expected structure of json: {"class_name": "class-name", "confidence": 0.4}. ' + "`class-name` must be one of the class names defined by user. You are only allowed to return " + "single JSON document, even if there are potentially multiple classes. You are not allowed to return list.", + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"List of all classes to be recognised by model: {serialised_classes}", + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}", + "detail": gpt_image_detail, + }, + }, + ], + }, + ] + + +def prepare_multi_label_classification_prompt( + base64_image: str, classes: List[str], gpt_image_detail: str, **kwargs +) -> List[dict]: + serialised_classes = ", ".join(classes) + return [ + { + "role": "system", + "content": "You act as multi-label classification model. You must provide reasonable predictions. " + "You are only allowed to produce JSON document in Markdown ```json [...]``` markers. " + 'Expected structure of json: {"predicted_classes": [{"class": "class-name-1", "confidence": 0.9}, ' + '{"class": "class-name-2", "confidence": 0.7}]}. ' + "`class-name-X` must be one of the class names defined by user and `confidence` is a float value in range " + "0.0-1.0 that represent how sure you are that the class is present in the image. Only return class names " + "that are visible.", + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"List of all classes to be recognised by model: {serialised_classes}", + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}", + "detail": gpt_image_detail, + }, + }, + ], + }, + ] + + +def prepare_vqa_prompt( + base64_image: str, prompt: str, gpt_image_detail: str, **kwargs +) -> List[dict]: + return [ + { + "role": "system", + "content": "You act as Visual Question Answering model. Your task is to provide answer to question" + "submitted by user. If this is open-question - answer with few sentences, for ABCD question, " + "return only the indicator of the answer.", + }, + { + "role": "user", + "content": [ + {"type": "text", "text": f"Question: {prompt}"}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}", + "detail": gpt_image_detail, + }, + }, + ], + }, + ] + + +def prepare_ocr_prompt( + base64_image: str, gpt_image_detail: str, **kwargs +) -> List[dict]: + return [ + { + "role": "system", + "content": "You act as OCR model. Your task is to read text from the image and return it in " + "paragraphs representing the structure of texts in the image. You should only return " + "recognised text, nothing else.", + }, + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}", + "detail": gpt_image_detail, + }, + }, + ], + }, + ] + + +def prepare_caption_prompt( + base64_image: str, gpt_image_detail: str, short_description: bool, **kwargs +) -> List[dict]: + caption_detail_level = "Caption should be short." + if not short_description: + caption_detail_level = "Caption should be extensive." + return [ + { + "role": "system", + "content": f"You act as image caption model. Your task is to provide description of the image. " + f"{caption_detail_level}", + }, + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}", + "detail": gpt_image_detail, + }, + }, + ], + }, + ] + + +def prepare_structured_answering_prompt( + base64_image: str, output_structure: Dict[str, str], gpt_image_detail: str, **kwargs +) -> List[dict]: + output_structure_serialised = json.dumps(output_structure, indent=4) + return [ + { + "role": "system", + "content": "You are supposed to produce responses in JSON wrapped in Markdown markers: " + "```json\nyour-response\n```. User is to provide you dictionary with keys and values. " + "Each key must be present in your response. Values in user dictionary represent " + "descriptions for JSON fields to be generated. Provide only JSON Markdown in response.", + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"Specification of requirements regarding output fields: \n" + f"{output_structure_serialised}", + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}", + "detail": gpt_image_detail, + }, + }, + ], + }, + ] + + +PROMPT_BUILDERS = { + "unconstrained": prepare_unconstrained_prompt, + "ocr": prepare_ocr_prompt, + "visual-question-answering": prepare_vqa_prompt, + "caption": partial(prepare_caption_prompt, short_description=True), + "detailed-caption": partial(prepare_caption_prompt, short_description=False), + "classification": prepare_classification_prompt, + "multi-label-classification": prepare_multi_label_classification_prompt, + "structured-answering": prepare_structured_answering_prompt, +} diff --git a/inference/core/workflows/execution_engine/entities/types.py b/inference/core/workflows/execution_engine/entities/types.py index bbca551054..286031e4e0 100644 --- a/inference/core/workflows/execution_engine/entities/types.py +++ b/inference/core/workflows/execution_engine/entities/types.py @@ -136,7 +136,7 @@ def __hash__(self) -> int: """ LIST_OF_VALUES_KIND = Kind( name="list_of_values", - description="List of values of any types", + description="List of values of any type", docs=LIST_OF_VALUES_KIND_DOCS, ) @@ -292,7 +292,7 @@ def __hash__(self) -> int: """ CLASSIFICATION_PREDICTION_KIND = Kind( name="classification_prediction", - description="`'predictions'` key from Classification Model output", + description="Predictions from classifier", docs=CLASSIFICATION_PREDICTION_KIND_DOCS, ) @@ -374,9 +374,75 @@ def __hash__(self) -> int: confidence=array([ 0.84955, 0.74344, 0.45636, 0.86537]), class_id=array([2, 7, 2, 0]), tracker_id=None, - data={'class_name': array(['car', 'truck', 'car', 'car'], dtype=' int: confidence=array([ 0.95898]), class_id=array([6]), tracker_id=None, - data={'class_name': array(['G'], dtype=' int: tracker_id=None, data={ 'class_name': array(['G'], dtype=' int: docs=KEYPOINT_DETECTION_PREDICTION_KIND_DOCS, ) -QR_CODE_DETECTION_KIND_DOCS = f""" +QR_CODE_DETECTION_KIND_DOCS = """ This kind represents batch of predictions regarding QR codes location and data their provide. Example: ``` -# Each prediction in batch is list of dictionaries that contains detected QR codes (detections) and their metadata -[ - [ - {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "qr_code", "class_id": 0.1, "detection_id": "random-uuid", "data": ""}}, - {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "qr_code", "class_id": 0.1, "detection_id": "random-uuid", "data": ""}}, ], - [ - {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "qr_code", "class_id": 0.1, "detection_id": "random-uuid", "data": ""}}, - {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "qr_code", "class_id": 0.1, "detection_id": "random-uuid", "data": ""}}, - ] -] +sv.Detections( + xyxy=array([ + [ 865, 153.5, 1189, 422.5], + [ 192.5, 77.5, 995.5, 722.5], + [ 194, 82, 996, 726], + [ 460, 333, 704, 389]] + ), + mask=None, + confidence=array([ 1.0, 1.0, 1.0, 1.0]), + class_id=array([2, 7, 2, 0]), + tracker_id=None, + data={ + 'class_name': array(['qr_code', 'qr_code', 'qr_code', 'qr_code'], dtype=' int: docs=QR_CODE_DETECTION_KIND_DOCS, ) -BAR_CODE_DETECTION_KIND_DOCS = f""" +BAR_CODE_DETECTION_KIND_DOCS = """ This kind represents batch of predictions regarding barcodes location and data their provide. Example: ``` -# Each prediction in batch is list of dictionaries that contains detected barcodes (detections) and their metadata -[ - [ - {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "barcode", "class_id": 0.1, "detection_id": "random-uuid", "data": ""}}, - {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "barcode", "class_id": 0.1, "detection_id": "random-uuid", "data": ""}}, ], - [ - {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "barcode", "class_id": 0.1, "detection_id": "random-uuid", "data": ""}}, - {{"x": 300, "y": 400, "width": 100, "height" 50, "confidence": 1.0, "class": "barcode", "class_id": 0.1, "detection_id": "random-uuid", "data": ""}}, - ] -] +sv.Detections( + xyxy=array([ + [ 865, 153.5, 1189, 422.5], + [ 192.5, 77.5, 995.5, 722.5], + [ 194, 82, 996, 726], + [ 460, 333, 704, 389]] + ), + mask=None, + confidence=array([ 1.0, 1.0, 1.0, 1.0]), + class_id=array([2, 7, 2, 0]), + tracker_id=None, + data={ + 'class_name': array(['barcode', 'barcode', 'barcode', 'barcode'], dtype=' int: ) +LANGUAGE_MODEL_OUTPUT_KIND_DOCS = """ +This kind represent output generated by language model. It is Python string, which can be processed +by blocks transforming LLMs / VLMs output into structured form. + +Examples: +``` +{"predicted_class": "car", "confidence": 0.7} # which is example JSON with classification prediction +"The is A." # which is example unstructured generation for VQA task +``` +""" + +LANGUAGE_MODEL_OUTPUT_KIND = Kind( + name="language_model_output", + description="LLM / VLM output", + docs=LANGUAGE_MODEL_OUTPUT_KIND_DOCS, +) + STEP_AS_SELECTED_ELEMENT = "step" STEP_OUTPUT_AS_SELECTED_ELEMENT = "step_output" diff --git a/requirements/_requirements.txt b/requirements/_requirements.txt index 834247e30d..e781f29f1c 100644 --- a/requirements/_requirements.txt +++ b/requirements/_requirements.txt @@ -29,3 +29,4 @@ pydot>=2.0.0 shapely>=2.0.0,<2.1.0 tldextract~=5.1.2 packaging~=24.0 +anthropic~=0.34.2 \ No newline at end of file diff --git a/tests/inference/hosted_platform_tests/conftest.py b/tests/inference/hosted_platform_tests/conftest.py index faf859864f..95689e7823 100644 --- a/tests/inference/hosted_platform_tests/conftest.py +++ b/tests/inference/hosted_platform_tests/conftest.py @@ -79,6 +79,8 @@ class PlatformEnvironment(Enum): ROBOFLOW_API_KEY = os.environ["HOSTED_PLATFORM_TESTS_API_KEY"] OPENAI_KEY = os.getenv("OPENAI_KEY") +GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") +ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY") @pytest.fixture(scope="session") diff --git a/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_claude.py b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_claude.py new file mode 100644 index 0000000000..efaac597bc --- /dev/null +++ b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_claude.py @@ -0,0 +1,242 @@ +import numpy as np +import pytest + +from inference_sdk import InferenceHTTPClient +from tests.inference.hosted_platform_tests.conftest import ( + ANTHROPIC_API_KEY, + ROBOFLOW_API_KEY, +) + +CLASSIFICATION_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + {"type": "WorkflowParameter", "name": "classes"}, + ], + "steps": [ + { + "type": "roboflow_core/anthropic_claude@v1", + "name": "claude", + "images": "$inputs.image", + "task_type": "classification", + "classes": "$inputs.classes", + "api_key": "$inputs.api_key", + }, + { + "type": "roboflow_core/vlm_as_classifier@v1", + "name": "parser", + "image": "$inputs.image", + "vlm_output": "$steps.claude.output", + "classes": "$steps.claude.classes", + }, + { + "type": "roboflow_core/property_definition@v1", + "name": "top_class", + "operations": [ + {"type": "ClassificationPropertyExtract", "property_name": "top_class"} + ], + "data": "$steps.parser.predictions", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "claude_result", + "selector": "$steps.claude.output", + }, + { + "type": "JsonField", + "name": "top_class", + "selector": "$steps.top_class.output", + }, + { + "type": "JsonField", + "name": "parsed_prediction", + "selector": "$steps.parser.*", + }, + ], +} + + +@pytest.mark.skipif(ANTHROPIC_API_KEY is None, reason="No Antropic API key provided") +@pytest.mark.flaky(retries=4, delay=1) +def test_classification_workflow( + object_detection_service_url: str, + dogs_image: np.ndarray, +) -> None: + client = InferenceHTTPClient( + api_url=object_detection_service_url, + api_key=ROBOFLOW_API_KEY, + ) + + # when + result = client.run_workflow( + specification=CLASSIFICATION_WORKFLOW, + images={ + "image": dogs_image, + }, + parameters={ + "api_key": ANTHROPIC_API_KEY, + "classes": ["cat", "dog"], + }, + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == { + "claude_result", + "top_class", + "parsed_prediction", + }, "Expected all outputs to be delivered" + assert ( + isinstance(result[0]["claude_result"], str) + and len(result[0]["claude_result"]) > 0 + ), "Expected non-empty string generated" + assert result[0]["top_class"] == "dog" + assert result[0]["parsed_prediction"]["error_status"] is False + + +STRUCTURED_PROMPTING_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + ], + "steps": [ + { + "type": "roboflow_core/anthropic_claude@v1", + "name": "claude", + "images": "$inputs.image", + "task_type": "structured-answering", + "output_structure": { + "dogs_count": "count of dogs instances in the image", + "cats_count": "count of cats instances in the image", + }, + "api_key": "$inputs.api_key", + }, + { + "type": "roboflow_core/json_parser@v1", + "name": "parser", + "raw_json": "$steps.claude.output", + "expected_fields": ["dogs_count", "cats_count"], + }, + { + "type": "roboflow_core/property_definition@v1", + "name": "property_definition", + "operations": [{"type": "ToString"}], + "data": "$steps.parser.dogs_count", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.property_definition.output", + } + ], +} + + +@pytest.mark.skipif(ANTHROPIC_API_KEY is None, reason="No Antropic API key provided") +@pytest.mark.flaky(retries=4, delay=1) +def test_structured_parsing_workflow( + object_detection_service_url: str, + dogs_image: np.ndarray, +) -> None: + client = InferenceHTTPClient( + api_url=object_detection_service_url, + api_key=ROBOFLOW_API_KEY, + ) + + # when + result = client.run_workflow( + specification=STRUCTURED_PROMPTING_WORKFLOW, + images={ + "image": dogs_image, + }, + parameters={ + "api_key": ANTHROPIC_API_KEY, + }, + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered" + assert result[0]["result"] == "2" + + +OBJECT_DETECTION_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + {"type": "WorkflowParameter", "name": "classes"}, + ], + "steps": [ + { + "type": "roboflow_core/anthropic_claude@v1", + "name": "claude", + "images": "$inputs.image", + "task_type": "object-detection", + "classes": "$inputs.classes", + "api_key": "$inputs.api_key", + }, + { + "type": "roboflow_core/vlm_as_detector@v1", + "name": "parser", + "vlm_output": "$steps.claude.output", + "image": "$inputs.image", + "classes": "$steps.claude.classes", + "model_type": "anthropic-claude", + "task_type": "object-detection", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "claude_result", + "selector": "$steps.claude.output", + }, + { + "type": "JsonField", + "name": "parsed_prediction", + "selector": "$steps.parser.predictions", + }, + ], +} + + +@pytest.mark.skipif(ANTHROPIC_API_KEY is None, reason="No Antropic API key provided") +@pytest.mark.flaky(retries=4, delay=1) +def test_object_detection_workflow( + object_detection_service_url: str, + dogs_image: np.ndarray, +) -> None: + client = InferenceHTTPClient( + api_url=object_detection_service_url, + api_key=ROBOFLOW_API_KEY, + ) + + # when + result = client.run_workflow( + specification=OBJECT_DETECTION_WORKFLOW, + images={ + "image": dogs_image, + }, + parameters={ + "api_key": ANTHROPIC_API_KEY, + "classes": ["cat", "dog"], + }, + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == { + "claude_result", + "parsed_prediction", + }, "Expected all outputs to be delivered" + assert [e["class"] for e in result[0]["parsed_prediction"]["predictions"]] == [ + "dog", + "dog", + ], "Expected 2 dogs to be detected" diff --git a/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_gemini.py b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_gemini.py new file mode 100644 index 0000000000..37044e862c --- /dev/null +++ b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_gemini.py @@ -0,0 +1,242 @@ +import numpy as np +import pytest + +from inference_sdk import InferenceHTTPClient +from tests.inference.hosted_platform_tests.conftest import ( + GOOGLE_API_KEY, + ROBOFLOW_API_KEY, +) + +CLASSIFICATION_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + {"type": "WorkflowParameter", "name": "classes"}, + ], + "steps": [ + { + "type": "roboflow_core/google_gemini@v1", + "name": "gemini", + "images": "$inputs.image", + "task_type": "classification", + "classes": "$inputs.classes", + "api_key": "$inputs.api_key", + }, + { + "type": "roboflow_core/vlm_as_classifier@v1", + "name": "parser", + "image": "$inputs.image", + "vlm_output": "$steps.gemini.output", + "classes": "$steps.gemini.classes", + }, + { + "type": "roboflow_core/property_definition@v1", + "name": "top_class", + "operations": [ + {"type": "ClassificationPropertyExtract", "property_name": "top_class"} + ], + "data": "$steps.parser.predictions", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "gemini_result", + "selector": "$steps.gemini.output", + }, + { + "type": "JsonField", + "name": "top_class", + "selector": "$steps.top_class.output", + }, + { + "type": "JsonField", + "name": "parsed_prediction", + "selector": "$steps.parser.*", + }, + ], +} + + +@pytest.mark.skipif(GOOGLE_API_KEY is None, reason="No Google API key provided") +@pytest.mark.flaky(retries=4, delay=1) +def test_classification_workflow( + object_detection_service_url: str, + dogs_image: np.ndarray, +) -> None: + client = InferenceHTTPClient( + api_url=object_detection_service_url, + api_key=ROBOFLOW_API_KEY, + ) + + # when + result = client.run_workflow( + specification=CLASSIFICATION_WORKFLOW, + images={ + "image": dogs_image, + }, + parameters={ + "api_key": GOOGLE_API_KEY, + "classes": ["cat", "dog"], + }, + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == { + "gemini_result", + "top_class", + "parsed_prediction", + }, "Expected all outputs to be delivered" + assert ( + isinstance(result[0]["gemini_result"], str) + and len(result[0]["gemini_result"]) > 0 + ), "Expected non-empty string generated" + assert result[0]["top_class"] == "dog" + assert result[0]["parsed_prediction"]["error_status"] is False + + +STRUCTURED_PROMPTING_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + ], + "steps": [ + { + "type": "roboflow_core/google_gemini@v1", + "name": "gemini", + "images": "$inputs.image", + "task_type": "structured-answering", + "output_structure": { + "dogs_count": "count of dogs instances in the image", + "cats_count": "count of cats instances in the image", + }, + "api_key": "$inputs.api_key", + }, + { + "type": "roboflow_core/json_parser@v1", + "name": "parser", + "raw_json": "$steps.gemini.output", + "expected_fields": ["dogs_count", "cats_count"], + }, + { + "type": "roboflow_core/property_definition@v1", + "name": "property_definition", + "operations": [{"type": "ToString"}], + "data": "$steps.parser.dogs_count", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.property_definition.output", + } + ], +} + + +@pytest.mark.skipif(GOOGLE_API_KEY is None, reason="No Google API key provided") +@pytest.mark.flaky(retries=4, delay=1) +def test_structured_parsing_workflow( + object_detection_service_url: str, + dogs_image: np.ndarray, +) -> None: + client = InferenceHTTPClient( + api_url=object_detection_service_url, + api_key=ROBOFLOW_API_KEY, + ) + + # when + result = client.run_workflow( + specification=STRUCTURED_PROMPTING_WORKFLOW, + images={ + "image": dogs_image, + }, + parameters={ + "api_key": GOOGLE_API_KEY, + }, + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered" + assert result[0]["result"] == "2" + + +OBJECT_DETECTION_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + {"type": "WorkflowParameter", "name": "classes"}, + ], + "steps": [ + { + "type": "roboflow_core/google_gemini@v1", + "name": "gemini", + "images": "$inputs.image", + "task_type": "object-detection", + "classes": "$inputs.classes", + "api_key": "$inputs.api_key", + }, + { + "type": "roboflow_core/vlm_as_detector@v1", + "name": "parser", + "vlm_output": "$steps.gemini.output", + "image": "$inputs.image", + "classes": "$steps.gemini.classes", + "model_type": "google-gemini", + "task_type": "object-detection", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "gemini_result", + "selector": "$steps.gemini.output", + }, + { + "type": "JsonField", + "name": "parsed_prediction", + "selector": "$steps.parser.predictions", + }, + ], +} + + +@pytest.mark.skipif(GOOGLE_API_KEY is None, reason="No Google API key provided") +@pytest.mark.flaky(retries=4, delay=1) +def test_object_detection_workflow( + object_detection_service_url: str, + dogs_image: np.ndarray, +) -> None: + client = InferenceHTTPClient( + api_url=object_detection_service_url, + api_key=ROBOFLOW_API_KEY, + ) + + # when + result = client.run_workflow( + specification=OBJECT_DETECTION_WORKFLOW, + images={ + "image": dogs_image, + }, + parameters={ + "api_key": GOOGLE_API_KEY, + "classes": ["cat", "dog"], + }, + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == { + "gemini_result", + "parsed_prediction", + }, "Expected all outputs to be delivered" + assert [e["class"] for e in result[0]["parsed_prediction"]["predictions"]] == [ + "dog", + "dog", + ], "Expected 2 dogs to be detected" diff --git a/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_openai.py b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_openai.py index 66ee23cdf5..aaa10f3564 100644 --- a/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_openai.py +++ b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_openai.py @@ -95,3 +95,161 @@ def test_image_description_workflow( detection_confidences, [0.857235848903656, 0.5132315158843994], atol=1e-4 ), "Expected predictions to match what was observed while test creation" assert len(result[0]["description"]) > 0, "Expected some description" + + +CLASSIFICATION_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + {"type": "WorkflowParameter", "name": "classes"}, + ], + "steps": [ + { + "type": "roboflow_core/open_ai@v2", + "name": "gpt", + "images": "$inputs.image", + "task_type": "classification", + "classes": "$inputs.classes", + "api_key": "$inputs.api_key", + }, + { + "type": "roboflow_core/vlm_as_classifier@v1", + "name": "parser", + "image": "$inputs.image", + "vlm_output": "$steps.gpt.output", + "classes": "$steps.gpt.classes", + }, + { + "type": "roboflow_core/property_definition@v1", + "name": "top_class", + "operations": [ + {"type": "ClassificationPropertyExtract", "property_name": "top_class"} + ], + "data": "$steps.parser.predictions", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "gpt_result", + "selector": "$steps.gpt.output", + }, + { + "type": "JsonField", + "name": "top_class", + "selector": "$steps.top_class.output", + }, + { + "type": "JsonField", + "name": "parsed_prediction", + "selector": "$steps.parser.*", + }, + ], +} + + +@pytest.mark.skipif(OPENAI_KEY is None, reason="No OpenAI API key provided") +@pytest.mark.flaky(retries=4, delay=1) +def test_classification_workflow( + object_detection_service_url: str, + dogs_image: np.ndarray, +) -> None: + client = InferenceHTTPClient( + api_url=object_detection_service_url, + api_key=ROBOFLOW_API_KEY, + ) + + # when + result = client.run_workflow( + specification=CLASSIFICATION_WORKFLOW, + images={ + "image": dogs_image, + }, + parameters={ + "api_key": OPENAI_KEY, + "classes": ["cat", "dog"], + }, + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == { + "gpt_result", + "top_class", + "parsed_prediction", + }, "Expected all outputs to be delivered" + assert ( + isinstance(result[0]["gpt_result"], str) and len(result[0]["gpt_result"]) > 0 + ), "Expected non-empty string generated" + assert result[0]["top_class"] == "dog" + assert result[0]["parsed_prediction"]["error_status"] is False + + +STRUCTURED_PROMPTING_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + ], + "steps": [ + { + "type": "roboflow_core/open_ai@v2", + "name": "gpt", + "images": "$inputs.image", + "task_type": "structured-answering", + "output_structure": { + "dogs_count": "count of dogs instances in the image", + "cats_count": "count of cats instances in the image", + }, + "api_key": "$inputs.api_key", + }, + { + "type": "roboflow_core/json_parser@v1", + "name": "parser", + "raw_json": "$steps.gpt.output", + "expected_fields": ["dogs_count", "cats_count"], + }, + { + "type": "roboflow_core/property_definition@v1", + "name": "property_definition", + "operations": [{"type": "ToString"}], + "data": "$steps.parser.dogs_count", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.property_definition.output", + } + ], +} + + +@pytest.mark.skipif(OPENAI_KEY is None, reason="No OpenAI API key provided") +@pytest.mark.flaky(retries=4, delay=1) +def test_structured_prompting_workflow( + object_detection_service_url: str, + dogs_image: np.ndarray, +) -> None: + client = InferenceHTTPClient( + api_url=object_detection_service_url, + api_key=ROBOFLOW_API_KEY, + ) + + # when + result = client.run_workflow( + specification=STRUCTURED_PROMPTING_WORKFLOW, + images={ + "image": dogs_image, + }, + parameters={ + "api_key": OPENAI_KEY, + }, + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered" + assert result[0]["result"] == "2" diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_claude_models.py b/tests/workflows/integration_tests/execution/test_workflow_with_claude_models.py new file mode 100644 index 0000000000..4e244e6b87 --- /dev/null +++ b/tests/workflows/integration_tests/execution/test_workflow_with_claude_models.py @@ -0,0 +1,675 @@ +""" +This test module requires Anthropic AI API key passed via env variable WORKFLOWS_TEST_ANTHROPIC_API_KEY. +This is supposed to be used only locally, as that would be too much of a cost in CI +""" + +import os + +import numpy as np +import pytest + +from inference.core.env import WORKFLOWS_MAX_CONCURRENT_STEPS +from inference.core.managers.base import ModelManager +from inference.core.workflows.core_steps.common.entities import StepExecutionMode +from inference.core.workflows.execution_engine.core import ExecutionEngine +from tests.workflows.integration_tests.execution.workflows_gallery_collector.decorators import ( + add_to_workflows_gallery, +) + +ANTHROPIC_API_KEY = os.getenv("WORKFLOWS_TEST_ANTHROPIC_API_KEY") + +UNCONSTRAINED_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + ], + "steps": [ + { + "type": "roboflow_core/anthropic_claude@v1", + "name": "claude", + "images": "$inputs.image", + "task_type": "unconstrained", + "prompt": "Give me dominant color of the image", + "api_key": "$inputs.api_key", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.claude.output", + }, + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Prompting Anthropic Claude with arbitrary prompt", + use_case_description=""" +In this example, Anthropic Claude model is prompted with arbitrary text from user + """, + workflow_definition=UNCONSTRAINED_WORKFLOW, + workflow_name_in_app="claude-arbitrary-prompt", +) +@pytest.mark.skipif( + condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided" +) +def test_workflow_with_unconstrained_prompt( + model_manager: ModelManager, + dogs_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=UNCONSTRAINED_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [dogs_image], + "api_key": ANTHROPIC_API_KEY, + "prompt": "What is the topic of the image?", + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered" + assert ( + isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0 + ), "Expected non-empty string generated" + + +OCR_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + ], + "steps": [ + { + "type": "roboflow_core/anthropic_claude@v1", + "name": "claude", + "images": "$inputs.image", + "task_type": "ocr", + "api_key": "$inputs.api_key", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.claude.output", + }, + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Using Anthropic Claude as OCR model", + use_case_description=""" +In this example, Anthropic Claude model is used as OCR system. User just points task type and do not need to provide +any prompt. + """, + workflow_definition=OCR_WORKFLOW, + workflow_name_in_app="claude-ocr", +) +@pytest.mark.skipif( + condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided" +) +def test_workflow_with_ocr_prompt( + model_manager: ModelManager, + license_plate_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=OCR_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [license_plate_image], + "api_key": ANTHROPIC_API_KEY, + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered" + assert ( + isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0 + ), "Expected non-empty string generated" + + +VQA_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + {"type": "WorkflowParameter", "name": "prompt"}, + ], + "steps": [ + { + "type": "roboflow_core/anthropic_claude@v1", + "name": "claude", + "images": "$inputs.image", + "task_type": "visual-question-answering", + "prompt": "$inputs.prompt", + "api_key": "$inputs.api_key", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.claude.output", + }, + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Using Anthropic Claude as Visual Question Answering system", + use_case_description=""" +In this example, Anthropic Claude model is used as VQA system. User provides question via prompt. + """, + workflow_definition=VQA_WORKFLOW, + workflow_name_in_app="claude-vqa", +) +@pytest.mark.skipif( + condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided" +) +def test_workflow_with_vqa_prompt( + model_manager: ModelManager, + license_plate_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=VQA_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [license_plate_image], + "api_key": ANTHROPIC_API_KEY, + "prompt": "What are the brands of the cars?", + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered" + assert ( + isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0 + ), "Expected non-empty string generated" + + +CAPTION_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + ], + "steps": [ + { + "type": "roboflow_core/anthropic_claude@v1", + "name": "claude", + "images": "$inputs.image", + "task_type": "caption", + "api_key": "$inputs.api_key", + "temperature": 1.0, + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.claude.output", + }, + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Using Anthropic Claude as Image Captioning system", + use_case_description=""" +In this example, Anthropic Claude model is used as Image Captioning system. + """, + workflow_definition=CAPTION_WORKFLOW, + workflow_name_in_app="claude-captioning", +) +@pytest.mark.skipif( + condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided" +) +def test_workflow_with_captioning_prompt( + model_manager: ModelManager, + license_plate_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=CAPTION_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [license_plate_image], + "api_key": ANTHROPIC_API_KEY, + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered" + assert ( + isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0 + ), "Expected non-empty string generated" + + +CLASSIFICATION_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + {"type": "WorkflowParameter", "name": "classes"}, + ], + "steps": [ + { + "type": "roboflow_core/anthropic_claude@v1", + "name": "claude", + "images": "$inputs.image", + "task_type": "classification", + "classes": "$inputs.classes", + "api_key": "$inputs.api_key", + }, + { + "type": "roboflow_core/vlm_as_classifier@v1", + "name": "parser", + "image": "$inputs.image", + "vlm_output": "$steps.claude.output", + "classes": "$steps.claude.classes", + }, + { + "type": "roboflow_core/property_definition@v1", + "name": "top_class", + "operations": [ + {"type": "ClassificationPropertyExtract", "property_name": "top_class"} + ], + "data": "$steps.parser.predictions", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "claude_result", + "selector": "$steps.claude.output", + }, + { + "type": "JsonField", + "name": "top_class", + "selector": "$steps.top_class.output", + }, + { + "type": "JsonField", + "name": "parsed_prediction", + "selector": "$steps.parser.*", + }, + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Using Anthropic Claude as multi-class classifier", + use_case_description=""" +In this example, Anthropic Claude model is used as classifier. Output from the model is parsed by +special `roboflow_core/vlm_as_classifier@v1` block which turns model output text into +full-blown prediction, which can later be used by other blocks compatible with +classification predictions - in this case we extract top-class property. + """, + workflow_definition=CLASSIFICATION_WORKFLOW, + workflow_name_in_app="claude-multi-class-classifier", +) +@pytest.mark.skipif( + condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided" +) +def test_workflow_with_multi_class_classifier_prompt( + model_manager: ModelManager, + dogs_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=CLASSIFICATION_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [dogs_image], + "api_key": ANTHROPIC_API_KEY, + "classes": ["cat", "dog"], + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == { + "claude_result", + "top_class", + "parsed_prediction", + }, "Expected all outputs to be delivered" + assert ( + isinstance(result[0]["claude_result"], str) + and len(result[0]["claude_result"]) > 0 + ), "Expected non-empty string generated" + assert result[0]["top_class"] == "dog" + assert result[0]["parsed_prediction"]["error_status"] is False + + +MULTI_LABEL_CLASSIFICATION_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + {"type": "WorkflowParameter", "name": "classes"}, + ], + "steps": [ + { + "type": "roboflow_core/anthropic_claude@v1", + "name": "claude", + "images": "$inputs.image", + "task_type": "multi-label-classification", + "classes": "$inputs.classes", + "api_key": "$inputs.api_key", + }, + { + "type": "roboflow_core/vlm_as_classifier@v1", + "name": "parser", + "image": "$inputs.image", # requires image input to construct valid output compatible with "inference" + "vlm_output": "$steps.claude.output", + "classes": "$steps.claude.classes", + }, + { + "type": "roboflow_core/property_definition@v1", + "name": "top_class", + "operations": [ + {"type": "ClassificationPropertyExtract", "property_name": "top_class"} + ], + "data": "$steps.parser.predictions", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.top_class.output", + }, + { + "type": "JsonField", + "name": "parsed_prediction", + "selector": "$steps.parser.*", + }, + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Using Anthropic Claude as multi-label classifier", + use_case_description=""" +In this example, Anthropic Claude model is used as multi-label classifier. Output from the model is parsed by +special `roboflow_core/vlm_as_classifier@v1` block which turns model output text into +full-blown prediction, which can later be used by other blocks compatible with +classification predictions - in this case we extract top-class property. + """, + workflow_definition=MULTI_LABEL_CLASSIFICATION_WORKFLOW, + workflow_name_in_app="claude-multi-label-classifier", +) +@pytest.mark.skipif( + condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided" +) +def test_workflow_with_multi_label_classifier_prompt( + model_manager: ModelManager, + dogs_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=MULTI_LABEL_CLASSIFICATION_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [dogs_image], + "api_key": ANTHROPIC_API_KEY, + "classes": ["cat", "dog"], + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == { + "result", + "parsed_prediction", + }, "Expected all outputs to be delivered" + assert result[0]["result"] == ["dog"] + assert result[0]["parsed_prediction"]["error_status"] is False + + +STRUCTURED_PROMPTING_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + ], + "steps": [ + { + "type": "roboflow_core/anthropic_claude@v1", + "name": "claude", + "images": "$inputs.image", + "task_type": "structured-answering", + "output_structure": { + "dogs_count": "count of dogs instances in the image", + "cats_count": "count of cats instances in the image", + }, + "api_key": "$inputs.api_key", + }, + { + "type": "roboflow_core/json_parser@v1", + "name": "parser", + "raw_json": "$steps.claude.output", + "expected_fields": ["dogs_count", "cats_count"], + }, + { + "type": "roboflow_core/property_definition@v1", + "name": "property_definition", + "operations": [{"type": "ToString"}], + "data": "$steps.parser.dogs_count", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.property_definition.output", + } + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Using Anthropic Claude to provide structured JSON", + use_case_description=""" +In this example, Anthropic Claude model is expected to provide structured output in JSON, which can later be +parsed by dedicated `roboflow_core/json_parser@v1` block which transforms string into dictionary +and expose it's keys to other blocks for further processing. In this case, parsed output is +transformed using `roboflow_core/property_definition@v1` block. + """, + workflow_definition=STRUCTURED_PROMPTING_WORKFLOW, + workflow_name_in_app="claude-structured-prompting", +) +@pytest.mark.skipif( + condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided" +) +def test_workflow_with_structured_prompt( + model_manager: ModelManager, + dogs_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=STRUCTURED_PROMPTING_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [dogs_image], + "api_key": ANTHROPIC_API_KEY, + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered" + assert result[0]["result"] == "2" + + +OBJECT_DETECTION_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + {"type": "WorkflowParameter", "name": "classes"}, + ], + "steps": [ + { + "type": "roboflow_core/anthropic_claude@v1", + "name": "claude", + "images": "$inputs.image", + "task_type": "object-detection", + "classes": "$inputs.classes", + "api_key": "$inputs.api_key", + }, + { + "type": "roboflow_core/vlm_as_detector@v1", + "name": "parser", + "vlm_output": "$steps.claude.output", + "image": "$inputs.image", + "classes": "$steps.claude.classes", + "model_type": "anthropic-claude", + "task_type": "object-detection", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "claude_result", + "selector": "$steps.claude.output", + }, + { + "type": "JsonField", + "name": "parsed_prediction", + "selector": "$steps.parser.predictions", + }, + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Using Anthropic Claude as object-detection model", + use_case_description=""" +In this example, Anthropic Claude model is expected to provide output, which can later be +parsed by dedicated `roboflow_core/vlm_as_detector@v1` block which transforms string into `sv.Detections`, +which can later be used by other blocks processing object-detection predictions. + """, + workflow_definition=OBJECT_DETECTION_WORKFLOW, + workflow_name_in_app="claude-object-detection", +) +@pytest.mark.skipif( + condition=ANTHROPIC_API_KEY is None, reason="Anthropic API key not provided" +) +def test_workflow_with_object_detection_prompt( + model_manager: ModelManager, + dogs_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=OBJECT_DETECTION_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [dogs_image], + "api_key": ANTHROPIC_API_KEY, + "classes": ["cat", "dog"], + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == { + "claude_result", + "parsed_prediction", + }, "Expected all outputs to be delivered" + assert result[0]["parsed_prediction"].data["class_name"].tolist() == [ + "dog", + "dog", + ], "Expected 2 dogs to be detected" diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_detections_consensus_block.py b/tests/workflows/integration_tests/execution/test_workflow_with_detections_consensus_block.py index e071d73974..6916fe8d40 100644 --- a/tests/workflows/integration_tests/execution/test_workflow_with_detections_consensus_block.py +++ b/tests/workflows/integration_tests/execution/test_workflow_with_detections_consensus_block.py @@ -15,7 +15,11 @@ "version": "1.0", "inputs": [ {"type": "WorkflowImage", "name": "image"}, - {"type": "WorkflowParameter", "name": "model_id"}, + { + "type": "WorkflowParameter", + "name": "model_id", + "default_value": "yolov8n-640", + }, ], "steps": [ { @@ -228,31 +232,6 @@ def test_consensus_workflow_when_confidence_is_restricted_by_input_parameter( ), "Expected confidences to match what was validated manually as workflow outcome" -def test_consensus_workflow_when_model_id_not_provided_in_input( - model_manager: ModelManager, - crowd_image: np.ndarray, -) -> None: - # given - workflow_init_parameters = { - "workflows_core.model_manager": model_manager, - "workflows_core.api_key": None, - "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, - } - execution_engine = ExecutionEngine.init( - workflow_definition=CONSENSUS_WORKFLOW, - init_parameters=workflow_init_parameters, - max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, - ) - - # when - with pytest.raises(RuntimeInputError): - _ = execution_engine.run( - runtime_parameters={ - "image": crowd_image, - } - ) - - def test_consensus_workflow_when_image_not_provided_in_input( model_manager: ModelManager, ) -> None: diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_gemini_models.py b/tests/workflows/integration_tests/execution/test_workflow_with_gemini_models.py new file mode 100644 index 0000000000..97943e1a3e --- /dev/null +++ b/tests/workflows/integration_tests/execution/test_workflow_with_gemini_models.py @@ -0,0 +1,675 @@ +""" +This test module requires Google AI API key passed via env variable WORKFLOWS_TEST_GOOGLE_API_KEY. +This is supposed to be used only locally, as that would be too much of a cost in CI +""" + +import os + +import numpy as np +import pytest + +from inference.core.env import WORKFLOWS_MAX_CONCURRENT_STEPS +from inference.core.managers.base import ModelManager +from inference.core.workflows.core_steps.common.entities import StepExecutionMode +from inference.core.workflows.execution_engine.core import ExecutionEngine +from tests.workflows.integration_tests.execution.workflows_gallery_collector.decorators import ( + add_to_workflows_gallery, +) + +GOOGLE_API_KEY = os.getenv("WORKFLOWS_TEST_GOOGLE_API_KEY") + +UNCONSTRAINED_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + ], + "steps": [ + { + "type": "roboflow_core/google_gemini@v1", + "name": "gemini", + "images": "$inputs.image", + "task_type": "unconstrained", + "prompt": "Give me dominant color of the image", + "api_key": "$inputs.api_key", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.gemini.output", + }, + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Prompting Google's Gemini with arbitrary prompt", + use_case_description=""" +In this example, Google's Gemini model is prompted with arbitrary text from user + """, + workflow_definition=UNCONSTRAINED_WORKFLOW, + workflow_name_in_app="gemini-arbitrary-prompt", +) +@pytest.mark.skipif( + condition=GOOGLE_API_KEY is None, reason="Google API key not provided" +) +def test_workflow_with_unconstrained_prompt( + model_manager: ModelManager, + dogs_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=UNCONSTRAINED_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [dogs_image], + "api_key": GOOGLE_API_KEY, + "prompt": "What is the topic of the image?", + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered" + assert ( + isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0 + ), "Expected non-empty string generated" + + +OCR_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + ], + "steps": [ + { + "type": "roboflow_core/google_gemini@v1", + "name": "gemini", + "images": "$inputs.image", + "task_type": "ocr", + "api_key": "$inputs.api_key", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.gemini.output", + }, + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Using Google's Gemini as OCR model", + use_case_description=""" +In this example, Google's Gemini model is used as OCR system. User just points task type and do not need to provide +any prompt. + """, + workflow_definition=OCR_WORKFLOW, + workflow_name_in_app="gemini-ocr", +) +@pytest.mark.skipif( + condition=GOOGLE_API_KEY is None, reason="Google API key not provided" +) +def test_workflow_with_ocr_prompt( + model_manager: ModelManager, + license_plate_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=OCR_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [license_plate_image], + "api_key": GOOGLE_API_KEY, + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered" + assert ( + isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0 + ), "Expected non-empty string generated" + + +VQA_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + {"type": "WorkflowParameter", "name": "prompt"}, + ], + "steps": [ + { + "type": "roboflow_core/google_gemini@v1", + "name": "gemini", + "images": "$inputs.image", + "task_type": "visual-question-answering", + "prompt": "$inputs.prompt", + "api_key": "$inputs.api_key", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.gemini.output", + }, + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Using Google's Gemini as Visual Question Answering system", + use_case_description=""" +In this example, Google's Gemini model is used as VQA system. User provides question via prompt. + """, + workflow_definition=VQA_WORKFLOW, + workflow_name_in_app="gemini-vqa", +) +@pytest.mark.skipif( + condition=GOOGLE_API_KEY is None, reason="Google API key not provided" +) +def test_workflow_with_vqa_prompt( + model_manager: ModelManager, + license_plate_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=VQA_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [license_plate_image], + "api_key": GOOGLE_API_KEY, + "prompt": "What are the brands of the cars?", + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered" + assert ( + isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0 + ), "Expected non-empty string generated" + + +CAPTION_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + ], + "steps": [ + { + "type": "roboflow_core/google_gemini@v1", + "name": "gemini", + "images": "$inputs.image", + "task_type": "caption", + "api_key": "$inputs.api_key", + "temperature": 1.0, + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.gemini.output", + }, + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Using Google's Gemini as Image Captioning system", + use_case_description=""" +In this example, Google's Gemini model is used as Image Captioning system. + """, + workflow_definition=CAPTION_WORKFLOW, + workflow_name_in_app="gemini-captioning", +) +@pytest.mark.skipif( + condition=GOOGLE_API_KEY is None, reason="Google API key not provided" +) +def test_workflow_with_captioning_prompt( + model_manager: ModelManager, + license_plate_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=CAPTION_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [license_plate_image], + "api_key": GOOGLE_API_KEY, + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered" + assert ( + isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0 + ), "Expected non-empty string generated" + + +CLASSIFICATION_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + {"type": "WorkflowParameter", "name": "classes"}, + ], + "steps": [ + { + "type": "roboflow_core/google_gemini@v1", + "name": "gemini", + "images": "$inputs.image", + "task_type": "classification", + "classes": "$inputs.classes", + "api_key": "$inputs.api_key", + }, + { + "type": "roboflow_core/vlm_as_classifier@v1", + "name": "parser", + "image": "$inputs.image", + "vlm_output": "$steps.gemini.output", + "classes": "$steps.gemini.classes", + }, + { + "type": "roboflow_core/property_definition@v1", + "name": "top_class", + "operations": [ + {"type": "ClassificationPropertyExtract", "property_name": "top_class"} + ], + "data": "$steps.parser.predictions", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "gemini_result", + "selector": "$steps.gemini.output", + }, + { + "type": "JsonField", + "name": "top_class", + "selector": "$steps.top_class.output", + }, + { + "type": "JsonField", + "name": "parsed_prediction", + "selector": "$steps.parser.*", + }, + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Using Google's Gemini as multi-class classifier", + use_case_description=""" +In this example, Google's Gemini model is used as classifier. Output from the model is parsed by +special `roboflow_core/vlm_as_classifier@v1` block which turns model output text into +full-blown prediction, which can later be used by other blocks compatible with +classification predictions - in this case we extract top-class property. + """, + workflow_definition=CLASSIFICATION_WORKFLOW, + workflow_name_in_app="gemini-multi-class-classifier", +) +@pytest.mark.skipif( + condition=GOOGLE_API_KEY is None, reason="Google API key not provided" +) +def test_workflow_with_multi_class_classifier_prompt( + model_manager: ModelManager, + dogs_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=CLASSIFICATION_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [dogs_image], + "api_key": GOOGLE_API_KEY, + "classes": ["cat", "dog"], + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == { + "gemini_result", + "top_class", + "parsed_prediction", + }, "Expected all outputs to be delivered" + assert ( + isinstance(result[0]["gemini_result"], str) + and len(result[0]["gemini_result"]) > 0 + ), "Expected non-empty string generated" + assert result[0]["top_class"] == "dog" + assert result[0]["parsed_prediction"]["error_status"] is False + + +MULTI_LABEL_CLASSIFICATION_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + {"type": "WorkflowParameter", "name": "classes"}, + ], + "steps": [ + { + "type": "roboflow_core/google_gemini@v1", + "name": "gemini", + "images": "$inputs.image", + "task_type": "multi-label-classification", + "classes": "$inputs.classes", + "api_key": "$inputs.api_key", + }, + { + "type": "roboflow_core/vlm_as_classifier@v1", + "name": "parser", + "image": "$inputs.image", + "vlm_output": "$steps.gemini.output", + "classes": "$steps.gemini.classes", + }, + { + "type": "roboflow_core/property_definition@v1", + "name": "top_class", + "operations": [ + {"type": "ClassificationPropertyExtract", "property_name": "top_class"} + ], + "data": "$steps.parser.predictions", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.top_class.output", + }, + { + "type": "JsonField", + "name": "parsed_prediction", + "selector": "$steps.parser.*", + }, + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Using Google's Gemini as multi-label classifier", + use_case_description=""" +In this example, Google's Gemini model is used as multi-label classifier. Output from the model is parsed by +special `roboflow_core/vlm_as_classifier@v1` block which turns model output text into +full-blown prediction, which can later be used by other blocks compatible with +classification predictions - in this case we extract top-class property. + """, + workflow_definition=MULTI_LABEL_CLASSIFICATION_WORKFLOW, + workflow_name_in_app="gemini-multi-label-classifier", +) +@pytest.mark.skipif( + condition=GOOGLE_API_KEY is None, reason="Google API key not provided" +) +def test_workflow_with_multi_label_classifier_prompt( + model_manager: ModelManager, + dogs_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=MULTI_LABEL_CLASSIFICATION_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [dogs_image], + "api_key": GOOGLE_API_KEY, + "classes": ["cat", "dog"], + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == { + "result", + "parsed_prediction", + }, "Expected all outputs to be delivered" + assert result[0]["result"] == ["dog"] + assert result[0]["parsed_prediction"]["error_status"] is False + + +STRUCTURED_PROMPTING_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + ], + "steps": [ + { + "type": "roboflow_core/google_gemini@v1", + "name": "gemini", + "images": "$inputs.image", + "task_type": "structured-answering", + "output_structure": { + "dogs_count": "count of dogs instances in the image", + "cats_count": "count of cats instances in the image", + }, + "api_key": "$inputs.api_key", + }, + { + "type": "roboflow_core/json_parser@v1", + "name": "parser", + "raw_json": "$steps.gemini.output", + "expected_fields": ["dogs_count", "cats_count"], + }, + { + "type": "roboflow_core/property_definition@v1", + "name": "property_definition", + "operations": [{"type": "ToString"}], + "data": "$steps.parser.dogs_count", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.property_definition.output", + } + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Using Google's Gemini to provide structured JSON", + use_case_description=""" +In this example, Google's Gemini model is expected to provide structured output in JSON, which can later be +parsed by dedicated `roboflow_core/json_parser@v1` block which transforms string into dictionary +and expose it's keys to other blocks for further processing. In this case, parsed output is +transformed using `roboflow_core/property_definition@v1` block. + """, + workflow_definition=STRUCTURED_PROMPTING_WORKFLOW, + workflow_name_in_app="gemini-structured-prompting", +) +@pytest.mark.skipif( + condition=GOOGLE_API_KEY is None, reason="Google API key not provided" +) +def test_workflow_with_structured_prompt( + model_manager: ModelManager, + dogs_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=STRUCTURED_PROMPTING_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [dogs_image], + "api_key": GOOGLE_API_KEY, + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered" + assert result[0]["result"] == "2" + + +OBJECT_DETECTION_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + {"type": "WorkflowParameter", "name": "classes"}, + ], + "steps": [ + { + "type": "roboflow_core/google_gemini@v1", + "name": "gemini", + "images": "$inputs.image", + "task_type": "object-detection", + "classes": "$inputs.classes", + "api_key": "$inputs.api_key", + }, + { + "type": "roboflow_core/vlm_as_detector@v1", + "name": "parser", + "vlm_output": "$steps.gemini.output", + "image": "$inputs.image", + "classes": "$steps.gemini.classes", + "model_type": "google-gemini", + "task_type": "object-detection", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "gemini_result", + "selector": "$steps.gemini.output", + }, + { + "type": "JsonField", + "name": "parsed_prediction", + "selector": "$steps.parser.predictions", + }, + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Using Google's Gemini as object-detection model", + use_case_description=""" +In this example, Google's Gemini model is expected to provide output, which can later be +parsed by dedicated `roboflow_core/vlm_as_detector@v1` block which transforms string into `sv.Detections`, +which can later be used by other blocks processing object-detection predictions. + """, + workflow_definition=OBJECT_DETECTION_WORKFLOW, + workflow_name_in_app="gemini-object-detection", +) +@pytest.mark.skipif( + condition=GOOGLE_API_KEY is None, reason="Google API key not provided" +) +def test_workflow_with_object_detection_prompt( + model_manager: ModelManager, + dogs_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=OBJECT_DETECTION_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [dogs_image], + "api_key": GOOGLE_API_KEY, + "classes": ["cat", "dog"], + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == { + "gemini_result", + "parsed_prediction", + }, "Expected all outputs to be delivered" + assert result[0]["parsed_prediction"].data["class_name"].tolist() == [ + "dog", + "dog", + ], "Expected 2 dogs to be detected" diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_model_running_on_absolute_static_crop.py b/tests/workflows/integration_tests/execution/test_workflow_with_model_running_on_absolute_static_crop.py index 02e4d1db62..db9312d59a 100644 --- a/tests/workflows/integration_tests/execution/test_workflow_with_model_running_on_absolute_static_crop.py +++ b/tests/workflows/integration_tests/execution/test_workflow_with_model_running_on_absolute_static_crop.py @@ -15,7 +15,11 @@ "version": "1.0", "inputs": [ {"type": "WorkflowImage", "name": "image"}, - {"type": "WorkflowParameter", "name": "model_id", "default_value": "yolov8n-640"}, + { + "type": "WorkflowParameter", + "name": "model_id", + "default_value": "yolov8n-640", + }, {"type": "WorkflowParameter", "name": "confidence", "default_value": 0.7}, {"type": "WorkflowParameter", "name": "x_center"}, {"type": "WorkflowParameter", "name": "y_center"}, diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_open_ai_models.py b/tests/workflows/integration_tests/execution/test_workflow_with_open_ai_models.py new file mode 100644 index 0000000000..b37850ff2d --- /dev/null +++ b/tests/workflows/integration_tests/execution/test_workflow_with_open_ai_models.py @@ -0,0 +1,584 @@ +""" +This test module requires OpenAI API key passed via env variable WORKFLOWS_TEST_OPEN_AI_KEY. +This is supposed to be used only locally, as that would be too much of a cost in CI +""" + +import os + +import numpy as np +import pytest + +from inference.core.env import WORKFLOWS_MAX_CONCURRENT_STEPS +from inference.core.managers.base import ModelManager +from inference.core.workflows.core_steps.common.entities import StepExecutionMode +from inference.core.workflows.execution_engine.core import ExecutionEngine +from tests.workflows.integration_tests.execution.workflows_gallery_collector.decorators import ( + add_to_workflows_gallery, +) + +OPEN_AI_API_KEY = os.getenv("WORKFLOWS_TEST_OPEN_AI_KEY") + +UNCONSTRAINED_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + {"type": "WorkflowParameter", "name": "prompt"}, + ], + "steps": [ + { + "type": "roboflow_core/open_ai@v2", + "name": "gpt", + "images": "$inputs.image", + "task_type": "unconstrained", + "prompt": "$inputs.prompt", + "api_key": "$inputs.api_key", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.gpt.output", + }, + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Prompting GPT with arbitrary prompt", + use_case_description=""" +In this example, GPT model is prompted with arbitrary text from user + """, + workflow_definition=UNCONSTRAINED_WORKFLOW, + workflow_name_in_app="gpt-arbitrary-prompt", +) +@pytest.mark.skipif( + condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided" +) +def test_workflow_with_unconstrained_prompt( + model_manager: ModelManager, + dogs_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=UNCONSTRAINED_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [dogs_image], + "api_key": OPEN_AI_API_KEY, + "prompt": "What is the topic of the image?", + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered" + assert ( + isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0 + ), "Expected non-empty string generated" + + +OCR_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + ], + "steps": [ + { + "type": "roboflow_core/open_ai@v2", + "name": "gpt", + "images": "$inputs.image", + "task_type": "ocr", + "api_key": "$inputs.api_key", + "model_version": "gpt-4o-mini", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.gpt.output", + }, + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Using GPT as OCR model", + use_case_description=""" +In this example, GPT model is used as OCR system. User just points task type and do not need to provide +any prompt. + """, + workflow_definition=OCR_WORKFLOW, + workflow_name_in_app="gpt-ocr", +) +@pytest.mark.skipif( + condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided" +) +def test_workflow_with_ocr_prompt( + model_manager: ModelManager, + license_plate_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=OCR_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [license_plate_image], + "api_key": OPEN_AI_API_KEY, + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered" + assert ( + isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0 + ), "Expected non-empty string generated" + + +VQA_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + {"type": "WorkflowParameter", "name": "prompt"}, + ], + "steps": [ + { + "type": "roboflow_core/open_ai@v2", + "name": "gpt", + "images": "$inputs.image", + "task_type": "visual-question-answering", + "prompt": "$inputs.prompt", + "api_key": "$inputs.api_key", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.gpt.output", + }, + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Using GPT as Visual Question Answering system", + use_case_description=""" +In this example, GPT model is used as VQA system. User provides question via prompt. + """, + workflow_definition=VQA_WORKFLOW, + workflow_name_in_app="gpt-vqa", +) +@pytest.mark.skipif( + condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided" +) +def test_workflow_with_vqa_prompt( + model_manager: ModelManager, + license_plate_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=VQA_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [license_plate_image], + "api_key": OPEN_AI_API_KEY, + "prompt": "What are the brands of the cars?", + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered" + assert ( + isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0 + ), "Expected non-empty string generated" + + +CAPTION_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + ], + "steps": [ + { + "type": "roboflow_core/open_ai@v2", + "name": "gpt", + "images": "$inputs.image", + "task_type": "caption", + "api_key": "$inputs.api_key", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.gpt.output", + }, + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Using GPT as Image Captioning system", + use_case_description=""" +In this example, GPT model is used as Image Captioning system. + """, + workflow_definition=CAPTION_WORKFLOW, + workflow_name_in_app="gpt-captioning", +) +@pytest.mark.skipif( + condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided" +) +def test_workflow_with_captioning_prompt( + model_manager: ModelManager, + license_plate_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=CAPTION_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [license_plate_image], + "api_key": OPEN_AI_API_KEY, + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered" + assert ( + isinstance(result[0]["result"], str) and len(result[0]["result"]) > 0 + ), "Expected non-empty string generated" + + +CLASSIFICATION_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + {"type": "WorkflowParameter", "name": "classes"}, + ], + "steps": [ + { + "type": "roboflow_core/open_ai@v2", + "name": "gpt", + "images": "$inputs.image", + "task_type": "classification", + "classes": "$inputs.classes", + "api_key": "$inputs.api_key", + }, + { + "type": "roboflow_core/vlm_as_classifier@v1", + "name": "parser", + "image": "$inputs.image", + "vlm_output": "$steps.gpt.output", + "classes": "$steps.gpt.classes", + }, + { + "type": "roboflow_core/property_definition@v1", + "name": "top_class", + "operations": [ + {"type": "ClassificationPropertyExtract", "property_name": "top_class"} + ], + "data": "$steps.parser.predictions", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "gpt_result", + "selector": "$steps.gpt.output", + }, + { + "type": "JsonField", + "name": "top_class", + "selector": "$steps.top_class.output", + }, + { + "type": "JsonField", + "name": "parsed_prediction", + "selector": "$steps.parser.*", + }, + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Using GPT as multi-class classifier", + use_case_description=""" +In this example, GPT model is used as classifier. Output from the model is parsed by +special `roboflow_core/vlm_as_classifier@v1` block which turns GPT output text into +full-blown prediction, which can later be used by other blocks compatible with +classification predictions - in this case we extract top-class property. + """, + workflow_definition=CLASSIFICATION_WORKFLOW, + workflow_name_in_app="gpt-multi-class-classifier", +) +@pytest.mark.skipif( + condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided" +) +def test_workflow_with_multi_class_classifier_prompt( + model_manager: ModelManager, + dogs_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=CLASSIFICATION_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [dogs_image], + "api_key": OPEN_AI_API_KEY, + "classes": ["cat", "dog"], + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == { + "gpt_result", + "top_class", + "parsed_prediction", + }, "Expected all outputs to be delivered" + assert ( + isinstance(result[0]["gpt_result"], str) and len(result[0]["gpt_result"]) > 0 + ), "Expected non-empty string generated" + assert result[0]["top_class"] == "dog" + assert result[0]["parsed_prediction"]["error_status"] is False + + +MULTI_LABEL_CLASSIFICATION_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + {"type": "WorkflowParameter", "name": "classes"}, + ], + "steps": [ + { + "type": "roboflow_core/open_ai@v2", + "name": "gpt", + "images": "$inputs.image", + "task_type": "multi-label-classification", + "classes": "$inputs.classes", + "api_key": "$inputs.api_key", + }, + { + "type": "roboflow_core/vlm_as_classifier@v1", + "name": "parser", + "image": "$inputs.image", + "vlm_output": "$steps.gpt.output", + "classes": "$steps.gpt.classes", + }, + { + "type": "roboflow_core/property_definition@v1", + "name": "top_class", + "operations": [ + {"type": "ClassificationPropertyExtract", "property_name": "top_class"} + ], + "data": "$steps.parser.predictions", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.top_class.output", + }, + { + "type": "JsonField", + "name": "parsed_prediction", + "selector": "$steps.parser.*", + }, + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Using GPT as multi-label classifier", + use_case_description=""" +In this example, GPT model is used as multi-label classifier. Output from the model is parsed by +special `roboflow_core/vlm_as_classifier@v1` block which turns GPT output text into +full-blown prediction, which can later be used by other blocks compatible with +classification predictions - in this case we extract top-class property. + """, + workflow_definition=MULTI_LABEL_CLASSIFICATION_WORKFLOW, + workflow_name_in_app="gpt-multi-label-classifier", +) +@pytest.mark.skipif( + condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided" +) +def test_workflow_with_multi_label_classifier_prompt( + model_manager: ModelManager, + dogs_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=MULTI_LABEL_CLASSIFICATION_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [dogs_image], + "api_key": OPEN_AI_API_KEY, + "classes": ["cat", "dog"], + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == { + "result", + "parsed_prediction", + }, "Expected all outputs to be delivered" + assert result[0]["result"] == ["dog"] + assert result[0]["parsed_prediction"]["error_status"] is False + + +STRUCTURED_PROMPTING_WORKFLOW = { + "version": "1.0", + "inputs": [ + {"type": "WorkflowImage", "name": "image"}, + {"type": "WorkflowParameter", "name": "api_key"}, + ], + "steps": [ + { + "type": "roboflow_core/open_ai@v2", + "name": "gpt", + "images": "$inputs.image", + "task_type": "structured-answering", + "output_structure": { + "dogs_count": "count of dogs instances in the image", + "cats_count": "count of cats instances in the image", + }, + "api_key": "$inputs.api_key", + }, + { + "type": "roboflow_core/json_parser@v1", + "name": "parser", + "raw_json": "$steps.gpt.output", + "expected_fields": ["dogs_count", "cats_count"], + }, + { + "type": "roboflow_core/property_definition@v1", + "name": "property_definition", + "operations": [{"type": "ToString"}], + "data": "$steps.parser.dogs_count", + }, + ], + "outputs": [ + { + "type": "JsonField", + "name": "result", + "selector": "$steps.property_definition.output", + } + ], +} + + +@add_to_workflows_gallery( + category="Workflows with Visual Language Models", + use_case_title="Using GPT to provide structured JSON", + use_case_description=""" +In this example, GPT model is expected to provide structured output in JSON, which can later be +parsed by dedicated `roboflow_core/json_parser@v1` block which transforms string into dictionary +and expose it's keys to other blocks for further processing. In this case, parsed output is +transformed using `roboflow_core/property_definition@v1` block. + """, + workflow_definition=STRUCTURED_PROMPTING_WORKFLOW, + workflow_name_in_app="gpt-structured-prompting", +) +@pytest.mark.skipif( + condition=OPEN_AI_API_KEY is None, reason="OpenAI API key not provided" +) +def test_workflow_with_structured_prompt( + model_manager: ModelManager, + dogs_image: np.ndarray, +) -> None: + # given + workflow_init_parameters = { + "workflows_core.model_manager": model_manager, + "workflows_core.step_execution_mode": StepExecutionMode.LOCAL, + } + execution_engine = ExecutionEngine.init( + workflow_definition=STRUCTURED_PROMPTING_WORKFLOW, + init_parameters=workflow_init_parameters, + max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS, + ) + + # when + result = execution_engine.run( + runtime_parameters={ + "image": [dogs_image], + "api_key": OPEN_AI_API_KEY, + } + ) + + # then + assert len(result) == 1, "Single image given, expected single output" + assert set(result[0].keys()) == {"result"}, "Expected all outputs to be delivered" + assert result[0]["result"] == "2" diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_single_model.py b/tests/workflows/integration_tests/execution/test_workflow_with_single_model.py index a8c1966bce..031c0e0d19 100644 --- a/tests/workflows/integration_tests/execution/test_workflow_with_single_model.py +++ b/tests/workflows/integration_tests/execution/test_workflow_with_single_model.py @@ -15,7 +15,11 @@ "version": "1.0", "inputs": [ {"type": "WorkflowImage", "name": "image"}, - {"type": "WorkflowParameter", "name": "model_id", "default_value": "yolov8n-640"}, + { + "type": "WorkflowParameter", + "name": "model_id", + "default_value": "yolov8n-640", + }, {"type": "WorkflowParameter", "name": "confidence", "default_value": 0.3}, ], "steps": [ diff --git a/tests/workflows/unit_tests/core_steps/formatters/test_json_parser.py b/tests/workflows/unit_tests/core_steps/formatters/test_json_parser.py new file mode 100644 index 0000000000..8907690232 --- /dev/null +++ b/tests/workflows/unit_tests/core_steps/formatters/test_json_parser.py @@ -0,0 +1,232 @@ +import json + +import pytest +from pydantic import ValidationError + +from inference.core.workflows.core_steps.formatters.json_parser.v1 import ( + BlockManifest, + JSONParserBlockV1, +) +from inference.core.workflows.execution_engine.entities.base import OutputDefinition +from inference.core.workflows.execution_engine.entities.types import BOOLEAN_KIND + + +def test_parsing_manifest_when_input_is_valid() -> None: + # given + raw_manifest = { + "name": "parser", + "type": "roboflow_core/json_parser@v1", + "raw_json": "$steps.some.a", + "expected_fields": ["a", "b", "c"], + } + + # when + result = BlockManifest.model_validate(raw_manifest) + + # then + assert result == BlockManifest( + name="parser", + type="roboflow_core/json_parser@v1", + raw_json="$steps.some.a", + expected_fields=["a", "b", "c"], + ) + + +def test_parsing_manifest_when_input_is_invalid() -> None: + # given + raw_manifest = { + "name": "parser", + "type": "roboflow_core/json_parser@v1", + "raw_json": "$steps.some.a", + "expected_fields": ["a", "b", "c", "error_status"], + } + + # when + with pytest.raises(ValidationError): + _ = BlockManifest.model_validate(raw_manifest) + + +def test_manifest_get_actual_outputs() -> None: + # given + manifest = BlockManifest( + name="parser", + type="roboflow_core/json_parser@v1", + raw_json="$steps.some.a", + expected_fields=["a", "b", "c"], + ) + + # when + result = manifest.get_actual_outputs() + + # then + assert result == [ + OutputDefinition(name="error_status", kind=[BOOLEAN_KIND]), + OutputDefinition(name="a"), + OutputDefinition(name="b"), + OutputDefinition(name="c"), + ] + + +def test_block_run_when_valid_json_given_and_all_fields_declared() -> None: + # given + raw_json = json.dumps({"a": "1", "b": "2"}) + block = JSONParserBlockV1() + + # when + result = block.run(raw_json=raw_json, expected_fields=["a", "b"]) + + # then + assert result == { + "error_status": False, + "a": "1", + "b": "2", + } + + +def test_block_run_when_valid_json_given_and_subset_of_fields_declared() -> None: + # given + raw_json = json.dumps({"a": "1", "b": "2"}) + block = JSONParserBlockV1() + + # when + result = block.run(raw_json=raw_json, expected_fields=["a"]) + + # then + assert result == { + "error_status": False, + "a": "1", + } + + +def test_block_run_when_valid_json_given_and_subset_of_declared_fields_found() -> None: + # given + raw_json = json.dumps({"a": "1", "b": "2"}) + block = JSONParserBlockV1() + + # when + result = block.run(raw_json=raw_json, expected_fields=["a", "b", "c"]) + + # then + assert result == { + "error_status": True, + "a": "1", + "b": "2", + "c": None, + } + + +def test_block_run_when_multiple_json_documents_provided() -> None: + # given + raw_json = json.dumps({"a": "1", "b": "2"}) + block = JSONParserBlockV1() + + # when + result = block.run(raw_json="\n".join([raw_json] * 2), expected_fields=["a", "b"]) + + # then + assert result == { + "error_status": True, + "a": None, + "b": None, + } + + +def test_block_run_when_invalid_json_provided() -> None: + # given + block = JSONParserBlockV1() + + # when + result = block.run(raw_json="invalid", expected_fields=["a", "b"]) + + # then + assert result == { + "error_status": True, + "a": None, + "b": None, + } + + +def test_block_run_when_json_in_markdown_provided() -> None: + # given + raw_json = json.dumps({"a": "1", "b": "2"}) + raw_json = f"```json\n{raw_json}\n```" + block = JSONParserBlockV1() + + # when + result = block.run(raw_json=raw_json, expected_fields=["a", "b"]) + + # then + assert result == { + "error_status": False, + "a": "1", + "b": "2", + } + + +def test_block_run_when_indented_json_in_markdown_provided() -> None: + # given + raw_json = json.dumps({"a": "1", "b": "2"}, indent=4) + raw_json = f"```json\n{raw_json}\n```" + block = JSONParserBlockV1() + + # when + result = block.run(raw_json=raw_json, expected_fields=["a", "b"]) + + # then + assert result == { + "error_status": False, + "a": "1", + "b": "2", + } + + +def test_block_run_when_json_in_markdown_uppercase_provided() -> None: + # given + raw_json = json.dumps({"a": "1", "b": "2"}) + raw_json = f"```JSON\n{raw_json}\n```" + block = JSONParserBlockV1() + + # when + result = block.run(raw_json=raw_json, expected_fields=["a", "b"]) + + # then + assert result == { + "error_status": False, + "a": "1", + "b": "2", + } + + +def test_block_run_when_json_in_markdown_without_new_lines_provided() -> None: + # given + raw_json = json.dumps({"a": "1", "b": "2"}) + raw_json = f"```JSON{raw_json}```" + block = JSONParserBlockV1() + + # when + result = block.run(raw_json=raw_json, expected_fields=["a", "b"]) + + # then + assert result == { + "error_status": False, + "a": "1", + "b": "2", + } + + +def test_block_run_when_multiple_jsons_in_markdown_provided() -> None: + # given + raw_json_1 = json.dumps({"a": "1", "b": "2"}) + raw_json_2 = json.dumps({"a": "3", "b": "4"}) + raw_json = f"```json\n{raw_json_1}\n```\n``json\n{raw_json_2}\n```" + block = JSONParserBlockV1() + + # when + result = block.run(raw_json=raw_json, expected_fields=["a", "b"]) + + # then + assert result == { + "error_status": False, + "a": "1", + "b": "2", + } diff --git a/tests/workflows/unit_tests/core_steps/formatters/test_vlm_as_classifier.py b/tests/workflows/unit_tests/core_steps/formatters/test_vlm_as_classifier.py new file mode 100644 index 0000000000..796c74cb3a --- /dev/null +++ b/tests/workflows/unit_tests/core_steps/formatters/test_vlm_as_classifier.py @@ -0,0 +1,342 @@ +from typing import List, Union + +import numpy as np +import pytest + +from inference.core.workflows.core_steps.formatters.vlm_as_classifier.v1 import ( + BlockManifest, + VLMAsClassifierBlockV1, +) +from inference.core.workflows.execution_engine.entities.base import ( + ImageParentMetadata, + WorkflowImageData, +) + + +@pytest.mark.parametrize("image", ["$inputs.image", "$steps.some.image"]) +@pytest.mark.parametrize( + "classes", ["$inputs.classes", "$steps.some.classes", ["a", "b"]] +) +def test_block_manifest_parsing_when_input_is_valid( + image: str, classes: Union[str, List[str]] +) -> None: + # given + raw_manifest = { + "type": "roboflow_core/vlm_as_classifier@v1", + "image": image, + "name": "parser", + "vlm_output": "$steps.vlm.output", + "classes": classes, + } + + # when + result = BlockManifest.model_validate(raw_manifest) + + # then + assert result == BlockManifest( + type="roboflow_core/vlm_as_classifier@v1", + name="parser", + image=image, + vlm_output="$steps.vlm.output", + classes=classes, + ) + + +def test_run_when_valid_json_given_for_multi_class_classification() -> None: + # given + image = WorkflowImageData( + numpy_image=np.zeros((192, 168, 3), dtype=np.uint8), + parent_metadata=ImageParentMetadata(parent_id="parent"), + ) + vlm_output = """ +```json +{"class_name": "car", "confidence": "0.7"} +``` + """ + block = VLMAsClassifierBlockV1() + + # when + result = block.run(image=image, vlm_output=vlm_output, classes=["car", "cat"]) + + # then + assert result["error_status"] is False + assert result["predictions"]["image"] == {"width": 168, "height": 192} + assert result["predictions"]["predictions"] == [ + {"class_name": "car", "class_id": 0, "confidence": 0.7}, + {"class_name": "cat", "class_id": 1, "confidence": 0.0}, + ] + assert result["predictions"]["top"] == "car" + assert abs(result["predictions"]["confidence"] - 0.7) < 1e-5 + assert result["predictions"]["parent_id"] == "parent" + assert len(result["inference_id"]) > 0 + assert result["inference_id"] == result["predictions"]["inference_id"] + + +def test_run_when_valid_json_given_for_multi_class_classification_when_unknown_class_predicted() -> ( + None +): + # given + image = WorkflowImageData( + numpy_image=np.zeros((192, 168, 3), dtype=np.uint8), + parent_metadata=ImageParentMetadata(parent_id="parent"), + ) + vlm_output = """ +```json +{"class_name": "my_class", "confidence": "0.7"} +``` + """ + block = VLMAsClassifierBlockV1() + + # when + result = block.run(image=image, vlm_output=vlm_output, classes=["car", "cat"]) + + # then + assert result["error_status"] is False + assert result["predictions"]["image"] == {"width": 168, "height": 192} + assert result["predictions"]["predictions"] == [ + {"class_name": "my_class", "class_id": -1, "confidence": 0.7}, + {"class_name": "car", "class_id": 0, "confidence": 0.0}, + {"class_name": "cat", "class_id": 1, "confidence": 0.0}, + ] + assert result["predictions"]["top"] == "my_class" + assert abs(result["predictions"]["confidence"] - 0.7) < 1e-5 + assert result["predictions"]["parent_id"] == "parent" + assert len(result["inference_id"]) > 0 + assert result["inference_id"] == result["predictions"]["inference_id"] + + +def test_run_when_valid_json_given_for_multi_label_classification() -> None: + # given + image = WorkflowImageData( + numpy_image=np.zeros((192, 168, 3), dtype=np.uint8), + parent_metadata=ImageParentMetadata(parent_id="parent"), + ) + vlm_output = """ + {"predicted_classes": [ + {"class": "cat", "confidence": 0.3}, {"class": "dog", "confidence": 0.6}, + {"class": "cat", "confidence": "0.7"} + ]} + """ + block = VLMAsClassifierBlockV1() + + # when + result = block.run( + image=image, vlm_output=vlm_output, classes=["car", "cat", "dog"] + ) + + # then + assert result["error_status"] is False + assert result["predictions"]["image"] == {"width": 168, "height": 192} + assert result["predictions"]["predictions"] == { + "car": {"confidence": 0.0, "class_id": 0}, + "cat": {"confidence": 0.7, "class_id": 1}, + "dog": {"confidence": 0.6, "class_id": 2}, + } + assert set(result["predictions"]["predicted_classes"]) == {"cat", "dog"} + assert result["predictions"]["parent_id"] == "parent" + assert len(result["inference_id"]) > 0 + assert result["inference_id"] == result["predictions"]["inference_id"] + + +def test_run_when_valid_json_given_for_multi_label_classification_when_unknown_class_provided() -> ( + None +): + # given + image = WorkflowImageData( + numpy_image=np.zeros((192, 168, 3), dtype=np.uint8), + parent_metadata=ImageParentMetadata(parent_id="parent"), + ) + vlm_output = """ + {"predicted_classes": [ + {"class": "my_class_1", "confidence": 0.3}, {"class": "my_class_2", "confidence": 0.6}, + {"class": "my_class_1", "confidence": 0.7} + ]} + """ + block = VLMAsClassifierBlockV1() + + # when + result = block.run( + image=image, vlm_output=vlm_output, classes=["car", "cat", "dog"] + ) + + # then + assert result["error_status"] is False + assert result["predictions"]["image"] == {"width": 168, "height": 192} + assert result["predictions"]["predictions"] == { + "car": {"confidence": 0.0, "class_id": 0}, + "cat": {"confidence": 0.0, "class_id": 1}, + "dog": {"confidence": 0.0, "class_id": 2}, + "my_class_1": {"confidence": 0.7, "class_id": -1}, + "my_class_2": {"confidence": 0.6, "class_id": -1}, + } + assert set(result["predictions"]["predicted_classes"]) == { + "my_class_1", + "my_class_2", + } + assert result["predictions"]["parent_id"] == "parent" + assert len(result["inference_id"]) > 0 + assert result["inference_id"] == result["predictions"]["inference_id"] + + +def test_run_when_valid_json_of_unknown_structure_given() -> None: + # given + image = WorkflowImageData( + numpy_image=np.zeros((192, 168, 3), dtype=np.uint8), + parent_metadata=ImageParentMetadata(parent_id="parent"), + ) + block = VLMAsClassifierBlockV1() + + # when + result = block.run( + image=image, vlm_output='{"some": "data"}', classes=["car", "cat"] + ) + + # then + assert result["error_status"] is True + assert result["predictions"] is None + assert len(result["inference_id"]) > 0 + + +def test_run_when_invalid_json_given() -> None: + # given + image = WorkflowImageData( + numpy_image=np.zeros((192, 168, 3), dtype=np.uint8), + parent_metadata=ImageParentMetadata(parent_id="parent"), + ) + block = VLMAsClassifierBlockV1() + + # when + result = block.run(image=image, vlm_output="invalid_json", classes=["car", "cat"]) + + # then + assert result["error_status"] is True + assert result["predictions"] is None + assert len(result["inference_id"]) > 0 + + +def test_run_when_multiple_jsons_given() -> None: + # given + raw_json = """ + {"predicted_classes": [ + {"class": "cat", "confidence": 0.3}, {"class": "dog", "confidence": 0.6}, + {"class": "cat", "confidence": "0.7"} + ]} + {"predicted_classes": [ + {"class": "cat", "confidence": 0.4}, {"class": "dog", "confidence": 0.7}, + {"class": "cat", "confidence": "0.8"} + ]} + """ + image = WorkflowImageData( + numpy_image=np.zeros((192, 168, 3), dtype=np.uint8), + parent_metadata=ImageParentMetadata(parent_id="parent"), + ) + block = VLMAsClassifierBlockV1() + + # when + result = block.run(image=image, vlm_output=raw_json, classes=["car", "cat"]) + + # then + assert result["error_status"] is True + assert result["predictions"] is None + assert len(result["inference_id"]) > 0 + + +def test_run_when_json_in_markdown_block_given() -> None: + # given + raw_json = """ +```json +{"predicted_classes": [ + {"class": "cat", "confidence": 0.3}, {"class": "dog", "confidence": 0.6}, + {"class": "cat", "confidence": "0.7"} +]} +``` +``` + """ + image = WorkflowImageData( + numpy_image=np.zeros((192, 168, 3), dtype=np.uint8), + parent_metadata=ImageParentMetadata(parent_id="parent"), + ) + block = VLMAsClassifierBlockV1() + + # when + result = block.run(image=image, vlm_output=raw_json, classes=["car", "cat", "dog"]) + + # then + assert result["error_status"] is False + assert result["predictions"]["image"] == {"width": 168, "height": 192} + assert result["predictions"]["predictions"] == { + "car": {"confidence": 0.0, "class_id": 0}, + "cat": {"confidence": 0.7, "class_id": 1}, + "dog": {"confidence": 0.6, "class_id": 2}, + } + assert set(result["predictions"]["predicted_classes"]) == {"cat", "dog"} + assert result["predictions"]["parent_id"] == "parent" + assert len(result["inference_id"]) > 0 + assert result["inference_id"] == result["predictions"]["inference_id"] + + +def test_run_when_json_in_markdown_block_without_new_lines_given() -> None: + # given + raw_json = """ +```json{"predicted_classes": [{"class": "cat", "confidence": 0.3}, {"class": "dog", "confidence": 0.6}, {"class": "cat", "confidence": "0.7"}]}``` +""" + image = WorkflowImageData( + numpy_image=np.zeros((192, 168, 3), dtype=np.uint8), + parent_metadata=ImageParentMetadata(parent_id="parent"), + ) + block = VLMAsClassifierBlockV1() + + # when + result = block.run(image=image, vlm_output=raw_json, classes=["car", "cat", "dog"]) + + # then + assert result["error_status"] is False + assert result["predictions"]["image"] == {"width": 168, "height": 192} + assert result["predictions"]["predictions"] == { + "car": {"confidence": 0.0, "class_id": 0}, + "cat": {"confidence": 0.7, "class_id": 1}, + "dog": {"confidence": 0.6, "class_id": 2}, + } + assert set(result["predictions"]["predicted_classes"]) == {"cat", "dog"} + assert result["predictions"]["parent_id"] == "parent" + assert len(result["inference_id"]) > 0 + assert result["inference_id"] == result["predictions"]["inference_id"] + + +def test_run_when_multiple_jsons_in_markdown_block_given() -> None: + # given + raw_json = """ +```json +{"predicted_classes": [ + {"class": "cat", "confidence": 0.3}, {"class": "dog", "confidence": 0.6}, + {"class": "cat", "confidence": "0.7"} +]} +``` +```json +{"predicted_classes": [ + {"class": "cat", "confidence": 0.4}, {"class": "dog", "confidence": 0.7}, + {"class": "cat", "confidence": "0.8"} +]} +``` +""" + image = WorkflowImageData( + numpy_image=np.zeros((192, 168, 3), dtype=np.uint8), + parent_metadata=ImageParentMetadata(parent_id="parent"), + ) + block = VLMAsClassifierBlockV1() + + # when + result = block.run(image=image, vlm_output=raw_json, classes=["car", "cat", "dog"]) + + # then + assert result["error_status"] is False + assert result["predictions"]["image"] == {"width": 168, "height": 192} + assert result["predictions"]["predictions"] == { + "car": {"confidence": 0.0, "class_id": 0}, + "cat": {"confidence": 0.7, "class_id": 1}, + "dog": {"confidence": 0.6, "class_id": 2}, + } + assert set(result["predictions"]["predicted_classes"]) == {"cat", "dog"} + assert result["predictions"]["parent_id"] == "parent" + assert len(result["inference_id"]) > 0 + assert result["inference_id"] == result["predictions"]["inference_id"] diff --git a/tests/workflows/unit_tests/core_steps/formatters/test_vlm_as_detector.py b/tests/workflows/unit_tests/core_steps/formatters/test_vlm_as_detector.py new file mode 100644 index 0000000000..10f013c26a --- /dev/null +++ b/tests/workflows/unit_tests/core_steps/formatters/test_vlm_as_detector.py @@ -0,0 +1,163 @@ +from typing import List, Union + +import numpy as np +import pytest +import supervision as sv + +from inference.core.workflows.core_steps.formatters.vlm_as_detector.v1 import ( + BlockManifest, + VLMAsDetectorBlockV1, +) +from inference.core.workflows.execution_engine.entities.base import ( + ImageParentMetadata, + WorkflowImageData, +) + + +@pytest.mark.parametrize("image", ["$inputs.image", "$steps.some.image"]) +@pytest.mark.parametrize( + "classes", ["$inputs.classes", "$steps.some.classes", ["a", "b"]] +) +def test_manifest_parsing_when_input_valid( + image: str, classes: Union[str, List[str]] +) -> None: + # given + raw_manifest = { + "type": "roboflow_core/vlm_as_detector@v1", + "name": "parser", + "image": image, + "vlm_output": "$steps.vlm.output", + "classes": classes, + "model_type": "google-gemini", + "task_type": "object-detection", + } + + # when + result = BlockManifest.model_validate(raw_manifest) + + # then + assert result == BlockManifest( + type="roboflow_core/vlm_as_detector@v1", + name="parser", + image=image, + vlm_output="$steps.vlm.output", + classes=classes, + model_type="google-gemini", + task_type="object-detection", + ) + + +def test_run_method_for_claude_and_gemini_output() -> None: + # given + block = VLMAsDetectorBlockV1() + image = WorkflowImageData( + numpy_image=np.zeros((192, 168, 3), dtype=np.uint8), + parent_metadata=ImageParentMetadata(parent_id="parent"), + ) + vlm_output = """ +{"detections": [ + {"x_min": 0.01, "y_min": 0.15, "x_max": 0.15, "y_max": 0.85, "class_name": "cat", "confidence": 1.98}, + {"x_min": 0.17, "y_min": 0.25, "x_max": 0.32, "y_max": 0.85, "class_name": "dog", "confidence": 0.97}, + {"x_min": 0.33, "y_min": 0.15, "x_max": 0.47, "y_max": 0.85, "class_name": "cat", "confidence": 0.99}, + {"x_min": 0.49, "y_min": 0.30, "x_max": 0.65, "y_max": 0.85, "class_name": "dog", "confidence": 0.98}, + {"x_min": 0.67, "y_min": 0.20, "x_max": 0.82, "y_max": 0.85, "class_name": "cat", "confidence": 0.99}, + {"x_min": 0.84, "y_min": 0.25, "x_max": 0.99, "y_max": 0.85, "class_name": "unknown", "confidence": 0.97} +]} + """ + + # when + result = block.run( + image=image, + vlm_output=vlm_output, + classes=["cat", "dog", "lion"], + model_type="google-gemini", + task_type="object-detection", + ) + + # then + assert result["error_status"] is False + assert isinstance(result["predictions"], sv.Detections) + assert len(result["inference_id"]) > 0 + assert np.allclose( + result["predictions"].xyxy, + np.array( + [ + [2, 29, 25, 163], + [29, 48, 54, 163], + [55, 29, 79, 163], + [82, 58, 109, 163], + [113, 38, 138, 163], + [141, 48, 166, 163], + ] + ), + atol=1.0, + ) + assert np.allclose(result["predictions"].class_id, np.array([0, 1, 0, 1, 0, -1])) + assert np.allclose( + result["predictions"].confidence, np.array([1.0, 0.97, 0.99, 0.98, 0.99, 0.97]) + ) + assert "class_name" in result["predictions"].data + assert "image_dimensions" in result["predictions"].data + assert "prediction_type" in result["predictions"].data + assert "parent_coordinates" in result["predictions"].data + assert "parent_dimensions" in result["predictions"].data + assert "root_parent_coordinates" in result["predictions"].data + assert "root_parent_dimensions" in result["predictions"].data + assert "parent_id" in result["predictions"].data + assert "root_parent_id" in result["predictions"].data + + +def test_run_method_for_invalid_claude_and_gemini_output() -> None: + # given + block = VLMAsDetectorBlockV1() + image = WorkflowImageData( + numpy_image=np.zeros((192, 168, 3), dtype=np.uint8), + parent_metadata=ImageParentMetadata(parent_id="parent"), + ) + vlm_output = """ + {"detections": [ + {"x_min": 0.01, "y_min": 0.15, "x_max": 0.15, "y_max": 0.85, "confidence": 1.98}, + {"x_min": 0.17, "y_min": 0.25, "x_max": 0.32, "y_max": 0.85, "class_name": "dog", "confidence": 0.97}, + {"x_min": 0.33, "y_min": 0.15, "x_max": 0.47, "y_max": 0.85, "class_name": "cat", "confidence": 0.99}, + {"x_min": 0.49, "x_max": 0.65, "y_max": 0.85, "class_name": "dog", "confidence": 0.98}, + {"x_min": 0.67, "y_min": 0.20, "x_max": 0.82, "y_max": 0.85, "class_name": "cat", "confidence": 0.99}, + {"x_min": 0.84, "y_min": 0.25, "x_max": 0.99, "y_max": 0.85, "class_name": "unknown", "confidence": 0.97} + ]} + """ + + # when + result = block.run( + image=image, + vlm_output=vlm_output, + classes=["cat", "dog", "lion"], + model_type="google-gemini", + task_type="object-detection", + ) + + # then + assert result["error_status"] is True + assert result["predictions"] is None + assert len(result["inference_id"]) > 0 + + +def test_run_method_for_invalid_json() -> None: + # given + block = VLMAsDetectorBlockV1() + image = WorkflowImageData( + numpy_image=np.zeros((192, 168, 3), dtype=np.uint8), + parent_metadata=ImageParentMetadata(parent_id="parent"), + ) + + # when + result = block.run( + image=image, + vlm_output="invalid", + classes=["cat", "dog", "lion"], + model_type="google-gemini", + task_type="object-detection", + ) + + # then + assert result["error_status"] is True + assert result["predictions"] is None + assert len(result["inference_id"]) > 0 diff --git a/tests/workflows/unit_tests/core_steps/transformations/test_perspective_correction.py b/tests/workflows/unit_tests/core_steps/transformations/test_perspective_correction.py index feac4169ed..c4fc40237e 100644 --- a/tests/workflows/unit_tests/core_steps/transformations/test_perspective_correction.py +++ b/tests/workflows/unit_tests/core_steps/transformations/test_perspective_correction.py @@ -275,7 +275,6 @@ def test_correct_detections_with_keypoints(): src=src_polygon, dst=dst_polygon, ) - # when corrected_detections = correct_detections( detections=detections, diff --git a/tests/workflows/unit_tests/execution_engine/introspection/test_blocks_loader.py b/tests/workflows/unit_tests/execution_engine/introspection/test_blocks_loader.py index 399860ccdd..768afa1726 100644 --- a/tests/workflows/unit_tests/execution_engine/introspection/test_blocks_loader.py +++ b/tests/workflows/unit_tests/execution_engine/introspection/test_blocks_loader.py @@ -218,7 +218,7 @@ def test_describe_available_blocks_when_valid_plugins_are_loaded( assert result.blocks[0].manifest_class == plugin_with_valid_blocks.Block1Manifest assert result.blocks[1].block_class == plugin_with_valid_blocks.Block2 assert result.blocks[1].manifest_class == plugin_with_valid_blocks.Block2Manifest - assert len(result.declared_kinds) == 31 + assert len(result.declared_kinds) > 0 @mock.patch.object(blocks_loader, "load_workflow_blocks") @@ -259,7 +259,7 @@ def test_describe_available_blocks_when_valid_plugins_are_loaded_and_multiple_ve result.blocks[2].manifest_class == plugin_with_multiple_versions_of_blocks.Block2Manifest ) - assert len(result.declared_kinds) == 31 + assert len(result.declared_kinds) > 0 @mock.patch.object(blocks_loader, "load_workflow_blocks")