Merge branch 'main' into feature/pass-measured-fps-when-handling-live…

…-stream
roboflow · Nov 14, 2024 · fc11d3e · fc11d3e
2 parents 1d0e796 + 1f235f3
commit fc11d3e
Show file tree

Hide file tree

Showing 62 changed files with 4,445 additions and 91 deletions.
diff --git a/docker/dockerfiles/Dockerfile.onnx.jetson.5.1.1.stream_manager b/docker/dockerfiles/Dockerfile.onnx.jetson.5.1.1.stream_manager
@@ -6,6 +6,8 @@ ENV LANG en_US.UTF-8
 RUN apt-get update -y && apt-get install -y \
     lshw \
     git \
+    python3.9 \
+    python3.9-dev \
     python3-pip \
     python3-matplotlib \
     gfortran \
@@ -24,31 +26,31 @@ COPY requirements/requirements.clip.txt \
     requirements/_requirements.txt \
     ./
 
-RUN pip3 install --ignore-installed PyYAML && rm -rf ~/.cache/pip
+RUN python3.9 -m pip install --ignore-installed PyYAML && rm -rf ~/.cache/pip
 
 # We needed to take statically compiled library for last known stable build and put it into hosting
 # That was due to faulty builds started 26.06.2024, probably due to release of new version
 # of pybind11, which gets automatically pulled while build of zxing_cpp library making
 # cmake to fail
-RUN wget https://storage.googleapis.com/roboflow-tests-assets/zxing_cpp_library_compiled_for_inference_v0.12.1_python_3.8.tar.gz \
-    && tar -xvzf zxing_cpp_library_compiled_for_inference_v0.12.1_python_3.8.tar.gz \
-    && mv zxing_cpp-2.2.0.dist-info /usr/local/lib/python3.8/dist-packages/zxing_cpp-2.2.0.dist-info \
-    && mv zxingcpp.cpython-38-aarch64-linux-gnu.so /usr/local/lib/python3.8/dist-packages/ \
-    && rm zxing_cpp_library_compiled_for_inference_v0.12.1_python_3.8.tar.gz
+RUN wget https://storage.googleapis.com/roboflow-tests-assets/zxing_cpp_library_compiled_for_inference_v0.12.1.tar.gz \
+    && tar -xvzf zxing_cpp_library_compiled_for_inference_v0.12.1.tar.gz \
+    && mv zxing_cpp-2.2.0.dist-info /usr/local/lib/python3.9/dist-packages/zxing_cpp-2.2.0.dist-info \
+    && mv zxingcpp.cpython-39-aarch64-linux-gnu.so /usr/local/lib/python3.9/dist-packages/ \
+    && rm zxing_cpp_library_compiled_for_inference_v0.12.1.tar.gz
 
-RUN pip3 install --upgrade pip  && pip3 install \
+RUN python3.9 -m pip install --upgrade pip  && python3.9 -m pip install \
     -r _requirements.txt \
     -r requirements.clip.txt \
     -r requirements.http.txt \
     "setuptools<=75.5.0" \
     --upgrade \
     && rm -rf ~/.cache/pip
 
-RUN pip3 uninstall --yes onnxruntime
-RUN wget https://nvidia.box.com/shared/static/iizg3ggrtdkqawkmebbfixo7sce6j365.whl -O onnxruntime_gpu-1.16.0-cp38-cp38-linux_aarch64.whl
-RUN pip3 install onnxruntime_gpu-1.16.0-cp38-cp38-linux_aarch64.whl "opencv-python-headless<4.3" \
+RUN python3.9 -m pip uninstall --yes onnxruntime
+RUN wget https://nvidia.box.com/shared/static/67zek28z497hs9aev7xg2c1wngdeyv4h.whl -O onnxruntime_gpu-1.16.0-cp39-cp39-linux_aarch64.whl
+RUN python3.9 -m pip install onnxruntime_gpu-1.16.0-cp39-cp39-linux_aarch64.whl "opencv-python-headless>4" \
     && rm -rf ~/.cache/pip \
-    && rm onnxruntime_gpu-1.16.0-cp38-cp38-linux_aarch64.whl
+    && rm onnxruntime_gpu-1.16.0-cp39-cp39-linux_aarch64.whl
 
 WORKDIR /app/
 COPY inference inference
@@ -66,4 +68,4 @@ ENV WORKFLOWS_STEP_EXECUTION_MODE=local
 ENV WORKFLOWS_MAX_CONCURRENT_STEPS=1
 ENV SUPERVISON_DEPRECATION_WARNING=0
 
-ENTRYPOINT ["python3", "-m", "inference.enterprise.stream_management.manager.app"]
+ENTRYPOINT ["python3.9", "-m", "inference.enterprise.stream_management.manager.app"]
diff --git a/inference/core/workflows/core_steps/formatters/vlm_as_classifier/v2.py b/inference/core/workflows/core_steps/formatters/vlm_as_classifier/v2.py
@@ -0,0 +1,267 @@
+import json
+import logging
+import re
+from typing import Dict, List, Literal, Optional, Tuple, Type, Union
+from uuid import uuid4
+
+from pydantic import ConfigDict, Field
+
+from inference.core.workflows.execution_engine.entities.base import (
+    OutputDefinition,
+    WorkflowImageData,
+)
+from inference.core.workflows.execution_engine.entities.types import (
+    BOOLEAN_KIND,
+    CLASSIFICATION_PREDICTION_KIND,
+    IMAGE_KIND,
+    INFERENCE_ID_KIND,
+    LANGUAGE_MODEL_OUTPUT_KIND,
+    LIST_OF_VALUES_KIND,
+    Selector,
+)
+from inference.core.workflows.prototypes.block import (
+    BlockResult,
+    WorkflowBlock,
+    WorkflowBlockManifest,
+)
+
+JSON_MARKDOWN_BLOCK_PATTERN = re.compile(r"```json([\s\S]*?)```", flags=re.IGNORECASE)
+
+LONG_DESCRIPTION = """
+The block expects string input that would be produced by blocks exposing Large Language Models (LLMs) and 
+Visual Language Models (VLMs). Input is parsed to classification prediction and returned as block output.
+
+Accepted formats:
+
+- valid JSON strings
+
+- JSON documents wrapped with Markdown tags (very common for GPT responses)
+
+Example:
+```
+{"my": "json"}
+```
+
+**Details regarding block behavior:**
+
+- `error_status` is set `True` whenever parsing cannot be completed
+
+- in case of multiple markdown blocks with raw JSON content - only first will be parsed
+"""
+
+SHORT_DESCRIPTION = "Parses raw string into classification prediction."
+
+
+class BlockManifest(WorkflowBlockManifest):
+    model_config = ConfigDict(
+        json_schema_extra={
+            "name": "VLM as Classifier",
+            "version": "v2",
+            "short_description": SHORT_DESCRIPTION,
+            "long_description": LONG_DESCRIPTION,
+            "license": "Apache-2.0",
+            "block_type": "formatter",
+        }
+    )
+    type: Literal["roboflow_core/vlm_as_classifier@v2"]
+    image: Selector(kind=[IMAGE_KIND]) = Field(
+        description="The image which was the base to generate VLM prediction",
+        examples=["$inputs.image", "$steps.cropping.crops"],
+    )
+    vlm_output: Selector(kind=[LANGUAGE_MODEL_OUTPUT_KIND]) = Field(
+        title="VLM Output",
+        description="The string with raw classification prediction to parse.",
+        examples=[["$steps.lmm.output"]],
+    )
+    classes: Union[
+        Selector(kind=[LIST_OF_VALUES_KIND]),
+        Selector(kind=[LIST_OF_VALUES_KIND]),
+        List[str],
+    ] = Field(
+        description="List of all classes used by the model, required to "
+        "generate mapping between class name and class id.",
+        examples=[["$steps.lmm.classes", "$inputs.classes", ["class_a", "class_b"]]],
+    )
+
+    @classmethod
+    def describe_outputs(cls) -> List[OutputDefinition]:
+        return [
+            OutputDefinition(name="error_status", kind=[BOOLEAN_KIND]),
+            OutputDefinition(name="predictions", kind=[CLASSIFICATION_PREDICTION_KIND]),
+            OutputDefinition(name="inference_id", kind=[INFERENCE_ID_KIND]),
+        ]
+
+    @classmethod
+    def get_execution_engine_compatibility(cls) -> Optional[str]:
+        return ">=1.3.0,<2.0.0"
+
+
+class VLMAsClassifierBlockV2(WorkflowBlock):
+
+    @classmethod
+    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
+        return BlockManifest
+
+    def run(
+        self,
+        image: WorkflowImageData,
+        vlm_output: str,
+        classes: List[str],
+    ) -> BlockResult:
+        inference_id = f"{uuid4()}"
+        error_status, parsed_data = string2json(
+            raw_json=vlm_output,
+        )
+        if error_status:
+            return {
+                "error_status": True,
+                "predictions": None,
+                "inference_id": inference_id,
+            }
+        if "class_name" in parsed_data and "confidence" in parsed_data:
+            return parse_multi_class_classification_results(
+                image=image,
+                results=parsed_data,
+                classes=classes,
+                inference_id=inference_id,
+            )
+        if "predicted_classes" in parsed_data:
+            return parse_multi_label_classification_results(
+                image=image,
+                results=parsed_data,
+                classes=classes,
+                inference_id=inference_id,
+            )
+        return {
+            "error_status": True,
+            "predictions": None,
+            "inference_id": inference_id,
+        }
+
+
+def string2json(
+    raw_json: str,
+) -> Tuple[bool, dict]:
+    json_blocks_found = JSON_MARKDOWN_BLOCK_PATTERN.findall(raw_json)
+    if len(json_blocks_found) == 0:
+        return try_parse_json(raw_json)
+    first_block = json_blocks_found[0]
+    return try_parse_json(first_block)
+
+
+def try_parse_json(content: str) -> Tuple[bool, dict]:
+    try:
+        return False, json.loads(content)
+    except Exception as error:
+        logging.warning(
+            f"Could not parse JSON to dict in `roboflow_core/vlm_as_classifier@v1` block. "
+            f"Error type: {error.__class__.__name__}. Details: {error}"
+        )
+        return True, {}
+
+
+def parse_multi_class_classification_results(
+    image: WorkflowImageData,
+    results: dict,
+    classes: List[str],
+    inference_id: str,
+) -> dict:
+    try:
+        class2id_mapping = create_classes_index(classes=classes)
+        height, width = image.numpy_image.shape[:2]
+        top_class = results["class_name"]
+        confidences = {top_class: scale_confidence(results["confidence"])}
+        predictions = []
+        if top_class not in class2id_mapping:
+            predictions.append(
+                {
+                    "class": top_class,
+                    "class_id": -1,
+                    "confidence": confidences.get(top_class, 0.0),
+                }
+            )
+        for class_name, class_id in class2id_mapping.items():
+            predictions.append(
+                {
+                    "class": class_name,
+                    "class_id": class_id,
+                    "confidence": confidences.get(class_name, 0.0),
+                }
+            )
+        parsed_prediction = {
+            "image": {"width": width, "height": height},
+            "predictions": predictions,
+            "top": top_class,
+            "confidence": confidences[top_class],
+            "inference_id": inference_id,
+            "parent_id": image.parent_metadata.parent_id,
+        }
+        return {
+            "error_status": False,
+            "predictions": parsed_prediction,
+            "inference_id": inference_id,
+        }
+    except Exception as error:
+        logging.warning(
+            f"Could not parse multi-class classification results in `roboflow_core/vlm_as_classifier@v1` block. "
+            f"Error type: {error.__class__.__name__}. Details: {error}"
+        )
+        return {"error_status": True, "predictions": None, "inference_id": inference_id}
+
+
+def parse_multi_label_classification_results(
+    image: WorkflowImageData,
+    results: dict,
+    classes: List[str],
+    inference_id: str,
+) -> dict:
+    try:
+        class2id_mapping = create_classes_index(classes=classes)
+        height, width = image.numpy_image.shape[:2]
+        predicted_classes_confidences = {}
+        for prediction in results["predicted_classes"]:
+            if prediction["class"] not in class2id_mapping:
+                class2id_mapping[prediction["class"]] = -1
+            if prediction["class"] in predicted_classes_confidences:
+                old_confidence = predicted_classes_confidences[prediction["class"]]
+                new_confidence = scale_confidence(value=prediction["confidence"])
+                predicted_classes_confidences[prediction["class"]] = max(
+                    old_confidence, new_confidence
+                )
+            else:
+                predicted_classes_confidences[prediction["class"]] = scale_confidence(
+                    value=prediction["confidence"]
+                )
+        predictions = {
+            class_name: {
+                "confidence": predicted_classes_confidences.get(class_name, 0.0),
+                "class_id": class_id,
+            }
+            for class_name, class_id in class2id_mapping.items()
+        }
+        parsed_prediction = {
+            "image": {"width": width, "height": height},
+            "predictions": predictions,
+            "predicted_classes": list(predicted_classes_confidences.keys()),
+            "inference_id": inference_id,
+            "parent_id": image.parent_metadata.parent_id,
+        }
+        return {
+            "error_status": False,
+            "predictions": parsed_prediction,
+            "inference_id": inference_id,
+        }
+    except Exception as error:
+        logging.warning(
+            f"Could not parse multi-label classification results in `roboflow_core/vlm_as_classifier@v1` block. "
+            f"Error type: {error.__class__.__name__}. Details: {error}"
+        )
+        return {"error_status": True, "predictions": None, "inference_id": inference_id}
+
+
+def create_classes_index(classes: List[str]) -> Dict[str, int]:
+    return {class_name: idx for idx, class_name in enumerate(classes)}
+
+
+def scale_confidence(value: float) -> float:
+    return min(max(float(value), 0.0), 1.0)