diff --git a/preprocessors/object-detection-llm/object-detection-llm.py b/preprocessors/object-detection-llm/object-detection-llm.py index b2930c8c..011a6e15 100644 --- a/preprocessors/object-detection-llm/object-detection-llm.py +++ b/preprocessors/object-detection-llm/object-detection-llm.py @@ -68,33 +68,30 @@ def normalize_bbox(bbox, width, height): ] -def process_objects(objects, threshold): +def process_objects(qwen_output, width, height, threshold): """ - Process detected objects by filtering, transforming, and enriching them. + Transform Qwen object detection output to IMAGE schema format. - - Filters objects by confidence threshold + - Transforms from Qwen format (bbox_2d, label) to IMAGE format + - Normalizes bounding boxes to [0,1] range + - Assigns confidence threshold to all objects - Normalizes labels (replaces underscores with spaces) - - Renumbers IDs sequentially - Calculates geometric properties (area, centroid) + - Filters objects by confidence threshold Args: - objects (list): List of detected objects with confidence scores + qwen_output (list): Qwen detection output with bbox_2d and label + width (int): Image width in pixels for normalization + height (int): Image height in pixels for normalization threshold (float): Minimum confidence score (0-1) Returns: list: Processed objects with computed properties """ processed = [] - for obj in objects: - if obj.get("confidence", 0) >= threshold: - obj['type'] = obj['type'].replace('_', ' ') - processed.append(obj) - - # Renumber IDs sequentially after filtering - for idx, obj in enumerate(processed): - obj['ID'] = idx - - x1, y1, x2, y2 = obj["dimensions"] + for idx, item in enumerate(qwen_output): + # Normalize bounding box + x1, y1, x2, y2 = normalize_bbox(item["bbox_2d"], width, height) # Calculate area (width * height) area = (x2 - x1) * (y2 - y1) @@ -103,13 +100,20 @@ def process_objects(objects, threshold): centroid_x = (x1 + x2) / 2 centroid_y = (y1 + y2) / 2 - # Create object entry according to schema - obj["area"] = area - obj["centroid"] = [centroid_x, centroid_y] + # Create object entry according to IMAGE schema + obj = { + "ID": idx, + "type": item["label"].replace('_', ' '), + "dimensions": [x1, y1, x2, y2], + "confidence": threshold, + "area": area, + "centroid": [centroid_x, centroid_y] + } + + processed.append(obj) logging.debug( - f"Processed {len(objects)} objects to {len(processed)} " - f"objects with confidence >= {threshold}" + f"Processed {len(qwen_output)} objects from Qwen output" ) return processed @@ -155,35 +159,42 @@ def detect_objects(): if error: return jsonify(error), error["code"] + stop_tokens = [ + "<|im_end|>", # Qwen's end token + "<|endoftext|>", # Alternative end token + "\n\n\n", # Triple newline + "```", # Code block end + ] + try: # Get object info - object_json = llm_client.chat_completion( + qwen_output = llm_client.chat_completion( prompt=OBJECT_DETECTION_PROMPT, image_base64=base64_image, json_schema=BBOX_RESPONSE_SCHEMA, - temperature=0.0, - parse_json=True + temperature=0.5, + parse_json=True, + stop=stop_tokens ) - if object_json is None or len(object_json.get("objects", [])) == 0: + logging.debug(f"Qwen output received: {qwen_output}") + + if qwen_output is None or len(qwen_output) == 0: logging.error("Failed to extract objects from the graphic.") return jsonify({"error": "No objects extracted"}), 204 - # Normalize bounding boxes + # Transform Qwen format to IMAGE schema format width, height = pil_image.size - for obj in object_json["objects"]: - # Normalize bounding boxes - obj["dimensions"] = normalize_bbox( - obj["dimensions"], width, height - ) - - # Filter objects by confidence threshold, add area and centroid, - # remove underscores from labels, and renumber IDs - object_json["objects"] = process_objects( - object_json["objects"], + processed_objects = process_objects( + qwen_output, + width, + height, CONF_THRESHOLD ) + # Wrap in "objects" for schema compliance + object_json = {"objects": processed_objects} + logging.pii(f"Normalized output: {object_json}") # Data schema validation diff --git a/preprocessors/object-detection-llm/object-detection.schema.json b/preprocessors/object-detection-llm/object-detection.schema.json index ed9a6d92..35133565 100644 --- a/preprocessors/object-detection-llm/object-detection.schema.json +++ b/preprocessors/object-detection-llm/object-detection.schema.json @@ -1,45 +1,23 @@ { "$schema": "http://json-schema.org/draft-07/schema", - "type": "object", + "type": "array", "title": "Object Detection Data", - "description": "Detected object data with bounding boxes.", - "definitions": { - "object": { - "type": "object", - "title": "BoundingBoxItem", - "properties": { - "ID": { - "description": "A number identifying this object in the set.", - "type": "integer" - }, - "type": { - "description": "The type of object detected (e.g., 'person', 'car').", - "type": "string" - }, - "dimensions": { - "description": "Bounding box coordinates of this object [x1, y1, x2, y2].", - "type": "array", - "items": { "type": "number" }, - "minItems": 4, - "maxItems": 4, - "additionalItems": false - }, - "confidence": { - "description": "Confidence in the correctness of this object's data (0-1).", - "type": "number", - "minimum": 0, - "maximum": 1 - } + "description": "Detected object data with bounding boxes in Qwen format.", + "items": { + "type": "object", + "properties": { + "bbox_2d": { + "description": "Bounding box coordinates [x1, y1, x2, y2].", + "type": "array", + "items": { "type": "number" }, + "minItems": 4, + "maxItems": 4 }, - "required": ["ID", "type", "dimensions", "confidence"] - } - }, - "properties": { - "objects": { - "description": "The set of detected objects in the image.", - "type": "array", - "items": { "$ref": "#/definitions/object" } - } - }, - "required": ["objects"] + "label": { + "description": "The type of object detected (e.g., 'person', 'car').", + "type": "string" + } + }, + "required": ["bbox_2d", "label"] + } } \ No newline at end of file diff --git a/utils/llm/prompts.py b/utils/llm/prompts.py index ce686ce7..44f7be73 100644 --- a/utils/llm/prompts.py +++ b/utils/llm/prompts.py @@ -10,39 +10,35 @@ """ # Object detection OBJECT_DETECTION_PROMPT = """ -Give the bounding boxes for the objects found in this image. +Step 1: +Determine from 0 to 10 major and important objects in the image. +Focus ONLY on the objects that are clearly visible and identifiable. + +Step 2: +Give the bounding boxes for the objects determined in the first step. Output a only JSON list of bounding boxes where each entry contains: -- the unique numeric ID in the key "ID", -- the object label in the key "type", -- the pixel coordinates of a 2D bounding box in the key "dimensions", -- and the confidence score in the key "confidence". +- the pixel coordinates of a 2D bounding box in the key "bbox_2d", +- the object label in the key "label". Example: ```json -{ - "objects": [ +[ { - "ID": 0, - "type": "car", - "dimensions": [120, 200, 300, 450], - "confidence": 0.92 + "bbox_2d": [120, 200, 300, 450], + "label": "car", }, { - "ID": 1, - "type": "person", - "dimensions": [50, 100, 120, 300], - "confidence": 0.95 + "bbox_2d": [50, 100, 120, 300], + "label": "person", } - ] -} - +] ``` Ensure that the bounding boxes are in the format [x1, y1, x2, y2]. Rules: 1. Focus ONLY on the major and important objects in the image. 2. The graphic can contain any number of objects, from zero to many. -3. If no objects are detected, return an empty list: {"objects": []}. +3. If no objects are detected, return an empty list: []. 4. Use simple and common object labels (e.g., "car", "person", "tree"). 5. Include ONLY objects that are clearly visible and identifiable. 6. Multiple objects can have the same confidence score.