[SAM3 Video] Add support for multi prompts (#42293)

yonigozlan · web-flow · commit c3fb1b1a6ca1 · 2025-11-20T17:25:10.000-05:00
* add support for multi prompts + fix checkpoints in tests

* Make sure to apply heuristics per prompt group

* simplify NMS to probs
diff --git a/docs/source/en/model_doc/sam3_video.md b/docs/source/en/model_doc/sam3_video.md
@@ -97,6 +97,39 @@ Processed 51 frames
 >>> print(f"Masks shape: {frame_0_outputs['masks'].shape}")
 ```
 
+You can also track multiple object categories simultaneously by providing multiple prompts. The model efficiently reuses vision features across all prompts:
+
+```python
+>>> # Add multiple text prompts (or use a list in add_text_prompt)
+>>> multi_prompt_session = processor.init_video_session(
+...     video=video_frames,
+...     inference_device=device,
+...     processing_device="cpu",
+...     video_storage_device="cpu",
+...     dtype=torch.bfloat16,
+... )
+>>>
+>>> prompts = ["person", "bed", "lamp"]
+>>> processor.add_text_prompt(multi_prompt_session, prompts)
+>>>
+>>> # Process video - detects objects from ALL prompts in a single pass
+>>> multi_outputs_per_frame = {}
+>>> for model_outputs in model.propagate_in_video_iterator(
+...     inference_session=multi_prompt_session, max_frame_num_to_track=50
+... ):
+...     processed_outputs = processor.postprocess_outputs(multi_prompt_session, model_outputs)
+...     multi_outputs_per_frame[model_outputs.frame_idx] = processed_outputs
+>>>
+>>> # Check which objects were detected by each prompt
+>>> frame_0_outputs = multi_outputs_per_frame[0]
+>>> prompt_to_obj_ids = frame_0_outputs["prompt_to_obj_ids"]
+>>> for prompt, obj_ids in prompt_to_obj_ids.items():
+...     print(f"{prompt}: {len(obj_ids)} objects")
+person: 2 objects
+bed: 1 objects
+lamp: 1 objects
+```
+
 #### Streaming Video Inference
 
 <div class="warning">
diff --git a/src/transformers/models/sam3_video/modeling_sam3_video.py b/src/transformers/models/sam3_video/modeling_sam3_video.py
diff --git a/src/transformers/models/sam3_video/processing_sam3_video.py b/src/transformers/models/sam3_video/processing_sam3_video.py
@@ -36,11 +36,11 @@ class Sam3VideoProcessor(ProcessorMixin):
     [`~Sam3ImageProcessor.__call__`] and [`~Sam3VideoProcessor.__call__`] for more information.
 
     Args:
-        image_processor (`Sam2ImageProcessorFast`):
-            An instance of [`Sam2ImageProcessorFast`].
+        image_processor (`Sam3ImageProcessorFast`):
+            An instance of [`Sam3ImageProcessorFast`].
         video_processor (`Sam2VideoVideoProcessor`):
             An instance of [`Sam2VideoVideoProcessor`].
-        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
+        tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
             An instance of [`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]. The tokenizer is a required input.
         target_size (`int`, *optional*):
             The target size (target_size, target_size) to which the image will be resized.
@@ -109,16 +109,36 @@ def __call__(
 
         return encoding_image_processor
 
-    def add_text_prompt(self, inference_session, text):
+    def add_text_prompt(self, inference_session: Sam3VideoInferenceSession, text: Union[str, list[str]]):
         """
-        Add text prompt to the inference session.
+        Add text prompt(s) to the inference session.
+
+        Args:
+            inference_session (`Sam3VideoInferenceSession`): The inference session.
+            text (`str` or `list[str]`): The text prompt(s) to add.
+
+        Returns:
+            `Sam3VideoInferenceSession`: The inference session with the added text prompt(s).
         """
-        encoded_text = self.tokenizer(text, return_tensors="pt", padding="max_length", max_length=32).to(
-            inference_session.inference_device
-        )
-        inference_session.text_attention_mask = encoded_text.attention_mask
-        inference_session.text_input_ids = encoded_text.input_ids
-        inference_session.has_new_text_input = True
+        if isinstance(text, str):
+            text = [text]
+
+        prompt_ids = []
+        for prompt_text in text:
+            # Add prompt and get its ID (reuses existing if duplicate)
+            prompt_id = inference_session.add_prompt(prompt_text)
+
+            # Only encode if this is a new prompt (not already in prompt_input_ids)
+            if prompt_id not in inference_session.prompt_input_ids:
+                encoded_text = self.tokenizer(
+                    prompt_text, return_tensors="pt", padding="max_length", max_length=32
+                ).to(inference_session.inference_device)
+
+                inference_session.prompt_input_ids[prompt_id] = encoded_text.input_ids
+                inference_session.prompt_attention_masks[prompt_id] = encoded_text.attention_mask
+
+            prompt_ids.append(prompt_id)
+
         return inference_session
 
     def init_video_session(
@@ -194,20 +214,46 @@ def _apply_non_overlapping_constraints(self, pred_masks):
         pred_masks = torch.where(keep, pred_masks, torch.clamp(pred_masks, max=-10.0))
         return pred_masks
 
-    def _apply_object_wise_non_overlapping_constraints(self, pred_masks, obj_scores, background_value=-10.0):
+    def _apply_object_wise_non_overlapping_constraints(
+        self,
+        pred_masks,
+        obj_scores,
+        background_value=-10.0,
+        prompt_ids=None,
+    ):
         """
-        Applies non-overlapping constraints object wise (i.e. only one object can claim the overlapping region)
+        Applies non-overlapping constraints object wise (i.e. only one object can claim the overlapping region).
+        Constraints are enforced independently for each prompt group when `prompt_ids` are provided.
         """
+        if prompt_ids is None:
+            return self._apply_object_wise_non_overlapping_constraints_impl(pred_masks, obj_scores, background_value)
+
+        if len(prompt_ids) != pred_masks.size(0):
+            raise ValueError("prompt_ids must have the same length as pred_masks")
+
+        pred_masks_grouped = pred_masks.clone()
+        prompt_ids_tensor = torch.tensor(prompt_ids, device=pred_masks.device, dtype=torch.long)
+        for prompt_id in prompt_ids_tensor.unique(sorted=True):
+            indices = torch.nonzero(prompt_ids_tensor == prompt_id, as_tuple=True)[0]
+            if indices.numel() == 0:
+                continue
+            prompt_masks = self._apply_object_wise_non_overlapping_constraints_impl(
+                pred_masks_grouped[indices],
+                obj_scores[indices],
+                background_value,
+            )
+            pred_masks_grouped[indices] = prompt_masks.to(pred_masks_grouped.dtype)
+        return pred_masks_grouped
+
+    def _apply_object_wise_non_overlapping_constraints_impl(self, pred_masks, obj_scores, background_value=-10.0):
         pred_masks_single_score = torch.where(pred_masks > 0, obj_scores[..., None, None], background_value)
-        # Apply pixel-wise non-overlapping constraint based on mask scores
         pixel_level_non_overlapping_masks = self._apply_non_overlapping_constraints(pred_masks_single_score)
-        # Replace object scores with pixel scores. Note, that now only one object can claim the overlapping region
         pred_masks = torch.where(
             pixel_level_non_overlapping_masks > 0,
             pred_masks,
             torch.clamp(pred_masks, max=background_value),
         )
-        return pred_masks
+        return pred_masks.to(pred_masks_single_score.dtype)
 
     def postprocess_outputs(
         self,
@@ -235,6 +281,8 @@ def postprocess_outputs(
                   (top_left_x, top_left_y, bottom_right_x, bottom_right_y).
                 - **masks** (`torch.Tensor` of shape `(num_objects, height, width)`): Binary segmentation masks
                   for each object at the original video resolution.
+                - **prompt_to_obj_ids** (`dict[str, list[int]]`): Mapping from prompt text to list of
+                  object IDs detected by that prompt.
         """
         obj_id_to_mask = model_outputs["obj_id_to_mask"]  # low res masks (1, H_low, W_low)
         curr_obj_ids = sorted(obj_id_to_mask.keys())
@@ -301,22 +349,35 @@ def postprocess_outputs(
 
             out_boxes_xyxy = masks_to_boxes(out_binary_masks)
 
-        # apply non-overlapping constraints on the existing masklets
+        # Apply non-overlapping constraints on the existing masklets.
+        # Constraints are enforced independently per prompt group.
         if out_binary_masks.shape[0] > 1:
             assert len(out_binary_masks) == len(out_tracker_probs)
+            prompt_ids_filtered = [
+                inference_session.obj_id_to_prompt_id[int(obj_id)] for obj_id in out_obj_ids.tolist()
+            ]
             out_binary_masks = (
                 self._apply_object_wise_non_overlapping_constraints(
                     out_binary_masks.unsqueeze(1),
                     out_tracker_probs.unsqueeze(1).to(out_binary_masks.device),
                     background_value=0,
+                    prompt_ids=prompt_ids_filtered,
                 ).squeeze(1)
             ) > 0
 
+        # Build prompt_to_obj_ids mapping: group object IDs by their associated prompt text.
+        prompt_to_obj_ids = {}
+        for obj_id in out_obj_ids.tolist():
+            prompt_id = inference_session.obj_id_to_prompt_id[obj_id]
+            prompt_text = inference_session.prompts[prompt_id]
+            prompt_to_obj_ids.setdefault(prompt_text, []).append(obj_id)
+
         outputs = {
             "object_ids": out_obj_ids,
             "scores": out_probs,
             "boxes": out_boxes_xyxy,
             "masks": out_binary_masks,
+            "prompt_to_obj_ids": prompt_to_obj_ids,
         }
         return outputs
 
diff --git a/tests/models/sam3/test_modeling_sam3.py b/tests/models/sam3/test_modeling_sam3.py
@@ -987,7 +987,7 @@ class Sam3ModelIntegrationTest(unittest.TestCase):
 
     def setUp(self):
         super().setUp()
-        model_name = "../sam3-hf-v4-video-full"
+        model_name = "facebook/sam3"
         self.model = Sam3Model.from_pretrained(model_name).to(torch.float32)
         self.processor = Sam3Processor.from_pretrained(model_name)
         self.model.to(torch_device)
diff --git a/tests/models/sam3_tracker/test_modeling_sam3_tracker.py b/tests/models/sam3_tracker/test_modeling_sam3_tracker.py
@@ -510,7 +510,7 @@ def prepare_video():
 class Sam3TrackerModelIntegrationTest(unittest.TestCase):
     def setUp(self):
         super().setUp()
-        checkpoint_path = "../sam3-hf-v4-video-full"
+        checkpoint_path = "facebook/sam3"
         self.model = Sam3TrackerModel.from_pretrained(checkpoint_path).to(torch.float32)
         self.processor = Sam3TrackerProcessor.from_pretrained(checkpoint_path)
         self.model.to(torch_device)
@@ -817,7 +817,7 @@ def test_inference_mask_generation_from_existing_points_and_mask(self):
         )
 
     def test_dummy_pipeline_generation(self):
-        generator = pipeline("mask-generation", model="../sam3-hf-v4-video-full", device=torch_device)
+        generator = pipeline("mask-generation", model="facebook/sam3", device=torch_device)
         raw_image = prepare_image()
 
         _ = generator(raw_image, points_per_batch=64)
diff --git a/tests/models/sam3_tracker_video/test_modeling_sam3_tracker_video.py b/tests/models/sam3_tracker_video/test_modeling_sam3_tracker_video.py
@@ -66,8 +66,8 @@ def prepare_video():
 class Sam3TrackerVideoModelIntegrationTest(unittest.TestCase):
     def setUp(self):
         super().setUp()
-        self.video_model = Sam3TrackerVideoModel.from_pretrained("../sam3-hf-v4-video-full").to(torch.float32)
-        self.processor = Sam3TrackerVideoProcessor.from_pretrained("../sam3-hf-v4-video-full")
+        self.video_model = Sam3TrackerVideoModel.from_pretrained("facebook/sam3").to(torch.float32)
+        self.processor = Sam3TrackerVideoProcessor.from_pretrained("facebook/sam3")
         self.video_model.to(torch_device)
         self.video_model.eval()
 
diff --git a/tests/models/sam3_video/test_modeling_sam3_video.py b/tests/models/sam3_video/test_modeling_sam3_video.py
@@ -42,7 +42,7 @@ def prepare_video():
 class Sam3VideoModelIntegrationTest(unittest.TestCase):
     def setUp(self):
         super().setUp()
-        checkpoint_path = "../sam3-hf-v4-video-full"
+        checkpoint_path = "facebook/sam3"
         self.video_model = Sam3VideoModel.from_pretrained(checkpoint_path).to(torch.float32)
         self.processor = Sam3VideoProcessor.from_pretrained(checkpoint_path)
         self.video_model.to(torch_device)
@@ -473,3 +473,62 @@ def test_inference_video_streaming_with_text_prompt(self):
                     atol=5e-3,  # Higher tolerance for raw logits
                     rtol=5e-3,
                 )
+
+    def test_inference_video_multi_prompt(self):
+        """Test multi-prompt tracking - detecting multiple object categories in one pass."""
+        raw_video = prepare_video()
+        inference_session = self.processor.init_video_session(
+            video=raw_video,
+            inference_device=torch_device,
+            processing_device="cpu",
+            video_storage_device="cpu",
+        )
+
+        # Add multiple text prompts
+        prompts = ["person", "bed"]
+        self.processor.add_text_prompt(
+            inference_session=inference_session,
+            text=prompts,
+        )
+
+        # Propagate through video frames
+        outputs_per_frame = {}
+        for model_outputs in self.video_model.propagate_in_video_iterator(
+            inference_session=inference_session,
+            max_frame_num_to_track=3,
+        ):
+            processed_outputs = self.processor.postprocess_outputs(inference_session, model_outputs)
+            outputs_per_frame[model_outputs.frame_idx] = processed_outputs
+
+        # Check we processed the expected number of frames
+        self.assertGreaterEqual(len(outputs_per_frame), 1)
+        self.assertLessEqual(len(outputs_per_frame), 4)
+
+        # Check output structure for each frame
+        for processed_outputs in outputs_per_frame.values():
+            self.assertIn("object_ids", processed_outputs)
+            self.assertIn("scores", processed_outputs)
+            self.assertIn("boxes", processed_outputs)
+            self.assertIn("masks", processed_outputs)
+            self.assertIn("prompt_to_obj_ids", processed_outputs)  # Multi-prompt specific
+
+            # Check prompt_to_obj_ids structure
+            prompt_to_obj_ids = processed_outputs["prompt_to_obj_ids"]
+            self.assertIsInstance(prompt_to_obj_ids, dict)
+            for prompt, obj_ids in prompt_to_obj_ids.items():
+                self.assertIsInstance(prompt, str)
+                self.assertIsInstance(obj_ids, list)
+                # Each object ID should be in the main object_ids list
+                for obj_id in obj_ids:
+                    self.assertIn(obj_id, processed_outputs["object_ids"].tolist())
+
+        # Check that we detected objects from multiple prompts
+        first_frame_outputs = outputs_per_frame[min(outputs_per_frame.keys())]
+        prompt_to_obj_ids = first_frame_outputs["prompt_to_obj_ids"]
+
+        # Should have at least one prompt with detections
+        self.assertGreater(len(prompt_to_obj_ids), 0)
+
+        # All prompts in prompt_to_obj_ids should be from our original prompts
+        for prompt in prompt_to_obj_ids.keys():
+            self.assertIn(prompt, prompts)