flashinfer-ai · yzh119 · Oct 20, 2025 · Oct 23, 2025 · Oct 24, 2025 · bkryu
@@ -28,7 +28,7 @@
     ) from e
 
 from .cuda_utils import checkCudaErrors
-from .utils import get_compute_capability, round_up
+from .utils import get_compute_capability, get_device_sm_count, round_up
 
 
 def get_sm_count_constraint(major: int, minor: int) -> Tuple[int, int]:
@@ -170,6 +170,18 @@ def split_device_green_ctx(
         RuntimeError: when requested SM allocation exceeds device capacity:
         ``num_groups * rounded_min_count > total_device_sms``
     """
+    # Check if device has enough SMs
+    min_sm, alignment = get_sm_count_constraint(*get_compute_capability(dev))
+    rounded_min = round_up(max(min_count, min_sm), alignment)
+    required_sms = num_groups * rounded_min
+    available_sms = get_device_sm_count(dev)
+
+    if required_sms > available_sms:
+        raise RuntimeError(
+            f"Insufficient SMs: requested {num_groups} groups with {rounded_min} SMs each "
+            f"(total: {required_sms} SMs), but device only has {available_sms} SMs available"
+        )
+
     cu_dev = get_cudevice(dev)
     resource = get_device_resource(cu_dev)
     results, remaining = split_resource(resource, num_groups, min_count)
@@ -246,14 +258,22 @@ def split_device_green_ctx_by_sm_count(
 
     # Round sm counts to meet the alignment and granularity requirements
     rounded_sm_counts = []
+    min_sm_count, sm_alignment = get_sm_count_constraint(*get_compute_capability(dev))
     for sm_count in sm_counts:
-        min_sm_count, sm_alignment = get_sm_count_constraint(
-            *get_compute_capability(dev)
-        )
         if sm_count <= 0:
             raise ValueError(f"SM count must be positive, got {sm_count}")
         rounded_sm_counts.append(round_up(max(sm_count, min_sm_count), sm_alignment))
 
+    # Check if device has enough SMs
+    required_sms = sum(rounded_sm_counts)
+    available_sms = get_device_sm_count(dev)
+
+    if required_sms > available_sms:
+        raise RuntimeError(
+            f"Insufficient SMs: requested {rounded_sm_counts} SMs "
+            f"(total: {required_sms} SMs), but device only has {available_sms} SMs available"
+        )
+
     # Split the device into multiple green contexts
     results, remaining = split_resource_by_sm_count(cu_dev, resource, rounded_sm_counts)
     resources = results + [remaining]

diff --git a/tests/utils/test_green_ctx.py b/tests/utils/test_green_ctx.py
@@ -12,9 +12,15 @@ def test_green_ctx_creation(
     num_groups: int,
     min_count: int,
 ):
-    streams, resources = green_ctx.split_device_green_ctx(
-        torch.device(device), num_groups, min_count
-    )
+    dev = torch.device(device)
+    try:
+        streams, resources = green_ctx.split_device_green_ctx(
+            dev, num_groups, min_count
+        )
+    except RuntimeError as e:
+        if "Insufficient SMs" in str(e):
+            pytest.skip(str(e))
+        raise
 
     assert len(resources) == num_groups + 1
     for resource in resources[:-1]:
@@ -30,9 +36,14 @@ def test_green_ctx_kernel_execution(
     num_groups: int,
     min_count: int,
 ):
-    streams, resources = green_ctx.split_device_green_ctx(
-        torch.device(device), num_groups, min_count
-    )
+    try:
+        streams, resources = green_ctx.split_device_green_ctx(
+            torch.device(device), num_groups, min_count
+        )
+    except RuntimeError as e:
+        if "Insufficient SMs" in str(e):
+            pytest.skip(str(e))
+        raise
     num_partitions = num_groups + 1
     assert len(streams) == num_partitions
     assert len(resources) == num_partitions
@@ -59,9 +70,14 @@ def test_split_device_green_ctx_by_sm_count_creation(
     device: str,
     sm_counts: list,
 ):
-    streams, resources = green_ctx.split_device_green_ctx_by_sm_count(
-        torch.device(device), sm_counts
-    )
+    try:
+        streams, resources = green_ctx.split_device_green_ctx_by_sm_count(
+            torch.device(device), sm_counts
+        )
+    except RuntimeError as e:
+        if "Insufficient SMs" in str(e):
+            pytest.skip(str(e))
+        raise
     num_partitions = len(sm_counts) + 1
     assert len(resources) == num_partitions
     assert len(streams) == num_partitions
@@ -85,9 +101,14 @@ def test_split_device_green_ctx_by_sm_count_kernel_execution(
     device: str,
     sm_counts: list,
 ):
-    streams, resources = green_ctx.split_device_green_ctx_by_sm_count(
-        torch.device(device), sm_counts
-    )
+    try:
+        streams, resources = green_ctx.split_device_green_ctx_by_sm_count(
+            torch.device(device), sm_counts
+        )
+    except RuntimeError as e:
+        if "Insufficient SMs" in str(e):
+            pytest.skip(str(e))
+        raise
     num_partitions = len(sm_counts) + 1
     assert len(streams) == num_partitions
     assert len(resources) == num_partitions
@@ -113,9 +134,14 @@ def test_split_device_green_ctx_by_sm_count_alignment(
     device: str,
     sm_counts: list,
 ):
-    _, resources = green_ctx.split_device_green_ctx_by_sm_count(
-        torch.device(device), sm_counts
-    )
+    try:
+        _, resources = green_ctx.split_device_green_ctx_by_sm_count(
+            torch.device(device), sm_counts
+        )
+    except RuntimeError as e:
+        if "Insufficient SMs" in str(e):
+            pytest.skip(str(e))
+        raise
 
     for resource in resources[:-1]:  # Exclude remaining SMs
         sm_count = resource.sm.smCount

diff --git a/tests/utils/test_jit_example.py b/tests/utils/test_jit_example.py
@@ -11,7 +11,7 @@
     gen_customize_single_prefill_module,
 )
 from flashinfer.prefill import single_prefill_with_kv_cache_with_jit_module
-from flashinfer.utils import MaskMode, is_sm90a_supported
+from flashinfer.utils import MaskMode, is_sm90a_supported, get_compute_capability
 
 
 def test_single_decode_mask():
@@ -166,6 +166,10 @@ def test_flash_sigmoid():
     torch.testing.assert_close(o, o_ref, rtol=2e-2, atol=2e-2)
 
 
+@pytest.mark.xfail(
+    get_compute_capability(torch.device("cuda:0")) == (12, 1),
+    reason="Numerical accuracy issue on SM 121 (Spark)",
+)
 def test_dump_logits():
     torch.manual_seed(42)
     variant_decl = r"""

diff --git a/tests/utils/test_sampling.py b/tests/utils/test_sampling.py
@@ -72,7 +72,7 @@ def test_softmax(
 
     probs_ref = torch.softmax(logits_scaled, dim=-1)
 
-    assert torch.allclose(probs, probs_ref, atol=1e-5)
+    assert torch.allclose(probs, probs_ref, rtol=1e-5, atol=1e-5)
 
 
 @pytest.mark.parametrize("vocab_size", [111, 32000, 128256])