diff --git a/src/supervision/detection/core.py b/src/supervision/detection/core.py index 30baca78b..3949ec3fc 100644 --- a/src/supervision/detection/core.py +++ b/src/supervision/detection/core.py @@ -977,6 +977,7 @@ def from_lmm( | Qwen2.5-VL | `QWEN_2_5_VL` | detection | `resolution_wh`, `input_wh` | `classes` | | Google Gemini 2.0 | `GOOGLE_GEMINI_2_0` | detection | `resolution_wh` | `classes` | | Google Gemini 2.5 | `GOOGLE_GEMINI_2_5` | detection, segmentation | `resolution_wh` | `classes` | + | Google Gemini 3.5 | `GOOGLE_GEMINI_3_5` | detection, segmentation | `resolution_wh` | `classes` | | Moondream | `MOONDREAM` | detection | `resolution_wh` | | | DeepSeek-VL2 | `DEEPSEEK_VL_2` | detection | `resolution_wh` | `classes` | @@ -1425,6 +1426,7 @@ def from_lmm( LMM.DEEPSEEK_VL_2: VLM.DEEPSEEK_VL_2, LMM.GOOGLE_GEMINI_2_0: VLM.GOOGLE_GEMINI_2_0, LMM.GOOGLE_GEMINI_2_5: VLM.GOOGLE_GEMINI_2_5, + LMM.GOOGLE_GEMINI_3_5: VLM.GOOGLE_GEMINI_3_5, } if isinstance(lmm, LMM): @@ -1464,6 +1466,7 @@ def from_vlm( | Qwen3-VL | `QWEN_3_VL` | detection | `resolution_wh`, | `classes` | | Google Gemini 2.0 | `GOOGLE_GEMINI_2_0` | detection | `resolution_wh` | `classes` | | Google Gemini 2.5 | `GOOGLE_GEMINI_2_5` | detection, segmentation | `resolution_wh` | `classes` | + | Google Gemini 3.5 | `GOOGLE_GEMINI_3_5` | detection, segmentation | `resolution_wh` | `classes` | | Moondream | `MOONDREAM` | detection | `resolution_wh` | | | DeepSeek-VL2 | `DEEPSEEK_VL_2` | detection | `resolution_wh` | `classes` | @@ -1931,7 +1934,7 @@ def from_vlm( xyxy = from_moondream(result, **kwargs) return cls(xyxy=xyxy) - if vlm == VLM.GOOGLE_GEMINI_2_5: + if vlm in (VLM.GOOGLE_GEMINI_2_5, VLM.GOOGLE_GEMINI_3_5): assert isinstance(result, str) gemini_result = from_google_gemini_2_5(result, **kwargs) data = {CLASS_NAME_DATA_FIELD: gemini_result[2]} diff --git a/src/supervision/detection/vlm.py b/src/supervision/detection/vlm.py index de53548d0..f655f73b4 100644 --- a/src/supervision/detection/vlm.py +++ b/src/supervision/detection/vlm.py @@ -33,6 +33,7 @@ class LMM(Enum): QWEN_3_VL: Qwen3-VL open vision-language model from Alibaba. GOOGLE_GEMINI_2_0: Google Gemini 2.0 vision-language model. GOOGLE_GEMINI_2_5: Google Gemini 2.5 vision-language model. + GOOGLE_GEMINI_3_5: Google Gemini 3.5 vision-language model. MOONDREAM: The Moondream vision-language model. """ @@ -43,6 +44,7 @@ class LMM(Enum): DEEPSEEK_VL_2 = "deepseek_vl_2" GOOGLE_GEMINI_2_0 = "gemini_2_0" GOOGLE_GEMINI_2_5 = "gemini_2_5" + GOOGLE_GEMINI_3_5 = "gemini_3_5" MOONDREAM = "moondream" @classmethod @@ -80,6 +82,7 @@ class VLM(Enum): QWEN_3_VL: Qwen3-VL open vision-language model from Alibaba. GOOGLE_GEMINI_2_0: Google Gemini 2.0 vision-language model. GOOGLE_GEMINI_2_5: Google Gemini 2.5 vision-language model. + GOOGLE_GEMINI_3_5: Google Gemini 3.5 vision-language model. MOONDREAM: The Moondream vision-language model. """ @@ -90,6 +93,7 @@ class VLM(Enum): DEEPSEEK_VL_2 = "deepseek_vl_2" GOOGLE_GEMINI_2_0 = "gemini_2_0" GOOGLE_GEMINI_2_5 = "gemini_2_5" + GOOGLE_GEMINI_3_5 = "gemini_3_5" MOONDREAM = "moondream" @classmethod @@ -120,6 +124,7 @@ def from_value(cls, value: VLM | str) -> VLM: VLM.DEEPSEEK_VL_2: str, VLM.GOOGLE_GEMINI_2_0: str, VLM.GOOGLE_GEMINI_2_5: str, + VLM.GOOGLE_GEMINI_3_5: str, VLM.MOONDREAM: dict, } @@ -131,6 +136,7 @@ def from_value(cls, value: VLM | str) -> VLM: VLM.DEEPSEEK_VL_2: ["resolution_wh"], VLM.GOOGLE_GEMINI_2_0: ["resolution_wh"], VLM.GOOGLE_GEMINI_2_5: ["resolution_wh"], + VLM.GOOGLE_GEMINI_3_5: ["resolution_wh"], VLM.MOONDREAM: ["resolution_wh"], } @@ -142,6 +148,7 @@ def from_value(cls, value: VLM | str) -> VLM: VLM.DEEPSEEK_VL_2: ["resolution_wh", "classes"], VLM.GOOGLE_GEMINI_2_0: ["resolution_wh", "classes"], VLM.GOOGLE_GEMINI_2_5: ["resolution_wh", "classes"], + VLM.GOOGLE_GEMINI_3_5: ["resolution_wh", "classes"], VLM.MOONDREAM: ["resolution_wh"], } diff --git a/tests/detection/test_vlm.py b/tests/detection/test_vlm.py index b5d035b06..98497b7d4 100644 --- a/tests/detection/test_vlm.py +++ b/tests/detection/test_vlm.py @@ -1297,3 +1297,62 @@ def test_from_deepseek_vl_2( detections.data[CLASS_NAME_DATA_FIELD], expected_detections.data[CLASS_NAME_DATA_FIELD], ) + + +@pytest.mark.parametrize( + ("result", "resolution_wh", "classes"), + [ + ("random text", (1000, 1000), None), + ("```json\n[]\n```", (1000, 1000), None), + ( + """```json + [ + {"box_2d": [100, 200, 300, 400], "label": "cat", "confidence": 0.8} + ] + ```""", + (1000, 500), + None, + ), + ( + """```json + [ + {"box_2d": [10, 20, 110, 120], "label": "cat", "confidence": 0.8}, + {"box_2d": [50, 100, 150, 200], "label": "dog", "confidence": 0.9} + ] + ```""", + (640, 480), + ["cat", "dog"], + ), + ], +) +def test_from_google_gemini_3_5_matches_2_5( + result: str, + resolution_wh: tuple[int, int], + classes: list[str] | None, +): + detections_2_5 = Detections.from_vlm( + vlm=VLM.GOOGLE_GEMINI_2_5, + result=result, + resolution_wh=resolution_wh, + classes=classes, + ) + detections_3_5 = Detections.from_vlm( + vlm=VLM.GOOGLE_GEMINI_3_5, + result=result, + resolution_wh=resolution_wh, + classes=classes, + ) + + assert len(detections_2_5) == len(detections_3_5) + + if len(detections_2_5) == 0: + return + + assert np.allclose(detections_2_5.xyxy, detections_3_5.xyxy) + assert np.array_equal(detections_2_5.class_id, detections_3_5.class_id) + assert np.array_equal( + detections_2_5.data[CLASS_NAME_DATA_FIELD], + detections_3_5.data[CLASS_NAME_DATA_FIELD], + ) + if detections_2_5.confidence is not None: + assert np.allclose(detections_2_5.confidence, detections_3_5.confidence)