Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/supervision/detection/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -977,6 +977,7 @@ def from_lmm(
| Qwen2.5-VL | `QWEN_2_5_VL` | detection | `resolution_wh`, `input_wh` | `classes` |
| Google Gemini 2.0 | `GOOGLE_GEMINI_2_0` | detection | `resolution_wh` | `classes` |
| Google Gemini 2.5 | `GOOGLE_GEMINI_2_5` | detection, segmentation | `resolution_wh` | `classes` |
| Google Gemini 3.5 | `GOOGLE_GEMINI_3_5` | detection, segmentation | `resolution_wh` | `classes` |
| Moondream | `MOONDREAM` | detection | `resolution_wh` | |
| DeepSeek-VL2 | `DEEPSEEK_VL_2` | detection | `resolution_wh` | `classes` |

Expand Down Expand Up @@ -1425,6 +1426,7 @@ def from_lmm(
LMM.DEEPSEEK_VL_2: VLM.DEEPSEEK_VL_2,
LMM.GOOGLE_GEMINI_2_0: VLM.GOOGLE_GEMINI_2_0,
LMM.GOOGLE_GEMINI_2_5: VLM.GOOGLE_GEMINI_2_5,
LMM.GOOGLE_GEMINI_3_5: VLM.GOOGLE_GEMINI_3_5,
}

if isinstance(lmm, LMM):
Expand Down Expand Up @@ -1464,6 +1466,7 @@ def from_vlm(
| Qwen3-VL | `QWEN_3_VL` | detection | `resolution_wh`, | `classes` |
| Google Gemini 2.0 | `GOOGLE_GEMINI_2_0` | detection | `resolution_wh` | `classes` |
| Google Gemini 2.5 | `GOOGLE_GEMINI_2_5` | detection, segmentation | `resolution_wh` | `classes` |
| Google Gemini 3.5 | `GOOGLE_GEMINI_3_5` | detection, segmentation | `resolution_wh` | `classes` |
| Moondream | `MOONDREAM` | detection | `resolution_wh` | |
| DeepSeek-VL2 | `DEEPSEEK_VL_2` | detection | `resolution_wh` | `classes` |

Expand Down Expand Up @@ -1931,7 +1934,7 @@ def from_vlm(
xyxy = from_moondream(result, **kwargs)
return cls(xyxy=xyxy)

if vlm == VLM.GOOGLE_GEMINI_2_5:
if vlm in (VLM.GOOGLE_GEMINI_2_5, VLM.GOOGLE_GEMINI_3_5):
assert isinstance(result, str)
gemini_result = from_google_gemini_2_5(result, **kwargs)
data = {CLASS_NAME_DATA_FIELD: gemini_result[2]}
Expand Down
7 changes: 7 additions & 0 deletions src/supervision/detection/vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class LMM(Enum):
QWEN_3_VL: Qwen3-VL open vision-language model from Alibaba.
GOOGLE_GEMINI_2_0: Google Gemini 2.0 vision-language model.
GOOGLE_GEMINI_2_5: Google Gemini 2.5 vision-language model.
GOOGLE_GEMINI_3_5: Google Gemini 3.5 vision-language model.
MOONDREAM: The Moondream vision-language model.
"""

Expand All @@ -43,6 +44,7 @@ class LMM(Enum):
DEEPSEEK_VL_2 = "deepseek_vl_2"
GOOGLE_GEMINI_2_0 = "gemini_2_0"
GOOGLE_GEMINI_2_5 = "gemini_2_5"
GOOGLE_GEMINI_3_5 = "gemini_3_5"
MOONDREAM = "moondream"

@classmethod
Expand Down Expand Up @@ -80,6 +82,7 @@ class VLM(Enum):
QWEN_3_VL: Qwen3-VL open vision-language model from Alibaba.
GOOGLE_GEMINI_2_0: Google Gemini 2.0 vision-language model.
GOOGLE_GEMINI_2_5: Google Gemini 2.5 vision-language model.
GOOGLE_GEMINI_3_5: Google Gemini 3.5 vision-language model.
MOONDREAM: The Moondream vision-language model.
"""

Expand All @@ -90,6 +93,7 @@ class VLM(Enum):
DEEPSEEK_VL_2 = "deepseek_vl_2"
GOOGLE_GEMINI_2_0 = "gemini_2_0"
GOOGLE_GEMINI_2_5 = "gemini_2_5"
GOOGLE_GEMINI_3_5 = "gemini_3_5"
MOONDREAM = "moondream"

@classmethod
Expand Down Expand Up @@ -120,6 +124,7 @@ def from_value(cls, value: VLM | str) -> VLM:
VLM.DEEPSEEK_VL_2: str,
VLM.GOOGLE_GEMINI_2_0: str,
VLM.GOOGLE_GEMINI_2_5: str,
VLM.GOOGLE_GEMINI_3_5: str,
VLM.MOONDREAM: dict,
}

Expand All @@ -131,6 +136,7 @@ def from_value(cls, value: VLM | str) -> VLM:
VLM.DEEPSEEK_VL_2: ["resolution_wh"],
VLM.GOOGLE_GEMINI_2_0: ["resolution_wh"],
VLM.GOOGLE_GEMINI_2_5: ["resolution_wh"],
VLM.GOOGLE_GEMINI_3_5: ["resolution_wh"],
VLM.MOONDREAM: ["resolution_wh"],
}

Expand All @@ -142,6 +148,7 @@ def from_value(cls, value: VLM | str) -> VLM:
VLM.DEEPSEEK_VL_2: ["resolution_wh", "classes"],
VLM.GOOGLE_GEMINI_2_0: ["resolution_wh", "classes"],
VLM.GOOGLE_GEMINI_2_5: ["resolution_wh", "classes"],
VLM.GOOGLE_GEMINI_3_5: ["resolution_wh", "classes"],
VLM.MOONDREAM: ["resolution_wh"],
}

Expand Down
59 changes: 59 additions & 0 deletions tests/detection/test_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1297,3 +1297,62 @@ def test_from_deepseek_vl_2(
detections.data[CLASS_NAME_DATA_FIELD],
expected_detections.data[CLASS_NAME_DATA_FIELD],
)


@pytest.mark.parametrize(
("result", "resolution_wh", "classes"),
[
("random text", (1000, 1000), None),
("```json\n[]\n```", (1000, 1000), None),
(
"""```json
[
{"box_2d": [100, 200, 300, 400], "label": "cat", "confidence": 0.8}
]
```""",
(1000, 500),
None,
),
(
"""```json
[
{"box_2d": [10, 20, 110, 120], "label": "cat", "confidence": 0.8},
{"box_2d": [50, 100, 150, 200], "label": "dog", "confidence": 0.9}
]
```""",
(640, 480),
["cat", "dog"],
),
],
)
def test_from_google_gemini_3_5_matches_2_5(
result: str,
resolution_wh: tuple[int, int],
classes: list[str] | None,
):
detections_2_5 = Detections.from_vlm(
vlm=VLM.GOOGLE_GEMINI_2_5,
result=result,
resolution_wh=resolution_wh,
classes=classes,
)
detections_3_5 = Detections.from_vlm(
vlm=VLM.GOOGLE_GEMINI_3_5,
result=result,
resolution_wh=resolution_wh,
classes=classes,
Comment on lines +1336 to +1343
)

assert len(detections_2_5) == len(detections_3_5)

if len(detections_2_5) == 0:
return

assert np.allclose(detections_2_5.xyxy, detections_3_5.xyxy)
assert np.array_equal(detections_2_5.class_id, detections_3_5.class_id)
assert np.array_equal(
detections_2_5.data[CLASS_NAME_DATA_FIELD],
detections_3_5.data[CLASS_NAME_DATA_FIELD],
)
if detections_2_5.confidence is not None:
assert np.allclose(detections_2_5.confidence, detections_3_5.confidence)
Loading