Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/supervision/detection/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -977,6 +977,7 @@ def from_lmm(
| Qwen2.5-VL | `QWEN_2_5_VL` | detection | `resolution_wh`, `input_wh` | `classes` |
| Google Gemini 2.0 | `GOOGLE_GEMINI_2_0` | detection | `resolution_wh` | `classes` |
| Google Gemini 2.5 | `GOOGLE_GEMINI_2_5` | detection, segmentation | `resolution_wh` | `classes` |
| Google Gemini 3.5 | `GOOGLE_GEMINI_3_5` | detection, segmentation | `resolution_wh` | `classes` |
| Moondream | `MOONDREAM` | detection | `resolution_wh` | |
| DeepSeek-VL2 | `DEEPSEEK_VL_2` | detection | `resolution_wh` | `classes` |

Expand Down Expand Up @@ -1425,6 +1426,7 @@ def from_lmm(
LMM.DEEPSEEK_VL_2: VLM.DEEPSEEK_VL_2,
LMM.GOOGLE_GEMINI_2_0: VLM.GOOGLE_GEMINI_2_0,
LMM.GOOGLE_GEMINI_2_5: VLM.GOOGLE_GEMINI_2_5,
LMM.GOOGLE_GEMINI_3_5: VLM.GOOGLE_GEMINI_3_5,
}

if isinstance(lmm, LMM):
Expand Down Expand Up @@ -1464,6 +1466,7 @@ def from_vlm(
| Qwen3-VL | `QWEN_3_VL` | detection | `resolution_wh`, | `classes` |
| Google Gemini 2.0 | `GOOGLE_GEMINI_2_0` | detection | `resolution_wh` | `classes` |
| Google Gemini 2.5 | `GOOGLE_GEMINI_2_5` | detection, segmentation | `resolution_wh` | `classes` |
| Google Gemini 3.5 | `GOOGLE_GEMINI_3_5` | detection, segmentation | `resolution_wh` | `classes` |
| Moondream | `MOONDREAM` | detection | `resolution_wh` | |
| DeepSeek-VL2 | `DEEPSEEK_VL_2` | detection | `resolution_wh` | `classes` |

Expand Down Expand Up @@ -1931,7 +1934,7 @@ def from_vlm(
xyxy = from_moondream(result, **kwargs)
return cls(xyxy=xyxy)

if vlm == VLM.GOOGLE_GEMINI_2_5:
if vlm in (VLM.GOOGLE_GEMINI_2_5, VLM.GOOGLE_GEMINI_3_5):
assert isinstance(result, str)
gemini_result = from_google_gemini_2_5(result, **kwargs)
data = {CLASS_NAME_DATA_FIELD: gemini_result[2]}
Expand Down
7 changes: 7 additions & 0 deletions src/supervision/detection/vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class LMM(Enum):
QWEN_3_VL: Qwen3-VL open vision-language model from Alibaba.
GOOGLE_GEMINI_2_0: Google Gemini 2.0 vision-language model.
GOOGLE_GEMINI_2_5: Google Gemini 2.5 vision-language model.
GOOGLE_GEMINI_3_5: Google Gemini 3.5 vision-language model.
MOONDREAM: The Moondream vision-language model.
Comment on lines 29 to 37
"""

Expand All @@ -43,6 +44,7 @@ class LMM(Enum):
DEEPSEEK_VL_2 = "deepseek_vl_2"
GOOGLE_GEMINI_2_0 = "gemini_2_0"
GOOGLE_GEMINI_2_5 = "gemini_2_5"
GOOGLE_GEMINI_3_5 = "gemini_3_5"
MOONDREAM = "moondream"

@classmethod
Expand Down Expand Up @@ -80,6 +82,7 @@ class VLM(Enum):
QWEN_3_VL: Qwen3-VL open vision-language model from Alibaba.
GOOGLE_GEMINI_2_0: Google Gemini 2.0 vision-language model.
GOOGLE_GEMINI_2_5: Google Gemini 2.5 vision-language model.
GOOGLE_GEMINI_3_5: Google Gemini 3.5 vision-language model.
MOONDREAM: The Moondream vision-language model.
"""
Comment on lines 78 to 87

Expand All @@ -90,6 +93,7 @@ class VLM(Enum):
DEEPSEEK_VL_2 = "deepseek_vl_2"
GOOGLE_GEMINI_2_0 = "gemini_2_0"
GOOGLE_GEMINI_2_5 = "gemini_2_5"
GOOGLE_GEMINI_3_5 = "gemini_3_5"
MOONDREAM = "moondream"

@classmethod
Expand Down Expand Up @@ -120,6 +124,7 @@ def from_value(cls, value: VLM | str) -> VLM:
VLM.DEEPSEEK_VL_2: str,
VLM.GOOGLE_GEMINI_2_0: str,
VLM.GOOGLE_GEMINI_2_5: str,
VLM.GOOGLE_GEMINI_3_5: str,
VLM.MOONDREAM: dict,
}

Expand All @@ -131,6 +136,7 @@ def from_value(cls, value: VLM | str) -> VLM:
VLM.DEEPSEEK_VL_2: ["resolution_wh"],
VLM.GOOGLE_GEMINI_2_0: ["resolution_wh"],
VLM.GOOGLE_GEMINI_2_5: ["resolution_wh"],
VLM.GOOGLE_GEMINI_3_5: ["resolution_wh"],
VLM.MOONDREAM: ["resolution_wh"],
}

Expand All @@ -142,6 +148,7 @@ def from_value(cls, value: VLM | str) -> VLM:
VLM.DEEPSEEK_VL_2: ["resolution_wh", "classes"],
VLM.GOOGLE_GEMINI_2_0: ["resolution_wh", "classes"],
VLM.GOOGLE_GEMINI_2_5: ["resolution_wh", "classes"],
VLM.GOOGLE_GEMINI_3_5: ["resolution_wh", "classes"],
VLM.MOONDREAM: ["resolution_wh"],
}

Expand Down
44 changes: 44 additions & 0 deletions tests/detection/test_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1297,3 +1297,47 @@ def test_from_deepseek_vl_2(
detections.data[CLASS_NAME_DATA_FIELD],
expected_detections.data[CLASS_NAME_DATA_FIELD],
)


@pytest.mark.parametrize(
("result", "resolution_wh", "classes"),
[
(
"random text",
(1000, 1000),
None
),
(
"
Comment thread
Borda marked this conversation as resolved.
Outdated
)
def test_from_google_gemini_3_5_matches_2_5(
result: str,
resolution_wh: tuple[int, int],
classes: list[str] | None,
):
detections_2_5 = Detections.from_vlm(
vlm=VLM.GOOGLE_GEMINI_2_5,
result=result,
resolution_wh=resolution_wh,
classes=classes,
)
detections_3_5 = Detections.from_vlm(
vlm=VLM.GOOGLE_GEMINI_3_5,
result=result,
resolution_wh=resolution_wh,
classes=classes,
Comment on lines +1336 to +1343
)

assert len(detections_2_5) == len(detections_3_5)

if len(detections_2_5) == 0:
return

assert np.allclose(detections_2_5.xyxy, detections_3_5.xyxy)
assert np.array_equal(detections_2_5.class_id, detections_3_5.class_id)
assert np.array_equal(
detections_2_5.data[CLASS_NAME_DATA_FIELD],
detections_3_5.data[CLASS_NAME_DATA_FIELD],
)
if detections_2_5.confidence is not None:
assert np.allclose(detections_2_5.confidence, detections_3_5.confidence)
Loading