From ad3c5b72e2969a561fc6bf99e5d4071acfe2338b Mon Sep 17 00:00:00 2001
From: Jenkins <jenkins@ip-172-31-29-217.us-east-2.compute.internal>
Date: Tue, 16 Jun 2026 13:54:57 -0500
Subject: [PATCH] add clip_vitb32_marrenj to models

---
 .../models/clip_vitb32_marrenj/__init__.py    |  13 ++
 .../models/clip_vitb32_marrenj/clip_arch.py   | 145 ++++++++++++++++++
 .../models/clip_vitb32_marrenj/model.py       | 116 ++++++++++++++
 .../clip_vitb32_marrenj/requirements.txt      |   5 +
 .../models/clip_vitb32_marrenj/test.py        |   9 ++
 5 files changed, 288 insertions(+)
 create mode 100644 brainscore_vision/models/clip_vitb32_marrenj/__init__.py
 create mode 100644 brainscore_vision/models/clip_vitb32_marrenj/clip_arch.py
 create mode 100644 brainscore_vision/models/clip_vitb32_marrenj/model.py
 create mode 100644 brainscore_vision/models/clip_vitb32_marrenj/requirements.txt
 create mode 100644 brainscore_vision/models/clip_vitb32_marrenj/test.py

diff --git a/brainscore_vision/models/clip_vitb32_marrenj/__init__.py b/brainscore_vision/models/clip_vitb32_marrenj/__init__.py
new file mode 100644
index 000000000..7fcf556b4
--- /dev/null
+++ b/brainscore_vision/models/clip_vitb32_marrenj/__init__.py
@@ -0,0 +1,13 @@
+from brainscore_vision import model_registry
+from brainscore_vision.model_helpers.brain_transformation import ModelCommitment
+from .model import get_model, LAYERS, BEHAVIORAL_READOUT_LAYER
+
+# No region_layer_map — brain-score's search picks the best visual.transformer
+# block per region. behavioral_readout_layer is pinned to ln_post (the
+# pre-projection 768-d CLS feature, matching our own alignment metric).
+model_registry['clip_vitb32_marrenj'] = lambda: ModelCommitment(
+    identifier='clip_vitb32_marrenj',
+    activations_model=get_model(),
+    layers=LAYERS,
+    behavioral_readout_layer=BEHAVIORAL_READOUT_LAYER,
+)
diff --git a/brainscore_vision/models/clip_vitb32_marrenj/clip_arch.py b/brainscore_vision/models/clip_vitb32_marrenj/clip_arch.py
new file mode 100644
index 000000000..a2b958a34
--- /dev/null
+++ b/brainscore_vision/models/clip_vitb32_marrenj/clip_arch.py
@@ -0,0 +1,145 @@
+"""Self-contained vision tower for our DeCLIP-trained CLIP-ViT-B/32, bundled
+inside the brain-score submission plugin so the CI sandbox doesn't need to clone
+our research repo. Trimmed from src/model.py to JUST the VisionTransformer +
+direct dependencies (Attention, TransformerBlock, LayerNorm, QuickGELU). The
+text encoder and full CLIP wrapper are dropped — brain-score only needs visual
+feature extraction.
+
+State dict loading: our Lightning checkpoint has keys like
+`model.visual.conv1.weight`, `model.text.*`, `model.logit_scale`. The plugin's
+model.py strips the `model.visual.` prefix and filters to visual-only keys
+before calling `VisionTransformer.load_state_dict`.
+"""
+import math
+from typing import Callable, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class QuickGELU(nn.Module):
+    def forward(self, x):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class LayerNorm(nn.LayerNorm):
+    """Standard nn.LayerNorm — kept under its original name so state_dict keys match."""
+    def forward(self, x):
+        orig_type = x.dtype
+        out = super().forward(x)
+        return out.to(orig_type)
+
+
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        if hasattr(F, 'scaled_dot_product_attention'):
+            x = F.scaled_dot_product_attention(q, k, v, attn_mask=None,
+                                                dropout_p=self.attn_drop.p if self.training else 0.0,
+                                                is_causal=False)
+            x = x.transpose(1, 2).reshape(B, N, C)
+        else:
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4.0, qkv_bias=False, drop=0.,
+                 attn_drop=0., ls_init_value: Optional[float] = None,
+                 act_layer: Callable = nn.GELU, norm_layer: Callable = LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias,
+                              attn_drop=attn_drop, proj_drop=drop)
+        self.ls_init_value = ls_init_value
+        if ls_init_value is not None:
+            self.gamma_1 = nn.Parameter(ls_init_value * torch.ones(dim))
+            self.gamma_2 = nn.Parameter(ls_init_value * torch.ones(dim))
+        else:
+            self.gamma_1 = None
+            self.gamma_2 = None
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, mlp_hidden_dim),
+            act_layer(),
+            nn.Dropout(drop),
+            nn.Linear(mlp_hidden_dim, dim),
+            nn.Dropout(drop),
+        )
+
+    def forward(self, x):
+        if self.gamma_1 is None:
+            x = x + self.attn(self.norm1(x))
+            x = x + self.mlp(self.norm2(x))
+        else:
+            x = x + self.gamma_1 * self.attn(self.norm1(x))
+            x = x + self.gamma_2 * self.mlp(self.norm2(x))
+        return x
+
+
+class VisionTransformer(nn.Module):
+    """ViT-B/32 visual encoder for CLIP-style models. Architectural defaults are
+    hardcoded to match VIT_B_32_CONFIG from our codebase (image_size=224,
+    patch_size=32, width=768, layers=12, heads=12, mlp_ratio=4.0,
+    output_dim=512, no layer scale)."""
+    def __init__(self, image_size=224, patch_size=32, width=768, layers=12,
+                 heads=12, mlp_ratio=4.0, output_dim=512,
+                 ls_init_value: Optional[float] = None,
+                 act_layer: Callable = nn.GELU,
+                 norm_layer: Callable = LayerNorm):
+        super().__init__()
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.width = width
+        self.output_dim = output_dim
+        self.grid_size = (image_size // patch_size, image_size // patch_size)
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.conv1 = nn.Conv2d(3, width, kernel_size=patch_size, stride=patch_size, bias=False)
+        self.class_embedding = nn.Parameter(torch.randn(width))
+        self.positional_embedding = nn.Parameter(torch.randn(self.num_patches + 1, width) * 0.01)
+        self.ln_pre = norm_layer(width)
+        self.transformer = nn.Sequential(*[
+            TransformerBlock(dim=width, num_heads=heads, mlp_ratio=mlp_ratio,
+                             qkv_bias=True, ls_init_value=ls_init_value,
+                             act_layer=act_layer, norm_layer=norm_layer)
+            for _ in range(layers)
+        ])
+        self.ln_post = norm_layer(width)
+        self.proj = nn.Parameter(torch.randn(width, output_dim) * (1 / width ** 0.5))
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = x.reshape(x.shape[0], x.shape[1], -1).permute(0, 2, 1)
+        x = torch.cat([
+            self.class_embedding.to(x.dtype) + torch.zeros(
+                x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device,
+            ),
+            x,
+        ], dim=1)
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+        for block in self.transformer:
+            x = block(x)
+        x = x[:, 0]
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
diff --git a/brainscore_vision/models/clip_vitb32_marrenj/model.py b/brainscore_vision/models/clip_vitb32_marrenj/model.py
new file mode 100644
index 000000000..4f615930f
--- /dev/null
+++ b/brainscore_vision/models/clip_vitb32_marrenj/model.py
@@ -0,0 +1,116 @@
+"""CLIP-ViT-B/32 Vanderbilt — final checkpoint of our DeCLIP/YFCC15M-trained
+contrastive baseline (epoch 31, 32-epoch budget). The model is our own custom
+class (not OpenCLIP); we bundle a trimmed visual-only version under
+`clip_arch.py` so the brain-score sandbox doesn't need our research repo.
+
+Brain-Score coverage NOTE: behavioral benchmarks that decode from a 1000-class
+ImageNet logits layer (e.g. Geirhos2021-*) will not work for this model because
+contrastive CLIP doesn't have a native ImageNet classifier. To run those, we'd
+need a zero-shot CLIP classifier head (cosine sim between visual features and
+text embeddings of the 1000 ImageNet class names). For this first submission
+we leave that out — neural V1/V2/V4/IT + Rajalingham2018-i2n behavioral all
+work directly off the visual encoder's features.
+
+We use CLIP-style preprocessing (Resize(224) → CenterCrop(224) → CLIP mean/std)
+to match how the model was trained.
+"""
+import functools
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms as T
+from huggingface_hub import hf_hub_download
+from brainscore_vision.model_helpers.activations.pytorch import PytorchWrapper
+
+from .clip_arch import VisionTransformer
+
+# === EDIT BEFORE SUBMITTING ===
+HF_REPO_ID = "marrenj/temporal-dynamics-baselines"
+HF_FILENAME = "clip_vitb32_baseline_ep031.ckpt"
+
+# OpenAI CLIP normalization — what the visual encoder was trained with.
+CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
+CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
+
+# Candidate visual-encoder layers for brain-score's region commitment search.
+# `ln_post` is the pre-projection 768-d CLS feature we already use as the
+# behavioral readout in our own alignment metric.
+LAYERS = [
+    "transformer.0",
+    "transformer.3",
+    "transformer.6",
+    "transformer.9",
+    "transformer.11",
+    "ln_post",
+]
+BEHAVIORAL_READOUT_LAYER = "ln_post"
+
+
+BIBTEX = """@misc{marrenj_temporal_dynamics_2026,
+  title={Temporal Dynamics of Human Behavioral Alignment in ImageNet-trained Models},
+  author={Wallace Lab},
+  year={2026},
+  note={CLIP-ViT-B/32 (custom DeCLIP-trained), YFCC15M, 32 epochs},
+}"""
+
+
+def _clip_preprocessing(image_filepaths):
+    """CLIP-style preprocessing pipeline that matches our model's training
+    preprocessing (src/dataset.py eval branch). Returns a (B, C, 224, 224)
+    numpy stack — brain-score's PytorchWrapper expects this exact shape."""
+    val_transform = T.Compose([
+        T.Resize(224, interpolation=T.InterpolationMode.BICUBIC),
+        T.CenterCrop(224),
+        T.ToTensor(),
+        T.Normalize(CLIP_MEAN, CLIP_STD),
+    ])
+    out = []
+    for p in image_filepaths:
+        img = Image.open(p).convert("RGB")
+        out.append(val_transform(img).numpy())
+    return np.stack(out)
+
+
+def get_model():
+    weights_path = hf_hub_download(repo_id=HF_REPO_ID, filename=HF_FILENAME)
+    raw = torch.load(weights_path, map_location="cpu", weights_only=True)
+    state_dict = raw.get("state_dict", raw)
+    # Lightning prefix: model.visual.<...> | model.text.* | model.logit_scale
+    # We need: <...> for just the visual encoder.
+    visual_sd = {
+        k[len("model.visual."):]: v
+        for k, v in state_dict.items()
+        if k.startswith("model.visual.")
+    }
+    if not visual_sd:
+        # In case the ckpt isn't Lightning-wrapped: try `visual.<...>` direct.
+        visual_sd = {
+            k[len("visual."):]: v for k, v in state_dict.items()
+            if k.startswith("visual.")
+        }
+    if not visual_sd:
+        raise RuntimeError(
+            f"could not find any visual-encoder keys in checkpoint {HF_FILENAME}. "
+            "Expected keys prefixed with 'model.visual.' or 'visual.'."
+        )
+
+    model = VisionTransformer()  # uses ViT-B/32 defaults
+    missing, unexpected = model.load_state_dict(visual_sd, strict=False)
+    if missing or unexpected:
+        print(f"  [clip_vitb32_marrenj] state_dict load: "
+              f"missing={len(missing)}, unexpected={len(unexpected)}")
+        if missing[:3]: print(f"    sample missing: {missing[:3]}")
+        if unexpected[:3]: print(f"    sample unexpected: {unexpected[:3]}")
+    model.eval()
+
+    wrapper = PytorchWrapper(
+        identifier="clip_vitb32_marrenj",
+        model=model,
+        preprocessing=_clip_preprocessing,
+    )
+    wrapper.image_size = 224
+    return wrapper
+
+
+def get_bibtex(model_identifier):
+    return BIBTEX
diff --git a/brainscore_vision/models/clip_vitb32_marrenj/requirements.txt b/brainscore_vision/models/clip_vitb32_marrenj/requirements.txt
new file mode 100644
index 000000000..4f392095f
--- /dev/null
+++ b/brainscore_vision/models/clip_vitb32_marrenj/requirements.txt
@@ -0,0 +1,5 @@
+torch==2.2.1
+torchvision==0.17.1
+huggingface_hub>=0.25
+numpy
+pillow
diff --git a/brainscore_vision/models/clip_vitb32_marrenj/test.py b/brainscore_vision/models/clip_vitb32_marrenj/test.py
new file mode 100644
index 000000000..ffe7aadde
--- /dev/null
+++ b/brainscore_vision/models/clip_vitb32_marrenj/test.py
@@ -0,0 +1,9 @@
+"""Minimum sanity test."""
+import pytest
+import brainscore_vision
+
+
+@pytest.mark.private_access
+def test_has_identifier():
+    model = brainscore_vision.load_model('clip_vitb32_marrenj')
+    assert model.identifier == 'clip_vitb32_marrenj'