diff --git a/brainscore_vision/data/berezutskaya2021/__init__.py b/brainscore_vision/data/berezutskaya2021/__init__.py
new file mode 100644
index 000000000..41c7a28f2
--- /dev/null
+++ b/brainscore_vision/data/berezutskaya2021/__init__.py
@@ -0,0 +1,35 @@
+from brainscore_core.supported_data_standards.brainio.assemblies import NeuronRecordingAssembly
+
+import brainscore_vision
+from brainscore_vision import data_registry, stimulus_set_registry
+from brainscore_core.supported_data_standards.brainio.s3 import load_stimulus_set_from_s3, load_assembly_from_s3
+
+BIBTEX = """@article{berezutskaya2022open,
+  title={Open multimodal iEEG-fMRI dataset from naturalistic stimulation with a short audiovisual film},
+  author={Berezutskaya, Julia and Vansteensel, Mariska J and Aarnoutse, Erik J and Freudenburg, Zachary V and Piantoni, Giovanni and Branco, Mariana P and Ramsey, Nick F},
+  journal={Scientific Data},
+  volume={9},
+  number={1},
+  pages={91},
+  year={2022},
+  publisher={Nature Publishing Group UK London}
+}"""
+
+# assembly: Berezutskaya2021-fMRI
+data_registry['Berezutskaya2021-fMRI'] = lambda: load_assembly_from_s3(
+    identifier="Berezutskaya2021-fMRI",
+    version_id="AY03ldTEnzdocCBXmJPVnb9yVL7Slx_S",
+    sha1="936b2751a9d1eaa182a0646faa3ce1d974dd0d92",
+    bucket="brainscore-storage/brainscore-vision/benchmarks/Berezutskaya2021-fMRI",
+    cls=NeuronRecordingAssembly,
+    stimulus_set_loader=lambda: brainscore_vision.load_stimulus_set('Berezutskaya2021'),
+)
+
+# stimulus set: Berezutskaya2021
+stimulus_set_registry['Berezutskaya2021'] = lambda: load_stimulus_set_from_s3(
+    identifier="Berezutskaya2021",
+    bucket="brainscore-storage/brainscore-vision/benchmarks/Berezutskaya2021-fMRI",
+    csv_sha1="42b06621fb5e3d4dadbc3975ddb31ed2df75aab4",
+    zip_sha1="3b05a9d055918da9536d615589bef6644e28d8a9",
+    csv_version_id="2BGwIC9sxZvNOW_bQr98Csk9hvbW.9lX",
+    zip_version_id="9QPVG.DYvWZcD_Q0wWrqUJrCsLDIvO8T")
diff --git a/brainscore_vision/data/keles2024/__init__.py b/brainscore_vision/data/keles2024/__init__.py
new file mode 100644
index 000000000..3e0d67b4c
--- /dev/null
+++ b/brainscore_vision/data/keles2024/__init__.py
@@ -0,0 +1,36 @@
+from brainscore_core.supported_data_standards.brainio.assemblies import NeuronRecordingAssembly
+
+import brainscore_vision
+from brainscore_vision import data_registry, stimulus_set_registry
+from brainscore_core.supported_data_standards.brainio.s3 import load_stimulus_set_from_s3, load_assembly_from_s3
+
+BIBTEX = """@article{keles2024multimodal,
+  title={Multimodal single-neuron, intracranial EEG, and fMRI brain responses during movie watching in human patients},
+  author={Keles, Umit and Dubois, Julien and Le, Kevin JM and Tyszka, J Michael and Kahn, David A and Reed, Chrystal M and Chung, Jeffrey M and Mamelak, Adam N and Adolphs, Ralph and Rutishauser, Ueli},
+  journal={Scientific data},
+  volume={11},
+  number={1},
+  pages={214},
+  year={2024},
+  publisher={Nature Publishing Group UK London}
+}"""
+
+# assembly: Keles2024-fMRI
+data_registry['Keles2024-fMRI'] = lambda: load_assembly_from_s3(
+    identifier="Keles2024-fMRI",
+    version_id="qbmUcZGjShAjAifkTzF.cuAQcgvUy2Nl",
+    sha1="929288d0ec99a7991f157e7787a10452ee453021",
+    bucket="brainscore-storage/brainscore-vision/benchmarks/Keles2024-fMRI",
+    cls=NeuronRecordingAssembly,
+    stimulus_set_loader=lambda: brainscore_vision.load_stimulus_set('Keles2024'),
+)
+
+# stimulus set: Keles2024
+stimulus_set_registry['Keles2024'] = lambda: load_stimulus_set_from_s3(
+    identifier="Keles2024",
+    bucket="brainscore-storage/brainscore-vision/benchmarks/Keles2024-fMRI",
+    csv_sha1="1fcf6d65a823b4e9ec4b02d6610190f92674b932",
+    zip_sha1="350536296800fc2f46172f231da2266a0fbac361",
+    csv_version_id="UKUxrOBp3eO4vbyGeqdhXGhfKhQ4Wrne",
+    zip_version_id="AB.1FCauYVAItaW3pYDYGq2AAd3VpacQ")
+
diff --git a/brainscore_vision/data/lahner2024/__init__.py b/brainscore_vision/data/lahner2024/__init__.py
new file mode 100644
index 000000000..4ed76ad52
--- /dev/null
+++ b/brainscore_vision/data/lahner2024/__init__.py
@@ -0,0 +1,30 @@
+from brainscore_core.supported_data_standards.brainio.assemblies import NeuronRecordingAssembly
+
+import brainscore_vision
+from brainscore_vision import data_registry, stimulus_set_registry
+from brainscore_core.supported_data_standards.brainio.s3 import load_stimulus_set_from_s3, load_assembly_from_s3
+
+BIBTEX = """@misc{lahner2024modeling,
+  title={Modeling short visual events through the BOLD moments video fMRI dataset and metadata. Nat. Commun. 15, 6241},
+  author={Lahner, B and Dwivedi, K and Iamshchinina, P and Graumann, M and Lascelles, A and Roig, G and Gifford, AT and Pan, B and Jin, SY and Ratan Murty, NA and others},
+  year={2024}
+}"""
+
+# assembly: Lahner2024-fMRI
+data_registry['Lahner2024-fMRI'] = lambda: load_assembly_from_s3(
+    identifier="Lahner2024-fMRI",
+    version_id="zr_i3T9Saww44rPNJwLaxo0hgp8rYjPO",
+    sha1="2c7f1d2e5724b8cc3c5cf47986e956c4f13001e4",
+    bucket="brainscore-storage/brainscore-vision/benchmarks/Lahner2024-fMRI",
+    cls=NeuronRecordingAssembly,
+    stimulus_set_loader=lambda: brainscore_vision.load_stimulus_set('BOLDMoments'),
+)
+
+# stimulus set: BOLDMoments - Lahner2024-fMRI
+stimulus_set_registry['BOLDMoments'] = lambda: load_stimulus_set_from_s3(
+    identifier="BOLDMoments",
+    bucket="brainscore-storage/brainscore-vision/benchmarks/Lahner2024-fMRI",
+    csv_sha1="0b27388f5898c908f58cd1f21f8f5cb3eda8536e",
+    zip_sha1="dc9c3bf631632cd433d02f2f1847fd33c01ae0b3",
+    csv_version_id="WaGkWh59b1drhy1MmAVVSxh7_VT_eTay",
+    zip_version_id="OxpOYy_3bveay9NFFFxNCVyghyAbqyIt")
diff --git a/brainscore_vision/data/mcmahon2023/__init__.py b/brainscore_vision/data/mcmahon2023/__init__.py
new file mode 100644
index 000000000..70771b55f
--- /dev/null
+++ b/brainscore_vision/data/mcmahon2023/__init__.py
@@ -0,0 +1,35 @@
+from brainscore_core.supported_data_standards.brainio.assemblies import NeuronRecordingAssembly
+
+import brainscore_vision
+from brainscore_vision import data_registry, stimulus_set_registry
+from brainscore_core.supported_data_standards.brainio.s3 import load_stimulus_set_from_s3, load_assembly_from_s3
+
+BIBTEX = """@article{mcmahon2023hierarchical,
+  title={Hierarchical organization of social action features along the lateral visual pathway},
+  author={McMahon, Emalie and Bonner, Michael F and Isik, Leyla},
+  journal={Current Biology},
+  volume={33},
+  number={23},
+  pages={5035--5047},
+  year={2023},
+  publisher={Elsevier}
+}"""
+
+# assembly: McMahon2023-fMRI
+data_registry['McMahon2023-fMRI'] = lambda: load_assembly_from_s3(
+    identifier="McMahon2023-fMRI",
+    version_id="0k8B44WRZaQyRc1eynbi_RBrjRT1mKL8",
+    sha1="c8f1ae9981c11f10135c20eca225b6aade039d23",
+    bucket="brainscore-storage/brainscore-vision/benchmarks/McMahon2023-fMRI",
+    cls=NeuronRecordingAssembly,
+    stimulus_set_loader=lambda: brainscore_vision.load_stimulus_set('McMahon2023'),
+)
+
+# stimulus set: McMahon2023
+stimulus_set_registry['McMahon2023'] = lambda: load_stimulus_set_from_s3(
+    identifier="McMahon2023",
+    bucket="brainscore-storage/brainscore-vision/benchmarks/McMahon2023-fMRI",
+    csv_sha1="fe617e7bce845c22eccf46242c95c0a74bcb501e",
+    zip_sha1="f136c855a457b6b40d0b91f621d9fbb68f1c6c48",
+    csv_version_id="zbknzf08aV.CLb1pUkt1ZMRk1nWheSPp",
+    zip_version_id="EVMJwIgUmE_NlLMCLTvMOpBQC__aiLXz")
diff --git a/brainscore_vision/data/savasegal2023/__init__.py b/brainscore_vision/data/savasegal2023/__init__.py
new file mode 100644
index 000000000..77b502bcd
--- /dev/null
+++ b/brainscore_vision/data/savasegal2023/__init__.py
@@ -0,0 +1,95 @@
+from brainscore_core.supported_data_standards.brainio.assemblies import NeuronRecordingAssembly
+
+import brainscore_vision
+from brainscore_vision import data_registry, stimulus_set_registry
+from brainscore_core.supported_data_standards.brainio.s3 import load_stimulus_set_from_s3, load_assembly_from_s3
+
+BIBTEX = """@article{sava2023individual,
+  title={Individual differences in neural event segmentation of continuous experiences},
+  author={Sava-Segal, Clara and Richards, Chandler and Leung, Megan and Finn, Emily S},
+  journal={Cerebral Cortex},
+  volume={33},
+  number={13},
+  pages={8164--8178},
+  year={2023},
+  publisher={Oxford University Press}
+}"""
+
+# assembly: Savasegal2023-fMRI-Defeat
+data_registry['Savasegal2023-fMRI-Defeat'] = lambda: load_assembly_from_s3(
+    identifier="Savasegal2023-fMRI-Defeat",
+    version_id="eb..6dECXftcmMTZX2DXWPVKz2luoUn9",
+    sha1="b6ea38ad12718122f7fef78d2ec5e809cce7c977",
+    bucket="brainscore-storage/brainscore-vision/benchmarks/Savasegal2023-fMRI-Defeat",
+    cls=NeuronRecordingAssembly,
+    stimulus_set_loader=lambda: brainscore_vision.load_stimulus_set('Savasegal2023-Defeat'),
+)
+
+# stimulus set: Savasegal2023-Defeat
+stimulus_set_registry['Savasegal2023-Defeat'] = lambda: load_stimulus_set_from_s3(
+    identifier="Savasegal2023-Defeat",
+    bucket="brainscore-storage/brainscore-vision/benchmarks/Savasegal2023-fMRI-Defeat",
+    csv_sha1="ef5de41cc775b12583cb2f41b25ebbcd157bd87a",
+    zip_sha1="5de346354a1fc2a74a7a013e74583a57b83dfe01",
+    csv_version_id="FKYjSSKOMZEpQo3h6x6MflH3NzNqP4Lg",
+    zip_version_id="Ll2ENeVR7E_.AUB9chchamR3MFNGqZgk")
+
+
+# assembly: Savasegal2023-fMRI-Growth
+data_registry['Savasegal2023-fMRI-Growth'] = lambda: load_assembly_from_s3(
+    identifier="Savasegal2023-fMRI-Growth",
+    version_id="fKUd3lqYkmVuhk7tQhWgEgjYUuYxzHr2",
+    sha1="93b6b0db811dce340e709dfb57156156e78d437d",
+    bucket="brainscore-storage/brainscore-vision/benchmarks/Savasegal2023-fMRI-Growth",
+    cls=NeuronRecordingAssembly,
+    stimulus_set_loader=lambda: brainscore_vision.load_stimulus_set('Savasegal2023-Growth'),
+)
+
+# stimulus set: Savasegal2023-Growth
+stimulus_set_registry['Savasegal2023-Growth'] = lambda: load_stimulus_set_from_s3(
+    identifier="Savasegal2023-Growth",
+    bucket="brainscore-storage/brainscore-vision/benchmarks/Savasegal2023-fMRI-Growth",
+    csv_sha1="029e2d68ad952d2929aa8cfb836443f7d3714fea",
+    zip_sha1="7dbdbb36920017b39793c0cc51c96393e33d8a99",
+    csv_version_id="DSmsK1BISRn5j2lLjWINJdjmuTx1MuYk",
+    zip_version_id="E.EBCeeQYY8MM8cAJvL261n4ox7Z.huW")
+
+
+# assembly: Savasegal2023-fMRI-Iteration
+data_registry['Savasegal2023-fMRI-Iteration'] = lambda: load_assembly_from_s3(
+    identifier="Savasegal2023-fMRI-Iteration",
+    version_id="VI_dZOWapYT.C2i1o_BkmYJpjVAQQQB9",
+    sha1="1595605baf4b461f11164b768337a10986b77618",
+    bucket="brainscore-storage/brainscore-vision/benchmarks/Savasegal2023-fMRI-Iteration",
+    cls=NeuronRecordingAssembly,
+    stimulus_set_loader=lambda: brainscore_vision.load_stimulus_set('Savasegal2023-Iteration'),
+)
+
+# stimulus set: Savasegal2023-Iteration
+stimulus_set_registry['Savasegal2023-Iteration'] = lambda: load_stimulus_set_from_s3(
+    identifier="Savasegal2023-Iteration",
+    bucket="brainscore-storage/brainscore-vision/benchmarks/Savasegal2023-fMRI-Iteration",
+    csv_sha1="5634119759f04886dc741f41243bc84a66d5d569",
+    zip_sha1="0f5cca320e6eaf814cad162cac4f40dd9dc9dcbe",
+    csv_version_id="jSz5.kOpWrwUTHjMhUGRCbHYke1o_NzW",
+    zip_version_id="qJoYn1uhNbWisP.K6ix4Npszh7sTYcn7")
+
+
+# assembly: Savasegal2023-fMRI-Lemonade
+data_registry['Savasegal2023-fMRI-Lemonade'] = lambda: load_assembly_from_s3(
+    identifier="Savasegal2023-fMRI-Lemonade",
+    version_id="BL3._yZ.5QnpmqfnZUQo8w6uJskg_uj7",
+    sha1="3c5849b7c9682a879059ff0e8c4b20656807bb4f",
+    bucket="brainscore-storage/brainscore-vision/benchmarks/Savasegal2023-fMRI-Lemonade",
+    cls=NeuronRecordingAssembly,
+    stimulus_set_loader=lambda: brainscore_vision.load_stimulus_set('Savasegal2023-Lemonade'),
+)
+
+# stimulus set: Savasegal2023-Lemonade
+stimulus_set_registry['Savasegal2023-Lemonade'] = lambda: load_stimulus_set_from_s3(
+    identifier="Savasegal2023-Lemonade",
+    bucket="brainscore-storage/brainscore-vision/benchmarks/Savasegal2023-fMRI-Lemonade",
+    csv_sha1="4537ca9955473b6688cd9f990b9a16182cbbb50e",
+    zip_sha1="79edea7316c9f5dde7a8682dce7000ed506d2026",
+    csv_version_id="EeZlMuTe2gsAmqbyeSMAOXMqovDGDkrK",
+    zip_version_id="ZBnxvkOf8WZVAIoMOieNhVbEXTxfixnF")
\ No newline at end of file
diff --git a/brainscore_vision/model_helpers/activations/temporal/core/__init__.py b/brainscore_vision/model_helpers/activations/temporal/core/__init__.py
index 90f0214ac..78f43a3e4 100644
--- a/brainscore_vision/model_helpers/activations/temporal/core/__init__.py
+++ b/brainscore_vision/model_helpers/activations/temporal/core/__init__.py
@@ -1,3 +1,21 @@
 from .extractor import ActivationsExtractor
 from .executor import BatchExecutor
-from .inferencer import *
\ No newline at end of file
+from .inferencer import (
+    Inferencer,
+    TemporalInferencer,
+    TemporalContextInferencerBase,
+    CausalInferencer,
+    BlockInferencer,
+    channel_name_mapping,
+)
+
+__all__ = [
+    "ActivationsExtractor",
+    "BatchExecutor",
+    "Inferencer",
+    "TemporalInferencer",
+    "TemporalContextInferencerBase",
+    "CausalInferencer",
+    "BlockInferencer",
+    "channel_name_mapping",
+]
diff --git a/brainscore_vision/model_helpers/activations/temporal/core/executor.py b/brainscore_vision/model_helpers/activations/temporal/core/executor.py
index 9670a85ff..3283fa504 100644
--- a/brainscore_vision/model_helpers/activations/temporal/core/executor.py
+++ b/brainscore_vision/model_helpers/activations/temporal/core/executor.py
@@ -20,8 +20,8 @@ def _func(x, *others):
     return _func
 
     
-# a mapper that execute a function in parallel with joblib
-class JoblibMapper:
+# a mapper that execute a fxunction in parallel with joblib
+class JobsMapper:
     def __init__(self, num_threads: int):
         self._num_threads = num_threads
         self._pool = Parallel(n_jobs=num_threads, verbose=False, backend="loky")
@@ -102,7 +102,7 @@ def __init__(self,
         if self.max_workers is not None:
             num_threads = min(self.max_workers, num_threads)
         self._logger.info(f"Using {num_threads} threads for parallel processing.")
-        self._mapper = JoblibMapper(num_threads)
+        self._mapper = JobsMapper(num_threads)
 
         # processing hooks
         self.before_hooks = []
@@ -126,10 +126,11 @@ def _get_batches(
 
         Returns
         -------
-        indices : list
-            indices of the source data after sorting.
-        masks : list of list
+        all_indices : list of list
+            indices of the source data in each batch.
+        all_masks : list of list
             masks for each batch to indicate whether the datum is padding sample.
+            1 for not padding, 0 for padding.
         all_batches : list of list
             list of batches.
         """
@@ -138,21 +139,19 @@ def _get_batches(
 
         if grouper is None:
             sorted_data = np.array(data, dtype='object')
-            inverse_indices = np.arange(N)
+            sorted_indices = np.arange(N)
             sorted_properties = [0] * N
         else:
             properties = np.array([hash(grouper(d)) for d in data])
             sorted_indices = np.argsort(properties)
-            inverse_indices = np.argsort(sorted_indices)  # inverse transform
             sorted_properties = properties[sorted_indices]
             sorted_data = np.array(data, dtype='object')[sorted_indices]
-        inverse_indices = list(inverse_indices)
+        sorted_indices = list(sorted_indices)
 
         index = 0
         all_batches = []
         all_indices = []
-        indices = []
-        masks = []
+        all_masks = []
         while index < N:
             property_anchor = sorted_properties[index]
             batch = []
@@ -160,7 +159,7 @@ def _get_batches(
                 batch.append(sorted_data[index])
                 index += 1
             
-            batch_indices = inverse_indices[index-len(batch):index]
+            batch_indices = sorted_indices[index-len(batch):index]
 
             if padding:
                 num_padding = batch_size - len(batch)
@@ -169,23 +168,32 @@ def _get_batches(
                     batch += batch_padding
             else:
                 num_padding = 0
-            masks.append([True] * (len(batch)-num_padding) + [False] * num_padding) 
-
+            all_masks.append([True] * (len(batch)-num_padding) + [False] * num_padding) 
             all_batches.append(batch)
-            all_indices.append(batch_indices)
-            indices.extend(batch_indices)
-        return indices, masks, all_batches
+            all_indices.append(batch_indices + [-1] * num_padding)
+        return all_indices, all_masks, all_batches
+
+    def _register_hook(self, name, hook, hook_group, index=None):
+        if index is None: index = len(hook_group)
+        hook_group.insert(index, (name, hook))
+
+    def _remove_hook(self, name, hook_group):
+        hook_group[:] = [(n, h) for n, h in hook_group if n != name]
 
-    def register_before_hook(self, hook: Callable[[Stimulus], Stimulus], index=None):
+    def register_before_hook(self, name: str, hook: Callable[[Stimulus], Stimulus], index=None):
         # hook: Stimulus -> Stimulus
-        if index is None: index = len(self.before_hooks)
-        self.before_hooks.insert(index, hook)
+        self._register_hook(name, hook, self.before_hooks, index)
 
-    def register_after_hook(self, hook: Callable[[Any, str, Stimulus], Any], index=None):
+    def register_after_hook(self, name: str, hook: Callable[[Any, str, Stimulus], Any], index=None):
         # hook: value, layer_name, Stimulus -> value
-        if index is None: index = len(self.after_hooks)
-        self.after_hooks.insert(index, hook)
-    
+        self._register_hook(name, hook, self.after_hooks, index)
+
+    def remove_before_hook(self, name: str):
+        self._remove_hook(name, self.before_hooks)
+
+    def remove_after_hook(self, name: str):
+        self._remove_hook(name, self.after_hooks)
+
     def add_stimuli(self, stimuli):
         self.stimuli.extend(stimuli)
 
@@ -193,27 +201,45 @@ def clear_stimuli(self):
         self.stimuli = []
 
     def execute(self, layers):
-        indices, masks, batches = self._get_batches(self.stimuli, self.batch_size, 
+        full_indices = []
+        full_activations = OrderedDict()
+        for layer_activations, indices in self.execute_batch(layers):
+            full_indices.extend(indices)
+            for layer_activation in layer_activations:
+                for layer, activations in layer_activation.items():
+                    full_activations.setdefault(layer, []).append(activations)
+        
+        for layer, activations in full_activations.items():
+            full_activations[layer] = [activations[i] for i in full_indices]
+
+        return full_activations
+
+    def execute_batch(self, layers):
+        all_indices, all_masks, batches = self._get_batches(self.stimuli, self.batch_size, 
                                                     grouper=self.batch_grouper,
                                                     padding=self.batch_padding)
-        
-        before_pipe = _pipeline(*self.before_hooks)
-        after_pipe = _pipeline(*self.after_hooks)
-        
-        layer_activations = OrderedDict()
-        for mask, batch in tqdm(zip(masks, batches), desc="activations", total=len(batches)):
+
+        before_pipe = _pipeline(*[hook for name, hook in self.before_hooks])
+        after_pipe = _pipeline(*[hook for name, hook in self.after_hooks])
+
+        # avoid keeping the whole batch in memory
+        def run(batch, mask, indices):
             batch = [before_pipe(stimulus) for stimulus in batch]
             model_inputs = self._mapper.map(self.preprocess, batch)
             batch_activations = self.get_activations(model_inputs, layers)
             assert isinstance(batch_activations, OrderedDict)
-            for layer, activations in batch_activations.items():
+            for i, (layer, activations) in enumerate(batch_activations.items()):
                 results = [after_pipe(arr, layer, stimulus) 
-                               for not_pad, arr, stimulus in zip(mask, activations, batch) 
-                               if not_pad]
-                layer_activations.setdefault(layer, []).extend(results)
-
-        for layer, activations in layer_activations.items():
-            layer_activations[layer] = [activations[i] for i in indices]
-
-        self.clear_stimuli()
-        return layer_activations
+                            for not_pad, arr, stimulus in zip(mask, activations, batch) 
+                            if not_pad]
+                if i == 0:
+                    layer_activations = [OrderedDict() for _ in range(len(results))]
+                
+                for j, result in enumerate(results):
+                    layer_activations[j][layer] = result
+            return layer_activations, indices
+
+        for indices, mask, batch in tqdm(zip(all_indices, all_masks, batches), desc="activations", total=len(batches)):
+            yield run(batch, mask, indices)
+            
+        self.clear_stimuli()
\ No newline at end of file
diff --git a/brainscore_vision/model_helpers/activations/temporal/core/extractor.py b/brainscore_vision/model_helpers/activations/temporal/core/extractor.py
index fe988e480..a66558db4 100644
--- a/brainscore_vision/model_helpers/activations/temporal/core/extractor.py
+++ b/brainscore_vision/model_helpers/activations/temporal/core/extractor.py
@@ -11,6 +11,7 @@
 from brainscore_core.supported_data_standards.brainio.stimuli import StimulusSet
 from brainscore_core.supported_data_standards.brainio.assemblies import DataAssembly
 from brainscore_vision.model_helpers.utils import fullname
+from brainscore_vision.model_helpers.activations.temporal.utils import data_assembly_mmap
 from result_caching import store_xarray
 from .inferencer import Inferencer
 from ..inputs import Stimulus
@@ -101,31 +102,26 @@ def from_paths(
         ):
         if layers is None:
             layers = ['logits']
-        if self.identifier and stimuli_identifier:
-            fnc = functools.partial(self._from_paths_stored,
-                                    identifier=self.identifier, stimuli_identifier=stimuli_identifier)
+
+        mmap_home = os.environ.get("MMAP_HOME", None)
+        if self.identifier and stimuli_identifier and mmap_home:
+            mmap_path = os.path.join(mmap_home, stimuli_identifier, self.identifier)
+            os.makedirs(mmap_path, exist_ok=True)
         else:
-            self._logger.debug(f"self.identifier `{self.identifier}` or stimuli_identifier {stimuli_identifier} "
-                               f"are not set, will not store")
-            fnc = self._from_paths
+            mmap_path = None
+
         # In case stimuli paths are duplicates (e.g. multiple trials), we first reduce them to only the paths that need
         # to be run individually, compute activations for those, and then expand the activations to all paths again.
         # This is done here, before storing, so that we only store the reduced activations.
         reduced_paths = self._reduce_paths(stimuli_paths)
-        activations = fnc(layers=layers, stimuli_paths=reduced_paths)
+        data = data_assembly_mmap.load(mmap_path)
+        if data:
+            activations = data.to_assembly()
+        else:
+            activations = self.inferencer(layers=layers, paths=reduced_paths, mmap_path=mmap_path)
         activations = self._expand_paths(activations, original_paths=stimuli_paths)
         return activations
 
-    @store_xarray(identifier_ignore=['stimuli_paths', 'layers'], combine_fields={'layers': 'layer'})
-    def _from_paths_stored(self, identifier, layers, stimuli_identifier, stimuli_paths):
-        stimuli_paths.sort()
-        return self._from_paths(layers=layers, stimuli_paths=stimuli_paths)
-
-    def _from_paths(self, layers, stimuli_paths):
-        if len(layers) == 0:
-            raise ValueError("No layers passed to retrieve activations from")
-        return self.inferencer(stimuli_paths, layers)
-
     def _reduce_paths(self, stimuli_paths):
         return list(set(stimuli_paths))
 
diff --git a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/__init__.py b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/__init__.py
index edb58f1aa..84a91b26a 100644
--- a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/__init__.py
+++ b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/__init__.py
@@ -1,2 +1,19 @@
-from .base import Inferencer
-from .video import *
\ No newline at end of file
+from .base import (
+    Inferencer,
+    channel_name_mapping,
+)
+from .temporal import (
+    TemporalInferencer,
+    TemporalContextInferencerBase,
+    CausalInferencer,
+    BlockInferencer,
+)
+
+__all__ = [
+    "Inferencer",
+    "TemporalInferencer",
+    "TemporalContextInferencerBase",
+    "CausalInferencer",
+    "BlockInferencer",
+    "channel_name_mapping",
+]
diff --git a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/base.py b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/base.py
index ce40262d6..a1ef87d22 100644
--- a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/base.py
+++ b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/base.py
@@ -1,19 +1,16 @@
-import functools
 import logging
-from collections import OrderedDict
-from typing import Callable, Hashable, List, Dict, Any, Union
 from pathlib import Path
+from typing import Any, Callable, Dict, Hashable, List, Union
 
 import numpy as np
 from tqdm.auto import tqdm
-import gc
 
-from brainscore_core.supported_data_standards.brainio.assemblies import NeuroidAssembly, walk_coords
+from brainscore_core.supported_data_standards.brainio.assemblies import NeuroidAssembly
 from brainscore_vision.model_helpers.utils import fullname
 
-from brainscore_vision.model_helpers.activations.temporal.core.executor import BatchExecutor
-from brainscore_vision.model_helpers.activations.temporal.utils import stack_with_nan_padding, batch_2d_resize
-from brainscore_vision.model_helpers.activations.temporal.inputs import Stimulus
+from ..executor import BatchExecutor
+from ...inputs import Stimulus
+from ...utils import batch_2d_resize, data_assembly_mmap
 
 
 channel_name_mapping = {
@@ -21,16 +18,75 @@
     "C": "channel",
     "H": "channel_y",
     "W": "channel_x",
-    "K": "channel_token"
+    "K": "channel_token",
 }
 
 
+def _make_tensor_to_numpy_hook():
+    import torch
+
+    def hook(val, layer, stimulus):
+        if isinstance(val, torch.Tensor):
+            return val.cpu().data.numpy()
+        return val
+
+    return hook
+
+
+def _make_dtype_hook(dtype):
+    return lambda val, layer, stimulus: val.astype(dtype)
+
+
+# downsample the activations with the largest spatial size (among width and height) to the specified size
+def _make_spatial_downsample_hook(max_spatial_size, layer_activation_format, mode="pool"):
+    def hook(val, layer, stimulus):
+        if max_spatial_size is None:
+            return val
+
+        dims = layer_activation_format[layer]
+
+        # require both H and W dimensions to do spatial downsampling
+        if "H" not in dims or "W" not in dims:
+            return val
+
+        H_dim, W_dim = dims.index("H"), dims.index("W")
+        val = val.swapaxes(H_dim, 0).swapaxes(W_dim, 1)
+        shape = val.shape[2:]
+        h, w = val.shape[:2]
+        val = val.reshape(h, w, -1)
+        new_size = _compute_new_size(w, h, max_spatial_size)
+        new_val = batch_2d_resize(val[None, :], new_size, mode=mode)[0]
+        new_val = new_val.reshape(*new_size, *shape)
+        new_val = new_val.swapaxes(0, H_dim).swapaxes(1, W_dim)
+        return new_val
+
+    return hook
+
+
+def _compute_new_size(w, h, max_spatial_size):
+    if isinstance(max_spatial_size, int):
+        if h > w:
+            new_h = max_spatial_size
+            new_w = int(w * new_h / h)
+        else:
+            new_w = max_spatial_size
+            new_h = int(h * new_w / w)
+    else:
+        new_h = int(h * max_spatial_size)
+        new_w = int(w * max_spatial_size)
+
+    new_h = max(1, new_h)
+    new_w = max(1, new_w)
+
+    return new_h, new_w
+
+
 class Inferencer:
     """Inferencer for batch processing of stimuli and packaging the activations.
-    
+
     Parameters
     ----------
-    get_activations : function 
+    get_activations : function
         function that takes a list of processed stimuli and a list of layers, and returns a dictionary of activations.
     preprocessing : function
         function that takes a stimulus and returns a processed stimulus.
@@ -43,8 +99,8 @@ class Inferencer:
         the visual degrees of the stimuli.
     max_spatial_size: int/float
         the maximum spatial size of the activations. If the spatial size of the activations is larger than this value,
-        the activations will be downsampled to this size. This is used to avoid the large memory consumption by the first layers of some model.
-        If float, resize the image based on this factor.
+        the activations will be downsampled to this size. This is used to avoid the large memory consumption by the
+        first layers of some model. If float, resize the image based on this factor.
     dtype: np.dtype
         data type of the activations.
     batch_size: int
@@ -57,13 +113,13 @@ class Inferencer:
     max_workers: int
         the maximum number of workers to use for parallel processing.
 
-        
     APIs
     ----
-    __call__(paths, layers)
+    __call__(paths, layers, mmap_path=None)
         process the stimuli and return a holistic assembly that compile activations of all specified layers.
         The returned assembly is a NeuroidAssembly with the dimensions [stimulus_path, neuroid].
         All dimensions of the activations will be stacked together to form the "neuroid" dimension.
+        If mmap_path is specified, the activations will be saved to the mmap file.
 
     Examples
     --------
@@ -72,22 +128,22 @@ class Inferencer:
     """
 
     def __init__(
-            self, 
-            get_activations : Callable[[List[Any]], Dict[str, np.array]], 
-            preprocessing : Callable[[List[Stimulus]], Any],
-            layer_activation_format : dict,
-            stimulus_type : Stimulus,
-            visual_degrees : float = 8.,
-            max_spatial_size : Union[int, float] = None,
-            dtype : np.dtype = np.float16,
-            batch_size : int = 64,
-            batch_grouper : Callable[[Stimulus], Hashable] = None,
-            batch_padding : bool = False,
-            max_workers : int = None,
-            *args,
-            **kwargs
-        ):
-
+        self,
+        get_activations: Callable[[List[Any]], Dict[str, np.array]],
+        preprocessing: Callable[[List[Stimulus]], Any],
+        layer_activation_format: dict,
+        stimulus_type: Stimulus,
+        visual_degrees: float = 8.0,
+        max_spatial_size: Union[int, float] = None,
+        dtype: np.dtype = np.float16,
+        batch_size: int = 64,
+        batch_grouper: Callable[[Stimulus], Hashable] = None,
+        batch_padding: bool = False,
+        max_workers: int = None,
+        simple_neuroid_id: bool = False,
+        *args,
+        **kwargs,
+    ):
         self.stimulus_type = stimulus_type
         self.layer_activation_format = layer_activation_format
         if isinstance(max_spatial_size, float):
@@ -95,14 +151,21 @@ def __init__(
         self.max_spatial_size = max_spatial_size
         self.visual_degrees = visual_degrees
         self.dtype = dtype
-        self._executor = BatchExecutor(get_activations, preprocessing, batch_size, batch_padding, batch_grouper, max_workers)
+        self._executor = BatchExecutor(
+            get_activations, preprocessing, batch_size, batch_padding, batch_grouper, max_workers
+        )
         self._stimulus_set_hooks = {}
         self._batch_activations_hooks = {}
         self._logger = logging.getLogger(fullname(self))
+        self.simple_neuroid_id = simple_neuroid_id
 
         # register hooks
-        self._executor.register_after_hook(self._make_spatial_downsample_hook(max_spatial_size))
-        self._executor.register_after_hook(self._make_dtype_hook(dtype))
+        self._executor.register_after_hook("tensor_to_numpy", _make_tensor_to_numpy_hook())
+        self._executor.register_after_hook(
+            "spatial_downsample",
+            _make_spatial_downsample_hook(max_spatial_size, self.layer_activation_format),
+        )
+        self._executor.register_after_hook("dtype", _make_dtype_hook(dtype))
 
     @property
     # identifier for the inferencer: including all the features that may affect the activations
@@ -115,160 +178,93 @@ def identifier(self) -> str:
             to_add.append(f".max_s={self.max_spatial_size}")
         to_add = "".join(to_add)
         return f"{self.__class__.__name__}{to_add}"
-    
+
     def set_visual_degrees(self, visual_degrees: float):
         self.visual_degrees = visual_degrees
+        print("Visual degrees not supported yet. Bypassing...")
 
-    # given the paths of the stimuli and the layers, return the model activations as a NeuroidAssembly
-    def __call__(self, paths: List[Union[str, Path]], layers: List[str]):
+    def __call__(self, paths: List[Union[str, Path]], layers: List[str], mmap_path: str = None) -> NeuroidAssembly:
         stimuli = self.load_stimuli(paths)
-        layer_activations = self.inference(stimuli, layers)
-        layer_assemblies = OrderedDict()
-        for layer in tqdm(layers, desc="Packaging layers"):
-            layer_assemblies[layer] = self.package_layer(layer_activations[layer], self.layer_activation_format[layer], stimuli)
-            del layer_activations[layer]
-            gc.collect()  # reduce memory usage
-        model_assembly = self.package(layer_assemblies, paths)
-        return model_assembly
-
-    def load_stimuli(self, paths : List[Union[str, Path]]) -> List[Stimulus]:
+        num_stimuli = len(paths)
+        stimulus_paths = paths
+
+        self._executor.add_stimuli(stimuli)
+        data = None
+
+        for layer_activations, indicies in self._executor.execute_batch(layers):
+            for layer_activation, i in zip(layer_activations, indicies):
+                if data is None:
+                    num_feats, neuroid_coords = self._get_neuroid_coords(
+                        layer_activation, self.layer_activation_format
+                    )
+                    data = data_assembly_mmap(
+                        mmap_path, shape=(num_stimuli, num_feats), dtype=self.dtype, fill_value=np.nan
+                    )
+                flatten_activation = self._flatten_activations(layer_activation)
+                if flatten_activation.size != num_feats:
+                    raise ValueError(
+                        f"The flattened activation size changed from {num_feats} to {flatten_activation.size}"
+                    )
+                data[i, :] = flatten_activation
+
+        data.register_meta(
+            dims=["stimulus_path", "neuroid"],
+            coords={
+                "stimulus_path": stimulus_paths,
+                **neuroid_coords,
+            },
+        )
+
+        return data.to_assembly()
+
+    def load_stimuli(self, paths: List[Union[str, Path]]) -> List[Stimulus]:
         ret = []
         for p in tqdm(paths, desc="Loading stimuli"):
             ret.append(self.load_stimulus(p))
         return ret
 
-    def load_stimulus(self, path : Union[str, Path]) -> Stimulus:
+    def load_stimulus(self, path: Union[str, Path]) -> Stimulus:
         return self.stimulus_type.from_path(path)
-    
-    # process the list of stimulus and return the activations (list of np.array, 
-    # whose length is the number of stimuli) of the specified layers
-    def inference(self, stimuli : List[Stimulus], layers : List[str]) -> Dict[str, List[np.array]]:
-        self._executor.add_stimuli(stimuli)
-        return self._executor.execute(layers)
-    
-    # Take the layer_activation (a list of np.array) and the layer specification,
-    # and package them into a NeuroidAssembly with all channels flattened into the "neuroid" dimension
-    def package_layer(
-            self, 
-            layer_activation : List[np.array],
-            layer_spec : str, 
-            stimuli : List[Stimulus]
-        ):
-        assert len(layer_activation) == len(stimuli)
-        layer_activation = stack_with_nan_padding(layer_activation, dtype=self.dtype)
-        channels = self._map_dims(layer_spec)
-        assembly = self._package(layer_activation, ["stimulus_path"] + channels)
-        assembly = self._stack_neuroid(assembly, channels)
-        return assembly
-    
-    # package the assemblies from different layers into a single one by concat along the neuroid dimension
-    def package(self, layer_assemblies : Dict[str, NeuroidAssembly], stimuli_paths : List[str]) -> NeuroidAssembly:
-        # merge manually instead of using merge_data_arrays since `xarray.merge` is very slow with these large arrays
-        # complication: (non)neuroid_coords are taken from the structure of layer_assemblies[0] i.e. the 1st assembly;
-        # using these names/keys for all assemblies results in KeyError if the first layer contains dims
-        # (see _package_layer) not present in later layers, e.g. first layer = conv, later layer = transformer layer
-        self._logger.debug(f"Merging {len(layer_assemblies)} layer assemblies")
-        layers = list(layer_assemblies.keys())
-        layer_assemblies = list(layer_assemblies.values())
-        layer_assemblies = [asm.transpose(*layer_assemblies[0].dims) for asm in layer_assemblies]
-        
-        nonneuroid_coords = {coord: (dims, values) for coord, dims, values in walk_coords(layer_assemblies[0])
-                             if set(dims) != {'neuroid'}}
-        neuroid_coords = [(coord, dims) for layer_assembly in layer_assemblies for coord, dims, values in walk_coords(layer_assembly)
-                             if set(dims) == {'neuroid'} and coord!='neuroid']
-        neuroid_coord_names = set(neuroid_coords)
+
+    def _flatten_activations(self, layer_activation):
+        arrs = [arr.flatten() for arr in layer_activation.values()]
+        return np.concatenate(arrs)
+
+    def _get_neuroid_coords(self, layer_activation, layer_specs):
+        feat_sizes = [arr.size for arr in layer_activation.values()]
+        feat_shapes = [arr.shape for arr in layer_activation.values()]
+        layers = list(layer_activation.keys())
+        num_feats = sum(feat_sizes)
+        count_neuroids = 0
         neuroid_coords = {}
+        neuroid_coords["neuroid_id"] = []
+        neuroid_coords["neuroid_num"] = []
+        neuroid_coords["layer"] = []
 
-        for layer_assembly in layer_assemblies:
-            for coord, _ in neuroid_coord_names:
-                try:
-                    coord_values = layer_assembly[coord].values
-                except KeyError:
-                    coord_values = np.full(layer_assembly.sizes['neuroid'], -1, dtype=int)
-                neuroid_coords.setdefault(coord, []).append(coord_values)
-
-            assert layer_assemblies[0].dims == layer_assembly.dims
-            for dim in set(layer_assembly.dims) - {'neuroid'}:
-                for coord, _, _ in walk_coords(layer_assembly[dim]):
-                    assert (layer_assembly[coord].values == layer_assemblies[0][coord].values).all()
-
-        for coord, dims in neuroid_coord_names:
-            neuroid_coords[coord] = (dims, np.concatenate(neuroid_coords[coord]))
-
-        # add stimulus_paths
-        nonneuroid_coords["stimulus_path"] = ('stimulus_path', stimuli_paths)
-
-        # add layer, neuroid_num, neuroid_id
-        layer_sizes = [asm.sizes['neuroid'] for asm in layer_assemblies]
-        neuroid_coords['layer'] = (('neuroid',), np.concatenate([[layer] * size for layer, size in zip(layers, layer_sizes)]))
-        neuroid_coords['neuroid_num'] = (('neuroid',), np.concatenate([np.arange(size) for size in layer_sizes]))
-        neuroid_coords['neuroid_id'] = (('neuroid',), np.concatenate([np.array([f"{layer}.{neuroid_num}" for neuroid_num in range(size)]) 
-                                                                   for layer, size in zip(layers, layer_sizes)]))
-
-        model_assembly = np.concatenate([a.values for a in layer_assemblies], axis=layer_assemblies[0].dims.index('neuroid'))
-        model_assembly = type(layer_assemblies[0])(model_assembly, coords={**nonneuroid_coords, **neuroid_coords},dims=layer_assemblies[0].dims)
-        return model_assembly
-    
-    # turn the activations into the specified dtype
-    def _make_dtype_hook(self, dtype):
-        return lambda val, layer, stimulus: val.astype(dtype)
-    
-    # downsample the activations with the largest spatial size (among width and height) to the specified size
-    def _make_spatial_downsample_hook(self, max_spatial_size, mode="pool"):
-        def hook(val, layer, stimulus):
-            if max_spatial_size is None:
-                return val
-            
-            dims = self.layer_activation_format[layer]
-
-            # require both H and W dimensions to do spatial downsampling
-            if "H" not in dims or "W" not in dims:
-                return val
-
-            H_dim, W_dim = dims.index("H"), dims.index("W")
-            val = val.swapaxes(H_dim, 0).swapaxes(W_dim, 1)
-            shape = val.shape[2:]
-            h, w = val.shape[:2]
-            val = val.reshape(h, w, -1)
-            new_size = _compute_new_size(w, h, self.max_spatial_size)
-            new_val = batch_2d_resize(val[None,:], new_size, mode=mode)[0]
-            new_val = new_val.reshape(*new_size, *shape)
-            new_val = new_val.swapaxes(0, H_dim).swapaxes(1, W_dim)
-            return new_val.astype(self.dtype)
-        return hook
-
-    # map dims to channel names
-    @staticmethod
-    def _map_dims(dims):
-        return [channel_name_mapping[dim] for dim in dims]
-
-    # stack the channel dimensions to form the "neuroid" dimension
-    @staticmethod
-    def _stack_neuroid(assembly, channels):
-        asm_cls = assembly.__class__
-        assembly = assembly.stack(neuroid=channels)
-        return asm_cls(assembly)
-
-    # package an activation numpy array into a NeuroidAssembly with specified dims
-    @staticmethod
-    def _package(activation: np.array, dims):
-        coords = {dim: range(activation.shape[i]) for i, dim in enumerate(dims)}
-        ret = NeuroidAssembly(activation, coords=coords, dims=dims)
-        return ret
+        def _grid(*ns):
+            from itertools import product
 
-def _compute_new_size(w, h, max_spatial_size):
-    if isinstance(max_spatial_size, int):
-        if h > w:
-            new_h = max_spatial_size
-            new_w = int(w * new_h / h)
-        else:
-            new_w = max_spatial_size
-            new_h = int(h * new_w / w)
-    else:
-        new_h = int(h * max_spatial_size)
-        new_w = int(w * max_spatial_size)
-    
-    new_h = max(1, new_h)
-    new_w = max(1, new_w)
+            ns = [range(n) for n in ns]
+            return product(*ns)
 
-    return new_h, new_w
+        def _expand_spec(spec, shape):
+            assert len(spec) == len(shape)
+            return [".".join([f"{dim}{i}" for dim, i in zip(spec, grid)]) for grid in _grid(*shape)]
+
+        # neuroid_id, neuroid_num, layer
+        for layer, size, shape in zip(layers, feat_sizes, feat_shapes):
+            layer_spec = layer_specs[layer]
+            layer_spec = [(dim, s) for dim, s in zip(layer_spec, shape)]
+            neuroid_nums = list(range(count_neuroids, count_neuroids + size))
+            neuroid_ids = [f"{layer}.{nid}" for nid in _expand_spec(*list(zip(*layer_spec)))] if not self.simple_neuroid_id else neuroid_nums
+            count_neuroids += size
+            layers = [layer] * size
+
+            neuroid_coords["neuroid_id"].extend(neuroid_ids)
+            neuroid_coords["neuroid_num"].extend(neuroid_nums)
+            neuroid_coords["layer"].extend(layers)
+
+        for key, val in neuroid_coords.items():
+            neuroid_coords[key] = ("neuroid", val)
+
+        return num_feats, neuroid_coords
diff --git a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/temporal.py b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/temporal.py
new file mode 100644
index 000000000..63a5ba5c2
--- /dev/null
+++ b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/temporal.py
@@ -0,0 +1,459 @@
+from typing import Callable, Hashable, List, Tuple, Union
+from pathlib import Path
+
+import numpy as np
+
+from brainscore_core.supported_data_standards.brainio.assemblies import NeuroidAssembly
+
+from .base import Inferencer
+from ...inputs import Stimulus, Video
+from ...utils import data_assembly_mmap
+
+
+MS_ROUNDING_DIGITS = 3
+
+
+class TemporalInferencer(Inferencer):
+    """Inferencer for video stimuli. The model takes a WHOLE video stimuli as input and generate the activations over time.
+    Then, the activations will be aligned to video time by EVENLY distributing the activations of multiple time steps
+    over time. The aligned activations will be again unified to the fps specified within the constructor (self.fps).
+    Finally, the activations will be packaged into a NeuroidAssembly.
+
+    Example:
+        temporal_inferencer = TemporalInferencer(..., fps=10)
+        model_assembly = temporal_inferencer(video_paths[1000ms], layers)
+        model_assembly.time_bins -> [(0, 100), (100, 200), ..., (900, 1000)]  # 1000ms, 10fps
+
+    Parameters
+    ----------
+    fps: float
+        frame rate of the model sampling.
+
+    num_frames: int, or (int, int)
+        - If None, the model accepts videos of any length.
+        - If a single int is passed, specify how many frames the model takes.
+        - If a tuple of two ints is passed, specify the range of the number of frames the model takes (inclusive). If
+          you need to specify infinite, use np.inf.
+
+    duration: float, or (float, float)
+        - If None, the model accepts videos of any length.
+        - If a single float is passed, specify the duration of the model takes, in ms.
+        - If a tuple of two floats is passed, specify the range of the duration the model takes (inclusive). If you need
+          to specify infinite, use np.inf.
+
+    convert_img_to_video: bool
+        whether to convert the input images to videos.
+    img_duration: float
+        specify the duration of the images, in ms. This will work only if convert_img_to_video is True.
+    batch_size: int
+        number of stimuli to process in each batch.
+    batch_grouper: function
+        function that takes a stimulus and return the property based on which the stimuli can be grouped.
+    """
+
+    def __init__(
+        self,
+        *args,
+        fps: float,
+        num_frames: Union[int, Tuple[int, int]] = None,
+        duration: Union[float, Tuple[float, float]] = None,
+        convert_img_to_video: bool = True,
+        img_duration: float = 1000.0,
+        batch_size: int = 32,
+        batch_grouper: Callable[[Video], Hashable] = lambda video: (round(video.duration, 6), video.fps),
+        **kwargs,
+    ):
+        super().__init__(*args, stimulus_type=Video, batch_size=batch_size, batch_grouper=batch_grouper, **kwargs)
+        self.fps = fps
+        self.num_frames = self._make_range(num_frames, type="num_frames")
+        self.duration = self._make_range(duration, type="duration")
+
+        if convert_img_to_video:
+            assert img_duration is not None, "img_duration should be specified if convert_img_to_video is True"
+        self.img_duration = img_duration
+        self.convert_to_video = convert_img_to_video
+
+    @property
+    def identifier(self) -> str:
+        id = f"{super().identifier}.fps={float(self.fps)}"
+        if self.convert_to_video:
+            id += f".img_dur={float(self.img_duration)}"
+        return id
+
+    def __call__(self, paths: List[Union[str, Path]], layers: List[str], mmap_path: str = None) -> NeuroidAssembly:
+        stimuli = self.load_stimuli(paths)
+        longest_stimulus = stimuli[np.argmax(np.array([stimulus.duration for stimulus in stimuli]))]
+        num_time_bins = longest_stimulus.num_frames
+        time_bin_coords = self._get_time_bin_coords(num_time_bins, self.fps)
+        num_stimuli = len(paths)
+        stimulus_paths = paths
+
+        self._executor.add_stimuli(stimuli)
+        data = None
+
+        for temporal_layer_activations, indicies in self._executor.execute_batch(layers):
+            for temporal_layer_activation, i in zip(temporal_layer_activations, indicies):
+                stimulus = stimuli[i]
+                # determine the time bin correspondence for each layer
+                for t, layer_activation in self._disect_time(temporal_layer_activation, stimulus.num_frames):
+                    if data is None:
+                        num_feats, neuroid_coords = self._get_neuroid_coords(
+                            layer_activation, self._remove_T(self.layer_activation_format)
+                        )
+                        data = data_assembly_mmap(
+                            mmap_path,
+                            shape=(num_stimuli, num_time_bins, num_feats),
+                            dtype=self.dtype,
+                            fill_value=np.nan,
+                        )
+                    flatten_activation = self._flatten_activations(layer_activation)
+                    data[i, t, :] = flatten_activation
+
+        data.register_meta(
+            dims=["stimulus_path", "time_bin", "neuroid"],
+            coords={
+                "stimulus_path": stimulus_paths,
+                **neuroid_coords,
+                **time_bin_coords,
+            },
+        )
+
+        return data.to_assembly()
+
+    def _disect_time(self, temporal_layer_activation, num_frames):
+        paces = {}
+        t_dims = {}
+        for layer in temporal_layer_activation:
+            activation = temporal_layer_activation[layer]
+            specs = self.layer_activation_format[layer]
+            t_dim = specs.index("T") if "T" in specs else None
+            num_t = 1 if t_dim is None else activation.shape[t_dim]
+            paces[layer] = num_t / num_frames  # evenly spaced activations over time
+            t_dims[layer] = t_dim
+
+        layer_ts = {layer: 0 for layer in temporal_layer_activation}
+        for t in range(num_frames):
+            ret = {}
+            for layer in temporal_layer_activation:
+                pace = paces[layer]
+                t_dim = t_dims[layer]
+                if t_dim is None:
+                    ret[layer] = temporal_layer_activation[layer]
+                else:
+                    ret[layer] = temporal_layer_activation[layer].take(int(layer_ts[layer]), axis=t_dim)
+                layer_ts[layer] += pace
+            yield t, ret
+
+    def _get_time_bin_coords(self, num_time_bins, fps):
+        interval = 1000 / fps
+        time_bin_starts = np.arange(0, num_time_bins) * interval
+        time_bin_ends = time_bin_starts + interval
+        return {
+            "time_bin_start": ("time_bin", time_bin_starts),
+            "time_bin_end": ("time_bin", time_bin_ends),
+        }
+
+    def _remove_T(self, layer_specs):
+        ret = {}
+        for layer, specs in layer_specs.items():
+            ret[layer] = specs.replace("T", "")
+        return ret
+
+    def load_stimulus(self, path: Union[str, Path]) -> Video:
+        # enable the conversion of images to videos
+        if self.convert_to_video and Stimulus.is_image_path(path):
+            video = Video.from_img_path(path, self.img_duration, self.fps)
+        else:
+            video = Video.from_path(path)
+        video = video.set_fps(self.fps)
+        self._check_video(video)
+        return video
+
+    def _make_range(self, num, type="num_frames"):
+        if num is None:
+            return (1 if type == "num_frames" else 0, np.inf)
+        if isinstance(num, (tuple, list)):
+            return num
+        else:
+            return (num, num)
+
+    def _check_video(self, video: Video):
+        if self.num_frames is not None:
+            estimated_num_frames = int(self.fps * video.duration / 1000)
+            assert (
+                self.num_frames[0] <= estimated_num_frames <= self.num_frames[1]
+            ), f"The number of frames must be within {self.num_frames}, but got {estimated_num_frames}"
+        if self.duration is not None:
+            assert (
+                self.duration[0] <= video.duration <= self.duration[1]
+            ), f"The duration must be within {self.duration}, but got {video.duration}"
+
+
+class TemporalContextInferencerBase(TemporalInferencer):
+    """Base class for context-aware inferencers (e.g., causal or block).
+
+    It computes the allowable temporal context from `num_frames` and `duration`,
+    then resolves an effective context length using `temporal_context_strategy`.
+
+    Parameters
+    ----------
+    temporal_context_strategy: str
+        How to pick the context length:
+        - "greedy": use the maximum allowed context by the model.
+        - "conservative": use the minimum allowed context by the model.
+        - "fix": use `fixed_temporal_context`.
+    fixed_temporal_context: float
+        Fixed context length in ms (used only when strategy is "fix").
+    out_of_bound_strategy: str
+        How to pad when context runs past video bounds. Currently:
+        - "repeat": repeat boundary frames.
+    """
+
+    def __init__(
+        self,
+        *args,
+        temporal_context_strategy: str = "greedy",
+        fixed_temporal_context: float = None,  # if None, default to greedy
+        out_of_bound_strategy: str = "repeat",
+        **kwargs,
+    ):
+        self.temporal_context_strategy = temporal_context_strategy
+        self.fixed_temporal_context = fixed_temporal_context
+        self.out_of_bound_strategy = out_of_bound_strategy
+        if self.temporal_context_strategy == "fix" and self.fixed_temporal_context is None:
+            raise ValueError("fixed_temporal_context must be specified if temporal_context_strategy is 'fix'.")
+        super().__init__(*args, **kwargs)
+
+    @property
+    def identifier(self) -> str:
+        lower, context = self._compute_temporal_context()
+        lower = round(lower, MS_ROUNDING_DIGITS)
+        context = round(context, MS_ROUNDING_DIGITS)
+        to_add = f".context={context}>{lower}.oob={self.out_of_bound_strategy}"
+        return f"{super().identifier}{to_add}"
+
+    def load_stimulus(self, path):
+        if self.convert_to_video and Stimulus.is_image_path(path):
+            video = Video.from_img_path(path, self.img_duration, self.fps)
+        else:
+            video = Video.from_path(path)
+        video = video.set_fps(self.fps)
+        # does no check here
+        return video
+
+    def _overlapped_range(self, start1, end1, start2, end2):
+        # compute the overlapped range of two ranges (start1, end1) and (start2, end2)
+        lower, upper = max(start1, start2), min(end1, end2)
+        if lower > upper:
+            raise ValueError(f"Ranges [{start1}, {end1}] and [{start2}, {end2}] do not overlap.")
+        return lower, upper
+
+    def _compute_temporal_context(self):
+        duration = self.duration
+        num_frames = self.num_frames
+        strategy = self.temporal_context_strategy
+
+        interval = 1000 / self.fps
+        num_frames_implied_ran = (num_frames[0] * interval, num_frames[1] * interval)
+        ran = self._overlapped_range(*num_frames_implied_ran, *duration)
+        lower = ran[0]
+
+        if strategy in ["greedy", "conservative"]:
+            if strategy == "greedy":
+                return lower, ran[1]
+            elif strategy == "conservative":
+                return lower, ran[0]
+
+        elif strategy == "fix":
+            context = self.fixed_temporal_context if self.fixed_temporal_context is not None else ran[1]
+            assert ran[0] <= context <= ran[1], f"Fixed temporal context {context} is not within the range {ran}"
+
+        else:
+            raise ValueError(f"Unknown temporal context strategy: {strategy}")
+
+        return lower, context
+
+
+class CausalInferencer(TemporalContextInferencerBase):
+    """Causal inference over time: each output time uses only past frames.
+
+    For each time bin, it feeds the model with a clip that ends at that time,
+    then keeps only the last activation step. This yields a time series where
+    each point is strictly causal.
+
+    `inference_fps` controls the output time-bin spacing (defaults to model FPS).
+    `fold_model_time` controls whether model time ("T") is folded into the neuroid axis
+    (True) or reduced to the last time step (False, default).
+    """
+
+    def __init__(self, *args, inference_fps: float = None, fold_model_time: bool = False, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.inference_fps = self.fps if inference_fps is None else inference_fps
+        self.fold_model_time = fold_model_time
+        self.temporal_context_strategy = 'fix' if fold_model_time else self.temporal_context_strategy
+
+    @property
+    def identifier(self) -> str:
+        base_id = super().identifier
+        if self.fold_model_time:
+            base_id = f"{base_id}.fold_model_time"
+        if self.inference_fps == self.fps:
+            return base_id
+        return f"{base_id}.ifps={float(self.inference_fps)}"
+
+    def __call__(self, paths, layers, mmap_path=None):
+        lower, context = self._compute_temporal_context()
+        interval = 1000 / self.inference_fps
+        stimuli = self.load_stimuli(paths)
+        longest_stimulus = stimuli[np.argmax(np.array([stimulus.duration for stimulus in stimuli]))]
+        # Match the number of time steps implied by the per-stimulus arange loop.
+        num_time_bins = int(np.ceil(longest_stimulus.duration / interval))
+        num_stimuli = len(paths)
+        time_bin_coords = self._get_time_bin_coords(num_time_bins, self.inference_fps)
+        stimulus_paths = paths
+
+        ts = []
+        stimulus_index = []
+        for s, stimulus in enumerate(stimuli):
+            duration = stimulus.duration
+            videos = []
+            # here we ensure that the covered time range at least include the whole duration
+            for t, time_end in enumerate(np.arange(interval, duration + interval, interval)):
+                # see if the model only receive limited context
+                time_start = self._get_time_start(time_end, context, lower)
+                clip = stimulus.set_window(time_start, time_end, padding=self.out_of_bound_strategy)
+                videos.append(clip)
+                ts.append(t)
+                stimulus_index.append(s)
+            self._executor.add_stimuli(videos)
+
+        data = None
+        for temporal_layer_activations, indicies in self._executor.execute_batch(layers):
+            for temporal_layer_activation, i in zip(temporal_layer_activations, indicies):
+                s = stimulus_index[i]
+                if self.fold_model_time:
+                    layer_activation = temporal_layer_activation
+                    layer_specs = self.layer_activation_format
+                else:
+                    layer_activation = self._get_last_time(temporal_layer_activation)
+                    layer_specs = self._remove_T(self.layer_activation_format)
+                if data is None:
+                    num_feats, neuroid_coords = self._get_neuroid_coords(
+                        layer_activation, layer_specs
+                    )
+                    data = data_assembly_mmap(
+                        mmap_path, shape=(num_stimuli, num_time_bins, num_feats), dtype=self.dtype, fill_value=np.nan
+                    )
+                flatten_activation = self._flatten_activations(layer_activation)
+                t = ts[i]
+                data[s, t, :] = flatten_activation
+
+        data.register_meta(
+            dims=["stimulus_path", "time_bin", "neuroid"],
+            coords={
+                "stimulus_path": stimulus_paths,
+                **neuroid_coords,
+                **time_bin_coords,
+            },
+        )
+
+        return data.to_assembly()
+
+    def _get_time_start(self, time_end, context, lower):
+        assert context >= lower, f"Temporal context {context} is not within the range {lower}"
+        if self.temporal_context_strategy == "fix":
+            return time_end - context
+        elif self.temporal_context_strategy == "greedy":
+            proposed_time_start = time_end - context
+            if proposed_time_start >= 0:
+                return proposed_time_start
+            else:
+                if time_end < lower:
+                    return time_end - lower
+                else:
+                    return 0
+        elif self.temporal_context_strategy == "conservative":
+            return time_end - context
+
+    def _get_last_time(self, temporal_layer_activation):
+        ret = {}
+        for layer, activation in temporal_layer_activation.items():
+            specs = self.layer_activation_format[layer]
+            t_dim = specs.index("T") if "T" in specs else None
+            ret[layer] = activation.take(-1, axis=t_dim) if t_dim is not None else activation
+        return ret
+
+
+class BlockInferencer(TemporalContextInferencerBase):
+    """Block-wise inference: split a video into context-sized chunks.
+
+    Each block is inferred separately and then stitched along time.
+    Block size follows the resolved temporal context (from strategy, num_frames, duration).
+    """
+
+    def __call__(self, paths, layers, mmap_path=None):
+        _, context = self._compute_temporal_context()
+
+        if np.isinf(context):
+            return super().__call__(paths, layers, mmap_path)
+
+        EPS = 1e-6
+        stimuli = self.load_stimuli(paths)
+        num_stimuli = len(paths)
+        stimulus_paths = paths
+
+        num_blocks = []
+        num_time_bins = []
+        num_frames_per_block = None
+        t_offsets = []
+        stimulus_index = []
+        for s, stimulus in enumerate(stimuli):
+            duration = stimulus.duration
+            video_blocks = []
+            # for each stimulus, divide it into block clips with the specified context
+            # EPS makes sure the last block is not out-of-bound
+            for block_id, time_start in enumerate(np.arange(0, duration + EPS, context)):
+                time_end = time_start + context
+                clip = stimulus.set_window(time_start, time_end, padding=self.out_of_bound_strategy)
+                video_blocks.append(clip)
+                stimulus_index.append(s)
+                clip_num_samples = clip.set_fps(self.fps).num_frames
+                if num_frames_per_block is None:
+                    num_frames_per_block = clip_num_samples
+                else:
+                    assert num_frames_per_block == clip_num_samples, "All blocks must have the same number of frames."
+                t_offsets.append(block_id * num_frames_per_block)
+            self._executor.add_stimuli(video_blocks)
+            num_blocks.append(len(video_blocks))
+            num_time_bins.append(len(video_blocks) * num_frames_per_block)
+
+        num_time_bins = max(num_time_bins)
+        time_bin_coords = self._get_time_bin_coords(num_time_bins, self.fps)
+
+        data = None
+        for temporal_layer_activations, indicies in self._executor.execute_batch(layers):
+            for temporal_layer_activation, i in zip(temporal_layer_activations, indicies):
+                s = stimulus_index[i]
+                # determine the time bin correspondence for each layer
+                for t, layer_activation in self._disect_time(temporal_layer_activation, num_frames_per_block):
+                    if data is None:
+                        num_feats, neuroid_coords = self._get_neuroid_coords(
+                            layer_activation, self._remove_T(self.layer_activation_format)
+                        )
+                        data = data_assembly_mmap(
+                            mmap_path, shape=(num_stimuli, num_time_bins, num_feats), dtype=self.dtype, fill_value=np.nan
+                        )
+                    flatten_activation = self._flatten_activations(layer_activation)
+                    t = t_offsets[i] + t
+                    data[s, t, :] = flatten_activation
+
+        data.register_meta(
+            dims=["stimulus_path", "time_bin", "neuroid"],
+            coords={
+                "stimulus_path": stimulus_paths,
+                **neuroid_coords,
+                **time_bin_coords,
+            },
+        )
+
+        return data.to_assembly()
diff --git a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/__init__.py b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/__init__.py
deleted file mode 100644
index 5d99e466a..000000000
--- a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .base import TemporalInferencer
-from .temporal_context import *
\ No newline at end of file
diff --git a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/base.py b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/base.py
deleted file mode 100644
index 02c50afe1..000000000
--- a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/base.py
+++ /dev/null
@@ -1,134 +0,0 @@
-import numpy as np
-from typing import Union, Tuple, Callable, Hashable, List, Dict
-from pathlib import Path
-
-from brainscore_vision.model_helpers.activations.temporal.inputs import Video, Stimulus
-from brainscore_vision.model_helpers.activations.temporal.utils import assembly_align_to_fps, stack_with_nan_padding
-from brainscore_core.supported_data_standards.brainio.assemblies import NeuroidAssembly
-
-from ..base import Inferencer
-from . import time_aligner as time_aligners 
-
-
-class TemporalInferencer(Inferencer):
-    """Inferencer for video stimuli. The model takes video stimuli as input and generate the activations over time.
-    Then, the activations will be aligned to video time by the time_aligner specified in the constructor. The aligned
-    activations will be again unified to the fps specified within the constructor (self.fps). Finally, the activations
-    will be packaged into a NeuroidAssembly.
-    
-    NOTE: for all the time_alignment method, the inference of time bins will only be done with the longest video, but ignore all other input videos.
-
-    Example:
-        temporal_inferencer = TemporalInferenver(..., fps=10)
-        model_assembly = temporal_inferencer(video_paths[1000ms], layers)
-        model_assembly.time_bins -> [(0, 100), (100, 200), ..., (900, 1000)]  # 1000ms, 10fps
-
-    Parameters
-    ----------
-    fps: float
-        frame rate of the model sampling.
-
-    num_frames: int, or (int, int)
-        - If None, the model accepts videos of any length.
-        - If a single int is passed, specify how many frames the model takes. 
-        - If a tuple of two ints is passed, specify the range of the number of frames the model takes (inclusive). If you need to specify infinite, use np.inf.
-    
-    duration: float, or (float, float)
-        - If None, the model accepts videos of any length.
-        - If a single float is passed, specify the duration of the model takes, in ms.
-        - If a tuple of two floats is passed, specify the range of the duration the model takes (inclusive). If you need to specify infinite, use np.inf.
-    
-    time_alignment: str
-        specify the method to align the activations in time.
-        The options and specifications are in the time_aligners module. The current options are:
-        - evenly_spaced: align the activations to have evenly spaced time bins across the whole video time span.
-        - ignore_time: ignore the time information and make a single time bin of the entire video.
-        - estimate_layer_fps: estimate the fps of the layer based on the video fps.
-        - per_frame_aligned: align the activations to the video frames.
-
-    convert_img_to_video: bool
-        whether to convert the input images to videos.
-    img_duration: float
-        specify the duration of the images, in ms. This will work only if convert_img_to_video is True.
-    batch_size: int
-        number of stimuli to process in each batch.
-    batch_grouper: function
-        function that takes a stimulus and return the property based on which the stimuli can be grouped.
-    """
-    def __init__(
-            self,
-            *args,
-            fps : float,
-            num_frames : Union[int, Tuple[int, int]] = None,
-            duration : Union[float, Tuple[float, float]] = None,
-            time_alignment : str = "evenly_spaced",
-            convert_img_to_video : bool = True,
-            img_duration : float = 1000.0,
-            batch_size : int = 32,
-            batch_grouper : Callable[[Video], Hashable] = lambda video: (round(video.duration, 6), video.fps),  # not including video.frame_size because most preprocessors will change the frame size to be the same
-            **kwargs,
-    ):
-        super().__init__(*args, stimulus_type=Video, batch_size=batch_size, 
-                         batch_grouper=batch_grouper, **kwargs)
-        self.fps = fps
-        self.num_frames = self._make_range(num_frames, type="num_frames")
-        self.duration = self._make_range(duration, type="duration")
-        assert hasattr(time_aligners, time_alignment), f"Unknown time alignment method: {time_alignment}"
-        self.time_aligner = getattr(time_aligners, time_alignment)
-
-        if convert_img_to_video:
-            assert img_duration is not None, "img_duration should be specified if convert_img_to_video is True"
-        self.img_duration = img_duration
-        self.convert_to_video = convert_img_to_video
-
-    @property
-    def identifier(self) -> str:
-        id = f"{super().identifier}.{self.time_aligner.__name__}.fps={float(self.fps)}"
-        if self.convert_to_video:
-            id += f".img_dur={float(self.img_duration)}"
-        return id
-
-    def load_stimulus(self, path: Union[str, Path]) -> Video:
-        if self.convert_to_video and Stimulus.is_image_path(path):
-            video = Video.from_img_path(path, self.img_duration, self.fps)
-        else:
-            video = Video.from_path(path)
-        video = video.set_fps(self.fps)
-        self._check_video(video)
-        return video
-    
-    def package_layer(
-            self, 
-            layer_activations : List[np.array],
-            layer_spec : str, 
-            stimuli : List[Stimulus]
-        ):
-        assert len(layer_activations) == len(stimuli)
-        longest_stimulus = stimuli[np.argmax(np.array([stimulus.duration for stimulus in stimuli]))]
-        ignore_time = self.time_aligner is time_aligners.ignore_time
-        channels = self._map_dims(layer_spec)
-        layer_activations = stack_with_nan_padding(layer_activations)
-        assembly = self._package(layer_activations, ["stimulus_path"] + channels)
-        # align to the longest stimulus
-        assembly = self.time_aligner(assembly, longest_stimulus)
-        if "channel_temporal" in channels and not ignore_time: 
-            channels.remove("channel_temporal")
-        assembly = self._stack_neuroid(assembly, channels)
-        if not ignore_time:
-            assembly = assembly_align_to_fps(assembly, self.fps)
-        return assembly
-
-    def _make_range(self, num, type="num_frames"):
-        if num is None:
-            return (1 if type=='num_frames' else 0, np.inf)
-        if isinstance(num, (tuple, list)):
-            return num
-        else:
-            return (num, num)
-
-    def _check_video(self, video: Video):
-        if self.num_frames is not None:
-            estimated_num_frames = int(self.fps * video.duration / 1000)
-            assert self.num_frames[0] <= estimated_num_frames <= self.num_frames[1], f"The number of frames must be within {self.num_frames}, but got {estimated_num_frames}"
-        if self.duration is not None:
-            assert self.duration[0] <= video.duration <= self.duration[1], f"The duration must be within {self.duration}, but got {video.duration}"
diff --git a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/__init__.py b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/__init__.py
deleted file mode 100644
index 4813c1fa9..000000000
--- a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .causal import CausalInferencer
-from .block import BlockInferencer
\ No newline at end of file
diff --git a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/base.py b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/base.py
deleted file mode 100644
index 683741004..000000000
--- a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/base.py
+++ /dev/null
@@ -1,99 +0,0 @@
-from typing import Union, List
-from pathlib import Path
-
-from brainscore_vision.model_helpers.activations.temporal.inputs import Video, Stimulus
-from ..base import TemporalInferencer
-
-
-MS_ROUNDING_DIGITS = 3
-
-class TemporalContextInferencerBase(TemporalInferencer):
-    """Inferencer base that computes the temporal context for concrete context-based inferencers,
-    like CausalInferencer and BlockInferencer. 
-
-    The context range is first determined by the num_frames and duration. Then, temporal_context_strategy
-    is used to determine the lower bound of temporal context and the expected temporal context.
-    
-    Parameters
-    ----------
-    temporal_context_strategy: str
-        specify how the length of temporal context for causal inference is determined.
-        Options:
-        - "greedy": the length of the temporal context is determined by the maximum of num_frames and duration.
-        - "conservative": the length of the temporal context is determined by the minimum of num_frames and duration.
-        - "fix": the length of the temporal context is determined by the specified "fixed_temporal_context".
-    
-    fixed_temporal_context: float
-        specify the fixed length of the temporal context, in ms. It will be used only if temporal_context_strategy is "fix".
-    
-    out_of_bound_strategy: str
-        specify how to handle the out-of-bound temporal context.
-        Options:
-        - "repeat": the out-of-bound temporal context will be repeated.
-        - TODO: "black": the out-of-bound temporal context will be zero-padded.
-        - TODO: "gray": the out-of-bound temporal context will be 128-padded.
-    """
-    def __init__(
-            self, 
-            *args,
-            temporal_context_strategy : str = "greedy",
-            fixed_temporal_context : float = None,
-            out_of_bound_strategy : str = "repeat",
-            **kwargs
-        ):
-        self.temporal_context_strategy = temporal_context_strategy
-        self.fixed_temporal_context = fixed_temporal_context
-        self.out_of_bound_strategy = out_of_bound_strategy
-        if self.temporal_context_strategy == "fix" and self.fixed_temporal_context is None:
-            raise ValueError("fixed_temporal_context must be specified if temporal_context_strategy is 'fix'.")
-        super().__init__(*args, **kwargs)
-
-    @property
-    def identifier(self) -> str:
-        lower, context = self._compute_temporal_context()
-        lower = round(lower, MS_ROUNDING_DIGITS)
-        context = round(context, MS_ROUNDING_DIGITS)
-        to_add = f".context={context}>{lower}.oob={self.out_of_bound_strategy}"
-        return f"{super().identifier}{to_add}"
-        
-    def load_stimulus(self, path: Union[str, Path]) -> Video:
-        if self.convert_to_video and Stimulus.is_image_path(path):
-            video = Video.from_img_path(path, self.img_duration, self.fps)
-        else:
-            video = Video.from_path(path)
-        video = video.set_fps(self.fps)
-        # does no check here
-        return video
-
-    def _overlapped_range(self, start1, end1, start2, end2):
-        # compute the overlapped range of two ranges (start1, end1) and (start2, end2)
-        lower, upper = max(start1, start2), min(end1, end2)
-        if lower > upper:
-            raise ValueError(f"Ranges [{start1}, {end1}] and [{start2}, {end2}] do not overlap.")
-        return lower, upper
-        
-    def _compute_temporal_context(self):
-        duration = self.duration
-        num_frames = self.num_frames
-        strategy = self.temporal_context_strategy
-
-        interval = 1000 / self.fps
-        num_frames_implied_ran = (num_frames[0] * interval, num_frames[1] * interval)
-        ran = self._overlapped_range(*num_frames_implied_ran, *duration)
-        lower = ran[0]
-
-        if strategy in ["greedy", "conservative"]:
-            if strategy == "greedy":
-                return lower, ran[1]
-            elif strategy == "conservative":
-                return lower, ran[0]
-
-        elif strategy == "fix":
-            context = self.fixed_temporal_context
-            assert ran[0] <= context <= ran[1], f"Fixed temporal context {context} is not within the range {ran}"
-
-        else:
-            raise ValueError(f"Unknown temporal context strategy: {strategy}")
-
-        return lower, context
-    
diff --git a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/block.py b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/block.py
deleted file mode 100644
index f6823a061..000000000
--- a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/block.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import numpy as np
-from collections import OrderedDict
-from tqdm import tqdm
-
-from .base import TemporalContextInferencerBase
-from brainscore_vision.model_helpers.activations.temporal.utils import stack_with_nan_padding
-
-
-class BlockInferencer(TemporalContextInferencerBase):
-    """Inferencer that divides the original video into smaller blocks and does inference on the blocks separately.
-    Finally, the activations are joint along the temporal dimension for the final activations. 
-
-    Specifically, suppose the video lasts for 1000ms and the block size is 400ms.
-    Then, the video is segmented into [0~400ms], [400~800ms], [800~1200ms] (1000~1200ms padded).
-    The activations for each segment will be stacked together.
-
-    The block size is determined by the temporal parameters (num_frames & duration) and temporal_context_strategy.
-    If num_frames or duration is given, the model's temporal context will be set to match the two.
-    """
-    
-    def inference(self, stimuli, layers):
-        _, context = self._compute_temporal_context()
-        self._time_ends = []
-
-        if np.isinf(context):
-            # if the context is inf, pass the whole video directively
-            self._time_ends = [inp.duration for inp in stimuli]
-            layer_activations = super().inference(stimuli, layers)
-            layer_activations = OrderedDict([(layer, [[a] for a in activations]) 
-                                             for layer, activations in layer_activations.items()]) 
-                                             # make one-clip video activations
-        else:
-            num_clips = []
-            for stimulus in stimuli:
-                duration = stimulus.duration
-                videos = []
-                # for each stimulus, divide it into block clips with the specified context
-                for time_start in np.arange(0, duration, context):
-                    time_end = time_start + context
-                    clip = stimulus.set_window(time_start, time_end, padding=self.out_of_bound_strategy)
-                    videos.append(clip)
-                # record the actual time_end (which could be larger than the duration of the original video)
-                # so that we can align the time correctly when packaging the activations
-                self._time_ends.append(time_end)
-                self._executor.add_stimuli(videos)
-                num_clips.append(len(videos))  # record the number of clips for each video
-
-            activations = self._executor.execute(layers)
-            layer_activations = OrderedDict()
-            for layer in layers:
-                clip_start = 0
-                for num_clip in num_clips:
-                    # retrieve clips from a video by num_clip
-                    clips = activations[layer][clip_start:clip_start+num_clip]  # clips for this video
-                    layer_activations.setdefault(layer, []).append(clips)
-                    clip_start += num_clip
-    
-        # concat the activations from the clips of the same video
-        ret = OrderedDict()
-        for layer in layers:
-            activation_dims = self.layer_activation_format[layer]
-            for clips in layer_activations[layer]:
-                # make T the first dimension, as [T, ...] for easy concatenation
-                # the package_layer will change accordingly
-                if 'T' in activation_dims:
-                    time_index = activation_dims.index('T')
-                    clips = [np.moveaxis(a, time_index, 0) for a in clips]
-                    ret.setdefault(layer, []).append(np.concatenate(clips, axis=0))
-                else:
-                    ret.setdefault(layer, []).append(np.stack(clips, axis=0))
-
-        return ret
-    
-    def package_layer(self, activations, layer_spec, stimuli):
-        layer_spec = "T" + layer_spec.replace('T', '')  # T has been moved to the first dimension
-        stimuli = [stimulus.set_window(0, time_end) for stimulus, time_end in zip(stimuli, self._time_ends)]
-        return super().package_layer(activations, layer_spec, stimuli) 
\ No newline at end of file
diff --git a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/causal.py b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/causal.py
deleted file mode 100644
index 778961660..000000000
--- a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/causal.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import numpy as np
-from collections import OrderedDict
-
-from .base import TemporalContextInferencerBase
-from brainscore_vision.model_helpers.activations.temporal.inputs.video import Video
-from brainscore_vision.model_helpers.activations.temporal.utils import stack_with_nan_padding
-
-
-class CausalInferencer(TemporalContextInferencerBase):
-    """Inferencer that ensures the activations are causal by feeding in the video in a causal manner. 
-
-    Specifically, suppose the video lasts for 1000ms and the model samples every 100ms.
-    Then, the activations from the last time step of the activations from the following videos:
-    [0 ~ 100ms], [0 ~ 200ms], ..., [0 ~ 1000ms].
-    will be stacked together to form the final activations for the video. In this way, the 
-    activations are always "causal", which means that the temporal contexts of the model are 
-    always from the past, not the future.
-
-    If num_frames or duration is given, the model's temporal context will be set to match the two.
-    """
-    def __init__(
-            self, 
-            *args,
-            **kwargs
-        ):
-        if "time_alignment" in kwargs:
-            if kwargs["time_alignment"] != "per_frame_aligned":
-                raise ValueError("CausalInferencer enforces time_alignment='per_frame_aligned'.")
-            else:
-                del kwargs["time_alignment"]
-        super().__init__(*args, **kwargs, time_alignment="per_frame_aligned")
-
-    def inference(self, stimuli, layers):
-        interval = 1000 / self.fps
-        lower, context = self._compute_temporal_context()
-        num_clips = []
-        latest_time_end = 0
-        for inp in stimuli:
-            duration = inp.duration
-            videos = []
-            # here we ensure that the covered time range at least include the whole duration
-            for time_end in np.arange(interval, duration+interval, interval):
-                # see if the model only receive limited context
-                time_start = self._get_time_start(time_end, context, lower)
-                clip = inp.set_window(time_start, time_end, padding=self.out_of_bound_strategy)
-                latest_time_end = max(time_end, latest_time_end)
-                videos.append(clip)
-
-            self._executor.add_stimuli(videos)
-            num_clips.append(len(videos))
-
-        activations = self._executor.execute(layers)
-        layer_activations = OrderedDict()
-        for layer in layers:
-            activation_dims = self.layer_activation_format[layer]
-            clip_start = 0
-            for num_clip in num_clips:
-                video_activations = activations[layer][clip_start:clip_start+num_clip]  # clips for this video
-                # make T the first dimension, as [T, ...]
-                if 'T' in activation_dims:
-                    time_index = activation_dims.index('T')
-                    video_activations = [a.take(-1, axis=time_index) for a in video_activations]
-                layer_activations.setdefault(layer, []).append(np.stack(video_activations, axis=0))
-                clip_start += num_clip
-
-        return layer_activations
-    
-    def package_layer(self, activations, layer_spec, stimuli):
-        layer_spec = "T" + layer_spec.replace('T', '')  # T has been moved to the first dimension
-        return super().package_layer(activations, layer_spec, stimuli) 
-    
-    def _get_time_start(self, time_end, context, lower):
-        assert context >= lower, f"Temporal context {context} is not within the range {lower}"
-        if self.temporal_context_strategy == "fix":
-            return time_end - context
-        elif self.temporal_context_strategy == "greedy":
-            proposed_time_start = time_end - context
-            if proposed_time_start >= 0:
-                return proposed_time_start
-            else:
-                if time_end < lower:
-                    return time_end - lower
-                else:
-                    return 0 
-        elif self.temporal_context_strategy == "conservative":
-            return time_end - context
\ No newline at end of file
diff --git a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/time_aligner.py b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/time_aligner.py
deleted file mode 100644
index b656cb767..000000000
--- a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/time_aligner.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import numpy as np
-from brainscore_core.supported_data_standards.brainio.assemblies import DataAssembly
-from brainscore_vision.model_helpers.activations.temporal.inputs.video import Video
-
-
-"""This module includes different time alignment strategies for the activations of a temporal neural network.
-
-    A time alignment strategy is a function that takes the DataAssembly (with channel_temporal) and the video stimuli,
-    and aligns the activations to the video time. The channel_temporal dimension will be changed into time_bin dimension,
-    and the time_bin_start and time_bin_end will be added as coordinates of it.
-"""
-
-
-def _convert(assembly, time_bin_starts, time_bin_ends):
-    # this function converts the "channel_temporal" dimension to "time_bin" dimension
-    # if "channel_temporal" is not present, it adds a "time_bin" dimension
-    asm_type = assembly.__class__
-    if "channel_temporal" in assembly.dims:
-        assembly = assembly.drop_vars("channel_temporal")
-        assembly = assembly.rename({"channel_temporal": "time_bin"})
-    else:
-        assembly = assembly.expand_dims("time_bin")
-    assembly = assembly.assign_coords({
-        "time_bin_start": ("time_bin", time_bin_starts),
-        "time_bin_end": ("time_bin", time_bin_ends)
-    })
-    return asm_type(assembly)
-
-def estimate_layer_fps(assembly : DataAssembly, video : Video) -> DataAssembly:
-    # in a temporal neural net, different layers may have different temporal resolutions
-    # this function estimates the temporal resolution of a layer, based on the video fps
-    fps = video.fps
-    num_t = assembly.sizes['channel_temporal'] if "channel_temporal" in assembly.dims else 1
-    duration = video.duration
-    model_frame_interval = 1000 / fps
-    estimated_frame_interval = duration / num_t
-    estimated_multiplier = estimated_frame_interval / model_frame_interval
-    estimated_multiplier = int(round(estimated_multiplier))  # round to nearest integer
-    estimated_interval = model_frame_interval * estimated_multiplier
-    time_bin_starts = np.arange(0, num_t) * estimated_interval
-    time_bin_ends = time_bin_starts + estimated_interval
-    return _convert(assembly, time_bin_starts, time_bin_ends)
-
-def evenly_spaced(assembly : DataAssembly, video : Video) -> DataAssembly:
-    # this function assumes that the activation of different time steps is evenly spaced
-    num_t = assembly.sizes['channel_temporal'] if "channel_temporal" in assembly.dims else 1
-    interval = video.duration / num_t
-    time_bin_starts = np.linspace(0, video.duration, num_t+1)[:-1]
-    time_bin_ends = time_bin_starts + interval
-    time_bin_ends[-1] = video.duration
-    return _convert(assembly, time_bin_starts, time_bin_ends)
-
-def per_frame_aligned(assembly : DataAssembly, video : Video) -> DataAssembly:
-    # this function assumes that the activation of different time steps is aligned with the video frames
-    num_t = assembly.sizes['channel_temporal'] if "channel_temporal" in assembly.dims else 1
-    assert video.num_frames <= num_t
-    interval = 1000 / video.fps
-    time_bin_starts = np.arange(0, num_t) * interval
-    time_bin_ends = time_bin_starts + interval
-    return _convert(assembly, time_bin_starts, time_bin_ends)
-
-def ignore_time(assembly : DataAssembly, video : Video) -> DataAssembly:
-    # this function treats the activations from the entire video as from a single time bin,
-    # and treat the "channel_temporal" as a regular channel dimension and does no conversion 
-    asm_type = assembly.__class__
-    assembly = assembly.expand_dims("time_bin")
-    time_bin_starts = [0]
-    time_bin_ends = [video.duration]
-    assembly = assembly.assign_coords({
-        "time_bin_start": ("time_bin", time_bin_starts),
-        "time_bin_end": ("time_bin", time_bin_ends)
-    })
-    return asm_type(assembly)
\ No newline at end of file
diff --git a/brainscore_vision/model_helpers/activations/temporal/inputs/__init__.py b/brainscore_vision/model_helpers/activations/temporal/inputs/__init__.py
index 1e2d95604..db3fbec97 100644
--- a/brainscore_vision/model_helpers/activations/temporal/inputs/__init__.py
+++ b/brainscore_vision/model_helpers/activations/temporal/inputs/__init__.py
@@ -1,3 +1,302 @@
-from .video import Video
-from .image import Image
-from .base import Stimulus
\ No newline at end of file
+import os
+from pathlib import Path
+from typing import Tuple, Union
+
+import cv2
+import numpy as np
+from PIL import Image as PILImage
+
+from ..utils import batch_2d_resize
+
+
+class Stimulus:
+    def from_path(self, path):
+        raise NotImplementedError("Choose a concrete Stimulus type to use.")
+
+    @staticmethod
+    def is_video_path(path: Union[str, Path]) -> bool:
+        extension = path.split('.')[-1].lower()
+        return extension in ['mp4', 'avi', 'mov', 'flv', 'wmv', 'webm', 'mkv', 'gif']
+
+    @staticmethod
+    def is_image_path(path: Union[str, Path]) -> bool:
+        extension = path.split('.')[-1].lower()
+        return extension in ['jpg', 'jpeg', 'png', 'bmp', 'tiff']
+
+
+class Image(Stimulus):
+    def __init__(self, path: str, size: int):
+        self._path = path
+        self._size = size
+
+    def copy(self):
+        return Image(self._path, self._size)
+
+    @property
+    def size(self):
+        return self._size
+
+    def set_size(self, size):
+        img = self.copy()
+        img._size = size
+        return img
+
+    def from_path(path):
+        return Image(path, get_image_size(path))
+
+    def to_pil_img(self):
+        return PILImage.fromarray(self.to_numpy())
+
+    def get_frame(self):
+        return np.array(PILImage.open(self._path).convert('RGB'))
+
+    # return (H, W, C[RGB])
+    def to_numpy(self):
+        arr = self.get_frame()
+
+        if arr.shape[:2][::-1] != self._size:
+            arr = batch_2d_resize(arr[None, :], self._size, "bilinear")[0]
+
+        return arr
+
+    def store_to_path(self, path):
+        self.to_img().save(path)
+        return path
+
+
+def get_image_size(path):
+    with PILImage.open(path) as img:
+        size = img.size
+    return size
+
+
+EPS = 1e-9
+
+
+def get_video_stats(video_path):
+    assert os.path.exists(video_path), f"Video file {video_path} does not exist."
+    cap = cv2.VideoCapture(video_path)
+    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    size = (width, height)
+    duration = length / fps * 1000
+    cap.release()
+    return fps, duration, size
+
+
+def get_image_stats(image_path):
+    with PILImage.open(image_path) as img:
+        return img.size
+    return size
+
+
+class Video(Stimulus):
+    """Video object that represents a video clip."""
+
+    def __init__(
+        self,
+        path: Union[str, Path],
+        fps: float,
+        start: float,
+        end: float,
+        size: Tuple[int, int],
+    ):
+        self._path = path
+        self._fps = fps
+        self._size = size
+        self._start = start
+        self._end = end
+        self._original_fps = None
+        self._original_duration = None
+        self._original_size = None
+
+    def __getattribute__(self, key):
+        if key.startswith("_original_"):
+            if super().__getattribute__(key) is None:
+                self._original_fps, self._original_duration, self._original_size = get_video_stats(self._path)
+        return super().__getattribute__(key)
+
+    def copy(self):
+        # return view
+        video = self.__class__(self._path, self._fps, self._start, self._end, self._size)
+        video._original_fps = self._original_fps
+        video._original_duration = self._original_duration
+        video._original_size = self._original_size
+        return video
+
+    @property
+    def duration(self):
+        # in ms
+        return self._end - self._start
+
+    @property
+    def fps(self):
+        return self._fps
+
+    @property
+    def num_frames(self):
+        return int(self.duration * self.fps / 1000 + EPS)
+
+    @property
+    def original_num_frames(self):
+        return int(self._original_duration * self._original_fps / 1000 + EPS)
+
+    @property
+    def frame_size(self):
+        return self._size
+
+    ### Transformations: return copy
+
+    def set_fps(self, fps):
+        assert 1000 / fps <= self.duration, f"fps {fps} is too low for duration {self.duration}ms."
+        video = self.copy()
+        video._fps = fps
+        return video
+
+    def set_size(self, size):
+        # size: (width, height)
+        video = self.copy()
+        video._size = size
+        return video
+
+    def set_window(self, start, end, padding="repeat"):
+        # use ms as the time scale
+        if end < start:
+            raise ValueError("end time is earlier than start time")
+
+        if padding != "repeat":
+            raise NotImplementedError()
+
+        video = self.copy()
+        video._start = self._start + start
+        video._end = self._start + end
+        return video
+
+    def _check_indices_ascending(self, indices):
+        if len(indices) == 0:
+            return False
+        if len(indices) == 1:
+            return True
+        for i in range(1, len(indices)):
+            if indices[i] < indices[i - 1]:
+                return False
+        return True
+
+    def _sanitize_frames(self, frames, tol=0.01):
+        # check if the read frames are valid
+        # if some last frames are invalid, just copy the last valid frame
+        # default tolerance: 0.01 of the total duration
+        num_invalid = sum([f is None for f in frames])
+        if num_invalid == len(frames):
+            raise ValueError("No valid frames.")
+        for i in range(num_invalid):
+            assert frames[-1 - i] is None, "Invalid frames are not at the end."
+        if num_invalid > int(self._original_duration / 1000 * self._fps * tol):
+            raise ValueError("Too many invalid frames.")
+        if num_invalid > 0:
+            for i in range(num_invalid):
+                frames[-1 - i] = frames[-1 - num_invalid]
+            print(f"Warning: last {num_invalid} frames are invalid.")
+        return frames
+
+    def get_frames(self, indices):
+        cap = cv2.VideoCapture(self._path)
+        if not cap.isOpened():
+            raise ValueError(f"Cannot open video file: {self._path}")
+
+        def _read(cap):
+            ret, frame = cap.read()
+            if not ret:
+                return None
+            return cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+
+        # ascending read optimization
+        frames = []
+        if self._check_indices_ascending(indices):
+            cap.set(cv2.CAP_PROP_POS_FRAMES, indices[0])  # Move to the first frame index
+            frame_index = indices[0] - 1
+            for target_index in indices:
+                to_move = target_index - frame_index
+                for _ in range(to_move):
+                    frame = _read(cap)
+                frames.append(frame)
+                frame_index += to_move
+        else:
+            # random access
+            for i, index in enumerate(indices):
+                cap.set(cv2.CAP_PROP_POS_FRAMES, index)  # Move to the frame index
+                frames.append(_read(cap))
+
+        cap.release()
+        frames = self._sanitize_frames(frames)
+
+        return np.array(frames)
+
+    ### I/O
+    def from_path(path):
+        fps, end, size = get_video_stats(path)
+        start = 0
+        return Video(path, fps, start, end, size)
+
+    def from_img_path(img_path, duration, fps):
+        # duration in ms
+        size = get_image_stats(img_path)
+        return VideoFromImage(img_path, fps, 0, duration, size)
+
+    def to_numpy(self):
+        # get the time stamps of frame samples
+        start_frame = self._start * self._original_fps / 1000
+        end_frame = self._end * self._original_fps / 1000
+        # avoid taking the last extra frame
+        samples = np.arange(start_frame, end_frame - EPS, self._original_fps / self.fps)
+        sample_indices = samples.astype(int)
+
+        # padding: repeat the first/last frame
+        original_num_frames = int(self._original_duration * self._original_fps / 1000 - EPS)  # EPS to avoid last frame OOB error
+        sample_indices = np.clip(sample_indices, 0, original_num_frames - 1)
+
+        # actual sampling
+        frames = self.get_frames(sample_indices)
+
+        # resizing
+        if self._size != (frames.shape[2], frames.shape[1]):
+            frames = batch_2d_resize(frames, self._size, "bilinear")
+
+        return frames
+
+    def to_frames(self):
+        return [f for f in self.to_numpy()]
+
+    def to_pil_imgs(self):
+        return [PILImage.fromarray(frame) for frame in self.to_numpy()]
+
+    def to_path(self):
+        # use context manager ?
+        path = None  # make a temporal file
+        raise NotImplementedError()
+        return path
+
+    def store_to_path(self, path):
+        # pick format based on path filename
+        if path.endswith(".avi"):
+            fourcc = cv2.VideoWriter_fourcc(*'XVID')
+        elif path.endswith(".mp4"):
+            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        else:
+            raise ValueError("Unsupported video format.")
+
+        out = cv2.VideoWriter(path, fourcc, self._fps, self._size)
+        for frame in self.to_frames():
+            out.write(frame[..., ::-1])  # to RGB
+        out.release()
+        return path
+
+
+class VideoFromImage(Video):
+    def get_frames(self, indices):
+        data = Image.from_path(self._path).to_numpy()
+        N = len(indices)
+        ret = np.repeat(data[np.newaxis, ...], N, axis=0)
+        return ret
diff --git a/brainscore_vision/model_helpers/activations/temporal/inputs/base.py b/brainscore_vision/model_helpers/activations/temporal/inputs/base.py
deleted file mode 100644
index c94ccd3d7..000000000
--- a/brainscore_vision/model_helpers/activations/temporal/inputs/base.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from typing import Union
-from pathlib import Path
-
-
-class Stimulus:
-    def from_path(self, path):
-        raise NotImplementedError("Choose a concrete Stimulus type to use.")
-    
-    @staticmethod
-    def is_video_path(path: Union[str, Path]) -> bool:
-        extension = path.split('.')[-1].lower()
-        return extension in ['mp4', 'avi', 'mov', 'flv', 'wmv', 'webm', 'mkv', 'gif']
-    
-    @staticmethod
-    def is_image_path(path: Union[str, Path]) -> bool:
-        extension = path.split('.')[-1].lower()
-        return extension in ['jpg', 'jpeg', 'png', 'bmp', 'tiff']
diff --git a/brainscore_vision/model_helpers/activations/temporal/inputs/image.py b/brainscore_vision/model_helpers/activations/temporal/inputs/image.py
deleted file mode 100644
index 6d7790ac1..000000000
--- a/brainscore_vision/model_helpers/activations/temporal/inputs/image.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import numpy as np
-from PIL import Image as PILImage
-
-from .base import Stimulus
-from brainscore_vision.model_helpers.activations.temporal.utils import batch_2d_resize
-
-
-class Image(Stimulus):
-    def __init__(self, path: str, size: int):
-        self._path = path
-        self._size = size
-
-    def copy(self):
-        return Image(self._path, self._size)
-    
-    @property
-    def size(self):
-        return self._size
-    
-    def set_size(self, size):
-        img = self.copy()
-        img._size = size
-        return img
-    
-    def from_path(path):
-        return Image(path, get_image_size(path))
-    
-    def to_pil_img(self):
-        return PILImage.fromarray(self.to_numpy())
-
-    def get_frame(self):
-        return np.array(PILImage.open(self._path).convert('RGB'))
-    
-    # return (H, W, C[RGB])
-    def to_numpy(self):
-        arr = self.get_frame()
-
-        if arr.shape[:2][::-1] != self._size:
-            arr = batch_2d_resize(arr[None,:], self._size, "bilinear")[0]
-
-        return arr
-
-    def store_to_path(self, path):
-        self.to_img().save(path)
-        return path
-
-def get_image_size(path):
-    with PILImage.open(path) as img:
-        size = img.size
-    return size
\ No newline at end of file
diff --git a/brainscore_vision/model_helpers/activations/temporal/inputs/video.py b/brainscore_vision/model_helpers/activations/temporal/inputs/video.py
deleted file mode 100644
index d8287a4a7..000000000
--- a/brainscore_vision/model_helpers/activations/temporal/inputs/video.py
+++ /dev/null
@@ -1,186 +0,0 @@
-import cv2
-from decord import VideoReader
-import numpy as np
-from PIL import Image as PILImage
-from typing import Tuple, Union
-from pathlib import Path
-
-from .base import Stimulus
-from .image import Image
-from brainscore_vision.model_helpers.activations.temporal.utils import batch_2d_resize
-
-
-EPS = 1e-9  
-
-def get_video_stats(video_path):
-    cap = cv2.VideoCapture(video_path)
-    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    fps    = cap.get(cv2.CAP_PROP_FPS)
-    size = (width, height)
-    duration = length / fps * 1000
-    cap.release()
-    return fps, duration, size
-
-
-def get_image_stats(image_path):
-    with PILImage.open(image_path) as img:
-        return img.size
-    return size
-
-
-class Video(Stimulus):
-    """Video object that represents a video clip."""
-
-    def __init__(
-            self, 
-            path: Union[str, Path], 
-            fps: float, 
-            start: float, 
-            end: float, 
-            size: Tuple[int, int]
-        ):
-        self._path = path
-        self._fps = fps
-        self._size = size
-        self._start = start
-        self._end = end
-        self._original_fps = None
-        self._original_duration = None
-        self._original_size = None
-
-    def __getattribute__(self, key):
-        if key.startswith("_original_"):
-            if super().__getattribute__(key) is None:
-                self._original_fps, self._original_duration, self._original_size = get_video_stats(self._path)
-        return super().__getattribute__(key)
-
-    def copy(self):
-        # return view
-        video = self.__class__(self._path, self._fps, self._start, self._end, self._size)
-        video._original_fps = self._original_fps
-        video._original_duration = self._original_duration
-        video._original_size = self._original_size
-        return video
-    
-    @property
-    def duration(self):
-        # in ms
-        return self._end - self._start
-
-    @property
-    def fps(self):
-        return self._fps
-
-    @property
-    def num_frames(self):
-        return int(self.duration * self.fps/1000 + EPS)
-    
-    @property
-    def original_num_frames(self):
-        return int(self._original_duration * self._original_fps/1000 + EPS)
-    
-    @property
-    def frame_size(self):
-        return self._size
-    
-    ### Transformations: return copy
-    
-    def set_fps(self, fps):
-        assert 1000/fps <= self.duration, f"fps {fps} is too low for duration {self.duration}ms."
-        video = self.copy()
-        video._fps = fps
-        return video
-    
-    def set_size(self, size):
-        # size: (width, height)
-        video = self.copy()
-        video._size = size
-        return video
-
-    def set_window(self, start, end, padding="repeat"):
-        # use ms as the time scale
-        if end < start:
-            raise ValueError("end time is earlier than start time")
-        
-        if padding != "repeat":
-            raise NotImplementedError()
-        
-        video = self.copy()
-        video._start = self._start + start
-        video._end = self._start + end
-        return video
-    
-    def get_frames(self, indices):
-        reader = VideoReader(self._path)
-        frames = reader.get_batch(indices).asnumpy()
-        del reader
-        return frames
-
-    ### I/O
-    def from_path(path):
-        fps, end, size = get_video_stats(path)
-        start = 0
-        return Video(path, fps, start, end, size)
-    
-    def from_img_path(img_path, duration, fps):
-        # duration in ms
-        size = get_image_stats(img_path)
-        return VideoFromImage(img_path, fps, 0, duration, size)
-    
-    def to_numpy(self):
-        # get the time stamps of frame samples
-        start_frame = self._start * self._original_fps / 1000
-        end_frame = self._end * self._original_fps / 1000
-        # avoid taking the last extra frame
-        samples = np.arange(start_frame, end_frame - EPS, self._original_fps/self.fps)
-        sample_indices = samples.astype(int)
-
-        # padding: repeat the first/last frame
-        original_num_frames = int(self._original_duration * self._original_fps/1000 - EPS)  # EPS to avoid last frame OOB error
-        sample_indices = np.clip(sample_indices, 0, original_num_frames-1)
-
-        # actual sampling
-        frames = self.get_frames(sample_indices)
-
-        # resizing
-        if self._size != (frames.shape[2], frames.shape[1]):
-            frames = batch_2d_resize(frames, self._size, "bilinear")
-
-        return frames
-    
-    def to_frames(self):
-        return [f for f in self.to_numpy()]
-
-    def to_pil_imgs(self):
-        return [PILImage.fromarray(frame) for frame in self.to_numpy()]
-
-    def to_path(self):
-        # use context manager ?
-        path = None  # make a temporal file
-        raise NotImplementedError()
-        return path
-
-    def store_to_path(self, path):
-        # pick format based on path filename
-        if path.endswith(".avi"):
-            fourcc = cv2.VideoWriter_fourcc(*'XVID')
-        elif path.endswith(".mp4"):
-            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-        else:
-            raise ValueError("Unsupported video format.")
-
-        out = cv2.VideoWriter(path, fourcc, self._fps, self._size)
-        for frame in self.to_frames():
-            out.write(frame[...,::-1])  # to RGB
-        out.release()
-        return path
-    
-
-class VideoFromImage(Video):
-    def get_frames(self, indices):
-        data = Image.from_path(self._path).to_numpy()
-        N = len(indices)
-        ret = np.repeat(data[np.newaxis, ...], N, axis=0)
-        return ret
\ No newline at end of file
diff --git a/brainscore_vision/model_helpers/activations/temporal/model/__init__.py b/brainscore_vision/model_helpers/activations/temporal/model/__init__.py
index cf53c04df..2fca6d571 100644
--- a/brainscore_vision/model_helpers/activations/temporal/model/__init__.py
+++ b/brainscore_vision/model_helpers/activations/temporal/model/__init__.py
@@ -1,2 +1,146 @@
-from .base import ActivationWrapper
-from .pytorch import PytorchWrapper
\ No newline at end of file
+from collections import OrderedDict
+import logging
+import typing
+from typing import Any, Callable, Dict, List
+
+import numpy as np
+
+from brainscore_vision.model_helpers.utils import fullname
+
+from ..core import ActivationsExtractor, Inferencer, TemporalInferencer
+from ..inputs import Stimulus
+
+
+SUBMODULE_SEPARATOR = '.'
+
+
+def default_process_activation(layer, layer_name, inputs, output):
+    # (torch.nn.Module, str, torch.Tensor, torch.Tensor) -> torch.Tensor
+    return output
+
+
+class ActivationWrapper:
+    def __init__(
+        self,
+        identifier: str,
+        preprocessing: Callable[[List[Stimulus]], Any],
+        inferencer_cls: Inferencer = TemporalInferencer,
+        **extractor_kwargs,
+    ):
+        self.identifier = identifier
+        self.preprocessing = preprocessing
+        self.build_extractor(inferencer_cls, **extractor_kwargs)
+
+    # List[preprocessed_input] -> Dict[layer -> np.array]
+    def get_activations(self, inputs: List[Stimulus], layers: List[str]) -> Dict[str, np.array]:
+        raise NotImplementedError()
+
+    def __call__(self, *args, **kwargs):
+        return self._extractor(*args, **kwargs)
+
+    def build_extractor(self, inferencer_cls, *args, **kwargs):
+        extractor = ActivationsExtractor(
+            identifier=self.identifier,
+            inferencer=inferencer_cls(get_activations=self.get_activations, preprocessing=self.preprocessing, *args, **kwargs),
+        )
+        extractor.insert_attrs(self)
+        self._extractor = extractor
+
+
+class PytorchWrapper(ActivationWrapper):
+    def __init__(
+        self,
+        identifier: str,
+        model,
+        preprocessing: Callable[[List[Stimulus]], Any],
+        process_output=None,
+        *args,
+        **kwargs,
+    ):
+        import torch
+
+        logger = logging.getLogger(fullname(self))
+        self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        logger.debug(f"Using device {self._device}")
+        self._model = model.to(self._device)
+        # preprocessing: input.Stimulus -> actual model inputs
+        self._preprocess = preprocessing
+        self._process_activation = default_process_activation if process_output is None else process_output
+        super().__init__(identifier, preprocessing, *args, **kwargs)
+
+    def forward(self, inputs: List[Any]) -> Any:
+        # this function gets a list of preprocessed inputs and does the forward pass
+        import torch
+
+        tensor = torch.stack(inputs)
+        tensor = tensor.to(self._device)
+        return self._model(tensor)
+
+    def get_activations(self, inputs: List[Any], layer_names: List[str]) -> typing.OrderedDict[str, Any]:
+        import torch
+
+        self._model.eval()
+        layer_results = OrderedDict()
+        hooks = []
+
+        for layer_name in layer_names:
+            layer = self.get_layer(layer_name)
+            hook = self._register_hook(layer, layer_name, target_dict=layer_results)
+            hooks.append(hook)
+
+        with torch.no_grad():
+            self.forward(inputs)
+
+        for hook in hooks:
+            hook.remove()
+        return layer_results
+
+    def get_layer(self, layer_name: str):
+        # the layer_name is a string that represents the module hierarchy up to the target layer,
+        # seperated by ".", e.g., "module1.submodule2.relu".
+        if layer_name == 'logits':
+            return self._output_layer()
+        module = self._model
+        for part in layer_name.split(SUBMODULE_SEPARATOR):
+            module = module._modules.get(part)
+            assert module is not None, f"No submodule found for layer {layer_name}, at part {part}"
+        return module
+
+    def _output_layer(self):
+        module = self._model
+        while module._modules:
+            module = module._modules[next(reversed(module._modules))]
+        return module
+
+    def _register_hook(self, layer, layer_name, target_dict):
+        def hook_function(_layer, _input, output, name=layer_name, target_dict=target_dict):
+            output = self._process_activation(_layer, name, _input, output)
+            target_dict[name] = output
+
+        hook = layer.register_forward_hook(hook_function)
+        return hook
+
+    def __repr__(self):
+        return repr(self._model)
+
+    def layers(self):
+        for name, module in self._model.named_modules():
+            if len(list(module.children())) > 0:  # this module only holds other modules
+                continue
+            yield name, module
+
+    def graph(self):
+        import networkx as nx
+
+        g = nx.DiGraph()
+        for layer_name, layer in self.layers():
+            g.add_node(layer_name, object=layer, type=type(layer))
+        return g
+
+
+__all__ = [
+    "ActivationWrapper",
+    "PytorchWrapper",
+    "default_process_activation",
+    "SUBMODULE_SEPARATOR",
+]
diff --git a/brainscore_vision/model_helpers/activations/temporal/model/base.py b/brainscore_vision/model_helpers/activations/temporal/model/base.py
deleted file mode 100644
index 3b0b55bd6..000000000
--- a/brainscore_vision/model_helpers/activations/temporal/model/base.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import numpy as np
-from ..inputs.base import Stimulus
-from ..core import ActivationsExtractor
-from ..core import TemporalInferencer, Inferencer
-
-from typing import List, Callable, Any, Dict
-
-
-class ActivationWrapper:
-    def __init__(
-            self, 
-            identifier : str, 
-            preprocessing : Callable[[List[Stimulus]], Any],
-            inferencer_cls : Inferencer = TemporalInferencer, 
-            **extractor_kwargs
-        ):
-        self.identifier = identifier
-        self.preprocessing = preprocessing
-        self.build_extractor(inferencer_cls, **extractor_kwargs)
-
-    # List[preprocessed_input] -> Dict[layer -> np.array]
-    def get_activations(self, inputs : List[Stimulus], layers : List[str]) -> Dict[str, np.array]:
-        raise NotImplementedError()
-
-    def __call__(self, *args, **kwargs):
-        return self._extractor(*args, **kwargs)
-    
-    def build_extractor(self, inferencer_cls, *args, **kwargs):
-        extractor = ActivationsExtractor(identifier=self.identifier, 
-                                         inferencer=inferencer_cls(get_activations=self.get_activations, 
-                                                        preprocessing=self.preprocessing, *args, **kwargs))
-        extractor.insert_attrs(self)
-        self._extractor = extractor
\ No newline at end of file
diff --git a/brainscore_vision/model_helpers/activations/temporal/model/pytorch.py b/brainscore_vision/model_helpers/activations/temporal/model/pytorch.py
deleted file mode 100644
index 00197874e..000000000
--- a/brainscore_vision/model_helpers/activations/temporal/model/pytorch.py
+++ /dev/null
@@ -1,107 +0,0 @@
-from collections import OrderedDict
-import typing
-from typing import Callable, List, Any, Dict
-
-import logging
-
-from brainscore_vision.model_helpers.utils import fullname
-from .base import ActivationWrapper
-from ..inputs import Stimulus
-
-
-SUBMODULE_SEPARATOR = '.'
-
-
-def default_process_activation(layer, layer_name, inputs, output):
-    # (torch.nn.Module, str, torch.Tensor, torch.Tensor) -> torch.Tensor
-    return output
-
-class PytorchWrapper(ActivationWrapper):
-    def __init__(
-            self, 
-            identifier : str, 
-            model, 
-            preprocessing : Callable[[List[Stimulus]], Any], 
-            process_output = None, 
-            *args, 
-            **kwargs
-        ):
-        import torch
-        logger = logging.getLogger(fullname(self))
-        self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        logger.debug(f"Using device {self._device}")
-        self._model = model.to(self._device)
-        # preprocessing: input.Stimulus -> actual model inputs
-        self._preprocess = preprocessing
-        self._process_activation = default_process_activation if process_output is None else process_output
-        super().__init__(identifier, preprocessing, *args, **kwargs)
-
-    def forward(self, inputs : List[Any]) -> Any: 
-        # this function gets a list of preprocessed inputs and does the forward pass
-        import torch
-        tensor = torch.stack(inputs)
-        tensor = tensor.to(self._device)
-        return self._model(tensor)
-
-    def get_activations(self, inputs : List[Any], layer_names : List[str]) -> typing.OrderedDict[str, Any]:
-        import torch
-        self._model.eval()
-        layer_results = OrderedDict()
-        hooks = []
-
-        for layer_name in layer_names:
-            layer = self.get_layer(layer_name)
-            hook = self._register_hook(layer, layer_name, target_dict=layer_results)
-            hooks.append(hook)
-
-        with torch.no_grad():
-            self.forward(inputs)
-
-        for hook in hooks:
-            hook.remove()
-        return layer_results
-    
-    def get_layer(self, layer_name : str):
-        # the layer_name is a string that represents the module hierarchy up to the target layer,
-        # seperated by ".", e.g., "module1.submodule2.relu".
-        if layer_name == 'logits':
-            return self._output_layer()
-        module = self._model
-        for part in layer_name.split(SUBMODULE_SEPARATOR):
-            module = module._modules.get(part)
-            assert module is not None, f"No submodule found for layer {layer_name}, at part {part}"
-        return module
-
-    def _output_layer(self):
-        module = self._model
-        while module._modules:
-            module = module._modules[next(reversed(module._modules))]
-        return module
-
-    @classmethod
-    def _tensor_to_numpy(cls, output):
-        return output.cpu().data.numpy()
-
-    def _register_hook(self, layer, layer_name, target_dict):
-        def hook_function(_layer, _input, output, name=layer_name, target_dict=target_dict):
-            output = self._process_activation(_layer, name, _input, output)
-            target_dict[name] = PytorchWrapper._tensor_to_numpy(output)
-
-        hook = layer.register_forward_hook(hook_function)
-        return hook
-
-    def __repr__(self):
-        return repr(self._model)
-
-    def layers(self):
-        for name, module in self._model.named_modules():
-            if len(list(module.children())) > 0:  # this module only holds other modules
-                continue
-            yield name, module
-
-    def graph(self):
-        import networkx as nx
-        g = nx.DiGraph()
-        for layer_name, layer in self.layers():
-            g.add_node(layer_name, object=layer, type=type(layer))
-        return g
diff --git a/brainscore_vision/model_helpers/activations/temporal/utils.py b/brainscore_vision/model_helpers/activations/temporal/utils.py
index d1f7a264c..188368e2a 100644
--- a/brainscore_vision/model_helpers/activations/temporal/utils.py
+++ b/brainscore_vision/model_helpers/activations/temporal/utils.py
@@ -1,8 +1,210 @@
 import os
 import numpy as np
+import pickle
 
 from brainscore_vision.model_helpers.brain_transformation.temporal import assembly_time_align
-
+from brainio.assemblies import DataAssembly
+
+
+# allow efficient fill_value for memmap
+class custom_memmap(np.memmap):
+    def __new__(subtype, filename, dtype=np.uint8, mode='r+', offset=0,
+                shape=None, order='C', fill_value=None):
+        # Import here to minimize 'import numpy' overhead
+        import mmap
+        import os.path
+        import struct
+        from numpy import ndarray
+
+        mode_equivalents = {
+            "readonly":"r",
+            "copyonwrite":"c",
+            "readwrite":"r+",
+            "write":"w+"
+        }
+
+        dtypedescr = np.dtype
+        valid_filemodes = ["r", "c", "r+", "w+"]
+        writeable_filemodes = ["r+", "w+"]
+
+        try:
+            mode = mode_equivalents[mode]
+        except KeyError as e:
+            if mode not in valid_filemodes:
+                raise ValueError(
+                    "mode must be one of {!r} (got {!r})"
+                    .format(valid_filemodes + list(mode_equivalents.keys()), mode)
+                ) from None
+
+        if mode == 'w+' and shape is None:
+            raise ValueError("shape must be given if mode == 'w+'")
+
+        def get_ctx(mode):
+            if hasattr(filename, 'read'):
+                f_ctx = nullcontext(filename)
+            else:
+                f_ctx = open(
+                    os.fspath(filename),
+                    ('r' if mode == 'c' else mode)+'b'
+                )
+            return f_ctx
+
+        with get_ctx(mode) as fid:
+            fid.seek(0, 2)
+            flen = fid.tell()
+            descr = dtypedescr(dtype)
+            _dbytes = descr.itemsize
+
+            if shape is None:
+                bytes = flen - offset
+                if bytes % _dbytes:
+                    raise ValueError("Size of available data is not a "
+                            "multiple of the data-type size.")
+                size = bytes // _dbytes
+                shape = (size,)
+            else:
+                if type(shape) not in (tuple, list):
+                    try:
+                        shape = [operator.index(shape)]
+                    except TypeError:
+                        pass
+                shape = tuple(shape)
+                size = np.intp(1)  # avoid default choice of np.int_, which might overflow
+                for k in shape:
+                    size *= k
+
+            bytes = int(offset + size*_dbytes)
+
+            if mode in ('w+', 'r+') and flen < bytes:
+                fid.seek(bytes - 1, 0)
+                fid.write(b'\0')
+                fid.flush()
+
+            if mode == 'w+' and fill_value is not None:
+                val = np.array(fill_value).astype(dtype).tobytes()
+                TARGET_BLOCK_SIZE = 1024 * 1024 * 10
+                # chunk writing
+                for i in range(0, bytes, TARGET_BLOCK_SIZE):
+                    with get_ctx(mode='r+') as _fid:
+                        _fid.seek(i)
+                        _fid.write(val * min(TARGET_BLOCK_SIZE, bytes - i))
+                        _fid.flush()
+
+            if mode == 'c':
+                acc = mmap.ACCESS_COPY
+            elif mode == 'r':
+                acc = mmap.ACCESS_READ
+            else:
+                acc = mmap.ACCESS_WRITE
+
+            start = offset - offset % mmap.ALLOCATIONGRANULARITY
+            bytes -= start
+            array_offset = offset - start
+            mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=start)
+
+            self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm,
+                                   offset=array_offset, order=order)
+            self._mmap = mm
+            self.offset = offset
+            self.mode = mode
+
+            if isinstance(filename, os.PathLike):
+                # special case - if we were constructed with a pathlib.path,
+                # then filename is a path object, not a string
+                self.filename = filename.resolve()
+            elif hasattr(fid, "name") and isinstance(fid.name, str):
+                # py3 returns int for TemporaryFile().name
+                self.filename = os.path.abspath(fid.name)
+            # same as memmap copies (e.g. memmap + 1)
+            else:
+                self.filename = None
+
+        return self
+
+
+# a map that write directly to the disk without loading into memory
+class data_assembly_mmap:
+    def __init__(self, filepath=None, **kwargs):
+        self.filepath = filepath
+        self.kwargs = kwargs
+        self._in_memory = filepath is None
+
+        if self._in_memory:
+            self._data = np.full(**kwargs)
+            self._created = True
+        else:
+            self._data = None
+            self._created = False
+            self.data_file = os.path.join(self.filepath, "data.npy")
+            self.meta_file = os.path.join(self.filepath, "meta.pkl")
+
+    def _open(self):
+        if self._in_memory:
+            return
+
+        if self._data is None:
+            kwargs = self.kwargs.copy()
+            fill_value = self.kwargs.get("fill_value", None)
+            if not self._created:
+                self._data = custom_memmap(self.data_file, mode='w+', **kwargs)
+                self._created = True
+            else:
+                self._data = custom_memmap(self.data_file, mode='r+', **kwargs)
+
+    def _close(self):
+        if self._data is not None:
+            self._data.flush()
+            del self._data
+            self._data = None
+
+    def __setitem__(self, key, value):
+        self._open()
+        self._data[key] = value
+        if not self._in_memory:
+            self._close()
+
+    def __getitem__(self, key):
+        self._open()
+        return self._data[key]
+
+    def register_meta(self, dims, coords):
+        self.coords = coords
+        self.dims = dims
+
+        if not self._in_memory:
+            with open(self.meta_file, 'wb') as f:
+                pickle.dump((dims, coords, self.kwargs), f)
+
+    def to_assembly(self):
+        self._open()
+        return DataAssembly(self._data, coords=self.coords, dims=self.dims)
+
+    @staticmethod
+    def is_saved(filepath):
+        data_file = os.path.join(filepath, "data.npy")
+        meta_file = os.path.join(filepath, "meta.pkl")
+        return os.path.exists(data_file) and os.path.exists(meta_file)
+
+    @staticmethod
+    def load(filepath):
+        if filepath is None:
+            return None
+
+        if not data_assembly_mmap.is_saved(filepath):
+            return None
+
+        meta_file = os.path.join(filepath, "meta.pkl")
+        with open(meta_file, 'rb') as f:
+            dims, coords, kwargs = pickle.load(f)
+
+        data = data_assembly_mmap(filepath, **kwargs)
+        data._created = True
+        data._open()
+        data.dims = dims
+        data.coords = coords
+
+        return data
+        
 
 def concat_with_nan_padding(arr_list, axis=0, dtype=np.float16):
     # Get shapes of all arrays
@@ -161,7 +363,7 @@ def assembly_align_to_fps(output_assembly, fps, mode="portion"):
 # get the inferencer from any model
 def get_inferencer(any_model):
     from brainscore_vision.model_helpers.activations.temporal.core import Inferencer, ActivationsExtractor
-    from brainscore_vision.model_helpers.activations.temporal.model.base import ActivationWrapper
+    from brainscore_vision.model_helpers.activations.temporal.model import ActivationWrapper
     from brainscore_vision.model_helpers.brain_transformation import ModelCommitment
 
     if isinstance(any_model, Inferencer): return any_model
@@ -171,7 +373,7 @@ def get_inferencer(any_model):
     raise ValueError(f"Cannot find inferencer from the model {any_model}")
 
 def get_base_model(any_model):
-    from brainscore_vision.model_helpers.activations.temporal.model.base import ActivationWrapper
+    from brainscore_vision.model_helpers.activations.temporal.model import ActivationWrapper
     from brainscore_vision.model_helpers.brain_transformation import ModelCommitment
 
     if isinstance(any_model, ActivationWrapper): return any_model
@@ -225,4 +427,4 @@ def download_weight_file(url, folder=None):
     if total_size != 0 and progress_bar.n != total_size:
         raise RuntimeError("Could not download file")
 
-    return weight_path
\ No newline at end of file
+    return weight_path
diff --git a/brainscore_vision/models/temporal_model_AVID_CMA/model.py b/brainscore_vision/models/temporal_model_AVID_CMA/model.py
index 0a9374c92..41aa147aa 100644
--- a/brainscore_vision/models/temporal_model_AVID_CMA/model.py
+++ b/brainscore_vision/models/temporal_model_AVID_CMA/model.py
@@ -21,7 +21,7 @@ def get_model(identifier):
         weight_path = load_weight_file(
             bucket="brainscore-vision",
             relative_path="temporal_model_AVID-CMA/AVID-CMA_Kinetics_InstX-N1024-PosW-N64-Top32_checkpoint.pth.tar",
-            version_id="yx9Pbq3SuNOOd4sX7csTolaHD1iTCx8y",
+            version_id="ZR41eKh6GYyJ61Kgwf1V4LShzse7A0yC",
             sha1="6efe4464ca654a56affff766acf24e89e6f3ffbf"
         )
 
@@ -29,8 +29,8 @@ def get_model(identifier):
         cfg_path = os.path.join(HOME, "configs/main/avid-cma/audioset/InstX-N1024-PosW-N64-Top32.yaml")
         weight_path = load_weight_file(
             bucket="brainscore-vision",
-            relative_path="temporal_model_AVID_CMA/AVID-CMA_Audioset_InstX-N1024-PosW-N64-Top32_checkpoint.pth.tar",
-            version_id="jSaZgbUohM0ZeoEUUKZiLBo6iz_v8VvQ",
+            relative_path="temporal_model_AVID-CMA/AVID-CMA_Audioset_InstX-N1024-PosW-N64-Top32_checkpoint.pth.tar",
+            version_id="8r37ZPc0oD3N0ff4R8Y_S_eV1DNDFs8d",
             sha1="9db5eba9aab6bdbb74025be57ab532df808fe3f6"
         )
 
@@ -38,8 +38,8 @@ def get_model(identifier):
         cfg_path = os.path.join(HOME, "configs/main/avid/kinetics/Cross-N1024.yaml")
         weight_path = load_weight_file(
             bucket="brainscore-vision",
-            relative_path="temporal_model_AVID_CMA/AVID_Kinetics_Cross-N1024_checkpoint.pth.tar",
-            version_id="XyKt0UOUFsuuyrl6ZREivK8FadRPx34u",
+            relative_path="temporal_model_AVID-CMA/AVID_Kinetics_Cross-N1024_checkpoint.pth.tar",
+            version_id="OjapKttPf.6dvZLO8L89GZddpcizDu_J",
             sha1="d3a04f856d29421ba8de37808593a3fad4d4794f"
         )
 
@@ -47,8 +47,8 @@ def get_model(identifier):
         cfg_path = os.path.join(HOME, "configs/main/avid/audioset/Cross-N1024.yaml")
         weight_path = load_weight_file(
             bucket="brainscore-vision",
-            relative_path="temporal_model_AVID_CMA/AVID_Audioset_Cross-N1024_checkpoint.pth.tar",
-            version_id="0Sxuhn8LsYXQC4FnPfJ7rw7uU6kDlKgc",
+            relative_path="temporal_model_AVID-CMA/AVID_Audioset_Cross-N1024_checkpoint.pth.tar",
+            version_id="VoWZzJllTVFu8oVk7ocf_K7pk11Gbh19",
             sha1="b48d8428a1a2526ccca070f810333df18bfce5fd"
         )
 
diff --git a/brainscore_vision/models/temporal_model_AVID_CMA/requirements.txt b/brainscore_vision/models/temporal_model_AVID_CMA/requirements.txt
index 47cc15207..d613d4ae4 100644
--- a/brainscore_vision/models/temporal_model_AVID_CMA/requirements.txt
+++ b/brainscore_vision/models/temporal_model_AVID_CMA/requirements.txt
@@ -1,3 +1,4 @@
 avid_cma @ git+https://github.com/YingtianDt/AVID-CMA.git
 torch
-torchvision
\ No newline at end of file
+torchvision
+pyav
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_GDT/model.py b/brainscore_vision/models/temporal_model_GDT/model.py
index 1afd9712d..7b90e8d43 100644
--- a/brainscore_vision/models/temporal_model_GDT/model.py
+++ b/brainscore_vision/models/temporal_model_GDT/model.py
@@ -22,21 +22,21 @@ def get_model(identifier):
         pth = load_weight_file(
             bucket="brainscore-vision",
             relative_path="temporal_model_GDT/gdt_K400.pth",
-            version_id="JpU_tnCzrbTejn6sOrQMk8eRsJ97yFgt",
+            version_id="xdpDRV0gnRhULZGL7lwjGxGKqp2hnP1_",
             sha1="7f12c60670346b1aab15194eb44c341906e1bca6"
         )
     elif dataset == "IG65M":
         pth = load_weight_file(
             bucket="brainscore-vision",
             relative_path="temporal_model_GDT/gdt_IG65M.pth",
-            version_id="R.NoD6VAbFbJdf8tg5jnXIWB3hQ8GlSD",
+            version_id="S0_ZVFA2K96ZoVLx26edAxILPm7S.Gf6",
             sha1="3dcee3af61691e1e7e47e4b115be6808f4ea8172"
         )
     elif dataset == "HowTo100M":
         pth = load_weight_file(
             bucket="brainscore-vision",
             relative_path="temporal_model_GDT/gdt_HT100M.pth",
-            version_id="BVRl9t_134PoKZCn9W54cyfkImCW2ioq",
+            version_id=".fli2qSf6pWqbLyY413ZID578sJrC.L.",
             sha1="a9a979c82e83b955794814923af736eb34e6f080"
         )
     else:
diff --git a/brainscore_vision/models/temporal_model_S3D_text_video/model.py b/brainscore_vision/models/temporal_model_S3D_text_video/model.py
index bfc11882c..99d8db3a5 100644
--- a/brainscore_vision/models/temporal_model_S3D_text_video/model.py
+++ b/brainscore_vision/models/temporal_model_S3D_text_video/model.py
@@ -3,7 +3,7 @@
 from torchvision import transforms
 from s3dg_howto100m import S3D
 
-from brainscore_vision.model_helpers.activations.temporal.model.pytorch import PytorchWrapper
+from brainscore_vision.model_helpers.activations.temporal.model import PytorchWrapper
 from brainscore_core.supported_data_standards.brainio.s3 import load_weight_file
 
 
@@ -41,14 +41,14 @@ def get_model(identifier="s3d-HowTo100M"):
     model_pth = load_weight_file(
         bucket="brainscore-vision",
         relative_path="temporal_model_S3D_text_video/s3d_howto100m.pth",
-        version_id="hRp6I8bpwreIMUVL0H.zCdK0hqRggL7n",
+        version_id="3k_iwjSqYUwMRdri9IH2JZn5yMGCnayc",
         sha1="31e99d2a1cd48f2259ca75e719ac82c8b751ea75"
     )
 
     dict_pth = load_weight_file(
         bucket="brainscore-vision",
         relative_path="temporal_model_S3D_text_video/s3d_dict.npy",
-        version_id="4NxVLe8DSL6Uue0F7e2rz8HZuOk.tkBI",
+        version_id="FWHuvz5CDjNJyG1IuzEqkHU9zzmPv62p",
         sha1="d368ff7d397ec8240f1f963b5efe8ff245bac35f"
     )
 
@@ -62,4 +62,4 @@ def get_model(identifier="s3d-HowTo100M"):
                              process_output=process_output,
                              **inferencer_kwargs)
     
-    return wrapper
\ No newline at end of file
+    return wrapper
diff --git a/brainscore_vision/models/temporal_model_S3D_text_video/requirements.txt b/brainscore_vision/models/temporal_model_S3D_text_video/requirements.txt
index 73f27f3b6..19f34cf56 100644
--- a/brainscore_vision/models/temporal_model_S3D_text_video/requirements.txt
+++ b/brainscore_vision/models/temporal_model_S3D_text_video/requirements.txt
@@ -1 +1,2 @@
+torchvision
 S3D_HowTo100M @ git+https://github.com/YingtianDt/S3D_HowTo100M
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_VideoMAE/model.py b/brainscore_vision/models/temporal_model_VideoMAE/model.py
index 0dc135329..7e313e10d 100644
--- a/brainscore_vision/models/temporal_model_VideoMAE/model.py
+++ b/brainscore_vision/models/temporal_model_VideoMAE/model.py
@@ -39,7 +39,7 @@ def get_model(identifier, num_frames=16):
         pth = load_weight_file(
             bucket="brainscore-vision",
             relative_path='temporal_model_VideoMAE/vit_b_k400_pt_1200e.pth',
-            version_id="Oi3VboRZujNyZAcwf5q7XZ2M8q1cPO6o",
+            version_id="YkdCqz8ywKAqAPMLv6iE9SfihuqExfcJ",
             sha1="8faf42df13f619a8970d653695e98f0643760b54"
         )
         num_blocks = 12
@@ -48,7 +48,7 @@ def get_model(identifier, num_frames=16):
         pth = load_weight_file(
             bucket="brainscore-vision",
             relative_path='temporal_model_VideoMAE/vit_l_k400_pt_1200e.pth',
-            version_id="MiPfczDaVponDGuUBrEPqmT.BiVvh_I1",
+            version_id="v8VsUKnAJQ23dyLiYOSYyjSrHI_zB60o",
             sha1="7ff6acbba221f85d7148223ec932ad7c57f2f40c"
         )
         num_blocks = 24
diff --git a/brainscore_vision/models/temporal_model_VideoMAEv2/model.py b/brainscore_vision/models/temporal_model_VideoMAEv2/model.py
index 6b9b8ab5a..b4b7568e4 100644
--- a/brainscore_vision/models/temporal_model_VideoMAEv2/model.py
+++ b/brainscore_vision/models/temporal_model_VideoMAEv2/model.py
@@ -53,7 +53,7 @@ def get_model(identifier):
         pth = load_weight_file(
             bucket="brainscore-vision",
             relative_path="temporal_model_VideoMAEv2/vit_g_hybrid_pt_1200e.pth",
-            version_id="TxtkfbeMV105dzpzTwi0Kn5glnvQvIrq",
+            version_id="0SPWwIlsWFhY98bKgo7ZRRNTXkcT.dBO",
             sha1="9048f2bc0b0c7ba4d0e5228f3a7c0bef4dbaca69"
         )
         num_blocks = 40
@@ -63,7 +63,7 @@ def get_model(identifier):
         pth = load_weight_file(
             bucket="brainscore-vision",
             relative_path="temporal_model_VideoMAEv2/vit_b_hybrid_pt_800e.pth",
-            version_id="rRjpYq21dAQ5KaCLbEHK.YaLZ_fbMPKw",
+            version_id="VLgKVXOX2XYJQvaIOqAAVDvd5xvjx3.d",
             sha1="1e3602691964b1eb6f7c33529119243a5b235635"
         )
         num_blocks = 12
diff --git a/brainscore_vision/models/temporal_model_mae_st/model.py b/brainscore_vision/models/temporal_model_mae_st/model.py
index 9f5cf9299..6e818cea7 100644
--- a/brainscore_vision/models/temporal_model_mae_st/model.py
+++ b/brainscore_vision/models/temporal_model_mae_st/model.py
@@ -53,7 +53,7 @@ def get_model(identifier):
         load_path = load_weight_file(
             bucket="brainscore-vision", 
             relative_path="temporal_model_mae_st/mae_pretrain_vit_large_k400.pth", 
-            version_id="cPcP4AzpG95CimQ5Pn.CHKnGUJlLXM3m",
+            version_id="bHQ1XRM.a4NfgbP53TR5AsgvUsN40R_j",
             sha1="c7fb91864a4ddf8b99309440121a3abe66b846bb"
         )
 
@@ -64,7 +64,7 @@ def get_model(identifier):
         load_path = load_weight_file(
             bucket="brainscore-vision", 
             relative_path="temporal_model_mae_st/mae_pretrain_vit_huge_k400.pth", 
-            version_id="IYKa8QiocgBzo3EhsBouS62HboK6iqYT",
+            version_id="ARS9kz2wQQ29iRsssyc7u_kfA4wLDAji",
             sha1="177e48577142ca01949c08254834ffa1198b9eb4"
         )
 
diff --git a/brainscore_vision/models/temporal_model_mae_st/requirements.txt b/brainscore_vision/models/temporal_model_mae_st/requirements.txt
index 0d1858c8e..85be22792 100644
--- a/brainscore_vision/models/temporal_model_mae_st/requirements.txt
+++ b/brainscore_vision/models/temporal_model_mae_st/requirements.txt
@@ -1,3 +1,6 @@
 mae_st @ git+https://github.com/YingtianDt/mae_st.git
 torch
-torchvision
\ No newline at end of file
+torchvision
+iopath
+simplejson
+timm
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_mmaction2/requirements.txt b/brainscore_vision/models/temporal_model_mmaction2/requirements.txt
index 1caa728dd..4428fdb7b 100644
--- a/brainscore_vision/models/temporal_model_mmaction2/requirements.txt
+++ b/brainscore_vision/models/temporal_model_mmaction2/requirements.txt
@@ -1,5 +1,6 @@
 importlib-metadata<5
-mmaction2 @ git+https://github.com/YingtianDt/mmaction2.git@533edc3
-mmengine
+mmaction2 @ git+https://github.com/YingtianDt/mmaction2.git
+mmengine==0.10.4
+mmcv==2.1.0
 torch
-torchvision
\ No newline at end of file
+torchvision
diff --git a/brainscore_vision/models/temporal_model_torchvision/model.py b/brainscore_vision/models/temporal_model_torchvision/model.py
index 30d96aba8..6c4e1150c 100644
--- a/brainscore_vision/models/temporal_model_torchvision/model.py
+++ b/brainscore_vision/models/temporal_model_torchvision/model.py
@@ -3,7 +3,7 @@
 from torchvision import transforms
 from torchvision.models import video as vid
 
-from brainscore_vision.model_helpers.activations.temporal.model.pytorch import PytorchWrapper
+from brainscore_vision.model_helpers.activations.temporal.model import PytorchWrapper
 
 
 LARGE_MODEL_LAYER_STEP = 2
@@ -89,4 +89,4 @@ def process_output(layer, layer_name, input, output):
                              process_output=process_output,
                              **inferencer_kwargs)
     
-    return wrapper
\ No newline at end of file
+    return wrapper
diff --git a/brainscore_vision/models/temporal_model_vjepa/__init__.py b/brainscore_vision/models/temporal_model_vjepa/__init__.py
new file mode 100644
index 000000000..6e2a2ff30
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_vjepa/__init__.py
@@ -0,0 +1,16 @@
+from brainscore_vision import model_registry
+from brainscore_vision.model_helpers.brain_transformation import ModelCommitment
+from brainscore_vision.model_helpers.activations.temporal.utils import get_specified_layers
+from brainscore_vision.model_interface import BrainModel
+from . import model
+
+
+def commit_model(identifier):
+    activations_model=model.get_model(identifier)
+    layers=get_specified_layers(activations_model)
+    return ModelCommitment(identifier=identifier, activations_model=activations_model, layers=layers)
+
+
+model_registry["VJEPA-L16"] = lambda: commit_model("VJEPA-L16")
+model_registry["VJEPA-H16"] = lambda: commit_model("VJEPA-H16")
+model_registry["VJEPA-H16-384"] = lambda: commit_model("VJEPA-H16-384")
diff --git a/brainscore_vision/models/temporal_model_vjepa/model.py b/brainscore_vision/models/temporal_model_vjepa/model.py
new file mode 100644
index 000000000..e05fb2fcd
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_vjepa/model.py
@@ -0,0 +1,86 @@
+from .vjepa_model import VJEPA
+import torch as th
+
+from brainscore_vision.model_helpers.activations.temporal.model import PytorchWrapper
+from brainscore_vision.model_helpers.activations.temporal.utils import download_weight_file
+
+from torchvision import transforms
+
+class VJEPAWrapper(PytorchWrapper):
+    def forward(self, inputs):
+        tensor = th.stack(inputs)
+        tensor = tensor.to(self._device)
+        return self._model(tensor)  # encoder only
+
+# Define the ImageNet mean and std
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+
+transform_img = transforms.Compose([
+    transforms.Resize((224, 224)),  # Example size for ViT
+    transforms.ToTensor(),
+    transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+])
+
+def transform_video(video):
+    frames = []
+    for img in video.to_pil_imgs():
+        frames += [transform_img(img)]
+    frames = th.stack(frames)
+    return frames
+
+def get_model(identifier, num_frames=16):
+    assert identifier.startswith("VJEPA")
+
+    if identifier == "VJEPA-L16":
+        url = "https://dl.fbaipublicfiles.com/jepa/vitl16/vitl16.pth.tar"
+        H = W = 14
+        vit_type = "vit_large"
+        num_blocks = 24
+    elif identifier == "VJEPA-H16":
+        url = "https://dl.fbaipublicfiles.com/jepa/vith16/vith16.pth.tar"
+        H = W = 14
+        vit_type = "vit_huge"
+        num_blocks = 32
+    elif identifier == "VJEPA-H16-384":
+        url = "https://dl.fbaipublicfiles.com/jepa/vith16-384/vith16-384.pth.tar"
+        H = W = 24
+        vit_type = "vit_huge"
+        num_blocks = 32
+    else:
+        raise ValueError(f"Unknown VJEPA identifier: {identifier}")
+
+    T = None
+    weight_path = download_weight_file(url, folder="temporal_model_vjepa")
+
+    # Instantiate the model
+    net = VJEPA(weight_path, vit_type)
+
+    def process_output(layer, layer_name, inputs, output):
+        if layer_name == "encoder.encoder.patch_embed":
+            global T
+            T = inputs[0].shape[2]
+
+        B, L, C = output.shape
+        assert L == T//2 * H * W
+        output = output.view(B, T//2, H, W, C)
+
+        return output
+
+    inferencer_kwargs = {
+        "fps": 10,
+        "layer_activation_format": {
+            "encoder.encoder.patch_embed": "THWC",
+            **{f"encoder.encoder.blocks.{i}": "THWC" for i in range(0, num_blocks, 2)},
+        },
+        "duration": None,
+        "time_alignment": "evenly_spaced",
+        "process_output": process_output,
+    }
+
+    for layer in inferencer_kwargs["layer_activation_format"].keys():
+        assert "decoder" not in layer, "Decoder layers are not supported."
+
+    wrapper = VJEPAWrapper(identifier, net, transform_video, 
+                                **inferencer_kwargs)
+    return wrapper
diff --git a/brainscore_vision/models/temporal_model_vjepa/requirements.txt b/brainscore_vision/models/temporal_model_vjepa/requirements.txt
new file mode 100644
index 000000000..abe4858a6
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_vjepa/requirements.txt
@@ -0,0 +1,14 @@
+torch>=2
+torchvision
+pyyaml
+numpy
+opencv-python
+submitit
+braceexpand
+webdataset
+timm
+decord
+pandas
+einops
+beartype
+phys_readouts@ git+https://github.com/thekej/phys_readouts.git#egg=phys_readouts
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_vjepa/test.py b/brainscore_vision/models/temporal_model_vjepa/test.py
new file mode 100644
index 000000000..6a20ca8dc
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_vjepa/test.py
@@ -0,0 +1,17 @@
+import pytest
+
+from brainscore_vision import load_model
+
+
+model_list = [
+    "VJEPA-Temporal",
+]
+
+@pytest.mark.private_access
+@pytest.mark.memory_intense
+@pytest.mark.parametrize("model_identifier", model_list)
+def test_load(model_identifier):
+    model = load_model(model_identifier)
+    assert model is not None
+
+    
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_vjepa/vjepa_model.py b/brainscore_vision/models/temporal_model_vjepa/vjepa_model.py
new file mode 100644
index 000000000..1fc333012
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_vjepa/vjepa_model.py
@@ -0,0 +1,87 @@
+from torch import nn
+
+import torch
+
+import phys_extractors.models.jepa_physics.jepa.src.models.vision_transformer as vit
+
+
+class VJEPA_encoder(nn.Module):
+    def __init__(
+            self, 
+            weights_path, 
+            vit_type="vit_large",
+            embed_dim=256, 
+            crop_size=224, 
+            patch_size=16,
+            num_frames=16,
+            tubelet_size=2,
+        ):
+
+        super().__init__()
+
+        self.embed_dim = embed_dim
+
+        # download the model and put it in the folder.
+        state_dict = torch.load(weights_path, map_location='cpu')
+
+        # following the config for the model
+
+        uniform_power = True
+        use_sdpa = True
+        use_SiLU = False
+        tight_SiLU = False
+
+        self.encoder = vit.__dict__[vit_type](
+            img_size=crop_size,
+            patch_size=patch_size,
+            num_frames=num_frames,
+            tubelet_size=tubelet_size,
+            uniform_power=uniform_power,
+            use_sdpa=use_sdpa,
+            use_SiLU=use_SiLU,
+            tight_SiLU=tight_SiLU,
+        )
+
+        self.encoder.load_state_dict({k.replace('module.backbone.', ''): v for k, v in state_dict['encoder'].items()})
+        self.encoder.eval()
+
+        # set requires_grad false
+        for param in self.encoder.parameters():
+            param.requires_grad = False
+
+    def forward(self, videos):
+        B, T, C, H, W = videos.shape
+        embeddings = self.encoder(videos.transpose(1,2))
+        embeddings = self.patchify(embeddings)
+        return embeddings
+
+    def patchify(self, input_array, patch_height=32, patch_width=32):
+        # Step 2: Pad the matrix to size 392x1036
+        padding_size = 12  # Number of columns to add
+        padded_matrix = torch.nn.functional.pad(input_array, (0, padding_size))
+        # Step 3: Define patch size
+        N, _, _ = padded_matrix.shape
+        H_prime = 56
+        W_prime = 148
+
+        # Step 4: Patchify the padded matrix into 49 contiguous patches of size 56x148
+        # Reshape the matrix into shape (7, 56, 7, 148) and then swap axes to obtain patches
+        patches = padded_matrix.unfold(1, H_prime, H_prime)
+        patches = patches.unfold(2, W_prime, W_prime)
+        patches = patches.contiguous().view(N, -1, H_prime, W_prime)  # 49 patches of size 56x148
+
+        return patches
+
+# Given sequence of images, predicts next latent
+class VJEPA(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+        self.encoder = VJEPA_encoder(*args, **kwargs)
+
+    def forward(self, x):
+        # set frozen pretrained encoder to eval mode
+        self.encoder.eval()
+        # x is (Bs, T, 3, H, W)
+        assert len(x.shape) == 5
+        return self.encoder(x)
diff --git a/tests/test_model_helpers/temporal/activations/test_extractor.py b/tests/test_model_helpers/temporal/activations/test_extractor.py
index 9f809bf5c..a30353b4a 100644
--- a/tests/test_model_helpers/temporal/activations/test_extractor.py
+++ b/tests/test_model_helpers/temporal/activations/test_extractor.py
@@ -3,7 +3,7 @@
 import pytest
 
 from brainscore_core.supported_data_standards.brainio.stimuli import StimulusSet
-from brainscore_vision.model_helpers.activations.temporal.inputs.base import Stimulus
+from brainscore_vision.model_helpers.activations.temporal.inputs import Stimulus
 from brainscore_vision.model_helpers.activations.temporal.model import ActivationWrapper
 from brainscore_vision.model_helpers.activations.temporal.core import TemporalInferencer, CausalInferencer 
 from collections import OrderedDict
@@ -93,4 +93,4 @@ def test_from_stimulus_set(causal, padding):
     assert len(np.unique(activations['layer'])) == len(layers)
 
     import gc
-    gc.collect()  # free some memory, we're piling up a lot of activations at this point
\ No newline at end of file
+    gc.collect()  # free some memory, we're piling up a lot of activations at this point
diff --git a/tests/test_model_helpers/temporal/activations/test_inferencer.py b/tests/test_model_helpers/temporal/activations/test_inferencer.py
index d9df0b5d4..db6249760 100644
--- a/tests/test_model_helpers/temporal/activations/test_inferencer.py
+++ b/tests/test_model_helpers/temporal/activations/test_inferencer.py
@@ -57,7 +57,7 @@ def time_down_sample_preprocess(video):
 def test_inferencer(max_spatial_size):
     inferencer = Inferencer(dummy_get_features, dummy_preprocess, dummy_layer_activation_format, 
                             Video, max_workers=1, max_spatial_size=max_spatial_size, batch_grouper=lambda s: s.duration)
-    model_assembly = inferencer(video_paths, layers=dummy_layers)
+    model_assembly = inferencer(video_paths[1:], layers=dummy_layers)
     if max_spatial_size is None:
         # 6 second video with fps 60 has 360 frames
         # the model simply return the same number of frames as the temporal size of activations
@@ -65,27 +65,22 @@ def test_inferencer(max_spatial_size):
         assert model_assembly.sizes["neuroid"] == 360*6*3*2 + 2
     else:
         assert model_assembly.sizes["neuroid"] == 360*max_spatial_size*(max_spatial_size//2) * 2 + 2
-    assert model_assembly.sizes["stimulus_path"] == 2 
+    assert model_assembly.sizes["stimulus_path"] == 1
 
 
-@pytest.mark.parametrize("time_alignment", ["evenly_spaced", "ignore_time"])
 @pytest.mark.parametrize("fps", [10, 30, 45])
-def test_temporal_inferencer(time_alignment, fps):
+def test_temporal_inferencer(fps):
     inferencer = TemporalInferencer(dummy_get_features, dummy_preprocess, 
                                     dummy_layer_activation_format, max_workers=1, 
-                                    fps=fps, time_alignment=time_alignment)
+                                    fps=fps)
     model_assembly = inferencer(video_paths, layers=dummy_layers)
     assert model_assembly['time_bin_start'].values[0] == 0
     assert model_assembly['time_bin_end'].values[-1] == max(video_durations)
 
-    if time_alignment != "ignore_time":
-        # since the longer video lasts for 6 seconds, and the temporal inferencer align all output assembly to have fps
-        # specified when constructing the inferencer, the number of time bins should be 6*fps
-        assert model_assembly.sizes["time_bin"] == 6 * fps
-        assert np.isclose(model_assembly['time_bin_end'].values[0] - model_assembly['time_bin_start'].values[0], 1000/fps)
-    else:
-        assert model_assembly.sizes["time_bin"] == 1
-        assert model_assembly['time_bin_end'].values[0] - model_assembly['time_bin_start'].values[0] == max(video_durations)
+    # since the longer video lasts for 6 seconds, and the temporal inferencer align all output assembly to have fps
+    # specified when constructing the inferencer, the number of time bins should be 6*fps
+    assert model_assembly.sizes["time_bin"] == 6 * fps
+    assert np.isclose(model_assembly['time_bin_end'].values[0] - model_assembly['time_bin_start'].values[0], 1000/fps)
 
     # manual computation check
     output_values = model_assembly.sel(stimulus_path=video_paths[1])\