diff --git a/brainscore_vision/data/berezutskaya2021/__init__.py b/brainscore_vision/data/berezutskaya2021/__init__.py new file mode 100644 index 000000000..41c7a28f2 --- /dev/null +++ b/brainscore_vision/data/berezutskaya2021/__init__.py @@ -0,0 +1,35 @@ +from brainscore_core.supported_data_standards.brainio.assemblies import NeuronRecordingAssembly + +import brainscore_vision +from brainscore_vision import data_registry, stimulus_set_registry +from brainscore_core.supported_data_standards.brainio.s3 import load_stimulus_set_from_s3, load_assembly_from_s3 + +BIBTEX = """@article{berezutskaya2022open, + title={Open multimodal iEEG-fMRI dataset from naturalistic stimulation with a short audiovisual film}, + author={Berezutskaya, Julia and Vansteensel, Mariska J and Aarnoutse, Erik J and Freudenburg, Zachary V and Piantoni, Giovanni and Branco, Mariana P and Ramsey, Nick F}, + journal={Scientific Data}, + volume={9}, + number={1}, + pages={91}, + year={2022}, + publisher={Nature Publishing Group UK London} +}""" + +# assembly: Berezutskaya2021-fMRI +data_registry['Berezutskaya2021-fMRI'] = lambda: load_assembly_from_s3( + identifier="Berezutskaya2021-fMRI", + version_id="AY03ldTEnzdocCBXmJPVnb9yVL7Slx_S", + sha1="936b2751a9d1eaa182a0646faa3ce1d974dd0d92", + bucket="brainscore-storage/brainscore-vision/benchmarks/Berezutskaya2021-fMRI", + cls=NeuronRecordingAssembly, + stimulus_set_loader=lambda: brainscore_vision.load_stimulus_set('Berezutskaya2021'), +) + +# stimulus set: Berezutskaya2021 +stimulus_set_registry['Berezutskaya2021'] = lambda: load_stimulus_set_from_s3( + identifier="Berezutskaya2021", + bucket="brainscore-storage/brainscore-vision/benchmarks/Berezutskaya2021-fMRI", + csv_sha1="42b06621fb5e3d4dadbc3975ddb31ed2df75aab4", + zip_sha1="3b05a9d055918da9536d615589bef6644e28d8a9", + csv_version_id="2BGwIC9sxZvNOW_bQr98Csk9hvbW.9lX", + zip_version_id="9QPVG.DYvWZcD_Q0wWrqUJrCsLDIvO8T") diff --git a/brainscore_vision/data/keles2024/__init__.py b/brainscore_vision/data/keles2024/__init__.py new file mode 100644 index 000000000..3e0d67b4c --- /dev/null +++ b/brainscore_vision/data/keles2024/__init__.py @@ -0,0 +1,36 @@ +from brainscore_core.supported_data_standards.brainio.assemblies import NeuronRecordingAssembly + +import brainscore_vision +from brainscore_vision import data_registry, stimulus_set_registry +from brainscore_core.supported_data_standards.brainio.s3 import load_stimulus_set_from_s3, load_assembly_from_s3 + +BIBTEX = """@article{keles2024multimodal, + title={Multimodal single-neuron, intracranial EEG, and fMRI brain responses during movie watching in human patients}, + author={Keles, Umit and Dubois, Julien and Le, Kevin JM and Tyszka, J Michael and Kahn, David A and Reed, Chrystal M and Chung, Jeffrey M and Mamelak, Adam N and Adolphs, Ralph and Rutishauser, Ueli}, + journal={Scientific data}, + volume={11}, + number={1}, + pages={214}, + year={2024}, + publisher={Nature Publishing Group UK London} +}""" + +# assembly: Keles2024-fMRI +data_registry['Keles2024-fMRI'] = lambda: load_assembly_from_s3( + identifier="Keles2024-fMRI", + version_id="qbmUcZGjShAjAifkTzF.cuAQcgvUy2Nl", + sha1="929288d0ec99a7991f157e7787a10452ee453021", + bucket="brainscore-storage/brainscore-vision/benchmarks/Keles2024-fMRI", + cls=NeuronRecordingAssembly, + stimulus_set_loader=lambda: brainscore_vision.load_stimulus_set('Keles2024'), +) + +# stimulus set: Keles2024 +stimulus_set_registry['Keles2024'] = lambda: load_stimulus_set_from_s3( + identifier="Keles2024", + bucket="brainscore-storage/brainscore-vision/benchmarks/Keles2024-fMRI", + csv_sha1="1fcf6d65a823b4e9ec4b02d6610190f92674b932", + zip_sha1="350536296800fc2f46172f231da2266a0fbac361", + csv_version_id="UKUxrOBp3eO4vbyGeqdhXGhfKhQ4Wrne", + zip_version_id="AB.1FCauYVAItaW3pYDYGq2AAd3VpacQ") + diff --git a/brainscore_vision/data/lahner2024/__init__.py b/brainscore_vision/data/lahner2024/__init__.py new file mode 100644 index 000000000..4ed76ad52 --- /dev/null +++ b/brainscore_vision/data/lahner2024/__init__.py @@ -0,0 +1,30 @@ +from brainscore_core.supported_data_standards.brainio.assemblies import NeuronRecordingAssembly + +import brainscore_vision +from brainscore_vision import data_registry, stimulus_set_registry +from brainscore_core.supported_data_standards.brainio.s3 import load_stimulus_set_from_s3, load_assembly_from_s3 + +BIBTEX = """@misc{lahner2024modeling, + title={Modeling short visual events through the BOLD moments video fMRI dataset and metadata. Nat. Commun. 15, 6241}, + author={Lahner, B and Dwivedi, K and Iamshchinina, P and Graumann, M and Lascelles, A and Roig, G and Gifford, AT and Pan, B and Jin, SY and Ratan Murty, NA and others}, + year={2024} +}""" + +# assembly: Lahner2024-fMRI +data_registry['Lahner2024-fMRI'] = lambda: load_assembly_from_s3( + identifier="Lahner2024-fMRI", + version_id="zr_i3T9Saww44rPNJwLaxo0hgp8rYjPO", + sha1="2c7f1d2e5724b8cc3c5cf47986e956c4f13001e4", + bucket="brainscore-storage/brainscore-vision/benchmarks/Lahner2024-fMRI", + cls=NeuronRecordingAssembly, + stimulus_set_loader=lambda: brainscore_vision.load_stimulus_set('BOLDMoments'), +) + +# stimulus set: BOLDMoments - Lahner2024-fMRI +stimulus_set_registry['BOLDMoments'] = lambda: load_stimulus_set_from_s3( + identifier="BOLDMoments", + bucket="brainscore-storage/brainscore-vision/benchmarks/Lahner2024-fMRI", + csv_sha1="0b27388f5898c908f58cd1f21f8f5cb3eda8536e", + zip_sha1="dc9c3bf631632cd433d02f2f1847fd33c01ae0b3", + csv_version_id="WaGkWh59b1drhy1MmAVVSxh7_VT_eTay", + zip_version_id="OxpOYy_3bveay9NFFFxNCVyghyAbqyIt") diff --git a/brainscore_vision/data/mcmahon2023/__init__.py b/brainscore_vision/data/mcmahon2023/__init__.py new file mode 100644 index 000000000..70771b55f --- /dev/null +++ b/brainscore_vision/data/mcmahon2023/__init__.py @@ -0,0 +1,35 @@ +from brainscore_core.supported_data_standards.brainio.assemblies import NeuronRecordingAssembly + +import brainscore_vision +from brainscore_vision import data_registry, stimulus_set_registry +from brainscore_core.supported_data_standards.brainio.s3 import load_stimulus_set_from_s3, load_assembly_from_s3 + +BIBTEX = """@article{mcmahon2023hierarchical, + title={Hierarchical organization of social action features along the lateral visual pathway}, + author={McMahon, Emalie and Bonner, Michael F and Isik, Leyla}, + journal={Current Biology}, + volume={33}, + number={23}, + pages={5035--5047}, + year={2023}, + publisher={Elsevier} +}""" + +# assembly: McMahon2023-fMRI +data_registry['McMahon2023-fMRI'] = lambda: load_assembly_from_s3( + identifier="McMahon2023-fMRI", + version_id="0k8B44WRZaQyRc1eynbi_RBrjRT1mKL8", + sha1="c8f1ae9981c11f10135c20eca225b6aade039d23", + bucket="brainscore-storage/brainscore-vision/benchmarks/McMahon2023-fMRI", + cls=NeuronRecordingAssembly, + stimulus_set_loader=lambda: brainscore_vision.load_stimulus_set('McMahon2023'), +) + +# stimulus set: McMahon2023 +stimulus_set_registry['McMahon2023'] = lambda: load_stimulus_set_from_s3( + identifier="McMahon2023", + bucket="brainscore-storage/brainscore-vision/benchmarks/McMahon2023-fMRI", + csv_sha1="fe617e7bce845c22eccf46242c95c0a74bcb501e", + zip_sha1="f136c855a457b6b40d0b91f621d9fbb68f1c6c48", + csv_version_id="zbknzf08aV.CLb1pUkt1ZMRk1nWheSPp", + zip_version_id="EVMJwIgUmE_NlLMCLTvMOpBQC__aiLXz") diff --git a/brainscore_vision/data/savasegal2023/__init__.py b/brainscore_vision/data/savasegal2023/__init__.py new file mode 100644 index 000000000..77b502bcd --- /dev/null +++ b/brainscore_vision/data/savasegal2023/__init__.py @@ -0,0 +1,95 @@ +from brainscore_core.supported_data_standards.brainio.assemblies import NeuronRecordingAssembly + +import brainscore_vision +from brainscore_vision import data_registry, stimulus_set_registry +from brainscore_core.supported_data_standards.brainio.s3 import load_stimulus_set_from_s3, load_assembly_from_s3 + +BIBTEX = """@article{sava2023individual, + title={Individual differences in neural event segmentation of continuous experiences}, + author={Sava-Segal, Clara and Richards, Chandler and Leung, Megan and Finn, Emily S}, + journal={Cerebral Cortex}, + volume={33}, + number={13}, + pages={8164--8178}, + year={2023}, + publisher={Oxford University Press} +}""" + +# assembly: Savasegal2023-fMRI-Defeat +data_registry['Savasegal2023-fMRI-Defeat'] = lambda: load_assembly_from_s3( + identifier="Savasegal2023-fMRI-Defeat", + version_id="eb..6dECXftcmMTZX2DXWPVKz2luoUn9", + sha1="b6ea38ad12718122f7fef78d2ec5e809cce7c977", + bucket="brainscore-storage/brainscore-vision/benchmarks/Savasegal2023-fMRI-Defeat", + cls=NeuronRecordingAssembly, + stimulus_set_loader=lambda: brainscore_vision.load_stimulus_set('Savasegal2023-Defeat'), +) + +# stimulus set: Savasegal2023-Defeat +stimulus_set_registry['Savasegal2023-Defeat'] = lambda: load_stimulus_set_from_s3( + identifier="Savasegal2023-Defeat", + bucket="brainscore-storage/brainscore-vision/benchmarks/Savasegal2023-fMRI-Defeat", + csv_sha1="ef5de41cc775b12583cb2f41b25ebbcd157bd87a", + zip_sha1="5de346354a1fc2a74a7a013e74583a57b83dfe01", + csv_version_id="FKYjSSKOMZEpQo3h6x6MflH3NzNqP4Lg", + zip_version_id="Ll2ENeVR7E_.AUB9chchamR3MFNGqZgk") + + +# assembly: Savasegal2023-fMRI-Growth +data_registry['Savasegal2023-fMRI-Growth'] = lambda: load_assembly_from_s3( + identifier="Savasegal2023-fMRI-Growth", + version_id="fKUd3lqYkmVuhk7tQhWgEgjYUuYxzHr2", + sha1="93b6b0db811dce340e709dfb57156156e78d437d", + bucket="brainscore-storage/brainscore-vision/benchmarks/Savasegal2023-fMRI-Growth", + cls=NeuronRecordingAssembly, + stimulus_set_loader=lambda: brainscore_vision.load_stimulus_set('Savasegal2023-Growth'), +) + +# stimulus set: Savasegal2023-Growth +stimulus_set_registry['Savasegal2023-Growth'] = lambda: load_stimulus_set_from_s3( + identifier="Savasegal2023-Growth", + bucket="brainscore-storage/brainscore-vision/benchmarks/Savasegal2023-fMRI-Growth", + csv_sha1="029e2d68ad952d2929aa8cfb836443f7d3714fea", + zip_sha1="7dbdbb36920017b39793c0cc51c96393e33d8a99", + csv_version_id="DSmsK1BISRn5j2lLjWINJdjmuTx1MuYk", + zip_version_id="E.EBCeeQYY8MM8cAJvL261n4ox7Z.huW") + + +# assembly: Savasegal2023-fMRI-Iteration +data_registry['Savasegal2023-fMRI-Iteration'] = lambda: load_assembly_from_s3( + identifier="Savasegal2023-fMRI-Iteration", + version_id="VI_dZOWapYT.C2i1o_BkmYJpjVAQQQB9", + sha1="1595605baf4b461f11164b768337a10986b77618", + bucket="brainscore-storage/brainscore-vision/benchmarks/Savasegal2023-fMRI-Iteration", + cls=NeuronRecordingAssembly, + stimulus_set_loader=lambda: brainscore_vision.load_stimulus_set('Savasegal2023-Iteration'), +) + +# stimulus set: Savasegal2023-Iteration +stimulus_set_registry['Savasegal2023-Iteration'] = lambda: load_stimulus_set_from_s3( + identifier="Savasegal2023-Iteration", + bucket="brainscore-storage/brainscore-vision/benchmarks/Savasegal2023-fMRI-Iteration", + csv_sha1="5634119759f04886dc741f41243bc84a66d5d569", + zip_sha1="0f5cca320e6eaf814cad162cac4f40dd9dc9dcbe", + csv_version_id="jSz5.kOpWrwUTHjMhUGRCbHYke1o_NzW", + zip_version_id="qJoYn1uhNbWisP.K6ix4Npszh7sTYcn7") + + +# assembly: Savasegal2023-fMRI-Lemonade +data_registry['Savasegal2023-fMRI-Lemonade'] = lambda: load_assembly_from_s3( + identifier="Savasegal2023-fMRI-Lemonade", + version_id="BL3._yZ.5QnpmqfnZUQo8w6uJskg_uj7", + sha1="3c5849b7c9682a879059ff0e8c4b20656807bb4f", + bucket="brainscore-storage/brainscore-vision/benchmarks/Savasegal2023-fMRI-Lemonade", + cls=NeuronRecordingAssembly, + stimulus_set_loader=lambda: brainscore_vision.load_stimulus_set('Savasegal2023-Lemonade'), +) + +# stimulus set: Savasegal2023-Lemonade +stimulus_set_registry['Savasegal2023-Lemonade'] = lambda: load_stimulus_set_from_s3( + identifier="Savasegal2023-Lemonade", + bucket="brainscore-storage/brainscore-vision/benchmarks/Savasegal2023-fMRI-Lemonade", + csv_sha1="4537ca9955473b6688cd9f990b9a16182cbbb50e", + zip_sha1="79edea7316c9f5dde7a8682dce7000ed506d2026", + csv_version_id="EeZlMuTe2gsAmqbyeSMAOXMqovDGDkrK", + zip_version_id="ZBnxvkOf8WZVAIoMOieNhVbEXTxfixnF") \ No newline at end of file diff --git a/brainscore_vision/model_helpers/activations/temporal/core/__init__.py b/brainscore_vision/model_helpers/activations/temporal/core/__init__.py index 90f0214ac..78f43a3e4 100644 --- a/brainscore_vision/model_helpers/activations/temporal/core/__init__.py +++ b/brainscore_vision/model_helpers/activations/temporal/core/__init__.py @@ -1,3 +1,21 @@ from .extractor import ActivationsExtractor from .executor import BatchExecutor -from .inferencer import * \ No newline at end of file +from .inferencer import ( + Inferencer, + TemporalInferencer, + TemporalContextInferencerBase, + CausalInferencer, + BlockInferencer, + channel_name_mapping, +) + +__all__ = [ + "ActivationsExtractor", + "BatchExecutor", + "Inferencer", + "TemporalInferencer", + "TemporalContextInferencerBase", + "CausalInferencer", + "BlockInferencer", + "channel_name_mapping", +] diff --git a/brainscore_vision/model_helpers/activations/temporal/core/executor.py b/brainscore_vision/model_helpers/activations/temporal/core/executor.py index 9670a85ff..3283fa504 100644 --- a/brainscore_vision/model_helpers/activations/temporal/core/executor.py +++ b/brainscore_vision/model_helpers/activations/temporal/core/executor.py @@ -20,8 +20,8 @@ def _func(x, *others): return _func -# a mapper that execute a function in parallel with joblib -class JoblibMapper: +# a mapper that execute a fxunction in parallel with joblib +class JobsMapper: def __init__(self, num_threads: int): self._num_threads = num_threads self._pool = Parallel(n_jobs=num_threads, verbose=False, backend="loky") @@ -102,7 +102,7 @@ def __init__(self, if self.max_workers is not None: num_threads = min(self.max_workers, num_threads) self._logger.info(f"Using {num_threads} threads for parallel processing.") - self._mapper = JoblibMapper(num_threads) + self._mapper = JobsMapper(num_threads) # processing hooks self.before_hooks = [] @@ -126,10 +126,11 @@ def _get_batches( Returns ------- - indices : list - indices of the source data after sorting. - masks : list of list + all_indices : list of list + indices of the source data in each batch. + all_masks : list of list masks for each batch to indicate whether the datum is padding sample. + 1 for not padding, 0 for padding. all_batches : list of list list of batches. """ @@ -138,21 +139,19 @@ def _get_batches( if grouper is None: sorted_data = np.array(data, dtype='object') - inverse_indices = np.arange(N) + sorted_indices = np.arange(N) sorted_properties = [0] * N else: properties = np.array([hash(grouper(d)) for d in data]) sorted_indices = np.argsort(properties) - inverse_indices = np.argsort(sorted_indices) # inverse transform sorted_properties = properties[sorted_indices] sorted_data = np.array(data, dtype='object')[sorted_indices] - inverse_indices = list(inverse_indices) + sorted_indices = list(sorted_indices) index = 0 all_batches = [] all_indices = [] - indices = [] - masks = [] + all_masks = [] while index < N: property_anchor = sorted_properties[index] batch = [] @@ -160,7 +159,7 @@ def _get_batches( batch.append(sorted_data[index]) index += 1 - batch_indices = inverse_indices[index-len(batch):index] + batch_indices = sorted_indices[index-len(batch):index] if padding: num_padding = batch_size - len(batch) @@ -169,23 +168,32 @@ def _get_batches( batch += batch_padding else: num_padding = 0 - masks.append([True] * (len(batch)-num_padding) + [False] * num_padding) - + all_masks.append([True] * (len(batch)-num_padding) + [False] * num_padding) all_batches.append(batch) - all_indices.append(batch_indices) - indices.extend(batch_indices) - return indices, masks, all_batches + all_indices.append(batch_indices + [-1] * num_padding) + return all_indices, all_masks, all_batches + + def _register_hook(self, name, hook, hook_group, index=None): + if index is None: index = len(hook_group) + hook_group.insert(index, (name, hook)) + + def _remove_hook(self, name, hook_group): + hook_group[:] = [(n, h) for n, h in hook_group if n != name] - def register_before_hook(self, hook: Callable[[Stimulus], Stimulus], index=None): + def register_before_hook(self, name: str, hook: Callable[[Stimulus], Stimulus], index=None): # hook: Stimulus -> Stimulus - if index is None: index = len(self.before_hooks) - self.before_hooks.insert(index, hook) + self._register_hook(name, hook, self.before_hooks, index) - def register_after_hook(self, hook: Callable[[Any, str, Stimulus], Any], index=None): + def register_after_hook(self, name: str, hook: Callable[[Any, str, Stimulus], Any], index=None): # hook: value, layer_name, Stimulus -> value - if index is None: index = len(self.after_hooks) - self.after_hooks.insert(index, hook) - + self._register_hook(name, hook, self.after_hooks, index) + + def remove_before_hook(self, name: str): + self._remove_hook(name, self.before_hooks) + + def remove_after_hook(self, name: str): + self._remove_hook(name, self.after_hooks) + def add_stimuli(self, stimuli): self.stimuli.extend(stimuli) @@ -193,27 +201,45 @@ def clear_stimuli(self): self.stimuli = [] def execute(self, layers): - indices, masks, batches = self._get_batches(self.stimuli, self.batch_size, + full_indices = [] + full_activations = OrderedDict() + for layer_activations, indices in self.execute_batch(layers): + full_indices.extend(indices) + for layer_activation in layer_activations: + for layer, activations in layer_activation.items(): + full_activations.setdefault(layer, []).append(activations) + + for layer, activations in full_activations.items(): + full_activations[layer] = [activations[i] for i in full_indices] + + return full_activations + + def execute_batch(self, layers): + all_indices, all_masks, batches = self._get_batches(self.stimuli, self.batch_size, grouper=self.batch_grouper, padding=self.batch_padding) - - before_pipe = _pipeline(*self.before_hooks) - after_pipe = _pipeline(*self.after_hooks) - - layer_activations = OrderedDict() - for mask, batch in tqdm(zip(masks, batches), desc="activations", total=len(batches)): + + before_pipe = _pipeline(*[hook for name, hook in self.before_hooks]) + after_pipe = _pipeline(*[hook for name, hook in self.after_hooks]) + + # avoid keeping the whole batch in memory + def run(batch, mask, indices): batch = [before_pipe(stimulus) for stimulus in batch] model_inputs = self._mapper.map(self.preprocess, batch) batch_activations = self.get_activations(model_inputs, layers) assert isinstance(batch_activations, OrderedDict) - for layer, activations in batch_activations.items(): + for i, (layer, activations) in enumerate(batch_activations.items()): results = [after_pipe(arr, layer, stimulus) - for not_pad, arr, stimulus in zip(mask, activations, batch) - if not_pad] - layer_activations.setdefault(layer, []).extend(results) - - for layer, activations in layer_activations.items(): - layer_activations[layer] = [activations[i] for i in indices] - - self.clear_stimuli() - return layer_activations + for not_pad, arr, stimulus in zip(mask, activations, batch) + if not_pad] + if i == 0: + layer_activations = [OrderedDict() for _ in range(len(results))] + + for j, result in enumerate(results): + layer_activations[j][layer] = result + return layer_activations, indices + + for indices, mask, batch in tqdm(zip(all_indices, all_masks, batches), desc="activations", total=len(batches)): + yield run(batch, mask, indices) + + self.clear_stimuli() \ No newline at end of file diff --git a/brainscore_vision/model_helpers/activations/temporal/core/extractor.py b/brainscore_vision/model_helpers/activations/temporal/core/extractor.py index fe988e480..a66558db4 100644 --- a/brainscore_vision/model_helpers/activations/temporal/core/extractor.py +++ b/brainscore_vision/model_helpers/activations/temporal/core/extractor.py @@ -11,6 +11,7 @@ from brainscore_core.supported_data_standards.brainio.stimuli import StimulusSet from brainscore_core.supported_data_standards.brainio.assemblies import DataAssembly from brainscore_vision.model_helpers.utils import fullname +from brainscore_vision.model_helpers.activations.temporal.utils import data_assembly_mmap from result_caching import store_xarray from .inferencer import Inferencer from ..inputs import Stimulus @@ -101,31 +102,26 @@ def from_paths( ): if layers is None: layers = ['logits'] - if self.identifier and stimuli_identifier: - fnc = functools.partial(self._from_paths_stored, - identifier=self.identifier, stimuli_identifier=stimuli_identifier) + + mmap_home = os.environ.get("MMAP_HOME", None) + if self.identifier and stimuli_identifier and mmap_home: + mmap_path = os.path.join(mmap_home, stimuli_identifier, self.identifier) + os.makedirs(mmap_path, exist_ok=True) else: - self._logger.debug(f"self.identifier `{self.identifier}` or stimuli_identifier {stimuli_identifier} " - f"are not set, will not store") - fnc = self._from_paths + mmap_path = None + # In case stimuli paths are duplicates (e.g. multiple trials), we first reduce them to only the paths that need # to be run individually, compute activations for those, and then expand the activations to all paths again. # This is done here, before storing, so that we only store the reduced activations. reduced_paths = self._reduce_paths(stimuli_paths) - activations = fnc(layers=layers, stimuli_paths=reduced_paths) + data = data_assembly_mmap.load(mmap_path) + if data: + activations = data.to_assembly() + else: + activations = self.inferencer(layers=layers, paths=reduced_paths, mmap_path=mmap_path) activations = self._expand_paths(activations, original_paths=stimuli_paths) return activations - @store_xarray(identifier_ignore=['stimuli_paths', 'layers'], combine_fields={'layers': 'layer'}) - def _from_paths_stored(self, identifier, layers, stimuli_identifier, stimuli_paths): - stimuli_paths.sort() - return self._from_paths(layers=layers, stimuli_paths=stimuli_paths) - - def _from_paths(self, layers, stimuli_paths): - if len(layers) == 0: - raise ValueError("No layers passed to retrieve activations from") - return self.inferencer(stimuli_paths, layers) - def _reduce_paths(self, stimuli_paths): return list(set(stimuli_paths)) diff --git a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/__init__.py b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/__init__.py index edb58f1aa..84a91b26a 100644 --- a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/__init__.py +++ b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/__init__.py @@ -1,2 +1,19 @@ -from .base import Inferencer -from .video import * \ No newline at end of file +from .base import ( + Inferencer, + channel_name_mapping, +) +from .temporal import ( + TemporalInferencer, + TemporalContextInferencerBase, + CausalInferencer, + BlockInferencer, +) + +__all__ = [ + "Inferencer", + "TemporalInferencer", + "TemporalContextInferencerBase", + "CausalInferencer", + "BlockInferencer", + "channel_name_mapping", +] diff --git a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/base.py b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/base.py index ce40262d6..a1ef87d22 100644 --- a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/base.py +++ b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/base.py @@ -1,19 +1,16 @@ -import functools import logging -from collections import OrderedDict -from typing import Callable, Hashable, List, Dict, Any, Union from pathlib import Path +from typing import Any, Callable, Dict, Hashable, List, Union import numpy as np from tqdm.auto import tqdm -import gc -from brainscore_core.supported_data_standards.brainio.assemblies import NeuroidAssembly, walk_coords +from brainscore_core.supported_data_standards.brainio.assemblies import NeuroidAssembly from brainscore_vision.model_helpers.utils import fullname -from brainscore_vision.model_helpers.activations.temporal.core.executor import BatchExecutor -from brainscore_vision.model_helpers.activations.temporal.utils import stack_with_nan_padding, batch_2d_resize -from brainscore_vision.model_helpers.activations.temporal.inputs import Stimulus +from ..executor import BatchExecutor +from ...inputs import Stimulus +from ...utils import batch_2d_resize, data_assembly_mmap channel_name_mapping = { @@ -21,16 +18,75 @@ "C": "channel", "H": "channel_y", "W": "channel_x", - "K": "channel_token" + "K": "channel_token", } +def _make_tensor_to_numpy_hook(): + import torch + + def hook(val, layer, stimulus): + if isinstance(val, torch.Tensor): + return val.cpu().data.numpy() + return val + + return hook + + +def _make_dtype_hook(dtype): + return lambda val, layer, stimulus: val.astype(dtype) + + +# downsample the activations with the largest spatial size (among width and height) to the specified size +def _make_spatial_downsample_hook(max_spatial_size, layer_activation_format, mode="pool"): + def hook(val, layer, stimulus): + if max_spatial_size is None: + return val + + dims = layer_activation_format[layer] + + # require both H and W dimensions to do spatial downsampling + if "H" not in dims or "W" not in dims: + return val + + H_dim, W_dim = dims.index("H"), dims.index("W") + val = val.swapaxes(H_dim, 0).swapaxes(W_dim, 1) + shape = val.shape[2:] + h, w = val.shape[:2] + val = val.reshape(h, w, -1) + new_size = _compute_new_size(w, h, max_spatial_size) + new_val = batch_2d_resize(val[None, :], new_size, mode=mode)[0] + new_val = new_val.reshape(*new_size, *shape) + new_val = new_val.swapaxes(0, H_dim).swapaxes(1, W_dim) + return new_val + + return hook + + +def _compute_new_size(w, h, max_spatial_size): + if isinstance(max_spatial_size, int): + if h > w: + new_h = max_spatial_size + new_w = int(w * new_h / h) + else: + new_w = max_spatial_size + new_h = int(h * new_w / w) + else: + new_h = int(h * max_spatial_size) + new_w = int(w * max_spatial_size) + + new_h = max(1, new_h) + new_w = max(1, new_w) + + return new_h, new_w + + class Inferencer: """Inferencer for batch processing of stimuli and packaging the activations. - + Parameters ---------- - get_activations : function + get_activations : function function that takes a list of processed stimuli and a list of layers, and returns a dictionary of activations. preprocessing : function function that takes a stimulus and returns a processed stimulus. @@ -43,8 +99,8 @@ class Inferencer: the visual degrees of the stimuli. max_spatial_size: int/float the maximum spatial size of the activations. If the spatial size of the activations is larger than this value, - the activations will be downsampled to this size. This is used to avoid the large memory consumption by the first layers of some model. - If float, resize the image based on this factor. + the activations will be downsampled to this size. This is used to avoid the large memory consumption by the + first layers of some model. If float, resize the image based on this factor. dtype: np.dtype data type of the activations. batch_size: int @@ -57,13 +113,13 @@ class Inferencer: max_workers: int the maximum number of workers to use for parallel processing. - APIs ---- - __call__(paths, layers) + __call__(paths, layers, mmap_path=None) process the stimuli and return a holistic assembly that compile activations of all specified layers. The returned assembly is a NeuroidAssembly with the dimensions [stimulus_path, neuroid]. All dimensions of the activations will be stacked together to form the "neuroid" dimension. + If mmap_path is specified, the activations will be saved to the mmap file. Examples -------- @@ -72,22 +128,22 @@ class Inferencer: """ def __init__( - self, - get_activations : Callable[[List[Any]], Dict[str, np.array]], - preprocessing : Callable[[List[Stimulus]], Any], - layer_activation_format : dict, - stimulus_type : Stimulus, - visual_degrees : float = 8., - max_spatial_size : Union[int, float] = None, - dtype : np.dtype = np.float16, - batch_size : int = 64, - batch_grouper : Callable[[Stimulus], Hashable] = None, - batch_padding : bool = False, - max_workers : int = None, - *args, - **kwargs - ): - + self, + get_activations: Callable[[List[Any]], Dict[str, np.array]], + preprocessing: Callable[[List[Stimulus]], Any], + layer_activation_format: dict, + stimulus_type: Stimulus, + visual_degrees: float = 8.0, + max_spatial_size: Union[int, float] = None, + dtype: np.dtype = np.float16, + batch_size: int = 64, + batch_grouper: Callable[[Stimulus], Hashable] = None, + batch_padding: bool = False, + max_workers: int = None, + simple_neuroid_id: bool = False, + *args, + **kwargs, + ): self.stimulus_type = stimulus_type self.layer_activation_format = layer_activation_format if isinstance(max_spatial_size, float): @@ -95,14 +151,21 @@ def __init__( self.max_spatial_size = max_spatial_size self.visual_degrees = visual_degrees self.dtype = dtype - self._executor = BatchExecutor(get_activations, preprocessing, batch_size, batch_padding, batch_grouper, max_workers) + self._executor = BatchExecutor( + get_activations, preprocessing, batch_size, batch_padding, batch_grouper, max_workers + ) self._stimulus_set_hooks = {} self._batch_activations_hooks = {} self._logger = logging.getLogger(fullname(self)) + self.simple_neuroid_id = simple_neuroid_id # register hooks - self._executor.register_after_hook(self._make_spatial_downsample_hook(max_spatial_size)) - self._executor.register_after_hook(self._make_dtype_hook(dtype)) + self._executor.register_after_hook("tensor_to_numpy", _make_tensor_to_numpy_hook()) + self._executor.register_after_hook( + "spatial_downsample", + _make_spatial_downsample_hook(max_spatial_size, self.layer_activation_format), + ) + self._executor.register_after_hook("dtype", _make_dtype_hook(dtype)) @property # identifier for the inferencer: including all the features that may affect the activations @@ -115,160 +178,93 @@ def identifier(self) -> str: to_add.append(f".max_s={self.max_spatial_size}") to_add = "".join(to_add) return f"{self.__class__.__name__}{to_add}" - + def set_visual_degrees(self, visual_degrees: float): self.visual_degrees = visual_degrees + print("Visual degrees not supported yet. Bypassing...") - # given the paths of the stimuli and the layers, return the model activations as a NeuroidAssembly - def __call__(self, paths: List[Union[str, Path]], layers: List[str]): + def __call__(self, paths: List[Union[str, Path]], layers: List[str], mmap_path: str = None) -> NeuroidAssembly: stimuli = self.load_stimuli(paths) - layer_activations = self.inference(stimuli, layers) - layer_assemblies = OrderedDict() - for layer in tqdm(layers, desc="Packaging layers"): - layer_assemblies[layer] = self.package_layer(layer_activations[layer], self.layer_activation_format[layer], stimuli) - del layer_activations[layer] - gc.collect() # reduce memory usage - model_assembly = self.package(layer_assemblies, paths) - return model_assembly - - def load_stimuli(self, paths : List[Union[str, Path]]) -> List[Stimulus]: + num_stimuli = len(paths) + stimulus_paths = paths + + self._executor.add_stimuli(stimuli) + data = None + + for layer_activations, indicies in self._executor.execute_batch(layers): + for layer_activation, i in zip(layer_activations, indicies): + if data is None: + num_feats, neuroid_coords = self._get_neuroid_coords( + layer_activation, self.layer_activation_format + ) + data = data_assembly_mmap( + mmap_path, shape=(num_stimuli, num_feats), dtype=self.dtype, fill_value=np.nan + ) + flatten_activation = self._flatten_activations(layer_activation) + if flatten_activation.size != num_feats: + raise ValueError( + f"The flattened activation size changed from {num_feats} to {flatten_activation.size}" + ) + data[i, :] = flatten_activation + + data.register_meta( + dims=["stimulus_path", "neuroid"], + coords={ + "stimulus_path": stimulus_paths, + **neuroid_coords, + }, + ) + + return data.to_assembly() + + def load_stimuli(self, paths: List[Union[str, Path]]) -> List[Stimulus]: ret = [] for p in tqdm(paths, desc="Loading stimuli"): ret.append(self.load_stimulus(p)) return ret - def load_stimulus(self, path : Union[str, Path]) -> Stimulus: + def load_stimulus(self, path: Union[str, Path]) -> Stimulus: return self.stimulus_type.from_path(path) - - # process the list of stimulus and return the activations (list of np.array, - # whose length is the number of stimuli) of the specified layers - def inference(self, stimuli : List[Stimulus], layers : List[str]) -> Dict[str, List[np.array]]: - self._executor.add_stimuli(stimuli) - return self._executor.execute(layers) - - # Take the layer_activation (a list of np.array) and the layer specification, - # and package them into a NeuroidAssembly with all channels flattened into the "neuroid" dimension - def package_layer( - self, - layer_activation : List[np.array], - layer_spec : str, - stimuli : List[Stimulus] - ): - assert len(layer_activation) == len(stimuli) - layer_activation = stack_with_nan_padding(layer_activation, dtype=self.dtype) - channels = self._map_dims(layer_spec) - assembly = self._package(layer_activation, ["stimulus_path"] + channels) - assembly = self._stack_neuroid(assembly, channels) - return assembly - - # package the assemblies from different layers into a single one by concat along the neuroid dimension - def package(self, layer_assemblies : Dict[str, NeuroidAssembly], stimuli_paths : List[str]) -> NeuroidAssembly: - # merge manually instead of using merge_data_arrays since `xarray.merge` is very slow with these large arrays - # complication: (non)neuroid_coords are taken from the structure of layer_assemblies[0] i.e. the 1st assembly; - # using these names/keys for all assemblies results in KeyError if the first layer contains dims - # (see _package_layer) not present in later layers, e.g. first layer = conv, later layer = transformer layer - self._logger.debug(f"Merging {len(layer_assemblies)} layer assemblies") - layers = list(layer_assemblies.keys()) - layer_assemblies = list(layer_assemblies.values()) - layer_assemblies = [asm.transpose(*layer_assemblies[0].dims) for asm in layer_assemblies] - - nonneuroid_coords = {coord: (dims, values) for coord, dims, values in walk_coords(layer_assemblies[0]) - if set(dims) != {'neuroid'}} - neuroid_coords = [(coord, dims) for layer_assembly in layer_assemblies for coord, dims, values in walk_coords(layer_assembly) - if set(dims) == {'neuroid'} and coord!='neuroid'] - neuroid_coord_names = set(neuroid_coords) + + def _flatten_activations(self, layer_activation): + arrs = [arr.flatten() for arr in layer_activation.values()] + return np.concatenate(arrs) + + def _get_neuroid_coords(self, layer_activation, layer_specs): + feat_sizes = [arr.size for arr in layer_activation.values()] + feat_shapes = [arr.shape for arr in layer_activation.values()] + layers = list(layer_activation.keys()) + num_feats = sum(feat_sizes) + count_neuroids = 0 neuroid_coords = {} + neuroid_coords["neuroid_id"] = [] + neuroid_coords["neuroid_num"] = [] + neuroid_coords["layer"] = [] - for layer_assembly in layer_assemblies: - for coord, _ in neuroid_coord_names: - try: - coord_values = layer_assembly[coord].values - except KeyError: - coord_values = np.full(layer_assembly.sizes['neuroid'], -1, dtype=int) - neuroid_coords.setdefault(coord, []).append(coord_values) - - assert layer_assemblies[0].dims == layer_assembly.dims - for dim in set(layer_assembly.dims) - {'neuroid'}: - for coord, _, _ in walk_coords(layer_assembly[dim]): - assert (layer_assembly[coord].values == layer_assemblies[0][coord].values).all() - - for coord, dims in neuroid_coord_names: - neuroid_coords[coord] = (dims, np.concatenate(neuroid_coords[coord])) - - # add stimulus_paths - nonneuroid_coords["stimulus_path"] = ('stimulus_path', stimuli_paths) - - # add layer, neuroid_num, neuroid_id - layer_sizes = [asm.sizes['neuroid'] for asm in layer_assemblies] - neuroid_coords['layer'] = (('neuroid',), np.concatenate([[layer] * size for layer, size in zip(layers, layer_sizes)])) - neuroid_coords['neuroid_num'] = (('neuroid',), np.concatenate([np.arange(size) for size in layer_sizes])) - neuroid_coords['neuroid_id'] = (('neuroid',), np.concatenate([np.array([f"{layer}.{neuroid_num}" for neuroid_num in range(size)]) - for layer, size in zip(layers, layer_sizes)])) - - model_assembly = np.concatenate([a.values for a in layer_assemblies], axis=layer_assemblies[0].dims.index('neuroid')) - model_assembly = type(layer_assemblies[0])(model_assembly, coords={**nonneuroid_coords, **neuroid_coords},dims=layer_assemblies[0].dims) - return model_assembly - - # turn the activations into the specified dtype - def _make_dtype_hook(self, dtype): - return lambda val, layer, stimulus: val.astype(dtype) - - # downsample the activations with the largest spatial size (among width and height) to the specified size - def _make_spatial_downsample_hook(self, max_spatial_size, mode="pool"): - def hook(val, layer, stimulus): - if max_spatial_size is None: - return val - - dims = self.layer_activation_format[layer] - - # require both H and W dimensions to do spatial downsampling - if "H" not in dims or "W" not in dims: - return val - - H_dim, W_dim = dims.index("H"), dims.index("W") - val = val.swapaxes(H_dim, 0).swapaxes(W_dim, 1) - shape = val.shape[2:] - h, w = val.shape[:2] - val = val.reshape(h, w, -1) - new_size = _compute_new_size(w, h, self.max_spatial_size) - new_val = batch_2d_resize(val[None,:], new_size, mode=mode)[0] - new_val = new_val.reshape(*new_size, *shape) - new_val = new_val.swapaxes(0, H_dim).swapaxes(1, W_dim) - return new_val.astype(self.dtype) - return hook - - # map dims to channel names - @staticmethod - def _map_dims(dims): - return [channel_name_mapping[dim] for dim in dims] - - # stack the channel dimensions to form the "neuroid" dimension - @staticmethod - def _stack_neuroid(assembly, channels): - asm_cls = assembly.__class__ - assembly = assembly.stack(neuroid=channels) - return asm_cls(assembly) - - # package an activation numpy array into a NeuroidAssembly with specified dims - @staticmethod - def _package(activation: np.array, dims): - coords = {dim: range(activation.shape[i]) for i, dim in enumerate(dims)} - ret = NeuroidAssembly(activation, coords=coords, dims=dims) - return ret + def _grid(*ns): + from itertools import product -def _compute_new_size(w, h, max_spatial_size): - if isinstance(max_spatial_size, int): - if h > w: - new_h = max_spatial_size - new_w = int(w * new_h / h) - else: - new_w = max_spatial_size - new_h = int(h * new_w / w) - else: - new_h = int(h * max_spatial_size) - new_w = int(w * max_spatial_size) - - new_h = max(1, new_h) - new_w = max(1, new_w) + ns = [range(n) for n in ns] + return product(*ns) - return new_h, new_w + def _expand_spec(spec, shape): + assert len(spec) == len(shape) + return [".".join([f"{dim}{i}" for dim, i in zip(spec, grid)]) for grid in _grid(*shape)] + + # neuroid_id, neuroid_num, layer + for layer, size, shape in zip(layers, feat_sizes, feat_shapes): + layer_spec = layer_specs[layer] + layer_spec = [(dim, s) for dim, s in zip(layer_spec, shape)] + neuroid_nums = list(range(count_neuroids, count_neuroids + size)) + neuroid_ids = [f"{layer}.{nid}" for nid in _expand_spec(*list(zip(*layer_spec)))] if not self.simple_neuroid_id else neuroid_nums + count_neuroids += size + layers = [layer] * size + + neuroid_coords["neuroid_id"].extend(neuroid_ids) + neuroid_coords["neuroid_num"].extend(neuroid_nums) + neuroid_coords["layer"].extend(layers) + + for key, val in neuroid_coords.items(): + neuroid_coords[key] = ("neuroid", val) + + return num_feats, neuroid_coords diff --git a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/temporal.py b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/temporal.py new file mode 100644 index 000000000..63a5ba5c2 --- /dev/null +++ b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/temporal.py @@ -0,0 +1,459 @@ +from typing import Callable, Hashable, List, Tuple, Union +from pathlib import Path + +import numpy as np + +from brainscore_core.supported_data_standards.brainio.assemblies import NeuroidAssembly + +from .base import Inferencer +from ...inputs import Stimulus, Video +from ...utils import data_assembly_mmap + + +MS_ROUNDING_DIGITS = 3 + + +class TemporalInferencer(Inferencer): + """Inferencer for video stimuli. The model takes a WHOLE video stimuli as input and generate the activations over time. + Then, the activations will be aligned to video time by EVENLY distributing the activations of multiple time steps + over time. The aligned activations will be again unified to the fps specified within the constructor (self.fps). + Finally, the activations will be packaged into a NeuroidAssembly. + + Example: + temporal_inferencer = TemporalInferencer(..., fps=10) + model_assembly = temporal_inferencer(video_paths[1000ms], layers) + model_assembly.time_bins -> [(0, 100), (100, 200), ..., (900, 1000)] # 1000ms, 10fps + + Parameters + ---------- + fps: float + frame rate of the model sampling. + + num_frames: int, or (int, int) + - If None, the model accepts videos of any length. + - If a single int is passed, specify how many frames the model takes. + - If a tuple of two ints is passed, specify the range of the number of frames the model takes (inclusive). If + you need to specify infinite, use np.inf. + + duration: float, or (float, float) + - If None, the model accepts videos of any length. + - If a single float is passed, specify the duration of the model takes, in ms. + - If a tuple of two floats is passed, specify the range of the duration the model takes (inclusive). If you need + to specify infinite, use np.inf. + + convert_img_to_video: bool + whether to convert the input images to videos. + img_duration: float + specify the duration of the images, in ms. This will work only if convert_img_to_video is True. + batch_size: int + number of stimuli to process in each batch. + batch_grouper: function + function that takes a stimulus and return the property based on which the stimuli can be grouped. + """ + + def __init__( + self, + *args, + fps: float, + num_frames: Union[int, Tuple[int, int]] = None, + duration: Union[float, Tuple[float, float]] = None, + convert_img_to_video: bool = True, + img_duration: float = 1000.0, + batch_size: int = 32, + batch_grouper: Callable[[Video], Hashable] = lambda video: (round(video.duration, 6), video.fps), + **kwargs, + ): + super().__init__(*args, stimulus_type=Video, batch_size=batch_size, batch_grouper=batch_grouper, **kwargs) + self.fps = fps + self.num_frames = self._make_range(num_frames, type="num_frames") + self.duration = self._make_range(duration, type="duration") + + if convert_img_to_video: + assert img_duration is not None, "img_duration should be specified if convert_img_to_video is True" + self.img_duration = img_duration + self.convert_to_video = convert_img_to_video + + @property + def identifier(self) -> str: + id = f"{super().identifier}.fps={float(self.fps)}" + if self.convert_to_video: + id += f".img_dur={float(self.img_duration)}" + return id + + def __call__(self, paths: List[Union[str, Path]], layers: List[str], mmap_path: str = None) -> NeuroidAssembly: + stimuli = self.load_stimuli(paths) + longest_stimulus = stimuli[np.argmax(np.array([stimulus.duration for stimulus in stimuli]))] + num_time_bins = longest_stimulus.num_frames + time_bin_coords = self._get_time_bin_coords(num_time_bins, self.fps) + num_stimuli = len(paths) + stimulus_paths = paths + + self._executor.add_stimuli(stimuli) + data = None + + for temporal_layer_activations, indicies in self._executor.execute_batch(layers): + for temporal_layer_activation, i in zip(temporal_layer_activations, indicies): + stimulus = stimuli[i] + # determine the time bin correspondence for each layer + for t, layer_activation in self._disect_time(temporal_layer_activation, stimulus.num_frames): + if data is None: + num_feats, neuroid_coords = self._get_neuroid_coords( + layer_activation, self._remove_T(self.layer_activation_format) + ) + data = data_assembly_mmap( + mmap_path, + shape=(num_stimuli, num_time_bins, num_feats), + dtype=self.dtype, + fill_value=np.nan, + ) + flatten_activation = self._flatten_activations(layer_activation) + data[i, t, :] = flatten_activation + + data.register_meta( + dims=["stimulus_path", "time_bin", "neuroid"], + coords={ + "stimulus_path": stimulus_paths, + **neuroid_coords, + **time_bin_coords, + }, + ) + + return data.to_assembly() + + def _disect_time(self, temporal_layer_activation, num_frames): + paces = {} + t_dims = {} + for layer in temporal_layer_activation: + activation = temporal_layer_activation[layer] + specs = self.layer_activation_format[layer] + t_dim = specs.index("T") if "T" in specs else None + num_t = 1 if t_dim is None else activation.shape[t_dim] + paces[layer] = num_t / num_frames # evenly spaced activations over time + t_dims[layer] = t_dim + + layer_ts = {layer: 0 for layer in temporal_layer_activation} + for t in range(num_frames): + ret = {} + for layer in temporal_layer_activation: + pace = paces[layer] + t_dim = t_dims[layer] + if t_dim is None: + ret[layer] = temporal_layer_activation[layer] + else: + ret[layer] = temporal_layer_activation[layer].take(int(layer_ts[layer]), axis=t_dim) + layer_ts[layer] += pace + yield t, ret + + def _get_time_bin_coords(self, num_time_bins, fps): + interval = 1000 / fps + time_bin_starts = np.arange(0, num_time_bins) * interval + time_bin_ends = time_bin_starts + interval + return { + "time_bin_start": ("time_bin", time_bin_starts), + "time_bin_end": ("time_bin", time_bin_ends), + } + + def _remove_T(self, layer_specs): + ret = {} + for layer, specs in layer_specs.items(): + ret[layer] = specs.replace("T", "") + return ret + + def load_stimulus(self, path: Union[str, Path]) -> Video: + # enable the conversion of images to videos + if self.convert_to_video and Stimulus.is_image_path(path): + video = Video.from_img_path(path, self.img_duration, self.fps) + else: + video = Video.from_path(path) + video = video.set_fps(self.fps) + self._check_video(video) + return video + + def _make_range(self, num, type="num_frames"): + if num is None: + return (1 if type == "num_frames" else 0, np.inf) + if isinstance(num, (tuple, list)): + return num + else: + return (num, num) + + def _check_video(self, video: Video): + if self.num_frames is not None: + estimated_num_frames = int(self.fps * video.duration / 1000) + assert ( + self.num_frames[0] <= estimated_num_frames <= self.num_frames[1] + ), f"The number of frames must be within {self.num_frames}, but got {estimated_num_frames}" + if self.duration is not None: + assert ( + self.duration[0] <= video.duration <= self.duration[1] + ), f"The duration must be within {self.duration}, but got {video.duration}" + + +class TemporalContextInferencerBase(TemporalInferencer): + """Base class for context-aware inferencers (e.g., causal or block). + + It computes the allowable temporal context from `num_frames` and `duration`, + then resolves an effective context length using `temporal_context_strategy`. + + Parameters + ---------- + temporal_context_strategy: str + How to pick the context length: + - "greedy": use the maximum allowed context by the model. + - "conservative": use the minimum allowed context by the model. + - "fix": use `fixed_temporal_context`. + fixed_temporal_context: float + Fixed context length in ms (used only when strategy is "fix"). + out_of_bound_strategy: str + How to pad when context runs past video bounds. Currently: + - "repeat": repeat boundary frames. + """ + + def __init__( + self, + *args, + temporal_context_strategy: str = "greedy", + fixed_temporal_context: float = None, # if None, default to greedy + out_of_bound_strategy: str = "repeat", + **kwargs, + ): + self.temporal_context_strategy = temporal_context_strategy + self.fixed_temporal_context = fixed_temporal_context + self.out_of_bound_strategy = out_of_bound_strategy + if self.temporal_context_strategy == "fix" and self.fixed_temporal_context is None: + raise ValueError("fixed_temporal_context must be specified if temporal_context_strategy is 'fix'.") + super().__init__(*args, **kwargs) + + @property + def identifier(self) -> str: + lower, context = self._compute_temporal_context() + lower = round(lower, MS_ROUNDING_DIGITS) + context = round(context, MS_ROUNDING_DIGITS) + to_add = f".context={context}>{lower}.oob={self.out_of_bound_strategy}" + return f"{super().identifier}{to_add}" + + def load_stimulus(self, path): + if self.convert_to_video and Stimulus.is_image_path(path): + video = Video.from_img_path(path, self.img_duration, self.fps) + else: + video = Video.from_path(path) + video = video.set_fps(self.fps) + # does no check here + return video + + def _overlapped_range(self, start1, end1, start2, end2): + # compute the overlapped range of two ranges (start1, end1) and (start2, end2) + lower, upper = max(start1, start2), min(end1, end2) + if lower > upper: + raise ValueError(f"Ranges [{start1}, {end1}] and [{start2}, {end2}] do not overlap.") + return lower, upper + + def _compute_temporal_context(self): + duration = self.duration + num_frames = self.num_frames + strategy = self.temporal_context_strategy + + interval = 1000 / self.fps + num_frames_implied_ran = (num_frames[0] * interval, num_frames[1] * interval) + ran = self._overlapped_range(*num_frames_implied_ran, *duration) + lower = ran[0] + + if strategy in ["greedy", "conservative"]: + if strategy == "greedy": + return lower, ran[1] + elif strategy == "conservative": + return lower, ran[0] + + elif strategy == "fix": + context = self.fixed_temporal_context if self.fixed_temporal_context is not None else ran[1] + assert ran[0] <= context <= ran[1], f"Fixed temporal context {context} is not within the range {ran}" + + else: + raise ValueError(f"Unknown temporal context strategy: {strategy}") + + return lower, context + + +class CausalInferencer(TemporalContextInferencerBase): + """Causal inference over time: each output time uses only past frames. + + For each time bin, it feeds the model with a clip that ends at that time, + then keeps only the last activation step. This yields a time series where + each point is strictly causal. + + `inference_fps` controls the output time-bin spacing (defaults to model FPS). + `fold_model_time` controls whether model time ("T") is folded into the neuroid axis + (True) or reduced to the last time step (False, default). + """ + + def __init__(self, *args, inference_fps: float = None, fold_model_time: bool = False, **kwargs): + super().__init__(*args, **kwargs) + self.inference_fps = self.fps if inference_fps is None else inference_fps + self.fold_model_time = fold_model_time + self.temporal_context_strategy = 'fix' if fold_model_time else self.temporal_context_strategy + + @property + def identifier(self) -> str: + base_id = super().identifier + if self.fold_model_time: + base_id = f"{base_id}.fold_model_time" + if self.inference_fps == self.fps: + return base_id + return f"{base_id}.ifps={float(self.inference_fps)}" + + def __call__(self, paths, layers, mmap_path=None): + lower, context = self._compute_temporal_context() + interval = 1000 / self.inference_fps + stimuli = self.load_stimuli(paths) + longest_stimulus = stimuli[np.argmax(np.array([stimulus.duration for stimulus in stimuli]))] + # Match the number of time steps implied by the per-stimulus arange loop. + num_time_bins = int(np.ceil(longest_stimulus.duration / interval)) + num_stimuli = len(paths) + time_bin_coords = self._get_time_bin_coords(num_time_bins, self.inference_fps) + stimulus_paths = paths + + ts = [] + stimulus_index = [] + for s, stimulus in enumerate(stimuli): + duration = stimulus.duration + videos = [] + # here we ensure that the covered time range at least include the whole duration + for t, time_end in enumerate(np.arange(interval, duration + interval, interval)): + # see if the model only receive limited context + time_start = self._get_time_start(time_end, context, lower) + clip = stimulus.set_window(time_start, time_end, padding=self.out_of_bound_strategy) + videos.append(clip) + ts.append(t) + stimulus_index.append(s) + self._executor.add_stimuli(videos) + + data = None + for temporal_layer_activations, indicies in self._executor.execute_batch(layers): + for temporal_layer_activation, i in zip(temporal_layer_activations, indicies): + s = stimulus_index[i] + if self.fold_model_time: + layer_activation = temporal_layer_activation + layer_specs = self.layer_activation_format + else: + layer_activation = self._get_last_time(temporal_layer_activation) + layer_specs = self._remove_T(self.layer_activation_format) + if data is None: + num_feats, neuroid_coords = self._get_neuroid_coords( + layer_activation, layer_specs + ) + data = data_assembly_mmap( + mmap_path, shape=(num_stimuli, num_time_bins, num_feats), dtype=self.dtype, fill_value=np.nan + ) + flatten_activation = self._flatten_activations(layer_activation) + t = ts[i] + data[s, t, :] = flatten_activation + + data.register_meta( + dims=["stimulus_path", "time_bin", "neuroid"], + coords={ + "stimulus_path": stimulus_paths, + **neuroid_coords, + **time_bin_coords, + }, + ) + + return data.to_assembly() + + def _get_time_start(self, time_end, context, lower): + assert context >= lower, f"Temporal context {context} is not within the range {lower}" + if self.temporal_context_strategy == "fix": + return time_end - context + elif self.temporal_context_strategy == "greedy": + proposed_time_start = time_end - context + if proposed_time_start >= 0: + return proposed_time_start + else: + if time_end < lower: + return time_end - lower + else: + return 0 + elif self.temporal_context_strategy == "conservative": + return time_end - context + + def _get_last_time(self, temporal_layer_activation): + ret = {} + for layer, activation in temporal_layer_activation.items(): + specs = self.layer_activation_format[layer] + t_dim = specs.index("T") if "T" in specs else None + ret[layer] = activation.take(-1, axis=t_dim) if t_dim is not None else activation + return ret + + +class BlockInferencer(TemporalContextInferencerBase): + """Block-wise inference: split a video into context-sized chunks. + + Each block is inferred separately and then stitched along time. + Block size follows the resolved temporal context (from strategy, num_frames, duration). + """ + + def __call__(self, paths, layers, mmap_path=None): + _, context = self._compute_temporal_context() + + if np.isinf(context): + return super().__call__(paths, layers, mmap_path) + + EPS = 1e-6 + stimuli = self.load_stimuli(paths) + num_stimuli = len(paths) + stimulus_paths = paths + + num_blocks = [] + num_time_bins = [] + num_frames_per_block = None + t_offsets = [] + stimulus_index = [] + for s, stimulus in enumerate(stimuli): + duration = stimulus.duration + video_blocks = [] + # for each stimulus, divide it into block clips with the specified context + # EPS makes sure the last block is not out-of-bound + for block_id, time_start in enumerate(np.arange(0, duration + EPS, context)): + time_end = time_start + context + clip = stimulus.set_window(time_start, time_end, padding=self.out_of_bound_strategy) + video_blocks.append(clip) + stimulus_index.append(s) + clip_num_samples = clip.set_fps(self.fps).num_frames + if num_frames_per_block is None: + num_frames_per_block = clip_num_samples + else: + assert num_frames_per_block == clip_num_samples, "All blocks must have the same number of frames." + t_offsets.append(block_id * num_frames_per_block) + self._executor.add_stimuli(video_blocks) + num_blocks.append(len(video_blocks)) + num_time_bins.append(len(video_blocks) * num_frames_per_block) + + num_time_bins = max(num_time_bins) + time_bin_coords = self._get_time_bin_coords(num_time_bins, self.fps) + + data = None + for temporal_layer_activations, indicies in self._executor.execute_batch(layers): + for temporal_layer_activation, i in zip(temporal_layer_activations, indicies): + s = stimulus_index[i] + # determine the time bin correspondence for each layer + for t, layer_activation in self._disect_time(temporal_layer_activation, num_frames_per_block): + if data is None: + num_feats, neuroid_coords = self._get_neuroid_coords( + layer_activation, self._remove_T(self.layer_activation_format) + ) + data = data_assembly_mmap( + mmap_path, shape=(num_stimuli, num_time_bins, num_feats), dtype=self.dtype, fill_value=np.nan + ) + flatten_activation = self._flatten_activations(layer_activation) + t = t_offsets[i] + t + data[s, t, :] = flatten_activation + + data.register_meta( + dims=["stimulus_path", "time_bin", "neuroid"], + coords={ + "stimulus_path": stimulus_paths, + **neuroid_coords, + **time_bin_coords, + }, + ) + + return data.to_assembly() diff --git a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/__init__.py b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/__init__.py deleted file mode 100644 index 5d99e466a..000000000 --- a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .base import TemporalInferencer -from .temporal_context import * \ No newline at end of file diff --git a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/base.py b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/base.py deleted file mode 100644 index 02c50afe1..000000000 --- a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/base.py +++ /dev/null @@ -1,134 +0,0 @@ -import numpy as np -from typing import Union, Tuple, Callable, Hashable, List, Dict -from pathlib import Path - -from brainscore_vision.model_helpers.activations.temporal.inputs import Video, Stimulus -from brainscore_vision.model_helpers.activations.temporal.utils import assembly_align_to_fps, stack_with_nan_padding -from brainscore_core.supported_data_standards.brainio.assemblies import NeuroidAssembly - -from ..base import Inferencer -from . import time_aligner as time_aligners - - -class TemporalInferencer(Inferencer): - """Inferencer for video stimuli. The model takes video stimuli as input and generate the activations over time. - Then, the activations will be aligned to video time by the time_aligner specified in the constructor. The aligned - activations will be again unified to the fps specified within the constructor (self.fps). Finally, the activations - will be packaged into a NeuroidAssembly. - - NOTE: for all the time_alignment method, the inference of time bins will only be done with the longest video, but ignore all other input videos. - - Example: - temporal_inferencer = TemporalInferenver(..., fps=10) - model_assembly = temporal_inferencer(video_paths[1000ms], layers) - model_assembly.time_bins -> [(0, 100), (100, 200), ..., (900, 1000)] # 1000ms, 10fps - - Parameters - ---------- - fps: float - frame rate of the model sampling. - - num_frames: int, or (int, int) - - If None, the model accepts videos of any length. - - If a single int is passed, specify how many frames the model takes. - - If a tuple of two ints is passed, specify the range of the number of frames the model takes (inclusive). If you need to specify infinite, use np.inf. - - duration: float, or (float, float) - - If None, the model accepts videos of any length. - - If a single float is passed, specify the duration of the model takes, in ms. - - If a tuple of two floats is passed, specify the range of the duration the model takes (inclusive). If you need to specify infinite, use np.inf. - - time_alignment: str - specify the method to align the activations in time. - The options and specifications are in the time_aligners module. The current options are: - - evenly_spaced: align the activations to have evenly spaced time bins across the whole video time span. - - ignore_time: ignore the time information and make a single time bin of the entire video. - - estimate_layer_fps: estimate the fps of the layer based on the video fps. - - per_frame_aligned: align the activations to the video frames. - - convert_img_to_video: bool - whether to convert the input images to videos. - img_duration: float - specify the duration of the images, in ms. This will work only if convert_img_to_video is True. - batch_size: int - number of stimuli to process in each batch. - batch_grouper: function - function that takes a stimulus and return the property based on which the stimuli can be grouped. - """ - def __init__( - self, - *args, - fps : float, - num_frames : Union[int, Tuple[int, int]] = None, - duration : Union[float, Tuple[float, float]] = None, - time_alignment : str = "evenly_spaced", - convert_img_to_video : bool = True, - img_duration : float = 1000.0, - batch_size : int = 32, - batch_grouper : Callable[[Video], Hashable] = lambda video: (round(video.duration, 6), video.fps), # not including video.frame_size because most preprocessors will change the frame size to be the same - **kwargs, - ): - super().__init__(*args, stimulus_type=Video, batch_size=batch_size, - batch_grouper=batch_grouper, **kwargs) - self.fps = fps - self.num_frames = self._make_range(num_frames, type="num_frames") - self.duration = self._make_range(duration, type="duration") - assert hasattr(time_aligners, time_alignment), f"Unknown time alignment method: {time_alignment}" - self.time_aligner = getattr(time_aligners, time_alignment) - - if convert_img_to_video: - assert img_duration is not None, "img_duration should be specified if convert_img_to_video is True" - self.img_duration = img_duration - self.convert_to_video = convert_img_to_video - - @property - def identifier(self) -> str: - id = f"{super().identifier}.{self.time_aligner.__name__}.fps={float(self.fps)}" - if self.convert_to_video: - id += f".img_dur={float(self.img_duration)}" - return id - - def load_stimulus(self, path: Union[str, Path]) -> Video: - if self.convert_to_video and Stimulus.is_image_path(path): - video = Video.from_img_path(path, self.img_duration, self.fps) - else: - video = Video.from_path(path) - video = video.set_fps(self.fps) - self._check_video(video) - return video - - def package_layer( - self, - layer_activations : List[np.array], - layer_spec : str, - stimuli : List[Stimulus] - ): - assert len(layer_activations) == len(stimuli) - longest_stimulus = stimuli[np.argmax(np.array([stimulus.duration for stimulus in stimuli]))] - ignore_time = self.time_aligner is time_aligners.ignore_time - channels = self._map_dims(layer_spec) - layer_activations = stack_with_nan_padding(layer_activations) - assembly = self._package(layer_activations, ["stimulus_path"] + channels) - # align to the longest stimulus - assembly = self.time_aligner(assembly, longest_stimulus) - if "channel_temporal" in channels and not ignore_time: - channels.remove("channel_temporal") - assembly = self._stack_neuroid(assembly, channels) - if not ignore_time: - assembly = assembly_align_to_fps(assembly, self.fps) - return assembly - - def _make_range(self, num, type="num_frames"): - if num is None: - return (1 if type=='num_frames' else 0, np.inf) - if isinstance(num, (tuple, list)): - return num - else: - return (num, num) - - def _check_video(self, video: Video): - if self.num_frames is not None: - estimated_num_frames = int(self.fps * video.duration / 1000) - assert self.num_frames[0] <= estimated_num_frames <= self.num_frames[1], f"The number of frames must be within {self.num_frames}, but got {estimated_num_frames}" - if self.duration is not None: - assert self.duration[0] <= video.duration <= self.duration[1], f"The duration must be within {self.duration}, but got {video.duration}" diff --git a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/__init__.py b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/__init__.py deleted file mode 100644 index 4813c1fa9..000000000 --- a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .causal import CausalInferencer -from .block import BlockInferencer \ No newline at end of file diff --git a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/base.py b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/base.py deleted file mode 100644 index 683741004..000000000 --- a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/base.py +++ /dev/null @@ -1,99 +0,0 @@ -from typing import Union, List -from pathlib import Path - -from brainscore_vision.model_helpers.activations.temporal.inputs import Video, Stimulus -from ..base import TemporalInferencer - - -MS_ROUNDING_DIGITS = 3 - -class TemporalContextInferencerBase(TemporalInferencer): - """Inferencer base that computes the temporal context for concrete context-based inferencers, - like CausalInferencer and BlockInferencer. - - The context range is first determined by the num_frames and duration. Then, temporal_context_strategy - is used to determine the lower bound of temporal context and the expected temporal context. - - Parameters - ---------- - temporal_context_strategy: str - specify how the length of temporal context for causal inference is determined. - Options: - - "greedy": the length of the temporal context is determined by the maximum of num_frames and duration. - - "conservative": the length of the temporal context is determined by the minimum of num_frames and duration. - - "fix": the length of the temporal context is determined by the specified "fixed_temporal_context". - - fixed_temporal_context: float - specify the fixed length of the temporal context, in ms. It will be used only if temporal_context_strategy is "fix". - - out_of_bound_strategy: str - specify how to handle the out-of-bound temporal context. - Options: - - "repeat": the out-of-bound temporal context will be repeated. - - TODO: "black": the out-of-bound temporal context will be zero-padded. - - TODO: "gray": the out-of-bound temporal context will be 128-padded. - """ - def __init__( - self, - *args, - temporal_context_strategy : str = "greedy", - fixed_temporal_context : float = None, - out_of_bound_strategy : str = "repeat", - **kwargs - ): - self.temporal_context_strategy = temporal_context_strategy - self.fixed_temporal_context = fixed_temporal_context - self.out_of_bound_strategy = out_of_bound_strategy - if self.temporal_context_strategy == "fix" and self.fixed_temporal_context is None: - raise ValueError("fixed_temporal_context must be specified if temporal_context_strategy is 'fix'.") - super().__init__(*args, **kwargs) - - @property - def identifier(self) -> str: - lower, context = self._compute_temporal_context() - lower = round(lower, MS_ROUNDING_DIGITS) - context = round(context, MS_ROUNDING_DIGITS) - to_add = f".context={context}>{lower}.oob={self.out_of_bound_strategy}" - return f"{super().identifier}{to_add}" - - def load_stimulus(self, path: Union[str, Path]) -> Video: - if self.convert_to_video and Stimulus.is_image_path(path): - video = Video.from_img_path(path, self.img_duration, self.fps) - else: - video = Video.from_path(path) - video = video.set_fps(self.fps) - # does no check here - return video - - def _overlapped_range(self, start1, end1, start2, end2): - # compute the overlapped range of two ranges (start1, end1) and (start2, end2) - lower, upper = max(start1, start2), min(end1, end2) - if lower > upper: - raise ValueError(f"Ranges [{start1}, {end1}] and [{start2}, {end2}] do not overlap.") - return lower, upper - - def _compute_temporal_context(self): - duration = self.duration - num_frames = self.num_frames - strategy = self.temporal_context_strategy - - interval = 1000 / self.fps - num_frames_implied_ran = (num_frames[0] * interval, num_frames[1] * interval) - ran = self._overlapped_range(*num_frames_implied_ran, *duration) - lower = ran[0] - - if strategy in ["greedy", "conservative"]: - if strategy == "greedy": - return lower, ran[1] - elif strategy == "conservative": - return lower, ran[0] - - elif strategy == "fix": - context = self.fixed_temporal_context - assert ran[0] <= context <= ran[1], f"Fixed temporal context {context} is not within the range {ran}" - - else: - raise ValueError(f"Unknown temporal context strategy: {strategy}") - - return lower, context - diff --git a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/block.py b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/block.py deleted file mode 100644 index f6823a061..000000000 --- a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/block.py +++ /dev/null @@ -1,77 +0,0 @@ -import numpy as np -from collections import OrderedDict -from tqdm import tqdm - -from .base import TemporalContextInferencerBase -from brainscore_vision.model_helpers.activations.temporal.utils import stack_with_nan_padding - - -class BlockInferencer(TemporalContextInferencerBase): - """Inferencer that divides the original video into smaller blocks and does inference on the blocks separately. - Finally, the activations are joint along the temporal dimension for the final activations. - - Specifically, suppose the video lasts for 1000ms and the block size is 400ms. - Then, the video is segmented into [0~400ms], [400~800ms], [800~1200ms] (1000~1200ms padded). - The activations for each segment will be stacked together. - - The block size is determined by the temporal parameters (num_frames & duration) and temporal_context_strategy. - If num_frames or duration is given, the model's temporal context will be set to match the two. - """ - - def inference(self, stimuli, layers): - _, context = self._compute_temporal_context() - self._time_ends = [] - - if np.isinf(context): - # if the context is inf, pass the whole video directively - self._time_ends = [inp.duration for inp in stimuli] - layer_activations = super().inference(stimuli, layers) - layer_activations = OrderedDict([(layer, [[a] for a in activations]) - for layer, activations in layer_activations.items()]) - # make one-clip video activations - else: - num_clips = [] - for stimulus in stimuli: - duration = stimulus.duration - videos = [] - # for each stimulus, divide it into block clips with the specified context - for time_start in np.arange(0, duration, context): - time_end = time_start + context - clip = stimulus.set_window(time_start, time_end, padding=self.out_of_bound_strategy) - videos.append(clip) - # record the actual time_end (which could be larger than the duration of the original video) - # so that we can align the time correctly when packaging the activations - self._time_ends.append(time_end) - self._executor.add_stimuli(videos) - num_clips.append(len(videos)) # record the number of clips for each video - - activations = self._executor.execute(layers) - layer_activations = OrderedDict() - for layer in layers: - clip_start = 0 - for num_clip in num_clips: - # retrieve clips from a video by num_clip - clips = activations[layer][clip_start:clip_start+num_clip] # clips for this video - layer_activations.setdefault(layer, []).append(clips) - clip_start += num_clip - - # concat the activations from the clips of the same video - ret = OrderedDict() - for layer in layers: - activation_dims = self.layer_activation_format[layer] - for clips in layer_activations[layer]: - # make T the first dimension, as [T, ...] for easy concatenation - # the package_layer will change accordingly - if 'T' in activation_dims: - time_index = activation_dims.index('T') - clips = [np.moveaxis(a, time_index, 0) for a in clips] - ret.setdefault(layer, []).append(np.concatenate(clips, axis=0)) - else: - ret.setdefault(layer, []).append(np.stack(clips, axis=0)) - - return ret - - def package_layer(self, activations, layer_spec, stimuli): - layer_spec = "T" + layer_spec.replace('T', '') # T has been moved to the first dimension - stimuli = [stimulus.set_window(0, time_end) for stimulus, time_end in zip(stimuli, self._time_ends)] - return super().package_layer(activations, layer_spec, stimuli) \ No newline at end of file diff --git a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/causal.py b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/causal.py deleted file mode 100644 index 778961660..000000000 --- a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/temporal_context/causal.py +++ /dev/null @@ -1,86 +0,0 @@ -import numpy as np -from collections import OrderedDict - -from .base import TemporalContextInferencerBase -from brainscore_vision.model_helpers.activations.temporal.inputs.video import Video -from brainscore_vision.model_helpers.activations.temporal.utils import stack_with_nan_padding - - -class CausalInferencer(TemporalContextInferencerBase): - """Inferencer that ensures the activations are causal by feeding in the video in a causal manner. - - Specifically, suppose the video lasts for 1000ms and the model samples every 100ms. - Then, the activations from the last time step of the activations from the following videos: - [0 ~ 100ms], [0 ~ 200ms], ..., [0 ~ 1000ms]. - will be stacked together to form the final activations for the video. In this way, the - activations are always "causal", which means that the temporal contexts of the model are - always from the past, not the future. - - If num_frames or duration is given, the model's temporal context will be set to match the two. - """ - def __init__( - self, - *args, - **kwargs - ): - if "time_alignment" in kwargs: - if kwargs["time_alignment"] != "per_frame_aligned": - raise ValueError("CausalInferencer enforces time_alignment='per_frame_aligned'.") - else: - del kwargs["time_alignment"] - super().__init__(*args, **kwargs, time_alignment="per_frame_aligned") - - def inference(self, stimuli, layers): - interval = 1000 / self.fps - lower, context = self._compute_temporal_context() - num_clips = [] - latest_time_end = 0 - for inp in stimuli: - duration = inp.duration - videos = [] - # here we ensure that the covered time range at least include the whole duration - for time_end in np.arange(interval, duration+interval, interval): - # see if the model only receive limited context - time_start = self._get_time_start(time_end, context, lower) - clip = inp.set_window(time_start, time_end, padding=self.out_of_bound_strategy) - latest_time_end = max(time_end, latest_time_end) - videos.append(clip) - - self._executor.add_stimuli(videos) - num_clips.append(len(videos)) - - activations = self._executor.execute(layers) - layer_activations = OrderedDict() - for layer in layers: - activation_dims = self.layer_activation_format[layer] - clip_start = 0 - for num_clip in num_clips: - video_activations = activations[layer][clip_start:clip_start+num_clip] # clips for this video - # make T the first dimension, as [T, ...] - if 'T' in activation_dims: - time_index = activation_dims.index('T') - video_activations = [a.take(-1, axis=time_index) for a in video_activations] - layer_activations.setdefault(layer, []).append(np.stack(video_activations, axis=0)) - clip_start += num_clip - - return layer_activations - - def package_layer(self, activations, layer_spec, stimuli): - layer_spec = "T" + layer_spec.replace('T', '') # T has been moved to the first dimension - return super().package_layer(activations, layer_spec, stimuli) - - def _get_time_start(self, time_end, context, lower): - assert context >= lower, f"Temporal context {context} is not within the range {lower}" - if self.temporal_context_strategy == "fix": - return time_end - context - elif self.temporal_context_strategy == "greedy": - proposed_time_start = time_end - context - if proposed_time_start >= 0: - return proposed_time_start - else: - if time_end < lower: - return time_end - lower - else: - return 0 - elif self.temporal_context_strategy == "conservative": - return time_end - context \ No newline at end of file diff --git a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/time_aligner.py b/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/time_aligner.py deleted file mode 100644 index b656cb767..000000000 --- a/brainscore_vision/model_helpers/activations/temporal/core/inferencer/video/time_aligner.py +++ /dev/null @@ -1,73 +0,0 @@ -import numpy as np -from brainscore_core.supported_data_standards.brainio.assemblies import DataAssembly -from brainscore_vision.model_helpers.activations.temporal.inputs.video import Video - - -"""This module includes different time alignment strategies for the activations of a temporal neural network. - - A time alignment strategy is a function that takes the DataAssembly (with channel_temporal) and the video stimuli, - and aligns the activations to the video time. The channel_temporal dimension will be changed into time_bin dimension, - and the time_bin_start and time_bin_end will be added as coordinates of it. -""" - - -def _convert(assembly, time_bin_starts, time_bin_ends): - # this function converts the "channel_temporal" dimension to "time_bin" dimension - # if "channel_temporal" is not present, it adds a "time_bin" dimension - asm_type = assembly.__class__ - if "channel_temporal" in assembly.dims: - assembly = assembly.drop_vars("channel_temporal") - assembly = assembly.rename({"channel_temporal": "time_bin"}) - else: - assembly = assembly.expand_dims("time_bin") - assembly = assembly.assign_coords({ - "time_bin_start": ("time_bin", time_bin_starts), - "time_bin_end": ("time_bin", time_bin_ends) - }) - return asm_type(assembly) - -def estimate_layer_fps(assembly : DataAssembly, video : Video) -> DataAssembly: - # in a temporal neural net, different layers may have different temporal resolutions - # this function estimates the temporal resolution of a layer, based on the video fps - fps = video.fps - num_t = assembly.sizes['channel_temporal'] if "channel_temporal" in assembly.dims else 1 - duration = video.duration - model_frame_interval = 1000 / fps - estimated_frame_interval = duration / num_t - estimated_multiplier = estimated_frame_interval / model_frame_interval - estimated_multiplier = int(round(estimated_multiplier)) # round to nearest integer - estimated_interval = model_frame_interval * estimated_multiplier - time_bin_starts = np.arange(0, num_t) * estimated_interval - time_bin_ends = time_bin_starts + estimated_interval - return _convert(assembly, time_bin_starts, time_bin_ends) - -def evenly_spaced(assembly : DataAssembly, video : Video) -> DataAssembly: - # this function assumes that the activation of different time steps is evenly spaced - num_t = assembly.sizes['channel_temporal'] if "channel_temporal" in assembly.dims else 1 - interval = video.duration / num_t - time_bin_starts = np.linspace(0, video.duration, num_t+1)[:-1] - time_bin_ends = time_bin_starts + interval - time_bin_ends[-1] = video.duration - return _convert(assembly, time_bin_starts, time_bin_ends) - -def per_frame_aligned(assembly : DataAssembly, video : Video) -> DataAssembly: - # this function assumes that the activation of different time steps is aligned with the video frames - num_t = assembly.sizes['channel_temporal'] if "channel_temporal" in assembly.dims else 1 - assert video.num_frames <= num_t - interval = 1000 / video.fps - time_bin_starts = np.arange(0, num_t) * interval - time_bin_ends = time_bin_starts + interval - return _convert(assembly, time_bin_starts, time_bin_ends) - -def ignore_time(assembly : DataAssembly, video : Video) -> DataAssembly: - # this function treats the activations from the entire video as from a single time bin, - # and treat the "channel_temporal" as a regular channel dimension and does no conversion - asm_type = assembly.__class__ - assembly = assembly.expand_dims("time_bin") - time_bin_starts = [0] - time_bin_ends = [video.duration] - assembly = assembly.assign_coords({ - "time_bin_start": ("time_bin", time_bin_starts), - "time_bin_end": ("time_bin", time_bin_ends) - }) - return asm_type(assembly) \ No newline at end of file diff --git a/brainscore_vision/model_helpers/activations/temporal/inputs/__init__.py b/brainscore_vision/model_helpers/activations/temporal/inputs/__init__.py index 1e2d95604..db3fbec97 100644 --- a/brainscore_vision/model_helpers/activations/temporal/inputs/__init__.py +++ b/brainscore_vision/model_helpers/activations/temporal/inputs/__init__.py @@ -1,3 +1,302 @@ -from .video import Video -from .image import Image -from .base import Stimulus \ No newline at end of file +import os +from pathlib import Path +from typing import Tuple, Union + +import cv2 +import numpy as np +from PIL import Image as PILImage + +from ..utils import batch_2d_resize + + +class Stimulus: + def from_path(self, path): + raise NotImplementedError("Choose a concrete Stimulus type to use.") + + @staticmethod + def is_video_path(path: Union[str, Path]) -> bool: + extension = path.split('.')[-1].lower() + return extension in ['mp4', 'avi', 'mov', 'flv', 'wmv', 'webm', 'mkv', 'gif'] + + @staticmethod + def is_image_path(path: Union[str, Path]) -> bool: + extension = path.split('.')[-1].lower() + return extension in ['jpg', 'jpeg', 'png', 'bmp', 'tiff'] + + +class Image(Stimulus): + def __init__(self, path: str, size: int): + self._path = path + self._size = size + + def copy(self): + return Image(self._path, self._size) + + @property + def size(self): + return self._size + + def set_size(self, size): + img = self.copy() + img._size = size + return img + + def from_path(path): + return Image(path, get_image_size(path)) + + def to_pil_img(self): + return PILImage.fromarray(self.to_numpy()) + + def get_frame(self): + return np.array(PILImage.open(self._path).convert('RGB')) + + # return (H, W, C[RGB]) + def to_numpy(self): + arr = self.get_frame() + + if arr.shape[:2][::-1] != self._size: + arr = batch_2d_resize(arr[None, :], self._size, "bilinear")[0] + + return arr + + def store_to_path(self, path): + self.to_img().save(path) + return path + + +def get_image_size(path): + with PILImage.open(path) as img: + size = img.size + return size + + +EPS = 1e-9 + + +def get_video_stats(video_path): + assert os.path.exists(video_path), f"Video file {video_path} does not exist." + cap = cv2.VideoCapture(video_path) + length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + fps = cap.get(cv2.CAP_PROP_FPS) + size = (width, height) + duration = length / fps * 1000 + cap.release() + return fps, duration, size + + +def get_image_stats(image_path): + with PILImage.open(image_path) as img: + return img.size + return size + + +class Video(Stimulus): + """Video object that represents a video clip.""" + + def __init__( + self, + path: Union[str, Path], + fps: float, + start: float, + end: float, + size: Tuple[int, int], + ): + self._path = path + self._fps = fps + self._size = size + self._start = start + self._end = end + self._original_fps = None + self._original_duration = None + self._original_size = None + + def __getattribute__(self, key): + if key.startswith("_original_"): + if super().__getattribute__(key) is None: + self._original_fps, self._original_duration, self._original_size = get_video_stats(self._path) + return super().__getattribute__(key) + + def copy(self): + # return view + video = self.__class__(self._path, self._fps, self._start, self._end, self._size) + video._original_fps = self._original_fps + video._original_duration = self._original_duration + video._original_size = self._original_size + return video + + @property + def duration(self): + # in ms + return self._end - self._start + + @property + def fps(self): + return self._fps + + @property + def num_frames(self): + return int(self.duration * self.fps / 1000 + EPS) + + @property + def original_num_frames(self): + return int(self._original_duration * self._original_fps / 1000 + EPS) + + @property + def frame_size(self): + return self._size + + ### Transformations: return copy + + def set_fps(self, fps): + assert 1000 / fps <= self.duration, f"fps {fps} is too low for duration {self.duration}ms." + video = self.copy() + video._fps = fps + return video + + def set_size(self, size): + # size: (width, height) + video = self.copy() + video._size = size + return video + + def set_window(self, start, end, padding="repeat"): + # use ms as the time scale + if end < start: + raise ValueError("end time is earlier than start time") + + if padding != "repeat": + raise NotImplementedError() + + video = self.copy() + video._start = self._start + start + video._end = self._start + end + return video + + def _check_indices_ascending(self, indices): + if len(indices) == 0: + return False + if len(indices) == 1: + return True + for i in range(1, len(indices)): + if indices[i] < indices[i - 1]: + return False + return True + + def _sanitize_frames(self, frames, tol=0.01): + # check if the read frames are valid + # if some last frames are invalid, just copy the last valid frame + # default tolerance: 0.01 of the total duration + num_invalid = sum([f is None for f in frames]) + if num_invalid == len(frames): + raise ValueError("No valid frames.") + for i in range(num_invalid): + assert frames[-1 - i] is None, "Invalid frames are not at the end." + if num_invalid > int(self._original_duration / 1000 * self._fps * tol): + raise ValueError("Too many invalid frames.") + if num_invalid > 0: + for i in range(num_invalid): + frames[-1 - i] = frames[-1 - num_invalid] + print(f"Warning: last {num_invalid} frames are invalid.") + return frames + + def get_frames(self, indices): + cap = cv2.VideoCapture(self._path) + if not cap.isOpened(): + raise ValueError(f"Cannot open video file: {self._path}") + + def _read(cap): + ret, frame = cap.read() + if not ret: + return None + return cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + + # ascending read optimization + frames = [] + if self._check_indices_ascending(indices): + cap.set(cv2.CAP_PROP_POS_FRAMES, indices[0]) # Move to the first frame index + frame_index = indices[0] - 1 + for target_index in indices: + to_move = target_index - frame_index + for _ in range(to_move): + frame = _read(cap) + frames.append(frame) + frame_index += to_move + else: + # random access + for i, index in enumerate(indices): + cap.set(cv2.CAP_PROP_POS_FRAMES, index) # Move to the frame index + frames.append(_read(cap)) + + cap.release() + frames = self._sanitize_frames(frames) + + return np.array(frames) + + ### I/O + def from_path(path): + fps, end, size = get_video_stats(path) + start = 0 + return Video(path, fps, start, end, size) + + def from_img_path(img_path, duration, fps): + # duration in ms + size = get_image_stats(img_path) + return VideoFromImage(img_path, fps, 0, duration, size) + + def to_numpy(self): + # get the time stamps of frame samples + start_frame = self._start * self._original_fps / 1000 + end_frame = self._end * self._original_fps / 1000 + # avoid taking the last extra frame + samples = np.arange(start_frame, end_frame - EPS, self._original_fps / self.fps) + sample_indices = samples.astype(int) + + # padding: repeat the first/last frame + original_num_frames = int(self._original_duration * self._original_fps / 1000 - EPS) # EPS to avoid last frame OOB error + sample_indices = np.clip(sample_indices, 0, original_num_frames - 1) + + # actual sampling + frames = self.get_frames(sample_indices) + + # resizing + if self._size != (frames.shape[2], frames.shape[1]): + frames = batch_2d_resize(frames, self._size, "bilinear") + + return frames + + def to_frames(self): + return [f for f in self.to_numpy()] + + def to_pil_imgs(self): + return [PILImage.fromarray(frame) for frame in self.to_numpy()] + + def to_path(self): + # use context manager ? + path = None # make a temporal file + raise NotImplementedError() + return path + + def store_to_path(self, path): + # pick format based on path filename + if path.endswith(".avi"): + fourcc = cv2.VideoWriter_fourcc(*'XVID') + elif path.endswith(".mp4"): + fourcc = cv2.VideoWriter_fourcc(*'mp4v') + else: + raise ValueError("Unsupported video format.") + + out = cv2.VideoWriter(path, fourcc, self._fps, self._size) + for frame in self.to_frames(): + out.write(frame[..., ::-1]) # to RGB + out.release() + return path + + +class VideoFromImage(Video): + def get_frames(self, indices): + data = Image.from_path(self._path).to_numpy() + N = len(indices) + ret = np.repeat(data[np.newaxis, ...], N, axis=0) + return ret diff --git a/brainscore_vision/model_helpers/activations/temporal/inputs/base.py b/brainscore_vision/model_helpers/activations/temporal/inputs/base.py deleted file mode 100644 index c94ccd3d7..000000000 --- a/brainscore_vision/model_helpers/activations/temporal/inputs/base.py +++ /dev/null @@ -1,17 +0,0 @@ -from typing import Union -from pathlib import Path - - -class Stimulus: - def from_path(self, path): - raise NotImplementedError("Choose a concrete Stimulus type to use.") - - @staticmethod - def is_video_path(path: Union[str, Path]) -> bool: - extension = path.split('.')[-1].lower() - return extension in ['mp4', 'avi', 'mov', 'flv', 'wmv', 'webm', 'mkv', 'gif'] - - @staticmethod - def is_image_path(path: Union[str, Path]) -> bool: - extension = path.split('.')[-1].lower() - return extension in ['jpg', 'jpeg', 'png', 'bmp', 'tiff'] diff --git a/brainscore_vision/model_helpers/activations/temporal/inputs/image.py b/brainscore_vision/model_helpers/activations/temporal/inputs/image.py deleted file mode 100644 index 6d7790ac1..000000000 --- a/brainscore_vision/model_helpers/activations/temporal/inputs/image.py +++ /dev/null @@ -1,50 +0,0 @@ -import numpy as np -from PIL import Image as PILImage - -from .base import Stimulus -from brainscore_vision.model_helpers.activations.temporal.utils import batch_2d_resize - - -class Image(Stimulus): - def __init__(self, path: str, size: int): - self._path = path - self._size = size - - def copy(self): - return Image(self._path, self._size) - - @property - def size(self): - return self._size - - def set_size(self, size): - img = self.copy() - img._size = size - return img - - def from_path(path): - return Image(path, get_image_size(path)) - - def to_pil_img(self): - return PILImage.fromarray(self.to_numpy()) - - def get_frame(self): - return np.array(PILImage.open(self._path).convert('RGB')) - - # return (H, W, C[RGB]) - def to_numpy(self): - arr = self.get_frame() - - if arr.shape[:2][::-1] != self._size: - arr = batch_2d_resize(arr[None,:], self._size, "bilinear")[0] - - return arr - - def store_to_path(self, path): - self.to_img().save(path) - return path - -def get_image_size(path): - with PILImage.open(path) as img: - size = img.size - return size \ No newline at end of file diff --git a/brainscore_vision/model_helpers/activations/temporal/inputs/video.py b/brainscore_vision/model_helpers/activations/temporal/inputs/video.py deleted file mode 100644 index d8287a4a7..000000000 --- a/brainscore_vision/model_helpers/activations/temporal/inputs/video.py +++ /dev/null @@ -1,186 +0,0 @@ -import cv2 -from decord import VideoReader -import numpy as np -from PIL import Image as PILImage -from typing import Tuple, Union -from pathlib import Path - -from .base import Stimulus -from .image import Image -from brainscore_vision.model_helpers.activations.temporal.utils import batch_2d_resize - - -EPS = 1e-9 - -def get_video_stats(video_path): - cap = cv2.VideoCapture(video_path) - length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) - height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) - fps = cap.get(cv2.CAP_PROP_FPS) - size = (width, height) - duration = length / fps * 1000 - cap.release() - return fps, duration, size - - -def get_image_stats(image_path): - with PILImage.open(image_path) as img: - return img.size - return size - - -class Video(Stimulus): - """Video object that represents a video clip.""" - - def __init__( - self, - path: Union[str, Path], - fps: float, - start: float, - end: float, - size: Tuple[int, int] - ): - self._path = path - self._fps = fps - self._size = size - self._start = start - self._end = end - self._original_fps = None - self._original_duration = None - self._original_size = None - - def __getattribute__(self, key): - if key.startswith("_original_"): - if super().__getattribute__(key) is None: - self._original_fps, self._original_duration, self._original_size = get_video_stats(self._path) - return super().__getattribute__(key) - - def copy(self): - # return view - video = self.__class__(self._path, self._fps, self._start, self._end, self._size) - video._original_fps = self._original_fps - video._original_duration = self._original_duration - video._original_size = self._original_size - return video - - @property - def duration(self): - # in ms - return self._end - self._start - - @property - def fps(self): - return self._fps - - @property - def num_frames(self): - return int(self.duration * self.fps/1000 + EPS) - - @property - def original_num_frames(self): - return int(self._original_duration * self._original_fps/1000 + EPS) - - @property - def frame_size(self): - return self._size - - ### Transformations: return copy - - def set_fps(self, fps): - assert 1000/fps <= self.duration, f"fps {fps} is too low for duration {self.duration}ms." - video = self.copy() - video._fps = fps - return video - - def set_size(self, size): - # size: (width, height) - video = self.copy() - video._size = size - return video - - def set_window(self, start, end, padding="repeat"): - # use ms as the time scale - if end < start: - raise ValueError("end time is earlier than start time") - - if padding != "repeat": - raise NotImplementedError() - - video = self.copy() - video._start = self._start + start - video._end = self._start + end - return video - - def get_frames(self, indices): - reader = VideoReader(self._path) - frames = reader.get_batch(indices).asnumpy() - del reader - return frames - - ### I/O - def from_path(path): - fps, end, size = get_video_stats(path) - start = 0 - return Video(path, fps, start, end, size) - - def from_img_path(img_path, duration, fps): - # duration in ms - size = get_image_stats(img_path) - return VideoFromImage(img_path, fps, 0, duration, size) - - def to_numpy(self): - # get the time stamps of frame samples - start_frame = self._start * self._original_fps / 1000 - end_frame = self._end * self._original_fps / 1000 - # avoid taking the last extra frame - samples = np.arange(start_frame, end_frame - EPS, self._original_fps/self.fps) - sample_indices = samples.astype(int) - - # padding: repeat the first/last frame - original_num_frames = int(self._original_duration * self._original_fps/1000 - EPS) # EPS to avoid last frame OOB error - sample_indices = np.clip(sample_indices, 0, original_num_frames-1) - - # actual sampling - frames = self.get_frames(sample_indices) - - # resizing - if self._size != (frames.shape[2], frames.shape[1]): - frames = batch_2d_resize(frames, self._size, "bilinear") - - return frames - - def to_frames(self): - return [f for f in self.to_numpy()] - - def to_pil_imgs(self): - return [PILImage.fromarray(frame) for frame in self.to_numpy()] - - def to_path(self): - # use context manager ? - path = None # make a temporal file - raise NotImplementedError() - return path - - def store_to_path(self, path): - # pick format based on path filename - if path.endswith(".avi"): - fourcc = cv2.VideoWriter_fourcc(*'XVID') - elif path.endswith(".mp4"): - fourcc = cv2.VideoWriter_fourcc(*'mp4v') - else: - raise ValueError("Unsupported video format.") - - out = cv2.VideoWriter(path, fourcc, self._fps, self._size) - for frame in self.to_frames(): - out.write(frame[...,::-1]) # to RGB - out.release() - return path - - -class VideoFromImage(Video): - def get_frames(self, indices): - data = Image.from_path(self._path).to_numpy() - N = len(indices) - ret = np.repeat(data[np.newaxis, ...], N, axis=0) - return ret \ No newline at end of file diff --git a/brainscore_vision/model_helpers/activations/temporal/model/__init__.py b/brainscore_vision/model_helpers/activations/temporal/model/__init__.py index cf53c04df..2fca6d571 100644 --- a/brainscore_vision/model_helpers/activations/temporal/model/__init__.py +++ b/brainscore_vision/model_helpers/activations/temporal/model/__init__.py @@ -1,2 +1,146 @@ -from .base import ActivationWrapper -from .pytorch import PytorchWrapper \ No newline at end of file +from collections import OrderedDict +import logging +import typing +from typing import Any, Callable, Dict, List + +import numpy as np + +from brainscore_vision.model_helpers.utils import fullname + +from ..core import ActivationsExtractor, Inferencer, TemporalInferencer +from ..inputs import Stimulus + + +SUBMODULE_SEPARATOR = '.' + + +def default_process_activation(layer, layer_name, inputs, output): + # (torch.nn.Module, str, torch.Tensor, torch.Tensor) -> torch.Tensor + return output + + +class ActivationWrapper: + def __init__( + self, + identifier: str, + preprocessing: Callable[[List[Stimulus]], Any], + inferencer_cls: Inferencer = TemporalInferencer, + **extractor_kwargs, + ): + self.identifier = identifier + self.preprocessing = preprocessing + self.build_extractor(inferencer_cls, **extractor_kwargs) + + # List[preprocessed_input] -> Dict[layer -> np.array] + def get_activations(self, inputs: List[Stimulus], layers: List[str]) -> Dict[str, np.array]: + raise NotImplementedError() + + def __call__(self, *args, **kwargs): + return self._extractor(*args, **kwargs) + + def build_extractor(self, inferencer_cls, *args, **kwargs): + extractor = ActivationsExtractor( + identifier=self.identifier, + inferencer=inferencer_cls(get_activations=self.get_activations, preprocessing=self.preprocessing, *args, **kwargs), + ) + extractor.insert_attrs(self) + self._extractor = extractor + + +class PytorchWrapper(ActivationWrapper): + def __init__( + self, + identifier: str, + model, + preprocessing: Callable[[List[Stimulus]], Any], + process_output=None, + *args, + **kwargs, + ): + import torch + + logger = logging.getLogger(fullname(self)) + self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + logger.debug(f"Using device {self._device}") + self._model = model.to(self._device) + # preprocessing: input.Stimulus -> actual model inputs + self._preprocess = preprocessing + self._process_activation = default_process_activation if process_output is None else process_output + super().__init__(identifier, preprocessing, *args, **kwargs) + + def forward(self, inputs: List[Any]) -> Any: + # this function gets a list of preprocessed inputs and does the forward pass + import torch + + tensor = torch.stack(inputs) + tensor = tensor.to(self._device) + return self._model(tensor) + + def get_activations(self, inputs: List[Any], layer_names: List[str]) -> typing.OrderedDict[str, Any]: + import torch + + self._model.eval() + layer_results = OrderedDict() + hooks = [] + + for layer_name in layer_names: + layer = self.get_layer(layer_name) + hook = self._register_hook(layer, layer_name, target_dict=layer_results) + hooks.append(hook) + + with torch.no_grad(): + self.forward(inputs) + + for hook in hooks: + hook.remove() + return layer_results + + def get_layer(self, layer_name: str): + # the layer_name is a string that represents the module hierarchy up to the target layer, + # seperated by ".", e.g., "module1.submodule2.relu". + if layer_name == 'logits': + return self._output_layer() + module = self._model + for part in layer_name.split(SUBMODULE_SEPARATOR): + module = module._modules.get(part) + assert module is not None, f"No submodule found for layer {layer_name}, at part {part}" + return module + + def _output_layer(self): + module = self._model + while module._modules: + module = module._modules[next(reversed(module._modules))] + return module + + def _register_hook(self, layer, layer_name, target_dict): + def hook_function(_layer, _input, output, name=layer_name, target_dict=target_dict): + output = self._process_activation(_layer, name, _input, output) + target_dict[name] = output + + hook = layer.register_forward_hook(hook_function) + return hook + + def __repr__(self): + return repr(self._model) + + def layers(self): + for name, module in self._model.named_modules(): + if len(list(module.children())) > 0: # this module only holds other modules + continue + yield name, module + + def graph(self): + import networkx as nx + + g = nx.DiGraph() + for layer_name, layer in self.layers(): + g.add_node(layer_name, object=layer, type=type(layer)) + return g + + +__all__ = [ + "ActivationWrapper", + "PytorchWrapper", + "default_process_activation", + "SUBMODULE_SEPARATOR", +] diff --git a/brainscore_vision/model_helpers/activations/temporal/model/base.py b/brainscore_vision/model_helpers/activations/temporal/model/base.py deleted file mode 100644 index 3b0b55bd6..000000000 --- a/brainscore_vision/model_helpers/activations/temporal/model/base.py +++ /dev/null @@ -1,33 +0,0 @@ -import numpy as np -from ..inputs.base import Stimulus -from ..core import ActivationsExtractor -from ..core import TemporalInferencer, Inferencer - -from typing import List, Callable, Any, Dict - - -class ActivationWrapper: - def __init__( - self, - identifier : str, - preprocessing : Callable[[List[Stimulus]], Any], - inferencer_cls : Inferencer = TemporalInferencer, - **extractor_kwargs - ): - self.identifier = identifier - self.preprocessing = preprocessing - self.build_extractor(inferencer_cls, **extractor_kwargs) - - # List[preprocessed_input] -> Dict[layer -> np.array] - def get_activations(self, inputs : List[Stimulus], layers : List[str]) -> Dict[str, np.array]: - raise NotImplementedError() - - def __call__(self, *args, **kwargs): - return self._extractor(*args, **kwargs) - - def build_extractor(self, inferencer_cls, *args, **kwargs): - extractor = ActivationsExtractor(identifier=self.identifier, - inferencer=inferencer_cls(get_activations=self.get_activations, - preprocessing=self.preprocessing, *args, **kwargs)) - extractor.insert_attrs(self) - self._extractor = extractor \ No newline at end of file diff --git a/brainscore_vision/model_helpers/activations/temporal/model/pytorch.py b/brainscore_vision/model_helpers/activations/temporal/model/pytorch.py deleted file mode 100644 index 00197874e..000000000 --- a/brainscore_vision/model_helpers/activations/temporal/model/pytorch.py +++ /dev/null @@ -1,107 +0,0 @@ -from collections import OrderedDict -import typing -from typing import Callable, List, Any, Dict - -import logging - -from brainscore_vision.model_helpers.utils import fullname -from .base import ActivationWrapper -from ..inputs import Stimulus - - -SUBMODULE_SEPARATOR = '.' - - -def default_process_activation(layer, layer_name, inputs, output): - # (torch.nn.Module, str, torch.Tensor, torch.Tensor) -> torch.Tensor - return output - -class PytorchWrapper(ActivationWrapper): - def __init__( - self, - identifier : str, - model, - preprocessing : Callable[[List[Stimulus]], Any], - process_output = None, - *args, - **kwargs - ): - import torch - logger = logging.getLogger(fullname(self)) - self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - logger.debug(f"Using device {self._device}") - self._model = model.to(self._device) - # preprocessing: input.Stimulus -> actual model inputs - self._preprocess = preprocessing - self._process_activation = default_process_activation if process_output is None else process_output - super().__init__(identifier, preprocessing, *args, **kwargs) - - def forward(self, inputs : List[Any]) -> Any: - # this function gets a list of preprocessed inputs and does the forward pass - import torch - tensor = torch.stack(inputs) - tensor = tensor.to(self._device) - return self._model(tensor) - - def get_activations(self, inputs : List[Any], layer_names : List[str]) -> typing.OrderedDict[str, Any]: - import torch - self._model.eval() - layer_results = OrderedDict() - hooks = [] - - for layer_name in layer_names: - layer = self.get_layer(layer_name) - hook = self._register_hook(layer, layer_name, target_dict=layer_results) - hooks.append(hook) - - with torch.no_grad(): - self.forward(inputs) - - for hook in hooks: - hook.remove() - return layer_results - - def get_layer(self, layer_name : str): - # the layer_name is a string that represents the module hierarchy up to the target layer, - # seperated by ".", e.g., "module1.submodule2.relu". - if layer_name == 'logits': - return self._output_layer() - module = self._model - for part in layer_name.split(SUBMODULE_SEPARATOR): - module = module._modules.get(part) - assert module is not None, f"No submodule found for layer {layer_name}, at part {part}" - return module - - def _output_layer(self): - module = self._model - while module._modules: - module = module._modules[next(reversed(module._modules))] - return module - - @classmethod - def _tensor_to_numpy(cls, output): - return output.cpu().data.numpy() - - def _register_hook(self, layer, layer_name, target_dict): - def hook_function(_layer, _input, output, name=layer_name, target_dict=target_dict): - output = self._process_activation(_layer, name, _input, output) - target_dict[name] = PytorchWrapper._tensor_to_numpy(output) - - hook = layer.register_forward_hook(hook_function) - return hook - - def __repr__(self): - return repr(self._model) - - def layers(self): - for name, module in self._model.named_modules(): - if len(list(module.children())) > 0: # this module only holds other modules - continue - yield name, module - - def graph(self): - import networkx as nx - g = nx.DiGraph() - for layer_name, layer in self.layers(): - g.add_node(layer_name, object=layer, type=type(layer)) - return g diff --git a/brainscore_vision/model_helpers/activations/temporal/utils.py b/brainscore_vision/model_helpers/activations/temporal/utils.py index d1f7a264c..188368e2a 100644 --- a/brainscore_vision/model_helpers/activations/temporal/utils.py +++ b/brainscore_vision/model_helpers/activations/temporal/utils.py @@ -1,8 +1,210 @@ import os import numpy as np +import pickle from brainscore_vision.model_helpers.brain_transformation.temporal import assembly_time_align - +from brainio.assemblies import DataAssembly + + +# allow efficient fill_value for memmap +class custom_memmap(np.memmap): + def __new__(subtype, filename, dtype=np.uint8, mode='r+', offset=0, + shape=None, order='C', fill_value=None): + # Import here to minimize 'import numpy' overhead + import mmap + import os.path + import struct + from numpy import ndarray + + mode_equivalents = { + "readonly":"r", + "copyonwrite":"c", + "readwrite":"r+", + "write":"w+" + } + + dtypedescr = np.dtype + valid_filemodes = ["r", "c", "r+", "w+"] + writeable_filemodes = ["r+", "w+"] + + try: + mode = mode_equivalents[mode] + except KeyError as e: + if mode not in valid_filemodes: + raise ValueError( + "mode must be one of {!r} (got {!r})" + .format(valid_filemodes + list(mode_equivalents.keys()), mode) + ) from None + + if mode == 'w+' and shape is None: + raise ValueError("shape must be given if mode == 'w+'") + + def get_ctx(mode): + if hasattr(filename, 'read'): + f_ctx = nullcontext(filename) + else: + f_ctx = open( + os.fspath(filename), + ('r' if mode == 'c' else mode)+'b' + ) + return f_ctx + + with get_ctx(mode) as fid: + fid.seek(0, 2) + flen = fid.tell() + descr = dtypedescr(dtype) + _dbytes = descr.itemsize + + if shape is None: + bytes = flen - offset + if bytes % _dbytes: + raise ValueError("Size of available data is not a " + "multiple of the data-type size.") + size = bytes // _dbytes + shape = (size,) + else: + if type(shape) not in (tuple, list): + try: + shape = [operator.index(shape)] + except TypeError: + pass + shape = tuple(shape) + size = np.intp(1) # avoid default choice of np.int_, which might overflow + for k in shape: + size *= k + + bytes = int(offset + size*_dbytes) + + if mode in ('w+', 'r+') and flen < bytes: + fid.seek(bytes - 1, 0) + fid.write(b'\0') + fid.flush() + + if mode == 'w+' and fill_value is not None: + val = np.array(fill_value).astype(dtype).tobytes() + TARGET_BLOCK_SIZE = 1024 * 1024 * 10 + # chunk writing + for i in range(0, bytes, TARGET_BLOCK_SIZE): + with get_ctx(mode='r+') as _fid: + _fid.seek(i) + _fid.write(val * min(TARGET_BLOCK_SIZE, bytes - i)) + _fid.flush() + + if mode == 'c': + acc = mmap.ACCESS_COPY + elif mode == 'r': + acc = mmap.ACCESS_READ + else: + acc = mmap.ACCESS_WRITE + + start = offset - offset % mmap.ALLOCATIONGRANULARITY + bytes -= start + array_offset = offset - start + mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=start) + + self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm, + offset=array_offset, order=order) + self._mmap = mm + self.offset = offset + self.mode = mode + + if isinstance(filename, os.PathLike): + # special case - if we were constructed with a pathlib.path, + # then filename is a path object, not a string + self.filename = filename.resolve() + elif hasattr(fid, "name") and isinstance(fid.name, str): + # py3 returns int for TemporaryFile().name + self.filename = os.path.abspath(fid.name) + # same as memmap copies (e.g. memmap + 1) + else: + self.filename = None + + return self + + +# a map that write directly to the disk without loading into memory +class data_assembly_mmap: + def __init__(self, filepath=None, **kwargs): + self.filepath = filepath + self.kwargs = kwargs + self._in_memory = filepath is None + + if self._in_memory: + self._data = np.full(**kwargs) + self._created = True + else: + self._data = None + self._created = False + self.data_file = os.path.join(self.filepath, "data.npy") + self.meta_file = os.path.join(self.filepath, "meta.pkl") + + def _open(self): + if self._in_memory: + return + + if self._data is None: + kwargs = self.kwargs.copy() + fill_value = self.kwargs.get("fill_value", None) + if not self._created: + self._data = custom_memmap(self.data_file, mode='w+', **kwargs) + self._created = True + else: + self._data = custom_memmap(self.data_file, mode='r+', **kwargs) + + def _close(self): + if self._data is not None: + self._data.flush() + del self._data + self._data = None + + def __setitem__(self, key, value): + self._open() + self._data[key] = value + if not self._in_memory: + self._close() + + def __getitem__(self, key): + self._open() + return self._data[key] + + def register_meta(self, dims, coords): + self.coords = coords + self.dims = dims + + if not self._in_memory: + with open(self.meta_file, 'wb') as f: + pickle.dump((dims, coords, self.kwargs), f) + + def to_assembly(self): + self._open() + return DataAssembly(self._data, coords=self.coords, dims=self.dims) + + @staticmethod + def is_saved(filepath): + data_file = os.path.join(filepath, "data.npy") + meta_file = os.path.join(filepath, "meta.pkl") + return os.path.exists(data_file) and os.path.exists(meta_file) + + @staticmethod + def load(filepath): + if filepath is None: + return None + + if not data_assembly_mmap.is_saved(filepath): + return None + + meta_file = os.path.join(filepath, "meta.pkl") + with open(meta_file, 'rb') as f: + dims, coords, kwargs = pickle.load(f) + + data = data_assembly_mmap(filepath, **kwargs) + data._created = True + data._open() + data.dims = dims + data.coords = coords + + return data + def concat_with_nan_padding(arr_list, axis=0, dtype=np.float16): # Get shapes of all arrays @@ -161,7 +363,7 @@ def assembly_align_to_fps(output_assembly, fps, mode="portion"): # get the inferencer from any model def get_inferencer(any_model): from brainscore_vision.model_helpers.activations.temporal.core import Inferencer, ActivationsExtractor - from brainscore_vision.model_helpers.activations.temporal.model.base import ActivationWrapper + from brainscore_vision.model_helpers.activations.temporal.model import ActivationWrapper from brainscore_vision.model_helpers.brain_transformation import ModelCommitment if isinstance(any_model, Inferencer): return any_model @@ -171,7 +373,7 @@ def get_inferencer(any_model): raise ValueError(f"Cannot find inferencer from the model {any_model}") def get_base_model(any_model): - from brainscore_vision.model_helpers.activations.temporal.model.base import ActivationWrapper + from brainscore_vision.model_helpers.activations.temporal.model import ActivationWrapper from brainscore_vision.model_helpers.brain_transformation import ModelCommitment if isinstance(any_model, ActivationWrapper): return any_model @@ -225,4 +427,4 @@ def download_weight_file(url, folder=None): if total_size != 0 and progress_bar.n != total_size: raise RuntimeError("Could not download file") - return weight_path \ No newline at end of file + return weight_path diff --git a/brainscore_vision/models/temporal_model_AVID_CMA/model.py b/brainscore_vision/models/temporal_model_AVID_CMA/model.py index 0a9374c92..41aa147aa 100644 --- a/brainscore_vision/models/temporal_model_AVID_CMA/model.py +++ b/brainscore_vision/models/temporal_model_AVID_CMA/model.py @@ -21,7 +21,7 @@ def get_model(identifier): weight_path = load_weight_file( bucket="brainscore-vision", relative_path="temporal_model_AVID-CMA/AVID-CMA_Kinetics_InstX-N1024-PosW-N64-Top32_checkpoint.pth.tar", - version_id="yx9Pbq3SuNOOd4sX7csTolaHD1iTCx8y", + version_id="ZR41eKh6GYyJ61Kgwf1V4LShzse7A0yC", sha1="6efe4464ca654a56affff766acf24e89e6f3ffbf" ) @@ -29,8 +29,8 @@ def get_model(identifier): cfg_path = os.path.join(HOME, "configs/main/avid-cma/audioset/InstX-N1024-PosW-N64-Top32.yaml") weight_path = load_weight_file( bucket="brainscore-vision", - relative_path="temporal_model_AVID_CMA/AVID-CMA_Audioset_InstX-N1024-PosW-N64-Top32_checkpoint.pth.tar", - version_id="jSaZgbUohM0ZeoEUUKZiLBo6iz_v8VvQ", + relative_path="temporal_model_AVID-CMA/AVID-CMA_Audioset_InstX-N1024-PosW-N64-Top32_checkpoint.pth.tar", + version_id="8r37ZPc0oD3N0ff4R8Y_S_eV1DNDFs8d", sha1="9db5eba9aab6bdbb74025be57ab532df808fe3f6" ) @@ -38,8 +38,8 @@ def get_model(identifier): cfg_path = os.path.join(HOME, "configs/main/avid/kinetics/Cross-N1024.yaml") weight_path = load_weight_file( bucket="brainscore-vision", - relative_path="temporal_model_AVID_CMA/AVID_Kinetics_Cross-N1024_checkpoint.pth.tar", - version_id="XyKt0UOUFsuuyrl6ZREivK8FadRPx34u", + relative_path="temporal_model_AVID-CMA/AVID_Kinetics_Cross-N1024_checkpoint.pth.tar", + version_id="OjapKttPf.6dvZLO8L89GZddpcizDu_J", sha1="d3a04f856d29421ba8de37808593a3fad4d4794f" ) @@ -47,8 +47,8 @@ def get_model(identifier): cfg_path = os.path.join(HOME, "configs/main/avid/audioset/Cross-N1024.yaml") weight_path = load_weight_file( bucket="brainscore-vision", - relative_path="temporal_model_AVID_CMA/AVID_Audioset_Cross-N1024_checkpoint.pth.tar", - version_id="0Sxuhn8LsYXQC4FnPfJ7rw7uU6kDlKgc", + relative_path="temporal_model_AVID-CMA/AVID_Audioset_Cross-N1024_checkpoint.pth.tar", + version_id="VoWZzJllTVFu8oVk7ocf_K7pk11Gbh19", sha1="b48d8428a1a2526ccca070f810333df18bfce5fd" ) diff --git a/brainscore_vision/models/temporal_model_AVID_CMA/requirements.txt b/brainscore_vision/models/temporal_model_AVID_CMA/requirements.txt index 47cc15207..d613d4ae4 100644 --- a/brainscore_vision/models/temporal_model_AVID_CMA/requirements.txt +++ b/brainscore_vision/models/temporal_model_AVID_CMA/requirements.txt @@ -1,3 +1,4 @@ avid_cma @ git+https://github.com/YingtianDt/AVID-CMA.git torch -torchvision \ No newline at end of file +torchvision +pyav \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_GDT/model.py b/brainscore_vision/models/temporal_model_GDT/model.py index 1afd9712d..7b90e8d43 100644 --- a/brainscore_vision/models/temporal_model_GDT/model.py +++ b/brainscore_vision/models/temporal_model_GDT/model.py @@ -22,21 +22,21 @@ def get_model(identifier): pth = load_weight_file( bucket="brainscore-vision", relative_path="temporal_model_GDT/gdt_K400.pth", - version_id="JpU_tnCzrbTejn6sOrQMk8eRsJ97yFgt", + version_id="xdpDRV0gnRhULZGL7lwjGxGKqp2hnP1_", sha1="7f12c60670346b1aab15194eb44c341906e1bca6" ) elif dataset == "IG65M": pth = load_weight_file( bucket="brainscore-vision", relative_path="temporal_model_GDT/gdt_IG65M.pth", - version_id="R.NoD6VAbFbJdf8tg5jnXIWB3hQ8GlSD", + version_id="S0_ZVFA2K96ZoVLx26edAxILPm7S.Gf6", sha1="3dcee3af61691e1e7e47e4b115be6808f4ea8172" ) elif dataset == "HowTo100M": pth = load_weight_file( bucket="brainscore-vision", relative_path="temporal_model_GDT/gdt_HT100M.pth", - version_id="BVRl9t_134PoKZCn9W54cyfkImCW2ioq", + version_id=".fli2qSf6pWqbLyY413ZID578sJrC.L.", sha1="a9a979c82e83b955794814923af736eb34e6f080" ) else: diff --git a/brainscore_vision/models/temporal_model_S3D_text_video/model.py b/brainscore_vision/models/temporal_model_S3D_text_video/model.py index bfc11882c..99d8db3a5 100644 --- a/brainscore_vision/models/temporal_model_S3D_text_video/model.py +++ b/brainscore_vision/models/temporal_model_S3D_text_video/model.py @@ -3,7 +3,7 @@ from torchvision import transforms from s3dg_howto100m import S3D -from brainscore_vision.model_helpers.activations.temporal.model.pytorch import PytorchWrapper +from brainscore_vision.model_helpers.activations.temporal.model import PytorchWrapper from brainscore_core.supported_data_standards.brainio.s3 import load_weight_file @@ -41,14 +41,14 @@ def get_model(identifier="s3d-HowTo100M"): model_pth = load_weight_file( bucket="brainscore-vision", relative_path="temporal_model_S3D_text_video/s3d_howto100m.pth", - version_id="hRp6I8bpwreIMUVL0H.zCdK0hqRggL7n", + version_id="3k_iwjSqYUwMRdri9IH2JZn5yMGCnayc", sha1="31e99d2a1cd48f2259ca75e719ac82c8b751ea75" ) dict_pth = load_weight_file( bucket="brainscore-vision", relative_path="temporal_model_S3D_text_video/s3d_dict.npy", - version_id="4NxVLe8DSL6Uue0F7e2rz8HZuOk.tkBI", + version_id="FWHuvz5CDjNJyG1IuzEqkHU9zzmPv62p", sha1="d368ff7d397ec8240f1f963b5efe8ff245bac35f" ) @@ -62,4 +62,4 @@ def get_model(identifier="s3d-HowTo100M"): process_output=process_output, **inferencer_kwargs) - return wrapper \ No newline at end of file + return wrapper diff --git a/brainscore_vision/models/temporal_model_S3D_text_video/requirements.txt b/brainscore_vision/models/temporal_model_S3D_text_video/requirements.txt index 73f27f3b6..19f34cf56 100644 --- a/brainscore_vision/models/temporal_model_S3D_text_video/requirements.txt +++ b/brainscore_vision/models/temporal_model_S3D_text_video/requirements.txt @@ -1 +1,2 @@ +torchvision S3D_HowTo100M @ git+https://github.com/YingtianDt/S3D_HowTo100M \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_VideoMAE/model.py b/brainscore_vision/models/temporal_model_VideoMAE/model.py index 0dc135329..7e313e10d 100644 --- a/brainscore_vision/models/temporal_model_VideoMAE/model.py +++ b/brainscore_vision/models/temporal_model_VideoMAE/model.py @@ -39,7 +39,7 @@ def get_model(identifier, num_frames=16): pth = load_weight_file( bucket="brainscore-vision", relative_path='temporal_model_VideoMAE/vit_b_k400_pt_1200e.pth', - version_id="Oi3VboRZujNyZAcwf5q7XZ2M8q1cPO6o", + version_id="YkdCqz8ywKAqAPMLv6iE9SfihuqExfcJ", sha1="8faf42df13f619a8970d653695e98f0643760b54" ) num_blocks = 12 @@ -48,7 +48,7 @@ def get_model(identifier, num_frames=16): pth = load_weight_file( bucket="brainscore-vision", relative_path='temporal_model_VideoMAE/vit_l_k400_pt_1200e.pth', - version_id="MiPfczDaVponDGuUBrEPqmT.BiVvh_I1", + version_id="v8VsUKnAJQ23dyLiYOSYyjSrHI_zB60o", sha1="7ff6acbba221f85d7148223ec932ad7c57f2f40c" ) num_blocks = 24 diff --git a/brainscore_vision/models/temporal_model_VideoMAEv2/model.py b/brainscore_vision/models/temporal_model_VideoMAEv2/model.py index 6b9b8ab5a..b4b7568e4 100644 --- a/brainscore_vision/models/temporal_model_VideoMAEv2/model.py +++ b/brainscore_vision/models/temporal_model_VideoMAEv2/model.py @@ -53,7 +53,7 @@ def get_model(identifier): pth = load_weight_file( bucket="brainscore-vision", relative_path="temporal_model_VideoMAEv2/vit_g_hybrid_pt_1200e.pth", - version_id="TxtkfbeMV105dzpzTwi0Kn5glnvQvIrq", + version_id="0SPWwIlsWFhY98bKgo7ZRRNTXkcT.dBO", sha1="9048f2bc0b0c7ba4d0e5228f3a7c0bef4dbaca69" ) num_blocks = 40 @@ -63,7 +63,7 @@ def get_model(identifier): pth = load_weight_file( bucket="brainscore-vision", relative_path="temporal_model_VideoMAEv2/vit_b_hybrid_pt_800e.pth", - version_id="rRjpYq21dAQ5KaCLbEHK.YaLZ_fbMPKw", + version_id="VLgKVXOX2XYJQvaIOqAAVDvd5xvjx3.d", sha1="1e3602691964b1eb6f7c33529119243a5b235635" ) num_blocks = 12 diff --git a/brainscore_vision/models/temporal_model_mae_st/model.py b/brainscore_vision/models/temporal_model_mae_st/model.py index 9f5cf9299..6e818cea7 100644 --- a/brainscore_vision/models/temporal_model_mae_st/model.py +++ b/brainscore_vision/models/temporal_model_mae_st/model.py @@ -53,7 +53,7 @@ def get_model(identifier): load_path = load_weight_file( bucket="brainscore-vision", relative_path="temporal_model_mae_st/mae_pretrain_vit_large_k400.pth", - version_id="cPcP4AzpG95CimQ5Pn.CHKnGUJlLXM3m", + version_id="bHQ1XRM.a4NfgbP53TR5AsgvUsN40R_j", sha1="c7fb91864a4ddf8b99309440121a3abe66b846bb" ) @@ -64,7 +64,7 @@ def get_model(identifier): load_path = load_weight_file( bucket="brainscore-vision", relative_path="temporal_model_mae_st/mae_pretrain_vit_huge_k400.pth", - version_id="IYKa8QiocgBzo3EhsBouS62HboK6iqYT", + version_id="ARS9kz2wQQ29iRsssyc7u_kfA4wLDAji", sha1="177e48577142ca01949c08254834ffa1198b9eb4" ) diff --git a/brainscore_vision/models/temporal_model_mae_st/requirements.txt b/brainscore_vision/models/temporal_model_mae_st/requirements.txt index 0d1858c8e..85be22792 100644 --- a/brainscore_vision/models/temporal_model_mae_st/requirements.txt +++ b/brainscore_vision/models/temporal_model_mae_st/requirements.txt @@ -1,3 +1,6 @@ mae_st @ git+https://github.com/YingtianDt/mae_st.git torch -torchvision \ No newline at end of file +torchvision +iopath +simplejson +timm \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_mmaction2/requirements.txt b/brainscore_vision/models/temporal_model_mmaction2/requirements.txt index 1caa728dd..4428fdb7b 100644 --- a/brainscore_vision/models/temporal_model_mmaction2/requirements.txt +++ b/brainscore_vision/models/temporal_model_mmaction2/requirements.txt @@ -1,5 +1,6 @@ importlib-metadata<5 -mmaction2 @ git+https://github.com/YingtianDt/mmaction2.git@533edc3 -mmengine +mmaction2 @ git+https://github.com/YingtianDt/mmaction2.git +mmengine==0.10.4 +mmcv==2.1.0 torch -torchvision \ No newline at end of file +torchvision diff --git a/brainscore_vision/models/temporal_model_torchvision/model.py b/brainscore_vision/models/temporal_model_torchvision/model.py index 30d96aba8..6c4e1150c 100644 --- a/brainscore_vision/models/temporal_model_torchvision/model.py +++ b/brainscore_vision/models/temporal_model_torchvision/model.py @@ -3,7 +3,7 @@ from torchvision import transforms from torchvision.models import video as vid -from brainscore_vision.model_helpers.activations.temporal.model.pytorch import PytorchWrapper +from brainscore_vision.model_helpers.activations.temporal.model import PytorchWrapper LARGE_MODEL_LAYER_STEP = 2 @@ -89,4 +89,4 @@ def process_output(layer, layer_name, input, output): process_output=process_output, **inferencer_kwargs) - return wrapper \ No newline at end of file + return wrapper diff --git a/brainscore_vision/models/temporal_model_vjepa/__init__.py b/brainscore_vision/models/temporal_model_vjepa/__init__.py new file mode 100644 index 000000000..6e2a2ff30 --- /dev/null +++ b/brainscore_vision/models/temporal_model_vjepa/__init__.py @@ -0,0 +1,16 @@ +from brainscore_vision import model_registry +from brainscore_vision.model_helpers.brain_transformation import ModelCommitment +from brainscore_vision.model_helpers.activations.temporal.utils import get_specified_layers +from brainscore_vision.model_interface import BrainModel +from . import model + + +def commit_model(identifier): + activations_model=model.get_model(identifier) + layers=get_specified_layers(activations_model) + return ModelCommitment(identifier=identifier, activations_model=activations_model, layers=layers) + + +model_registry["VJEPA-L16"] = lambda: commit_model("VJEPA-L16") +model_registry["VJEPA-H16"] = lambda: commit_model("VJEPA-H16") +model_registry["VJEPA-H16-384"] = lambda: commit_model("VJEPA-H16-384") diff --git a/brainscore_vision/models/temporal_model_vjepa/model.py b/brainscore_vision/models/temporal_model_vjepa/model.py new file mode 100644 index 000000000..e05fb2fcd --- /dev/null +++ b/brainscore_vision/models/temporal_model_vjepa/model.py @@ -0,0 +1,86 @@ +from .vjepa_model import VJEPA +import torch as th + +from brainscore_vision.model_helpers.activations.temporal.model import PytorchWrapper +from brainscore_vision.model_helpers.activations.temporal.utils import download_weight_file + +from torchvision import transforms + +class VJEPAWrapper(PytorchWrapper): + def forward(self, inputs): + tensor = th.stack(inputs) + tensor = tensor.to(self._device) + return self._model(tensor) # encoder only + +# Define the ImageNet mean and std +IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) +IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) + +transform_img = transforms.Compose([ + transforms.Resize((224, 224)), # Example size for ViT + transforms.ToTensor(), + transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), +]) + +def transform_video(video): + frames = [] + for img in video.to_pil_imgs(): + frames += [transform_img(img)] + frames = th.stack(frames) + return frames + +def get_model(identifier, num_frames=16): + assert identifier.startswith("VJEPA") + + if identifier == "VJEPA-L16": + url = "https://dl.fbaipublicfiles.com/jepa/vitl16/vitl16.pth.tar" + H = W = 14 + vit_type = "vit_large" + num_blocks = 24 + elif identifier == "VJEPA-H16": + url = "https://dl.fbaipublicfiles.com/jepa/vith16/vith16.pth.tar" + H = W = 14 + vit_type = "vit_huge" + num_blocks = 32 + elif identifier == "VJEPA-H16-384": + url = "https://dl.fbaipublicfiles.com/jepa/vith16-384/vith16-384.pth.tar" + H = W = 24 + vit_type = "vit_huge" + num_blocks = 32 + else: + raise ValueError(f"Unknown VJEPA identifier: {identifier}") + + T = None + weight_path = download_weight_file(url, folder="temporal_model_vjepa") + + # Instantiate the model + net = VJEPA(weight_path, vit_type) + + def process_output(layer, layer_name, inputs, output): + if layer_name == "encoder.encoder.patch_embed": + global T + T = inputs[0].shape[2] + + B, L, C = output.shape + assert L == T//2 * H * W + output = output.view(B, T//2, H, W, C) + + return output + + inferencer_kwargs = { + "fps": 10, + "layer_activation_format": { + "encoder.encoder.patch_embed": "THWC", + **{f"encoder.encoder.blocks.{i}": "THWC" for i in range(0, num_blocks, 2)}, + }, + "duration": None, + "time_alignment": "evenly_spaced", + "process_output": process_output, + } + + for layer in inferencer_kwargs["layer_activation_format"].keys(): + assert "decoder" not in layer, "Decoder layers are not supported." + + wrapper = VJEPAWrapper(identifier, net, transform_video, + **inferencer_kwargs) + return wrapper diff --git a/brainscore_vision/models/temporal_model_vjepa/requirements.txt b/brainscore_vision/models/temporal_model_vjepa/requirements.txt new file mode 100644 index 000000000..abe4858a6 --- /dev/null +++ b/brainscore_vision/models/temporal_model_vjepa/requirements.txt @@ -0,0 +1,14 @@ +torch>=2 +torchvision +pyyaml +numpy +opencv-python +submitit +braceexpand +webdataset +timm +decord +pandas +einops +beartype +phys_readouts@ git+https://github.com/thekej/phys_readouts.git#egg=phys_readouts \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_vjepa/test.py b/brainscore_vision/models/temporal_model_vjepa/test.py new file mode 100644 index 000000000..6a20ca8dc --- /dev/null +++ b/brainscore_vision/models/temporal_model_vjepa/test.py @@ -0,0 +1,17 @@ +import pytest + +from brainscore_vision import load_model + + +model_list = [ + "VJEPA-Temporal", +] + +@pytest.mark.private_access +@pytest.mark.memory_intense +@pytest.mark.parametrize("model_identifier", model_list) +def test_load(model_identifier): + model = load_model(model_identifier) + assert model is not None + + \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_vjepa/vjepa_model.py b/brainscore_vision/models/temporal_model_vjepa/vjepa_model.py new file mode 100644 index 000000000..1fc333012 --- /dev/null +++ b/brainscore_vision/models/temporal_model_vjepa/vjepa_model.py @@ -0,0 +1,87 @@ +from torch import nn + +import torch + +import phys_extractors.models.jepa_physics.jepa.src.models.vision_transformer as vit + + +class VJEPA_encoder(nn.Module): + def __init__( + self, + weights_path, + vit_type="vit_large", + embed_dim=256, + crop_size=224, + patch_size=16, + num_frames=16, + tubelet_size=2, + ): + + super().__init__() + + self.embed_dim = embed_dim + + # download the model and put it in the folder. + state_dict = torch.load(weights_path, map_location='cpu') + + # following the config for the model + + uniform_power = True + use_sdpa = True + use_SiLU = False + tight_SiLU = False + + self.encoder = vit.__dict__[vit_type]( + img_size=crop_size, + patch_size=patch_size, + num_frames=num_frames, + tubelet_size=tubelet_size, + uniform_power=uniform_power, + use_sdpa=use_sdpa, + use_SiLU=use_SiLU, + tight_SiLU=tight_SiLU, + ) + + self.encoder.load_state_dict({k.replace('module.backbone.', ''): v for k, v in state_dict['encoder'].items()}) + self.encoder.eval() + + # set requires_grad false + for param in self.encoder.parameters(): + param.requires_grad = False + + def forward(self, videos): + B, T, C, H, W = videos.shape + embeddings = self.encoder(videos.transpose(1,2)) + embeddings = self.patchify(embeddings) + return embeddings + + def patchify(self, input_array, patch_height=32, patch_width=32): + # Step 2: Pad the matrix to size 392x1036 + padding_size = 12 # Number of columns to add + padded_matrix = torch.nn.functional.pad(input_array, (0, padding_size)) + # Step 3: Define patch size + N, _, _ = padded_matrix.shape + H_prime = 56 + W_prime = 148 + + # Step 4: Patchify the padded matrix into 49 contiguous patches of size 56x148 + # Reshape the matrix into shape (7, 56, 7, 148) and then swap axes to obtain patches + patches = padded_matrix.unfold(1, H_prime, H_prime) + patches = patches.unfold(2, W_prime, W_prime) + patches = patches.contiguous().view(N, -1, H_prime, W_prime) # 49 patches of size 56x148 + + return patches + +# Given sequence of images, predicts next latent +class VJEPA(nn.Module): + def __init__(self, *args, **kwargs): + super().__init__() + + self.encoder = VJEPA_encoder(*args, **kwargs) + + def forward(self, x): + # set frozen pretrained encoder to eval mode + self.encoder.eval() + # x is (Bs, T, 3, H, W) + assert len(x.shape) == 5 + return self.encoder(x) diff --git a/tests/test_model_helpers/temporal/activations/test_extractor.py b/tests/test_model_helpers/temporal/activations/test_extractor.py index 9f809bf5c..a30353b4a 100644 --- a/tests/test_model_helpers/temporal/activations/test_extractor.py +++ b/tests/test_model_helpers/temporal/activations/test_extractor.py @@ -3,7 +3,7 @@ import pytest from brainscore_core.supported_data_standards.brainio.stimuli import StimulusSet -from brainscore_vision.model_helpers.activations.temporal.inputs.base import Stimulus +from brainscore_vision.model_helpers.activations.temporal.inputs import Stimulus from brainscore_vision.model_helpers.activations.temporal.model import ActivationWrapper from brainscore_vision.model_helpers.activations.temporal.core import TemporalInferencer, CausalInferencer from collections import OrderedDict @@ -93,4 +93,4 @@ def test_from_stimulus_set(causal, padding): assert len(np.unique(activations['layer'])) == len(layers) import gc - gc.collect() # free some memory, we're piling up a lot of activations at this point \ No newline at end of file + gc.collect() # free some memory, we're piling up a lot of activations at this point diff --git a/tests/test_model_helpers/temporal/activations/test_inferencer.py b/tests/test_model_helpers/temporal/activations/test_inferencer.py index d9df0b5d4..db6249760 100644 --- a/tests/test_model_helpers/temporal/activations/test_inferencer.py +++ b/tests/test_model_helpers/temporal/activations/test_inferencer.py @@ -57,7 +57,7 @@ def time_down_sample_preprocess(video): def test_inferencer(max_spatial_size): inferencer = Inferencer(dummy_get_features, dummy_preprocess, dummy_layer_activation_format, Video, max_workers=1, max_spatial_size=max_spatial_size, batch_grouper=lambda s: s.duration) - model_assembly = inferencer(video_paths, layers=dummy_layers) + model_assembly = inferencer(video_paths[1:], layers=dummy_layers) if max_spatial_size is None: # 6 second video with fps 60 has 360 frames # the model simply return the same number of frames as the temporal size of activations @@ -65,27 +65,22 @@ def test_inferencer(max_spatial_size): assert model_assembly.sizes["neuroid"] == 360*6*3*2 + 2 else: assert model_assembly.sizes["neuroid"] == 360*max_spatial_size*(max_spatial_size//2) * 2 + 2 - assert model_assembly.sizes["stimulus_path"] == 2 + assert model_assembly.sizes["stimulus_path"] == 1 -@pytest.mark.parametrize("time_alignment", ["evenly_spaced", "ignore_time"]) @pytest.mark.parametrize("fps", [10, 30, 45]) -def test_temporal_inferencer(time_alignment, fps): +def test_temporal_inferencer(fps): inferencer = TemporalInferencer(dummy_get_features, dummy_preprocess, dummy_layer_activation_format, max_workers=1, - fps=fps, time_alignment=time_alignment) + fps=fps) model_assembly = inferencer(video_paths, layers=dummy_layers) assert model_assembly['time_bin_start'].values[0] == 0 assert model_assembly['time_bin_end'].values[-1] == max(video_durations) - if time_alignment != "ignore_time": - # since the longer video lasts for 6 seconds, and the temporal inferencer align all output assembly to have fps - # specified when constructing the inferencer, the number of time bins should be 6*fps - assert model_assembly.sizes["time_bin"] == 6 * fps - assert np.isclose(model_assembly['time_bin_end'].values[0] - model_assembly['time_bin_start'].values[0], 1000/fps) - else: - assert model_assembly.sizes["time_bin"] == 1 - assert model_assembly['time_bin_end'].values[0] - model_assembly['time_bin_start'].values[0] == max(video_durations) + # since the longer video lasts for 6 seconds, and the temporal inferencer align all output assembly to have fps + # specified when constructing the inferencer, the number of time bins should be 6*fps + assert model_assembly.sizes["time_bin"] == 6 * fps + assert np.isclose(model_assembly['time_bin_end'].values[0] - model_assembly['time_bin_start'].values[0], 1000/fps) # manual computation check output_values = model_assembly.sel(stimulus_path=video_paths[1])\