From 144d054f1346807b72f41e26aec944360b18f08f Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Wed, 13 May 2026 20:14:42 -0400 Subject: [PATCH 1/2] feat: add numberOfSessions to AssetsSummary Aggregator counts unique (subject, session) pairs parsed from BIDS-style sub-* and ses-* tokens in asset paths (filename first, then directory components when the session token is omitted from the filename). Co-Authored-By: Claude Opus 4.7 (1M context) --- dandischema/metadata.py | 24 +++++++++++++++- dandischema/models.py | 3 ++ dandischema/tests/test_metadata.py | 46 ++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) diff --git a/dandischema/metadata.py b/dandischema/metadata.py index 19064262..dbd5aefe 100644 --- a/dandischema/metadata.py +++ b/dandischema/metadata.py @@ -527,15 +527,36 @@ def _add_asset_to_stats(assetmeta: Dict[str, Any], stats: _stats_type) -> None: stats = _get_samples(value, stats, hierarchy) break - for part in Path(assetmeta["path"]).name.split(".")[0].split("_"): + stats["sessions"] = stats.get("sessions", []) + path_subject = None + path_session = None + asset_path = Path(assetmeta["path"]) + # Scan filename first (existing behavior for sub-/sample-), then directory + # components, so ses- tokens that appear only in the directory (e.g. + # `sub-X/ses-Y/foo_acq-Z_bold.nii.gz`) are still counted. + for part in asset_path.name.split(".")[0].split("_"): if part.startswith("sub-"): subject = part.replace("sub-", "") + path_subject = subject if subject not in stats["subjects"]: stats["subjects"].append(subject) if part.startswith("sample-"): sample = part.replace("sample-", "") if sample not in stats["tissuesample"]: stats["tissuesample"].append(sample) + if part.startswith("ses-"): + path_session = part.replace("ses-", "") + if path_session is None or path_subject is None: + for directory in asset_path.parts[:-1]: + for part in directory.split("_"): + if path_subject is None and part.startswith("sub-"): + path_subject = part.replace("sub-", "") + if path_session is None and part.startswith("ses-"): + path_session = part.replace("ses-", "") + if path_subject is not None and path_session is not None: + pair = (path_subject, path_session) + if pair not in stats["sessions"]: + stats["sessions"].append(pair) stats["dataStandard"] = stats.get("dataStandard", []) @@ -567,4 +588,5 @@ def aggregate_assets_summary(metadata: Iterable[Dict[str, Any]]) -> dict: len(stats.pop("tissuesample", [])) + len(stats.pop("slice", [])) ) or None stats["numberOfCells"] = len(stats.pop("cell", [])) or None + stats["numberOfSessions"] = len(stats.pop("sessions", [])) or None return models.AssetsSummary(**stats).model_dump(mode="json", exclude_none=True) diff --git a/dandischema/models.py b/dandischema/models.py index b35275fe..856d7fc6 100644 --- a/dandischema/models.py +++ b/dandischema/models.py @@ -1200,6 +1200,9 @@ class AssetsSummary(DandiBaseModel): None, json_schema_extra={"readOnly": True} ) # more of NWB numberOfCells: Optional[int] = Field(None, json_schema_extra={"readOnly": True}) + numberOfSessions: Optional[int] = Field( + None, json_schema_extra={"readOnly": True} + ) # BIDS ses-* tokens, counted as unique (subject, session) pairs dataStandard: Optional[List[StandardsType]] = Field( None, json_schema_extra={"readOnly": True} diff --git a/dandischema/tests/test_metadata.py b/dandischema/tests/test_metadata.py index 6da7fff3..b848f3e6 100644 --- a/dandischema/tests/test_metadata.py +++ b/dandischema/tests/test_metadata.py @@ -521,6 +521,7 @@ def test_migrate_schemaversion_update() -> None: "numberOfSubjects": 1, "numberOfSamples": 1, "numberOfCells": 1, + "numberOfSessions": 1, "dataStandard": [ { "schemaKey": "StandardsType", @@ -541,6 +542,7 @@ def test_migrate_schemaversion_update() -> None: "numberOfBytes": 608720, "numberOfFiles": 2, "numberOfSubjects": 1, + "numberOfSessions": 2, "dataStandard": [ { "schemaKey": "StandardsType", @@ -588,6 +590,7 @@ def test_migrate_schemaversion_update() -> None: "numberOfSubjects": 2, "numberOfSamples": 1, "numberOfCells": 1, + "numberOfSessions": 2, "dataStandard": [ { "schemaKey": "StandardsType", @@ -751,6 +754,7 @@ def test_aggregation_bids() -> None: assert summary["numberOfFiles"] == 3 assert summary["numberOfSamples"] == 2 assert summary["numberOfSubjects"] == 1 + assert summary["numberOfSessions"] == 2 assert sum("BIDS" in _.get("name", "") for _ in summary["dataStandard"]) == 1 assert ( sum(_.get("name", "").startswith("OME/NGFF") for _ in summary["dataStandard"]) @@ -758,6 +762,48 @@ def test_aggregation_bids() -> None: ) # only a single entry so we do not duplicate them +def _bids_asset(path: str, size: int = 1) -> Dict[str, Any]: + return { + "schemaKey": "Asset", + "schemaVersion": DANDI_SCHEMA_VERSION, + "path": path, + "contentSize": size, + "encodingFormat": "application/x-nwb", + } + + +def test_aggregate_number_of_sessions() -> None: + # Same subject, two sessions (session token only in filename) + data = [ + _bids_asset("sub-01/ses-A/eeg/sub-01_ses-A_task-rest_eeg.edf"), + _bids_asset("sub-01/ses-B/eeg/sub-01_ses-B_task-rest_eeg.edf"), + ] + summary = aggregate_assets_summary(data) + assert summary["numberOfSubjects"] == 1 + assert summary["numberOfSessions"] == 2 + + # Two subjects sharing a session id "A" -> two distinct (sub, ses) pairs + data = [ + _bids_asset("sub-01/ses-A/eeg/sub-01_ses-A_task-rest_eeg.edf"), + _bids_asset("sub-02/ses-A/eeg/sub-02_ses-A_task-rest_eeg.edf"), + ] + summary = aggregate_assets_summary(data) + assert summary["numberOfSubjects"] == 2 + assert summary["numberOfSessions"] == 2 + + # ses- only in directory portion (filename omits it) still counts + data = [ + _bids_asset("sub-01/ses-A/anat/sub-01_T1w.nii.gz"), + ] + summary = aggregate_assets_summary(data) + assert summary["numberOfSessions"] == 1 + + # No ses- anywhere -> field is absent + data = [_bids_asset("sub-01/anat/sub-01_T1w.nii.gz")] + summary = aggregate_assets_summary(data) + assert "numberOfSessions" not in summary + + class TestValidateObjJson: """ Tests for `_validate_obj_json()` From cdb37eeb26bc0c26a47d72884033001f35634177 Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Thu, 14 May 2026 09:16:09 -0400 Subject: [PATCH 2/2] refactor: collapse BIDS-token parsing into a loop Per review feedback: generalize the per-prefix blocks (sub-, sample-, ses-) into a single loop over an entity list, and simplify the session fallback to a direct scan of path parts. Co-Authored-By: Claude Opus 4.7 (1M context) --- dandischema/metadata.py | 44 ++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/dandischema/metadata.py b/dandischema/metadata.py index e3f876c9..801f492e 100644 --- a/dandischema/metadata.py +++ b/dandischema/metadata.py @@ -531,35 +531,25 @@ def _add_asset_to_stats(assetmeta: Dict[str, Any], stats: _stats_type) -> None: # once in some incorrectly named datasets found: Dict[str, str] = {} asset_path = Path(assetmeta["path"]) + # (entity key, BIDS prefix, unique-values bucket in stats; None if not aggregated) + entities = [ + ("subject", "sub-", "subjects"), + ("sample", "sample-", "tissuesample"), + ("session", "ses-", None), + ] for part in asset_path.name.split(".")[0].split("_"): - if not found.get("subject") and part.startswith("sub-"): - found["subject"] = subject = part.split("sub-", 1)[1] - if subject not in stats["subjects"]: - stats["subjects"].append(subject) - if not found.get("sample") and part.startswith("sample-"): - found["sample"] = sample = part.replace("sample-", "") - if sample not in stats["tissuesample"]: - stats["tissuesample"].append(sample) - if not found.get("session") and part.startswith("ses-"): - found["session"] = part.split("ses-", 1)[1] - # Fallback: ses- tokens that appear only in directory components (e.g. - # `sub-X/ses-Y/foo_acq-Z_bold.nii.gz`) should still be counted. To form - # the (subject, session) pair we also accept a directory-only subject, - # but we do not add such a subject to stats["subjects"] — subject - # counting remains driven by the filename and wasAttributedTo, matching - # prior behavior. + for key, prefix, bucket in entities: + if not found.get(key) and part.startswith(prefix): + found[key] = value = part.split(prefix, 1)[1] + if bucket is not None and value not in stats[bucket]: + stats[bucket].append(value) + # If ses- is absent from the filename, fall back to scanning the path + # parts (BIDS keeps `ses-X` as its own directory). if not found.get("session"): - dir_subject = found.get("subject") - dir_session: Optional[str] = None - for directory in asset_path.parts[:-1]: - for part in directory.split("_"): - if dir_subject is None and part.startswith("sub-"): - dir_subject = part.split("sub-", 1)[1] - if dir_session is None and part.startswith("ses-"): - dir_session = part.split("ses-", 1)[1] - if dir_subject is not None and dir_session is not None: - found.setdefault("subject", dir_subject) - found["session"] = dir_session + for part in asset_path.parts[:-1]: + if part.startswith("ses-"): + found["session"] = part.split("ses-", 1)[1] + break stats["sessions"] = stats.get("sessions", []) if found.get("subject") and found.get("session"): pair = (found["subject"], found["session"])