diff --git a/dandischema/metadata.py b/dandischema/metadata.py index 64383a41..801f492e 100644 --- a/dandischema/metadata.py +++ b/dandischema/metadata.py @@ -530,15 +530,31 @@ def _add_asset_to_stats(assetmeta: Dict[str, Any], stats: _stats_type) -> None: # which components already found, so we do not count more than # once in some incorrectly named datasets found: Dict[str, str] = {} - for part in Path(assetmeta["path"]).name.split(".")[0].split("_"): - if not found.get("subject") and part.startswith("sub-"): - found["subject"] = subject = part.split("sub-", 1)[1] - if subject not in stats["subjects"]: - stats["subjects"].append(subject) - if not found.get("sample") and part.startswith("sample-"): - found["sample"] = sample = part.replace("sample-", "") - if sample not in stats["tissuesample"]: - stats["tissuesample"].append(sample) + asset_path = Path(assetmeta["path"]) + # (entity key, BIDS prefix, unique-values bucket in stats; None if not aggregated) + entities = [ + ("subject", "sub-", "subjects"), + ("sample", "sample-", "tissuesample"), + ("session", "ses-", None), + ] + for part in asset_path.name.split(".")[0].split("_"): + for key, prefix, bucket in entities: + if not found.get(key) and part.startswith(prefix): + found[key] = value = part.split(prefix, 1)[1] + if bucket is not None and value not in stats[bucket]: + stats[bucket].append(value) + # If ses- is absent from the filename, fall back to scanning the path + # parts (BIDS keeps `ses-X` as its own directory). + if not found.get("session"): + for part in asset_path.parts[:-1]: + if part.startswith("ses-"): + found["session"] = part.split("ses-", 1)[1] + break + stats["sessions"] = stats.get("sessions", []) + if found.get("subject") and found.get("session"): + pair = (found["subject"], found["session"]) + if pair not in stats["sessions"]: + stats["sessions"].append(pair) stats["dataStandard"] = stats.get("dataStandard", []) @@ -573,4 +589,5 @@ def aggregate_assets_summary(metadata: Iterable[Dict[str, Any]]) -> dict: len(stats.pop("tissuesample", [])) + len(stats.pop("slice", [])) ) or None stats["numberOfCells"] = len(stats.pop("cell", [])) or None + stats["numberOfSessions"] = len(stats.pop("sessions", [])) or None return models.AssetsSummary(**stats).model_dump(mode="json", exclude_none=True) diff --git a/dandischema/models.py b/dandischema/models.py index 58e57355..44149980 100644 --- a/dandischema/models.py +++ b/dandischema/models.py @@ -1201,6 +1201,9 @@ class AssetsSummary(DandiBaseModel): None, json_schema_extra={"readOnly": True} ) # more of NWB numberOfCells: Optional[int] = Field(None, json_schema_extra={"readOnly": True}) + numberOfSessions: Optional[int] = Field( + None, json_schema_extra={"readOnly": True} + ) # BIDS ses-* tokens, counted as unique (subject, session) pairs dataStandard: Optional[List[StandardsType]] = Field( None, json_schema_extra={"readOnly": True} diff --git a/dandischema/tests/test_metadata.py b/dandischema/tests/test_metadata.py index 2327271d..1e7dc819 100644 --- a/dandischema/tests/test_metadata.py +++ b/dandischema/tests/test_metadata.py @@ -521,6 +521,7 @@ def test_migrate_schemaversion_update() -> None: "numberOfSubjects": 1, "numberOfSamples": 1, "numberOfCells": 1, + "numberOfSessions": 1, "dataStandard": [ { "schemaKey": "StandardsType", @@ -541,6 +542,7 @@ def test_migrate_schemaversion_update() -> None: "numberOfBytes": 608720, "numberOfFiles": 2, "numberOfSubjects": 1, + "numberOfSessions": 2, "dataStandard": [ { "schemaKey": "StandardsType", @@ -588,6 +590,7 @@ def test_migrate_schemaversion_update() -> None: "numberOfSubjects": 2, "numberOfSamples": 1, "numberOfCells": 1, + "numberOfSessions": 2, "dataStandard": [ { "schemaKey": "StandardsType", @@ -751,6 +754,7 @@ def test_aggregation_bids() -> None: assert summary["numberOfFiles"] == 3 assert summary["numberOfSamples"] == 2 assert summary["numberOfSubjects"] == 1 + assert summary["numberOfSessions"] == 2 assert sum("BIDS" in _.get("name", "") for _ in summary["dataStandard"]) == 1 assert ( sum(_.get("name", "").startswith("OME/NGFF") for _ in summary["dataStandard"]) @@ -758,6 +762,48 @@ def test_aggregation_bids() -> None: ) # only a single entry so we do not duplicate them +def _bids_asset(path: str, size: int = 1) -> Dict[str, Any]: + return { + "schemaKey": "Asset", + "schemaVersion": DANDI_SCHEMA_VERSION, + "path": path, + "contentSize": size, + "encodingFormat": "application/x-nwb", + } + + +def test_aggregate_number_of_sessions() -> None: + # Same subject, two sessions (session token only in filename) + data = [ + _bids_asset("sub-01/ses-A/eeg/sub-01_ses-A_task-rest_eeg.edf"), + _bids_asset("sub-01/ses-B/eeg/sub-01_ses-B_task-rest_eeg.edf"), + ] + summary = aggregate_assets_summary(data) + assert summary["numberOfSubjects"] == 1 + assert summary["numberOfSessions"] == 2 + + # Two subjects sharing a session id "A" -> two distinct (sub, ses) pairs + data = [ + _bids_asset("sub-01/ses-A/eeg/sub-01_ses-A_task-rest_eeg.edf"), + _bids_asset("sub-02/ses-A/eeg/sub-02_ses-A_task-rest_eeg.edf"), + ] + summary = aggregate_assets_summary(data) + assert summary["numberOfSubjects"] == 2 + assert summary["numberOfSessions"] == 2 + + # ses- only in directory portion (filename omits it) still counts + data = [ + _bids_asset("sub-01/ses-A/anat/sub-01_T1w.nii.gz"), + ] + summary = aggregate_assets_summary(data) + assert summary["numberOfSessions"] == 1 + + # No ses- anywhere -> field is absent + data = [_bids_asset("sub-01/anat/sub-01_T1w.nii.gz")] + summary = aggregate_assets_summary(data) + assert "numberOfSessions" not in summary + + class TestValidateObjJson: """ Tests for `_validate_obj_json()`