Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 26 additions & 9 deletions dandischema/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,15 +530,31 @@ def _add_asset_to_stats(assetmeta: Dict[str, Any], stats: _stats_type) -> None:
# which components already found, so we do not count more than
# once in some incorrectly named datasets
found: Dict[str, str] = {}
for part in Path(assetmeta["path"]).name.split(".")[0].split("_"):
if not found.get("subject") and part.startswith("sub-"):
found["subject"] = subject = part.split("sub-", 1)[1]
if subject not in stats["subjects"]:
stats["subjects"].append(subject)
if not found.get("sample") and part.startswith("sample-"):
found["sample"] = sample = part.replace("sample-", "")
if sample not in stats["tissuesample"]:
stats["tissuesample"].append(sample)
asset_path = Path(assetmeta["path"])
# (entity key, BIDS prefix, unique-values bucket in stats; None if not aggregated)
entities = [
("subject", "sub-", "subjects"),
("sample", "sample-", "tissuesample"),
("session", "ses-", None),
]
for part in asset_path.name.split(".")[0].split("_"):
for key, prefix, bucket in entities:
if not found.get(key) and part.startswith(prefix):
found[key] = value = part.split(prefix, 1)[1]
if bucket is not None and value not in stats[bucket]:
stats[bucket].append(value)
# If ses- is absent from the filename, fall back to scanning the path
# parts (BIDS keeps `ses-X` as its own directory).
if not found.get("session"):
for part in asset_path.parts[:-1]:
if part.startswith("ses-"):
found["session"] = part.split("ses-", 1)[1]
break
stats["sessions"] = stats.get("sessions", [])
if found.get("subject") and found.get("session"):
pair = (found["subject"], found["session"])
if pair not in stats["sessions"]:
stats["sessions"].append(pair)

stats["dataStandard"] = stats.get("dataStandard", [])

Expand Down Expand Up @@ -573,4 +589,5 @@ def aggregate_assets_summary(metadata: Iterable[Dict[str, Any]]) -> dict:
len(stats.pop("tissuesample", [])) + len(stats.pop("slice", []))
) or None
stats["numberOfCells"] = len(stats.pop("cell", [])) or None
stats["numberOfSessions"] = len(stats.pop("sessions", [])) or None
return models.AssetsSummary(**stats).model_dump(mode="json", exclude_none=True)
3 changes: 3 additions & 0 deletions dandischema/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1201,6 +1201,9 @@ class AssetsSummary(DandiBaseModel):
None, json_schema_extra={"readOnly": True}
) # more of NWB
numberOfCells: Optional[int] = Field(None, json_schema_extra={"readOnly": True})
numberOfSessions: Optional[int] = Field(
None, json_schema_extra={"readOnly": True}
) # BIDS ses-* tokens, counted as unique (subject, session) pairs

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possibly add to the comment that this count assumes the dataset is valid BIDS dataset. (For example, a session without subject will not be counted)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possibly moving some of the info in the comment into the description argument of Field. That way, the info will be part of the model. However, other fields, e.g. numberOfCells is not currently doing that.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would like to keep this information as comments rather than description. If we were to add a new data structure, then we could add a new way to compute numberOfSessions, which could still be a valid field. If we don't make notes about how the data is derived, we can support new data structure without making any changes to the schema. That's why I think the field should contain information about what the data is about, and comments should be used to give hints about how it was derived.


dataStandard: Optional[List[StandardsType]] = Field(
None, json_schema_extra={"readOnly": True}
Expand Down
46 changes: 46 additions & 0 deletions dandischema/tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,7 @@ def test_migrate_schemaversion_update() -> None:
"numberOfSubjects": 1,
"numberOfSamples": 1,
"numberOfCells": 1,
"numberOfSessions": 1,
"dataStandard": [
{
"schemaKey": "StandardsType",
Expand All @@ -541,6 +542,7 @@ def test_migrate_schemaversion_update() -> None:
"numberOfBytes": 608720,
"numberOfFiles": 2,
"numberOfSubjects": 1,
"numberOfSessions": 2,
"dataStandard": [
{
"schemaKey": "StandardsType",
Expand Down Expand Up @@ -588,6 +590,7 @@ def test_migrate_schemaversion_update() -> None:
"numberOfSubjects": 2,
"numberOfSamples": 1,
"numberOfCells": 1,
"numberOfSessions": 2,
"dataStandard": [
{
"schemaKey": "StandardsType",
Expand Down Expand Up @@ -751,13 +754,56 @@ def test_aggregation_bids() -> None:
assert summary["numberOfFiles"] == 3
assert summary["numberOfSamples"] == 2
assert summary["numberOfSubjects"] == 1
assert summary["numberOfSessions"] == 2
assert sum("BIDS" in _.get("name", "") for _ in summary["dataStandard"]) == 1
assert (
sum(_.get("name", "").startswith("OME/NGFF") for _ in summary["dataStandard"])
== 1
) # only a single entry so we do not duplicate them


def _bids_asset(path: str, size: int = 1) -> Dict[str, Any]:
return {
"schemaKey": "Asset",
"schemaVersion": DANDI_SCHEMA_VERSION,
"path": path,
"contentSize": size,
"encodingFormat": "application/x-nwb",
}


def test_aggregate_number_of_sessions() -> None:
# Same subject, two sessions (session token only in filename)
data = [
_bids_asset("sub-01/ses-A/eeg/sub-01_ses-A_task-rest_eeg.edf"),
_bids_asset("sub-01/ses-B/eeg/sub-01_ses-B_task-rest_eeg.edf"),
]
summary = aggregate_assets_summary(data)
assert summary["numberOfSubjects"] == 1
assert summary["numberOfSessions"] == 2

# Two subjects sharing a session id "A" -> two distinct (sub, ses) pairs
data = [
_bids_asset("sub-01/ses-A/eeg/sub-01_ses-A_task-rest_eeg.edf"),
_bids_asset("sub-02/ses-A/eeg/sub-02_ses-A_task-rest_eeg.edf"),
]
summary = aggregate_assets_summary(data)
assert summary["numberOfSubjects"] == 2
assert summary["numberOfSessions"] == 2

# ses- only in directory portion (filename omits it) still counts
data = [
_bids_asset("sub-01/ses-A/anat/sub-01_T1w.nii.gz"),
]
summary = aggregate_assets_summary(data)
assert summary["numberOfSessions"] == 1

# No ses- anywhere -> field is absent
data = [_bids_asset("sub-01/anat/sub-01_T1w.nii.gz")]
summary = aggregate_assets_summary(data)
assert "numberOfSessions" not in summary


class TestValidateObjJson:
"""
Tests for `_validate_obj_json()`
Expand Down
Loading