Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion dandischema/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,8 @@ def _add_asset_to_stats(assetmeta: Dict[str, Any], stats: _stats_type) -> None:
# which components already found, so we do not count more than
# once in some incorrectly named datasets
found: Dict[str, str] = {}
for part in Path(assetmeta["path"]).name.split(".")[0].split("_"):
asset_path = Path(assetmeta["path"])
for part in asset_path.name.split(".")[0].split("_"):
if not found.get("subject") and part.startswith("sub-"):
found["subject"] = subject = part.split("sub-", 1)[1]
if subject not in stats["subjects"]:
Expand All @@ -539,6 +540,31 @@ def _add_asset_to_stats(assetmeta: Dict[str, Any], stats: _stats_type) -> None:
found["sample"] = sample = part.replace("sample-", "")
if sample not in stats["tissuesample"]:
stats["tissuesample"].append(sample)
if not found.get("session") and part.startswith("ses-"):
found["session"] = part.split("ses-", 1)[1]
# Fallback: ses- tokens that appear only in directory components (e.g.
# `sub-X/ses-Y/foo_acq-Z_bold.nii.gz`) should still be counted. To form
# the (subject, session) pair we also accept a directory-only subject,
# but we do not add such a subject to stats["subjects"] — subject
# counting remains driven by the filename and wasAttributedTo, matching
# prior behavior.
if not found.get("session"):
dir_subject = found.get("subject")
dir_session: Optional[str] = None
for directory in asset_path.parts[:-1]:
for part in directory.split("_"):
if dir_subject is None and part.startswith("sub-"):
dir_subject = part.split("sub-", 1)[1]
if dir_session is None and part.startswith("ses-"):
dir_session = part.split("ses-", 1)[1]
if dir_subject is not None and dir_session is not None:
found.setdefault("subject", dir_subject)
found["session"] = dir_session
stats["sessions"] = stats.get("sessions", [])
if found.get("subject") and found.get("session"):
pair = (found["subject"], found["session"])
if pair not in stats["sessions"]:
stats["sessions"].append(pair)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

too aisloppy -- easier to HI compose the logic but first potentially even generalize 2 prior blocks into a 'for' loop, then if no session was found in filename, go through path's parts and split on ses- and be done if any matches.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in cdb37ee — collapsed the sub-/sample-/ses- filename parsing into a single loop over an entities list, and simplified the session fallback to scanning asset_path.parts[:-1] for a ses-X directory and stopping at the first match. Dropped the directory-only subject fallback since BIDS files always carry sub- in the filename.


stats["dataStandard"] = stats.get("dataStandard", [])

Expand Down Expand Up @@ -573,4 +599,5 @@ def aggregate_assets_summary(metadata: Iterable[Dict[str, Any]]) -> dict:
len(stats.pop("tissuesample", [])) + len(stats.pop("slice", []))
) or None
stats["numberOfCells"] = len(stats.pop("cell", [])) or None
stats["numberOfSessions"] = len(stats.pop("sessions", [])) or None
return models.AssetsSummary(**stats).model_dump(mode="json", exclude_none=True)
3 changes: 3 additions & 0 deletions dandischema/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1201,6 +1201,9 @@ class AssetsSummary(DandiBaseModel):
None, json_schema_extra={"readOnly": True}
) # more of NWB
numberOfCells: Optional[int] = Field(None, json_schema_extra={"readOnly": True})
numberOfSessions: Optional[int] = Field(
None, json_schema_extra={"readOnly": True}
) # BIDS ses-* tokens, counted as unique (subject, session) pairs

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possibly add to the comment that this count assumes the dataset is valid BIDS dataset. (For example, a session without subject will not be counted)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possibly moving some of the info in the comment into the description argument of Field. That way, the info will be part of the model. However, other fields, e.g. numberOfCells is not currently doing that.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would like to keep this information as comments rather than description. If we were to add a new data structure, then we could add a new way to compute numberOfSessions, which could still be a valid field. If we don't make notes about how the data is derived, we can support new data structure without making any changes to the schema. That's why I think the field should contain information about what the data is about, and comments should be used to give hints about how it was derived.


dataStandard: Optional[List[StandardsType]] = Field(
None, json_schema_extra={"readOnly": True}
Expand Down
46 changes: 46 additions & 0 deletions dandischema/tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,7 @@ def test_migrate_schemaversion_update() -> None:
"numberOfSubjects": 1,
"numberOfSamples": 1,
"numberOfCells": 1,
"numberOfSessions": 1,
"dataStandard": [
{
"schemaKey": "StandardsType",
Expand All @@ -541,6 +542,7 @@ def test_migrate_schemaversion_update() -> None:
"numberOfBytes": 608720,
"numberOfFiles": 2,
"numberOfSubjects": 1,
"numberOfSessions": 2,
"dataStandard": [
{
"schemaKey": "StandardsType",
Expand Down Expand Up @@ -588,6 +590,7 @@ def test_migrate_schemaversion_update() -> None:
"numberOfSubjects": 2,
"numberOfSamples": 1,
"numberOfCells": 1,
"numberOfSessions": 2,
"dataStandard": [
{
"schemaKey": "StandardsType",
Expand Down Expand Up @@ -751,13 +754,56 @@ def test_aggregation_bids() -> None:
assert summary["numberOfFiles"] == 3
assert summary["numberOfSamples"] == 2
assert summary["numberOfSubjects"] == 1
assert summary["numberOfSessions"] == 2
assert sum("BIDS" in _.get("name", "") for _ in summary["dataStandard"]) == 1
assert (
sum(_.get("name", "").startswith("OME/NGFF") for _ in summary["dataStandard"])
== 1
) # only a single entry so we do not duplicate them


def _bids_asset(path: str, size: int = 1) -> Dict[str, Any]:
return {
"schemaKey": "Asset",
"schemaVersion": DANDI_SCHEMA_VERSION,
"path": path,
"contentSize": size,
"encodingFormat": "application/x-nwb",
}


def test_aggregate_number_of_sessions() -> None:
# Same subject, two sessions (session token only in filename)
data = [
_bids_asset("sub-01/ses-A/eeg/sub-01_ses-A_task-rest_eeg.edf"),
_bids_asset("sub-01/ses-B/eeg/sub-01_ses-B_task-rest_eeg.edf"),
]
summary = aggregate_assets_summary(data)
assert summary["numberOfSubjects"] == 1
assert summary["numberOfSessions"] == 2

# Two subjects sharing a session id "A" -> two distinct (sub, ses) pairs
data = [
_bids_asset("sub-01/ses-A/eeg/sub-01_ses-A_task-rest_eeg.edf"),
_bids_asset("sub-02/ses-A/eeg/sub-02_ses-A_task-rest_eeg.edf"),
]
summary = aggregate_assets_summary(data)
assert summary["numberOfSubjects"] == 2
assert summary["numberOfSessions"] == 2

# ses- only in directory portion (filename omits it) still counts
data = [
_bids_asset("sub-01/ses-A/anat/sub-01_T1w.nii.gz"),
]
summary = aggregate_assets_summary(data)
assert summary["numberOfSessions"] == 1

# No ses- anywhere -> field is absent
data = [_bids_asset("sub-01/anat/sub-01_T1w.nii.gz")]
summary = aggregate_assets_summary(data)
assert "numberOfSessions" not in summary


class TestValidateObjJson:
"""
Tests for `_validate_obj_json()`
Expand Down
Loading