From 144d054f1346807b72f41e26aec944360b18f08f Mon Sep 17 00:00:00 2001
From: Ben Dichter <ben.dichter@gmail.com>
Date: Wed, 13 May 2026 20:14:42 -0400
Subject: [PATCH 1/2] feat: add numberOfSessions to AssetsSummary

Aggregator counts unique (subject, session) pairs parsed from BIDS-style
sub-* and ses-* tokens in asset paths (filename first, then directory
components when the session token is omitted from the filename).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 dandischema/metadata.py            | 24 +++++++++++++++-
 dandischema/models.py              |  3 ++
 dandischema/tests/test_metadata.py | 46 ++++++++++++++++++++++++++++++
 3 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/dandischema/metadata.py b/dandischema/metadata.py
index 19064262..dbd5aefe 100644
--- a/dandischema/metadata.py
+++ b/dandischema/metadata.py
@@ -527,15 +527,36 @@ def _add_asset_to_stats(assetmeta: Dict[str, Any], stats: _stats_type) -> None:
             stats = _get_samples(value, stats, hierarchy)
             break
 
-    for part in Path(assetmeta["path"]).name.split(".")[0].split("_"):
+    stats["sessions"] = stats.get("sessions", [])
+    path_subject = None
+    path_session = None
+    asset_path = Path(assetmeta["path"])
+    # Scan filename first (existing behavior for sub-/sample-), then directory
+    # components, so ses- tokens that appear only in the directory (e.g.
+    # `sub-X/ses-Y/foo_acq-Z_bold.nii.gz`) are still counted.
+    for part in asset_path.name.split(".")[0].split("_"):
         if part.startswith("sub-"):
             subject = part.replace("sub-", "")
+            path_subject = subject
             if subject not in stats["subjects"]:
                 stats["subjects"].append(subject)
         if part.startswith("sample-"):
             sample = part.replace("sample-", "")
             if sample not in stats["tissuesample"]:
                 stats["tissuesample"].append(sample)
+        if part.startswith("ses-"):
+            path_session = part.replace("ses-", "")
+    if path_session is None or path_subject is None:
+        for directory in asset_path.parts[:-1]:
+            for part in directory.split("_"):
+                if path_subject is None and part.startswith("sub-"):
+                    path_subject = part.replace("sub-", "")
+                if path_session is None and part.startswith("ses-"):
+                    path_session = part.replace("ses-", "")
+    if path_subject is not None and path_session is not None:
+        pair = (path_subject, path_session)
+        if pair not in stats["sessions"]:
+            stats["sessions"].append(pair)
 
     stats["dataStandard"] = stats.get("dataStandard", [])
 
@@ -567,4 +588,5 @@ def aggregate_assets_summary(metadata: Iterable[Dict[str, Any]]) -> dict:
         len(stats.pop("tissuesample", [])) + len(stats.pop("slice", []))
     ) or None
     stats["numberOfCells"] = len(stats.pop("cell", [])) or None
+    stats["numberOfSessions"] = len(stats.pop("sessions", [])) or None
     return models.AssetsSummary(**stats).model_dump(mode="json", exclude_none=True)
diff --git a/dandischema/models.py b/dandischema/models.py
index b35275fe..856d7fc6 100644
--- a/dandischema/models.py
+++ b/dandischema/models.py
@@ -1200,6 +1200,9 @@ class AssetsSummary(DandiBaseModel):
         None, json_schema_extra={"readOnly": True}
     )  # more of NWB
     numberOfCells: Optional[int] = Field(None, json_schema_extra={"readOnly": True})
+    numberOfSessions: Optional[int] = Field(
+        None, json_schema_extra={"readOnly": True}
+    )  # BIDS ses-* tokens, counted as unique (subject, session) pairs
 
     dataStandard: Optional[List[StandardsType]] = Field(
         None, json_schema_extra={"readOnly": True}
diff --git a/dandischema/tests/test_metadata.py b/dandischema/tests/test_metadata.py
index 6da7fff3..b848f3e6 100644
--- a/dandischema/tests/test_metadata.py
+++ b/dandischema/tests/test_metadata.py
@@ -521,6 +521,7 @@ def test_migrate_schemaversion_update() -> None:
                 "numberOfSubjects": 1,
                 "numberOfSamples": 1,
                 "numberOfCells": 1,
+                "numberOfSessions": 1,
                 "dataStandard": [
                     {
                         "schemaKey": "StandardsType",
@@ -541,6 +542,7 @@ def test_migrate_schemaversion_update() -> None:
                 "numberOfBytes": 608720,
                 "numberOfFiles": 2,
                 "numberOfSubjects": 1,
+                "numberOfSessions": 2,
                 "dataStandard": [
                     {
                         "schemaKey": "StandardsType",
@@ -588,6 +590,7 @@ def test_migrate_schemaversion_update() -> None:
                 "numberOfSubjects": 2,
                 "numberOfSamples": 1,
                 "numberOfCells": 1,
+                "numberOfSessions": 2,
                 "dataStandard": [
                     {
                         "schemaKey": "StandardsType",
@@ -751,6 +754,7 @@ def test_aggregation_bids() -> None:
     assert summary["numberOfFiles"] == 3
     assert summary["numberOfSamples"] == 2
     assert summary["numberOfSubjects"] == 1
+    assert summary["numberOfSessions"] == 2
     assert sum("BIDS" in _.get("name", "") for _ in summary["dataStandard"]) == 1
     assert (
         sum(_.get("name", "").startswith("OME/NGFF") for _ in summary["dataStandard"])
@@ -758,6 +762,48 @@ def test_aggregation_bids() -> None:
     )  # only a single entry so we do not duplicate them
 
 
+def _bids_asset(path: str, size: int = 1) -> Dict[str, Any]:
+    return {
+        "schemaKey": "Asset",
+        "schemaVersion": DANDI_SCHEMA_VERSION,
+        "path": path,
+        "contentSize": size,
+        "encodingFormat": "application/x-nwb",
+    }
+
+
+def test_aggregate_number_of_sessions() -> None:
+    # Same subject, two sessions (session token only in filename)
+    data = [
+        _bids_asset("sub-01/ses-A/eeg/sub-01_ses-A_task-rest_eeg.edf"),
+        _bids_asset("sub-01/ses-B/eeg/sub-01_ses-B_task-rest_eeg.edf"),
+    ]
+    summary = aggregate_assets_summary(data)
+    assert summary["numberOfSubjects"] == 1
+    assert summary["numberOfSessions"] == 2
+
+    # Two subjects sharing a session id "A" -> two distinct (sub, ses) pairs
+    data = [
+        _bids_asset("sub-01/ses-A/eeg/sub-01_ses-A_task-rest_eeg.edf"),
+        _bids_asset("sub-02/ses-A/eeg/sub-02_ses-A_task-rest_eeg.edf"),
+    ]
+    summary = aggregate_assets_summary(data)
+    assert summary["numberOfSubjects"] == 2
+    assert summary["numberOfSessions"] == 2
+
+    # ses- only in directory portion (filename omits it) still counts
+    data = [
+        _bids_asset("sub-01/ses-A/anat/sub-01_T1w.nii.gz"),
+    ]
+    summary = aggregate_assets_summary(data)
+    assert summary["numberOfSessions"] == 1
+
+    # No ses- anywhere -> field is absent
+    data = [_bids_asset("sub-01/anat/sub-01_T1w.nii.gz")]
+    summary = aggregate_assets_summary(data)
+    assert "numberOfSessions" not in summary
+
+
 class TestValidateObjJson:
     """
     Tests for `_validate_obj_json()`

From cdb37eeb26bc0c26a47d72884033001f35634177 Mon Sep 17 00:00:00 2001
From: Ben Dichter <ben.dichter@gmail.com>
Date: Thu, 14 May 2026 09:16:09 -0400
Subject: [PATCH 2/2] refactor: collapse BIDS-token parsing into a loop

Per review feedback: generalize the per-prefix blocks (sub-, sample-,
ses-) into a single loop over an entity list, and simplify the session
fallback to a direct scan of path parts.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 dandischema/metadata.py | 44 ++++++++++++++++-------------------------
 1 file changed, 17 insertions(+), 27 deletions(-)

diff --git a/dandischema/metadata.py b/dandischema/metadata.py
index e3f876c9..801f492e 100644
--- a/dandischema/metadata.py
+++ b/dandischema/metadata.py
@@ -531,35 +531,25 @@ def _add_asset_to_stats(assetmeta: Dict[str, Any], stats: _stats_type) -> None:
     # once in some incorrectly named datasets
     found: Dict[str, str] = {}
     asset_path = Path(assetmeta["path"])
+    # (entity key, BIDS prefix, unique-values bucket in stats; None if not aggregated)
+    entities = [
+        ("subject", "sub-", "subjects"),
+        ("sample", "sample-", "tissuesample"),
+        ("session", "ses-", None),
+    ]
     for part in asset_path.name.split(".")[0].split("_"):
-        if not found.get("subject") and part.startswith("sub-"):
-            found["subject"] = subject = part.split("sub-", 1)[1]
-            if subject not in stats["subjects"]:
-                stats["subjects"].append(subject)
-        if not found.get("sample") and part.startswith("sample-"):
-            found["sample"] = sample = part.replace("sample-", "")
-            if sample not in stats["tissuesample"]:
-                stats["tissuesample"].append(sample)
-        if not found.get("session") and part.startswith("ses-"):
-            found["session"] = part.split("ses-", 1)[1]
-    # Fallback: ses- tokens that appear only in directory components (e.g.
-    # `sub-X/ses-Y/foo_acq-Z_bold.nii.gz`) should still be counted. To form
-    # the (subject, session) pair we also accept a directory-only subject,
-    # but we do not add such a subject to stats["subjects"] — subject
-    # counting remains driven by the filename and wasAttributedTo, matching
-    # prior behavior.
+        for key, prefix, bucket in entities:
+            if not found.get(key) and part.startswith(prefix):
+                found[key] = value = part.split(prefix, 1)[1]
+                if bucket is not None and value not in stats[bucket]:
+                    stats[bucket].append(value)
+    # If ses- is absent from the filename, fall back to scanning the path
+    # parts (BIDS keeps `ses-X` as its own directory).
     if not found.get("session"):
-        dir_subject = found.get("subject")
-        dir_session: Optional[str] = None
-        for directory in asset_path.parts[:-1]:
-            for part in directory.split("_"):
-                if dir_subject is None and part.startswith("sub-"):
-                    dir_subject = part.split("sub-", 1)[1]
-                if dir_session is None and part.startswith("ses-"):
-                    dir_session = part.split("ses-", 1)[1]
-        if dir_subject is not None and dir_session is not None:
-            found.setdefault("subject", dir_subject)
-            found["session"] = dir_session
+        for part in asset_path.parts[:-1]:
+            if part.startswith("ses-"):
+                found["session"] = part.split("ses-", 1)[1]
+                break
     stats["sessions"] = stats.get("sessions", [])
     if found.get("subject") and found.get("session"):
         pair = (found["subject"], found["session"])