Skip to content
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ sandbox/
venv/
venvs/
dandischema/_version.py
uv.lock
17 changes: 16 additions & 1 deletion dandischema/datacite/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import json
from pathlib import Path
import re
from typing import Any, Dict, Union
from typing import Any, Dict, Optional, Union

from jsonschema import Draft7Validator

Expand Down Expand Up @@ -117,6 +117,7 @@ def to_datacite(
meta: Union[dict, PublishedDandiset],
validate: bool = False,
publish: bool = False,
concept_doi: Optional[str] = None,
) -> dict:
"""Convert published Dandiset metadata to Datacite"""

Expand Down Expand Up @@ -163,6 +164,10 @@ def to_datacite(
)

attributes["publicationYear"] = str(meta.datePublished.year)
# T002: Add dates field with Issued dateType

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is done with spec-kit, it might be worth reviewing the spec itself.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 -- we would need the "sources"! ;)

attributes["dates"] = [
{"date": str(meta.datePublished.date()), "dateType": "Issued"}
]
# not sure about it dandi-api had "resourceTypeGeneral": "NWB"
attributes["types"] = {
"resourceType": "Neural Data",
Expand Down Expand Up @@ -298,6 +303,16 @@ def to_datacite(
if hasattr(meta, "keywords") and meta.keywords is not None:
attributes["subjects"] = [{"subject": el} for el in meta.keywords]

# T003: Add IsVersionOf relation when concept_doi is provided
if concept_doi is not None:
attributes.setdefault("relatedIdentifiers", []).append(
{
"relatedIdentifier": concept_doi,
"relatedIdentifierType": "DOI",
"relationType": "IsVersionOf",
}
)

datacite_dict = {"data": {"id": meta.doi, "type": "dois", "attributes": attributes}}

if validate:
Expand Down
1 change: 1 addition & 0 deletions dandischema/datacite/tests/test_datacite.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,7 @@ def test_datacite_publish(metadata_basic: Dict[str, Any]) -> None:
},
],
"publicationYear": "1970",
"dates": [{"date": "1970-01-01", "dateType": "Issued"}],
"publisher": {
"name": f"{_INSTANCE_CONFIG.instance_name} Archive",
"publisherIdentifier": f"https://scicrunch.org/resolver/"
Expand Down
253 changes: 253 additions & 0 deletions dandischema/datacite/tests/test_doi_improvements.py

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test_doi_improvenents.py wont make sense after the merge. Instead I think it makes more sense to incorporate this into test_datacite, or split into a more expressive file name.

Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
"""
Tests for DataCite DOI improvements.

T001: Optional doi field on Dandiset model for concept DOIs
T002: dates field in to_datacite() output
T003: IsVersionOf/HasVersion relation support in to_datacite()

Note: These tests were AI-generated (Claude Code) using TDD methodology.
"""

import random
from typing import Any, Dict

from pydantic import ValidationError
import pytest

from dandischema.conf import get_instance_config
from dandischema.models import (
Dandiset,
LicenseType,
RoleType,
)
from dandischema.tests.utils import (
DOI_PREFIX,
INSTANCE_NAME,

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC, settting the instance name is one possible strategy for "safe testing" against a real datacite instance, ie instead of DANDI, we might use DANDI_TEST_2026_05_07

So if we set up some e2e test where real findable DOIs get created, they would be namespaced and not interfere with future tests.

basic_publishmeta,
skipif_no_doi_prefix,
)

from .. import to_datacite

_INSTANCE_CONFIG = get_instance_config()


@pytest.fixture(scope="function")
def metadata_with_publish() -> Dict[str, Any]:
"""Create a complete metadata dict suitable for PublishedDandiset."""
dandi_id_noprefix = f"000{random.randrange(100, 999)}"
dandi_id = f"{INSTANCE_NAME}:{dandi_id_noprefix}"
version = "0.0.0"
meta_dict = {
"identifier": dandi_id,
"id": f"{dandi_id}/{version}",
"name": "Test DOI Improvements Dataset",
"description": "Testing DOI metadata improvements",
"contributor": [
{
"name": "Test_last, Test_first",
"email": "test@example.com",
"roleName": [RoleType("dcite:ContactPerson")],
"schemaKey": "Person",
}
],
"license": [LicenseType("spdx:CC-BY-4.0")],
"url": f"https://dandiarchive.org/dandiset/{dandi_id_noprefix}/{version}",
"version": version,
"citation": "Test_last, Test_first 2026",
"manifestLocation": [
f"https://api.dandiarchive.org/api/dandisets/{dandi_id_noprefix}/versions/draft/assets/"
],
"assetsSummary": {
"schemaKey": "AssetsSummary",
"numberOfBytes": 100,
"numberOfFiles": 2,
"dataStandard": [{"schemaKey": "StandardsType", "name": "NWB"}],
"approach": [{"schemaKey": "ApproachType", "name": "electrophysiology"}],
"measurementTechnique": [
{
"schemaKey": "MeasurementTechniqueType",
"name": "two-photon microscopy technique",
}
],
"species": [{"schemaKey": "SpeciesType", "name": "Human"}],
},
}
if DOI_PREFIX is not None:
meta_dict.update(
basic_publishmeta(
INSTANCE_NAME, dandi_id=dandi_id_noprefix, prefix=DOI_PREFIX
)
)
return meta_dict


# =============================================================================
# T001: Optional doi field on Dandiset model for concept DOIs
# =============================================================================


class TestDandisetConceptDoi:
"""T001: The Dandiset model should accept an optional doi field."""

@skipif_no_doi_prefix
def test_dandiset_accepts_doi_field(
self, metadata_with_publish: Dict[str, Any]
) -> None:
"""Dandiset model should accept a doi field without error."""
meta = metadata_with_publish.copy()
dandi_id_noprefix = meta["identifier"].split(":")[1]
# Construct a concept DOI (no version suffix)
concept_doi = f"{DOI_PREFIX}/{INSTANCE_NAME.lower()}.{dandi_id_noprefix}"

# Build a Dandiset (not PublishedDandiset) with a doi
dandiset_meta = {
k: v
for k, v in meta.items()
if k not in ("datePublished", "publishedBy", "doi")
}
dandiset_meta["doi"] = concept_doi

# This should not raise — Dandiset should accept an optional doi
dandiset = Dandiset(**dandiset_meta)
assert dandiset.doi == concept_doi

def test_dandiset_doi_is_optional(
self, metadata_with_publish: Dict[str, Any]
) -> None:
"""Dandiset model should work without a doi field (default None)."""
meta = metadata_with_publish.copy()
dandiset_meta = {
k: v
for k, v in meta.items()
if k not in ("datePublished", "publishedBy", "doi")
}
# No doi field — should still work
dandiset = Dandiset(**dandiset_meta)
assert dandiset.doi is None

@skipif_no_doi_prefix
@pytest.mark.parametrize(
"bad_doi",
[
"not-a-doi",
"10.1234/foo", # wrong prefix and format
f"{DOI_PREFIX}/{INSTANCE_NAME.lower()}.000123/0.0.0", # version DOI, not concept
"https://doi.org/10.80507/dandi.000123", # URL, not bare DOI
"",
],
)
def test_dandiset_doi_rejects_malformed(
self, metadata_with_publish: Dict[str, Any], bad_doi: str
) -> None:
"""Dandiset.doi should reject values not matching DANDI_CONCEPT_DOI_PATTERN."""
meta = metadata_with_publish.copy()
dandiset_meta = {
k: v
for k, v in meta.items()
if k not in ("datePublished", "publishedBy", "doi")
}
dandiset_meta["doi"] = bad_doi
with pytest.raises(ValidationError):
Dandiset(**dandiset_meta)


# =============================================================================
# T002: dates field in to_datacite() output
# =============================================================================


class TestDataciteDatesField:
"""T002: to_datacite() should include a dates field with dateType Issued."""

@skipif_no_doi_prefix
def test_dates_field_present(self, metadata_with_publish: Dict[str, Any]) -> None:
"""DataCite output should contain a 'dates' attribute."""
datacite = to_datacite(metadata_with_publish)
attrs = datacite["data"]["attributes"]
assert "dates" in attrs, "dates field missing from DataCite output"

@skipif_no_doi_prefix
def test_dates_field_has_issued_type(
self, metadata_with_publish: Dict[str, Any]
) -> None:
"""The dates field should contain an entry with dateType 'Issued'."""
datacite = to_datacite(metadata_with_publish)
attrs = datacite["data"]["attributes"]
dates = attrs.get("dates", [])
issued_dates = [d for d in dates if d.get("dateType") == "Issued"]
assert len(issued_dates) == 1, "Expected exactly one Issued date entry"

@skipif_no_doi_prefix
def test_dates_field_value_matches_publication_year(
self, metadata_with_publish: Dict[str, Any]
) -> None:
"""The Issued date value should correspond to datePublished."""
datacite = to_datacite(metadata_with_publish)
attrs = datacite["data"]["attributes"]
dates = attrs.get("dates", [])
issued_dates = [d for d in dates if d.get("dateType") == "Issued"]
assert len(issued_dates) == 1
# The date string should contain the publication year
pub_year = attrs["publicationYear"]
assert pub_year in issued_dates[0]["date"]


# =============================================================================
# T003: IsVersionOf/HasVersion relation support in to_datacite()
# =============================================================================


class TestDataciteConceptDoiRelations:
"""T003: to_datacite() should support concept DOI relations."""

@skipif_no_doi_prefix
def test_is_version_of_relation_added(
self, metadata_with_publish: Dict[str, Any]
) -> None:
"""When concept_doi is provided, an IsVersionOf relatedIdentifier should appear."""
dandi_id_noprefix = metadata_with_publish["identifier"].split(":")[1]
concept_doi = f"{DOI_PREFIX}/{INSTANCE_NAME.lower()}.{dandi_id_noprefix}"

datacite = to_datacite(metadata_with_publish, concept_doi=concept_doi)
attrs = datacite["data"]["attributes"]

related = attrs.get("relatedIdentifiers", [])
is_version_of = [r for r in related if r.get("relationType") == "IsVersionOf"]
assert len(is_version_of) == 1, "Expected one IsVersionOf relation"
assert is_version_of[0]["relatedIdentifier"] == concept_doi
assert is_version_of[0]["relatedIdentifierType"] == "DOI"

@skipif_no_doi_prefix
def test_no_concept_doi_no_relation(
self, metadata_with_publish: Dict[str, Any]
) -> None:
"""When concept_doi is not provided, no IsVersionOf relation should appear."""
datacite = to_datacite(metadata_with_publish)
attrs = datacite["data"]["attributes"]

related = attrs.get("relatedIdentifiers", [])
is_version_of = [r for r in related if r.get("relationType") == "IsVersionOf"]
assert (
len(is_version_of) == 0
), "No IsVersionOf relation expected without concept_doi"


# =============================================================================
# T004: DANDI identifier in alternateIdentifiers + version in Version property
# =============================================================================


class TestDataciteVersionProperty:
"""DataCite output should include version property."""

@skipif_no_doi_prefix
def test_version_property_populated(
self, metadata_with_publish: Dict[str, Any]
) -> None:
"""The 'version' attribute should be populated in the DataCite output."""
datacite = to_datacite(metadata_with_publish)
attrs = datacite["data"]["attributes"]
# version should already be present, but verify it matches the input
assert "version" in attrs
assert attrs["version"] == metadata_with_publish["version"]
15 changes: 15 additions & 0 deletions dandischema/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,13 @@
if _INSTANCE_CONFIG.doi_prefix is not None
else rf"^({_INNER_DANDI_DOI_PATTERN}|)$" # This matches an empty string as well
)
# Concept DOI pattern: like version DOI but without the version suffix
_INNER_CONCEPT_DOI_PATTERN = rf"{DOI_PREFIX_PATTERN}/{ID_PATTERN.lower()}\.\d{{6}}"
DANDI_CONCEPT_DOI_PATTERN = (
rf"^{_INNER_CONCEPT_DOI_PATTERN}$"
if _INSTANCE_CONFIG.doi_prefix is not None
else rf"^({_INNER_CONCEPT_DOI_PATTERN}|)$"
)
DANDI_PUBID_PATTERN = rf"^{ID_PATTERN}:{VERSION_PATTERN}$"
DANDI_NSKEY = "dandi" # Namespace for DANDI ontology

Expand Down Expand Up @@ -1700,6 +1707,14 @@ def contributor_musthave_contact(
),
]

doi: Optional[str] = Field(
default=None,
title="Concept DOI",
description="A version-independent DOI for the Dandiset as a whole.",
pattern=DANDI_CONCEPT_DOI_PATTERN,

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@candleindark I believe this would be one more case where subclass would overload the constraint, and thus we would have difficulty expressing in linkml right?

I feel that in such cases we just need to define slot without any constraint and in specific class uses of the slots define their specific constraints thus making it all work without warning... can we do that?

NB this is just a note/question to @candleindark , not request to change anything here

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@candleindark I believe this would be one more case where subclass would overload the constraint, and thus we would have difficulty expressing in linkml right?

Yes, but not exactly right here though this will cause it. With this, we will have difficult of translating the doi slot_usage entry in PublishedDandiset. This is because LinkML has monotonic behavior per https://linkml.io/linkml/schemas/advanced.html#unions-as-ranges, and pattern is a constraint metadata slot in LinkML to which the monotonic behavior applies to. (However, we can actually do it currently because of a bug, but I wouldn't count on it.)

@candleindark candleindark May 7, 2026

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel that in such cases we just need to define slot without any constraint and in specific class uses of the slots define their specific constraints thus making it all work without warning... can we do that?

No. In the schema level slot definitions in our LinkML schema of dandischema, a definitions already contains just the minimum maximum set of properties shared by all uses of the slot. The problem originates from inheritance. Since PublishedDandiset inherits from Dandiset. The doi slot in PublishedDandiset has all the properties of the doi slot in Dandiset, and overriding the value of pattern in the doi slot in PublishedDandiset is not possible per the monotonic behavior.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NB this is just a note/question to @candleindark , not request to change anything here

I agree. This is problem adding to a category that we already have. I will just have to find a way to solve the problem category not any particular problem.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The doi slot in PublishedDandiset has all the properties of the doi slot in Dandiset, and overriding the value of pattern in the doi slot in PublishedDandiset is not possible per the monotonic behavior.

that's what I am saying -- it will not override but only add it where actually used, i.e. not defined in the slot definition but only in the classes which would use that slot.

anyways -- remind where the issue on this is, we could continue there

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's what I am saying -- it will not override but only add it where actually used, i.e. not defined in the slot definition but only in the classes which would use that slot.

I don't think that will work conceptually. Where a slot is defined doesn't change the slot inheritance behavior in LinkML. A more detailed post is in the issue you are looking for #389 (comment).

However, overriding still works in practice because of the bug.

json_schema_extra={"readOnly": True, "nskey": DANDI_NSKEY},
)

name: str = Field(
title="Dandiset title",
description="A title associated with the Dandiset.",
Expand Down
Loading