Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 111 additions & 7 deletions dandischema/datacite/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,29 @@
from copy import deepcopy
from functools import lru_cache
import json
import logging
from pathlib import Path
import re
from typing import Any, Dict, Union
from typing import Any, Dict, List, Optional, Tuple, Union
from warnings import warn

from jsonschema import Draft7Validator
from pydantic import ValidationError
import requests

from ..models import NAME_PATTERN, Organization, Person, PublishedDandiset, RoleType
from ..models import (
NAME_PATTERN,
Dandiset,
LicenseType,
Organization,
Person,
PublishedDandiset,
RelationType,
Resource,
RoleType,
)

logger = logging.getLogger(__name__)

DATACITE_CONTRTYPE = {
"ContactPerson",
Expand Down Expand Up @@ -65,17 +80,102 @@
DATACITE_MAP = {el.lower(): el for el in DATACITE_IDENTYPE}


def construct_unvalidated_dandiset(meta_dict: dict) -> Dandiset:
"""
Construct a Dandiset model from a dictionary without validation.

Process all nested data first, then construct the final object.
"""
# Work on a copy to avoid mutating the input
processed_dict = deepcopy(meta_dict)

if "license" in processed_dict and processed_dict["license"]:
processed_dict["license"] = [
LicenseType(item) for item in processed_dict["license"]
]

if "contributor" in processed_dict and processed_dict["contributor"]:
processed_contributors: List[Union[Person, Organization]] = []
for contributor_data in processed_dict["contributor"]:
if "roleName" in contributor_data and contributor_data["roleName"]:
contributor_data["roleName"] = [
RoleType(role) for role in contributor_data["roleName"]
]

schema_key = contributor_data.get("schemaKey")
if schema_key == "Person":
processed_contributors.append(
Person.model_construct(**contributor_data)
)
elif schema_key == "Organization":
processed_contributors.append(

Check warning on line 111 in dandischema/datacite/__init__.py

View check run for this annotation

Codecov / codecov/patch

dandischema/datacite/__init__.py#L110-L111

Added lines #L110 - L111 were not covered by tests
Organization.model_construct(**contributor_data)
)

processed_dict["contributor"] = processed_contributors

if "relatedResource" in processed_dict and processed_dict["relatedResource"]:
processed_resources = []
for resource_data in processed_dict["relatedResource"]:
if "relation" in resource_data:
resource_data["relation"] = RelationType(resource_data["relation"])
processed_resources.append(Resource.model_construct(**resource_data))

Check warning on line 122 in dandischema/datacite/__init__.py

View check run for this annotation

Codecov / codecov/patch

dandischema/datacite/__init__.py#L118-L122

Added lines #L118 - L122 were not covered by tests

processed_dict["relatedResource"] = processed_resources

Check warning on line 124 in dandischema/datacite/__init__.py

View check run for this annotation

Codecov / codecov/patch

dandischema/datacite/__init__.py#L124

Added line #L124 was not covered by tests

return Dandiset.model_construct(**processed_dict)


def to_datacite(
meta: Union[dict, PublishedDandiset],
validate: bool = False,
publish: bool = False,
Comment thread
asmacdo marked this conversation as resolved.
*,
event: Optional[str] = None,
) -> dict:
"""Convert published Dandiset metadata to Datacite"""
if not isinstance(meta, PublishedDandiset):
meta = PublishedDandiset(**meta)
"""
Convert Dandiset metadata to DataCite payload.

This function tries to validate the metadata against PublishedDandiset model.
If strict validation fails, it falls back to using construct_unvalidated_dandiset()
to build the model without validation but with properly handled nested types.
"""
if isinstance(meta, dict):
meta = deepcopy(meta)
try:
meta = PublishedDandiset(**meta)
except ValidationError:
# mypy can't track that meta is still dict after failed PublishedDandiset(**meta)
assert isinstance(meta, dict)
if meta.get("version") == "draft":

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if the version is a draft, it wont have fields like datePublished. However this can also happen when we are creating a Dandiset DOI from a published version-- in this case, the metadata is the published version, but the doi and the url fields wont pass validation (they both won't include the version).

Previously I modified the PublishedDandiset to accept either format for url and doi, but I dont think that really makes sense-- those aren't valid for a published dandiset, and we wouldnt want our output schema to reflect flexibility thats not really there.

logger.debug("Falling back to unvalidated dandiset for draft version")
else:
logger.warning(

Check warning on line 153 in dandischema/datacite/__init__.py

View check run for this annotation

Codecov / codecov/patch

dandischema/datacite/__init__.py#L153

Added line #L153 was not covered by tests
"Validation failed for %s, using construct_unvalidated_dandiset()",
meta.get("id", "unknown"),
)
meta = construct_unvalidated_dandiset(meta) # type: ignore[assignment]

# At this point, meta is always a model object (PublishedDandiset or Dandiset)
assert isinstance(meta, (PublishedDandiset, Dandiset))
attributes: Dict[str, Any] = {}
if publish:

if event is not None and publish:
raise ValueError(
"Cannot use both 'event' and deprecated 'publish'. Use only 'event'."
)

# If there is no attributes["event"] a Draft DOI is minted
if event is not None:
if event not in {"publish", "hide"}:
raise ValueError("Invalid event value: must be 'publish' or 'hide'")
attributes["event"] = event
elif publish:
warn(
"'publish' is deprecated; use 'event=\"publish\"' instead",
DeprecationWarning,
stacklevel=2,
)
attributes["event"] = "publish"

attributes["alternateIdentifiers"] = [
Expand Down Expand Up @@ -103,7 +203,11 @@
"publisherIdentifierScheme": "RRID",
"lang": "en",
}
attributes["publicationYear"] = str(meta.datePublished.year)

# publicationYear is not available for draft dandisets
if hasattr(meta, "datePublished") and meta.datePublished:
attributes["publicationYear"] = str(meta.datePublished.year)

# not sure about it dandi-api had "resourceTypeGeneral": "NWB"
attributes["types"] = {
"resourceType": "Neural Data",
Expand Down
146 changes: 138 additions & 8 deletions dandischema/datacite/tests/test_datacite.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from datetime import datetime
import json
import os
from pathlib import Path
import random
from typing import Any, Dict, Tuple
from typing import Any, Dict, Optional, Tuple

from jsonschema import Draft7Validator
import pytest
Expand Down Expand Up @@ -57,6 +58,33 @@
return _get_datacite_schema()


@pytest.fixture(scope="function")
def metadata_draft() -> Dict[str, Any]:
"""Draft dandiset metadata that will trigger unvalidated fallback"""
dandi_id_noprefix = f"000{random.randrange(100, 999)}"
dandi_id = f"DANDI:{dandi_id_noprefix}"

return {
"identifier": dandi_id,
"id": f"{dandi_id}/draft",
"name": "testing draft dataset",
"description": "testing draft",
"contributor": [
{
"name": "A_last, A_first",
"email": "nemo@example.com",
"roleName": [RoleType("dcite:ContactPerson")],
"schemaKey": "Person",
}
],
"license": [LicenseType("spdx:CC-BY-4.0")],
"url": f"https://dandiarchive.org/dandiset/{dandi_id_noprefix}", # DLP, not version url
"doi": f"10.80507/dandi.{dandi_id_noprefix}",
"version": "draft",
# Missing: datePublished, publishedBy
}


@pytest.fixture(scope="function")
def metadata_basic() -> Dict[str, Any]:
dandi_id_noprefix = f"000{random.randrange(100, 999)}"
Expand Down Expand Up @@ -414,10 +442,10 @@
metadata_basic.update(_basic_publishmeta(dandi_id=dandi_id_noprefix))

# creating and validating datacite objects
datacite = to_datacite(metadata_basic, publish=True, validate=True)
with pytest.warns(DeprecationWarning, match="'publish' is deprecated"):
datacite = to_datacite(metadata_basic, publish=True, validate=True)

assert datacite == {
# 'data': {}
expected = {
"data": {
"id": f"10.80507/dandi.{dandi_id_noprefix}/{version}",
"type": "dois",
Expand Down Expand Up @@ -463,7 +491,7 @@
"alternateIdentifierType": "URL",
},
],
"publicationYear": "1970",
"publicationYear": str(datetime.now().year),
"publisher": {
"name": "DANDI Archive",
"publisherIdentifier": "https://scicrunch.org/resolver/RRID:SCR_017571",
Expand All @@ -489,6 +517,7 @@
},
}
}
assert datacite == expected


@pytest.mark.parametrize(
Expand Down Expand Up @@ -526,9 +555,6 @@
),
],
)
@pytest.mark.skipif(
not os.getenv("DATACITE_DEV_PASSWORD"), reason="no datacite password available"
)
def test_datacite_related_res_url(
metadata_basic: Dict[str, Any],
related_res_url: Dict[str, Any],
Expand All @@ -549,3 +575,107 @@
relIdent = datacite["data"]["attributes"]["relatedIdentifiers"][0]
assert relIdent["relatedIdentifier"] == related_ident_exp[0].lower()
assert relIdent["relatedIdentifierType"] == related_ident_exp[1]


@pytest.mark.parametrize(
"event_param, expected_event_in_output",
[
(None, None), # event=None should not include event in output
("publish", "publish"), # event="publish" should include event="publish"
("hide", "hide"), # event="hide" should include event="hide"
# Test no event parameter at all
("no_param", None), # Special marker for no event parameter
],
)
def test_event_parameter(
metadata_basic: Dict[str, Any],
event_param: str,
expected_event_in_output: Optional[str],
) -> None:
"""Test event parameter handling in to_datacite"""
dandi_id = metadata_basic["identifier"]
dandi_id_noprefix = dandi_id.split(":")[1]
metadata_basic.update(_basic_publishmeta(dandi_id=dandi_id_noprefix))

# Handle the special case where we don't pass event parameter at all
if event_param == "no_param":
datacite = to_datacite(metadata_basic)
else:
datacite = to_datacite(metadata_basic, event=event_param)

# Check event attribute presence/value
if expected_event_in_output is None:
assert "event" not in datacite["data"]["attributes"]
else:
assert datacite["data"]["attributes"]["event"] == expected_event_in_output


def test_invalid_event(metadata_basic: Dict[str, Any]) -> None:
"""Test that invalid event values raise ValueError"""
dandi_id = metadata_basic["identifier"]
dandi_id_noprefix = dandi_id.split(":")[1]
metadata_basic.update(_basic_publishmeta(dandi_id=dandi_id_noprefix))

with pytest.raises(ValueError, match="Invalid event value"):
to_datacite(metadata_basic, event="invalid")


def test_event_and_publish_conflict(metadata_basic: Dict[str, Any]) -> None:
"""Test that using both event and publish parameters raises ValueError"""
dandi_id = metadata_basic["identifier"]
dandi_id_noprefix = dandi_id.split(":")[1]
metadata_basic.update(_basic_publishmeta(dandi_id=dandi_id_noprefix))

with pytest.raises(
ValueError, match="Cannot use both 'event' and deprecated 'publish'"
):
to_datacite(metadata_basic, event="publish", publish=True)


def test_deprecated_publish_parameter(metadata_basic: Dict[str, Any]) -> None:
"""Test the deprecated publish parameter still works but shows warning"""
dandi_id = metadata_basic["identifier"]
dandi_id_noprefix = dandi_id.split(":")[1]
metadata_basic.update(_basic_publishmeta(dandi_id=dandi_id_noprefix))

with pytest.warns(DeprecationWarning, match="'publish' is deprecated"):
datacite = to_datacite(metadata_basic, publish=True)

# Check that event is "publish" despite using the deprecated parameter
assert datacite["data"]["attributes"]["event"] == "publish"


def test_draft_dandiset_unvalidated_fallback(metadata_draft: Dict[str, Any]) -> None:
"""Test that draft dandiset metadata uses unvalidated fallback"""
# Should work via unvalidated fallback without raising exception
datacite = to_datacite(metadata_draft)

# Verify basic structure is correct
assert datacite["data"]["type"] == "dois"
assert datacite["data"]["id"] == metadata_draft["doi"]

# Verify key attributes are populated from draft metadata
attrs = datacite["data"]["attributes"]
assert attrs["doi"] == metadata_draft["doi"]
assert attrs["version"] == "draft"
assert attrs["titles"][0]["title"] == metadata_draft["name"]
assert attrs["descriptions"][0]["description"] == metadata_draft["description"]

# Should have creators/contributors from the contributor field
assert len(attrs["creators"]) > 0
assert len(attrs["contributors"]) > 0

# Should NOT have publicationYear (since no datePublished in draft)
assert "publicationYear" not in attrs


@pytest.mark.skipif(
not os.getenv("DATACITE_DEV_PASSWORD"), reason="no datacite password available"
)
def test_draft_dandiset_datacite_api(metadata_draft: Dict[str, Any]) -> None:
"""Test that draft dandiset metadata works with actual DataCite API"""
# Generate DataCite payload
datacite = to_datacite(metadata_draft)

Check warning on line 678 in dandischema/datacite/tests/test_datacite.py

View check run for this annotation

Codecov / codecov/patch

dandischema/datacite/tests/test_datacite.py#L678

Added line #L678 was not covered by tests

# Post to actual DataCite API
datacite_post(datacite, metadata_draft["doi"])

Check warning on line 681 in dandischema/datacite/tests/test_datacite.py

View check run for this annotation

Codecov / codecov/patch

dandischema/datacite/tests/test_datacite.py#L681

Added line #L681 was not covered by tests
2 changes: 1 addition & 1 deletion dandischema/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def _basic_publishmeta(
Returned fields are additional to fields required by Dandiset
"""
publish_meta = {
"datePublished": str(datetime.now().year),
"datePublished": datetime.now(),
"publishedBy": {
"id": "urn:uuid:08fffc59-9f1b-44d6-8e02-6729d266d1b6",
"name": "DANDI publish",
Expand Down
Loading