diff --git a/tests/test_modality_registry.py b/tests/test_modality_registry.py index 9acaf15..d4a9f9b 100644 --- a/tests/test_modality_registry.py +++ b/tests/test_modality_registry.py @@ -85,3 +85,24 @@ def test_base_py_imports_the_registry_derived_sets(): assert base._FILE_BEARING_CATEGORIES is registry.FILE_BEARING_CATEGORIES assert base._TABULAR_FAMILY_CATEGORIES is registry.TABULAR_FAMILY_CATEGORIES assert base._SELF_SUPERVISED_CATEGORIES is registry.SELF_SUPERVISED_CATEGORIES + + +def test_data_format_valid_and_matches_conventions(): + """Every spec carries a valid DataFormat (P3d), and conventions' + _data_format_for reads it from the registry (single source).""" + from tracebloc_ingestor.cli.conventions import _data_format_for + from tracebloc_ingestor.utils.constants import DataFormat + + valid = set(DataFormat.get_all_formats()) + for category, spec in REGISTRY.items(): + assert ( + spec.data_format in valid + ), f"{category}: bad data_format {spec.data_format!r}" + assert _data_format_for(category) == spec.data_format + + +def test_transfer_present_iff_file_bearing(): + """The sidecar transfer factory is set exactly for file-bearing categories + (P3c invariant).""" + for category, spec in REGISTRY.items(): + assert spec.is_file_bearing == (spec.transfer is not None) diff --git a/tracebloc_ingestor/cli/conventions.py b/tracebloc_ingestor/cli/conventions.py index 3bf7eae..666cc0c 100644 --- a/tracebloc_ingestor/cli/conventions.py +++ b/tracebloc_ingestor/cli/conventions.py @@ -27,42 +27,54 @@ from dataclasses import dataclass, field from typing import Any, Dict, FrozenSet, List, Optional -from ..utils.constants import DataFormat, Intent, TaskCategory - +from ..modalities.registry import spec_for +from ..utils.constants import TaskCategory # --------------------------------------------------------------------------- # Category groupings — used both here and by the entrypoint when deciding # which sidecar paths matter. Single source of truth. # --------------------------------------------------------------------------- -IMAGE_CATEGORIES: FrozenSet[str] = frozenset({ - TaskCategory.IMAGE_CLASSIFICATION, - TaskCategory.OBJECT_DETECTION, - TaskCategory.KEYPOINT_DETECTION, - TaskCategory.SEMANTIC_SEGMENTATION, -}) +IMAGE_CATEGORIES: FrozenSet[str] = frozenset( + { + TaskCategory.IMAGE_CLASSIFICATION, + TaskCategory.OBJECT_DETECTION, + TaskCategory.KEYPOINT_DETECTION, + TaskCategory.SEMANTIC_SEGMENTATION, + } +) -TEXT_CATEGORIES: FrozenSet[str] = frozenset({ - TaskCategory.TEXT_CLASSIFICATION, - TaskCategory.TOKEN_CLASSIFICATION, -}) +TEXT_CATEGORIES: FrozenSet[str] = frozenset( + { + TaskCategory.TEXT_CLASSIFICATION, + TaskCategory.TOKEN_CLASSIFICATION, + } +) -TABULAR_CATEGORIES: FrozenSet[str] = frozenset({ - TaskCategory.TABULAR_CLASSIFICATION, - TaskCategory.TABULAR_REGRESSION, -}) +TABULAR_CATEGORIES: FrozenSet[str] = frozenset( + { + TaskCategory.TABULAR_CLASSIFICATION, + TaskCategory.TABULAR_REGRESSION, + } +) -TIME_SERIES_CATEGORIES: FrozenSet[str] = frozenset({ - TaskCategory.TIME_SERIES_FORECASTING, -}) +TIME_SERIES_CATEGORIES: FrozenSet[str] = frozenset( + { + TaskCategory.TIME_SERIES_FORECASTING, + } +) -TIME_TO_EVENT_CATEGORIES: FrozenSet[str] = frozenset({ - TaskCategory.TIME_TO_EVENT_PREDICTION, -}) +TIME_TO_EVENT_CATEGORIES: FrozenSet[str] = frozenset( + { + TaskCategory.TIME_TO_EVENT_PREDICTION, + } +) -MLM_CATEGORIES: FrozenSet[str] = frozenset({ - TaskCategory.MASKED_LANGUAGE_MODELING, -}) +MLM_CATEGORIES: FrozenSet[str] = frozenset( + { + TaskCategory.MASKED_LANGUAGE_MODELING, + } +) # Categories where the label is a numeric prediction target rather than # class metadata. The schema requires `label.policy` for these so the raw @@ -101,12 +113,18 @@ # framework's own samples round-trip through the documented happy-path # config with zero overrides. Production users with differently-sized data # should set `spec.file_options` in their YAML. - TaskCategory.IMAGE_CLASSIFICATION: {"target_size": [256, 256], "extension": ".jpeg"}, - TaskCategory.SEMANTIC_SEGMENTATION: {"target_size": [512, 512], "extension": ".jpg"}, - TaskCategory.OBJECT_DETECTION: {"target_size": [1920, 1080], "extension": ".jpg"}, + TaskCategory.IMAGE_CLASSIFICATION: { + "target_size": [256, 256], + "extension": ".jpeg", + }, + TaskCategory.SEMANTIC_SEGMENTATION: { + "target_size": [512, 512], + "extension": ".jpg", + }, + TaskCategory.OBJECT_DETECTION: {"target_size": [1920, 1080], "extension": ".jpg"}, # keypoint_detection: no target_size default — the customer's pose model # dictates input resolution, so the schema requires it top-level. - TaskCategory.KEYPOINT_DETECTION: {"extension": ".jpg"}, + TaskCategory.KEYPOINT_DETECTION: {"extension": ".jpg"}, } DEFAULT_TEXT_FILE_OPTIONS: Dict[str, Any] = { @@ -122,6 +140,7 @@ # Resolved configuration — what the entrypoint actually consumes. # --------------------------------------------------------------------------- + @dataclass class ResolvedConfig: """A fully-resolved ingest configuration. @@ -183,6 +202,7 @@ class ResolvedConfig: # Resolver # --------------------------------------------------------------------------- + def resolve(config: Dict[str, Any]) -> ResolvedConfig: """Translate a validated ingest.yaml dict into a :class:`ResolvedConfig`. @@ -302,25 +322,16 @@ def resolve(config: Dict[str, Any]) -> ResolvedConfig: # Helpers # --------------------------------------------------------------------------- + def _data_format_for(category: str) -> str: - """Map ``category`` to the ``DataFormat`` value the framework expects.""" - if category in IMAGE_CATEGORIES: - return DataFormat.IMAGE - if category in TEXT_CATEGORIES: - return DataFormat.TEXT - if ( - category in TABULAR_CATEGORIES - or category in TIME_SERIES_CATEGORIES - or category in TIME_TO_EVENT_CATEGORIES - ): - return DataFormat.TABULAR - if category in MLM_CATEGORIES: - return DataFormat.TEXT - raise ValueError( - f"Unknown category {category!r}; cannot derive data_format. " - "If this is a new category, add it to the relevant CATEGORY set " - "in conventions.py and to the schema enum." - ) + """Map ``category`` to the ``DataFormat`` value the framework expects. + + Reads the single source of truth — the ModalityRegistry — rather than the + per-format frozensets in this module (structural refactor backend#796, + P3d). ``spec_for`` raises ``ValueError`` on an unknown category, same as + the previous ladder did. + """ + return spec_for(category).data_format def _default_file_options_for(category: str) -> Dict[str, Any]: diff --git a/tracebloc_ingestor/modalities/registry.py b/tracebloc_ingestor/modalities/registry.py index f660e86..8b99a84 100644 --- a/tracebloc_ingestor/modalities/registry.py +++ b/tracebloc_ingestor/modalities/registry.py @@ -12,7 +12,7 @@ from typing import Dict -from ..utils.constants import TaskCategory +from ..utils.constants import DataFormat, TaskCategory from . import transfer as t from . import validators as v from .spec import ModalitySpec @@ -20,7 +20,8 @@ # One entry per supported category — the single source of truth. P3a: the three # behavior flags (was three frozensets in ingestors/base.py). P3b: the # validator factory (was the map_validators if/elif arm). P3c: the sidecar -# transfer factory (was the map_file_transfer if/elif arm). +# transfer factory (was the map_file_transfer if/elif arm). P3d: data_format +# (was the 6-frozenset ladder in conventions._data_format_for). _SPECS = ( # File-bearing categories (per-row sidecar files under SRC_PATH). ModalitySpec( @@ -28,6 +29,7 @@ is_file_bearing=True, is_tabular_family=False, is_self_supervised=False, + data_format=DataFormat.IMAGE, build_validators=v.image_classification, transfer=t.image_classification, ), @@ -36,6 +38,7 @@ is_file_bearing=True, is_tabular_family=False, is_self_supervised=False, + data_format=DataFormat.IMAGE, build_validators=v.object_detection, transfer=t.object_detection, ), @@ -44,6 +47,7 @@ is_file_bearing=True, is_tabular_family=False, is_self_supervised=False, + data_format=DataFormat.IMAGE, build_validators=v.keypoint_detection, transfer=t.keypoint_detection, ), @@ -52,6 +56,7 @@ is_file_bearing=True, is_tabular_family=False, is_self_supervised=False, + data_format=DataFormat.IMAGE, build_validators=v.semantic_segmentation, transfer=t.semantic_segmentation, ), @@ -60,6 +65,7 @@ is_file_bearing=True, is_tabular_family=False, is_self_supervised=False, + data_format=DataFormat.TEXT, build_validators=v.text_classification, transfer=t.text_classification, ), @@ -68,6 +74,7 @@ is_file_bearing=True, is_tabular_family=False, is_self_supervised=False, + data_format=DataFormat.TEXT, build_validators=v.token_classification, transfer=t.token_classification, ), @@ -77,6 +84,7 @@ is_file_bearing=True, is_tabular_family=False, is_self_supervised=True, + data_format=DataFormat.TEXT, build_validators=v.masked_language_modeling, transfer=t.masked_language_modeling, ), @@ -86,6 +94,7 @@ is_file_bearing=False, is_tabular_family=True, is_self_supervised=False, + data_format=DataFormat.TABULAR, build_validators=v.tabular_classification, ), ModalitySpec( @@ -93,6 +102,7 @@ is_file_bearing=False, is_tabular_family=True, is_self_supervised=False, + data_format=DataFormat.TABULAR, build_validators=v.tabular_regression, ), ModalitySpec( @@ -100,6 +110,7 @@ is_file_bearing=False, is_tabular_family=True, is_self_supervised=False, + data_format=DataFormat.TABULAR, build_validators=v.time_series_forecasting, ), ModalitySpec( @@ -107,6 +118,7 @@ is_file_bearing=False, is_tabular_family=True, is_self_supervised=False, + data_format=DataFormat.TABULAR, build_validators=v.time_to_event_prediction, ), ) diff --git a/tracebloc_ingestor/modalities/spec.py b/tracebloc_ingestor/modalities/spec.py index ec31ce7..e86ef68 100644 --- a/tracebloc_ingestor/modalities/spec.py +++ b/tracebloc_ingestor/modalities/spec.py @@ -34,6 +34,9 @@ class ModalitySpec: Attributes: category: the ``TaskCategory`` value this spec describes. + data_format: the ``DataFormat`` the framework expects for this category + (P3d). Read by ``conventions._data_format_for`` (was a 6-frozenset + ladder there). build_validators: ``(file_options) -> [validators]`` — the validator set this category runs (P3b). Replaces the corresponding ``map_validators`` if/elif arm; the factory bodies live in @@ -61,6 +64,7 @@ class ModalitySpec: is_file_bearing: bool is_tabular_family: bool is_self_supervised: bool + data_format: str build_validators: Callable[[Dict[str, Any]], List] transfer: Optional[ Callable[[Dict[str, Any], Dict[str, Any]], Optional[Dict[str, Any]]]