tracebloc · divyasinghds · Jun 15, 2026 · Jun 15, 2026
diff --git a/tests/test_modality_registry.py b/tests/test_modality_registry.py
@@ -0,0 +1,87 @@
+"""Tests for the ModalityRegistry (structural refactor — backend#796, P3a).
+
+The registry is the single source of truth for per-category behavior. These
+pin the invariant that makes a half-wired modality (the instance_segmentation
+zombie, #240/#99) unrepresentable: every category the engine/schema knows has
+a spec, and the registry-derived category sets match the spec flags.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import pytest
+
+from tracebloc_ingestor.modalities import (
+    FILE_BEARING_CATEGORIES,
+    REGISTRY,
+    SELF_SUPERVISED_CATEGORIES,
+    TABULAR_FAMILY_CATEGORIES,
+    registry,
+    spec_for,
+)
+from tracebloc_ingestor.utils.constants import TaskCategory
+
+SCHEMA_PATH = (
+    Path(__file__).resolve().parent.parent
+    / "tracebloc_ingestor"
+    / "schema"
+    / "ingest.v1.json"
+)
+SCHEMA_CATEGORIES = set(
+    json.loads(SCHEMA_PATH.read_text(encoding="utf-8"))["properties"]["category"][
+        "enum"
+    ]
+)
+
+
+def test_registry_covers_taskcategory_and_schema_enum_exactly():
+    """The registry must cover exactly the engine's categories and the
+    customer-facing schema enum — no extras, no gaps. A category in the schema
+    without a spec is the half-wired-zombie failure mode."""
+    engine = set(TaskCategory.get_all_categories())
+    assert set(REGISTRY) == engine == SCHEMA_CATEGORIES
+
+
+def test_derived_sets_match_spec_flags():
+    assert FILE_BEARING_CATEGORIES == {
+        c for c, s in REGISTRY.items() if s.is_file_bearing
+    }
+    assert TABULAR_FAMILY_CATEGORIES == {
+        c for c, s in REGISTRY.items() if s.is_tabular_family
+    }
+    assert SELF_SUPERVISED_CATEGORIES == {
+        c for c, s in REGISTRY.items() if s.is_self_supervised
+    }
+
+
+def test_spec_for_raises_on_unknown_category():
+    with pytest.raises(ValueError, match="No ModalitySpec"):
+        spec_for("totally_not_a_category")
+
+
+def test_known_flag_values():
+    # Lock the data that used to live in the three base.py frozensets.
+    mlm = spec_for(TaskCategory.MASKED_LANGUAGE_MODELING)
+    assert mlm.is_file_bearing and mlm.is_self_supervised and not mlm.is_tabular_family
+
+    tab = spec_for(TaskCategory.TABULAR_CLASSIFICATION)
+    assert (
+        tab.is_tabular_family and not tab.is_file_bearing and not tab.is_self_supervised
+    )
+
+    img = spec_for(TaskCategory.IMAGE_CLASSIFICATION)
+    assert (
+        img.is_file_bearing and not img.is_tabular_family and not img.is_self_supervised
+    )
+
+
+def test_base_py_imports_the_registry_derived_sets():
+    """base.py must consume the registry's sets (not redefine its own), so the
+    flags can't drift from the single source."""
+    from tracebloc_ingestor.ingestors import base
+
+    assert base._FILE_BEARING_CATEGORIES is registry.FILE_BEARING_CATEGORIES
+    assert base._TABULAR_FAMILY_CATEGORIES is registry.TABULAR_FAMILY_CATEGORIES
+    assert base._SELF_SUPERVISED_CATEGORIES is registry.SELF_SUPERVISED_CATEGORIES
diff --git a/tracebloc_ingestor/ingestors/base.py b/tracebloc_ingestor/ingestors/base.py
@@ -27,65 +27,29 @@
 from ..file_transfer import map_file_transfer
 from ..reporting import ConsoleRenderer
 
+# Per-category behavior flags now live in the ModalityRegistry (the single
+# source of truth — backend#796, P3a), derived from one ModalitySpec per
+# category instead of three hand-maintained frozensets here. Imported under
+# the previous names so the ``category in <set>`` checks below are unchanged
+# (and their None/unknown -> False semantics are preserved).
+from ..modalities.registry import (
+    FILE_BEARING_CATEGORIES as _FILE_BEARING_CATEGORIES,
+    SELF_SUPERVISED_CATEGORIES as _SELF_SUPERVISED_CATEGORIES,
+    TABULAR_FAMILY_CATEGORIES as _TABULAR_FAMILY_CATEGORIES,
+)
+
 # Logger for this module. Level is set by `setup_logging()` on the root
 # logger when the user script calls it; child loggers inherit that level.
 logger = logging.getLogger(__name__)
 
 __all__ = ["BaseIngestor", "IngestionSummary"]
 
 
-# Tabular-family categories carry `number_of_columns` in file_options;
-# image / text categories do not (a schema may still be supplied — e.g.
-# keypoint_detection's "Visibility" column — but a column count there
-# would be a misleading metric).
-_TABULAR_FAMILY_CATEGORIES = frozenset({
-    TaskCategory.TABULAR_CLASSIFICATION,
-    TaskCategory.TABULAR_REGRESSION,
-    TaskCategory.TIME_SERIES_FORECASTING,
-    TaskCategory.TIME_TO_EVENT_PREDICTION,
-})
-
-# File-bearing categories: every record references sidecar files (images,
-# annotations, masks, texts, sequences) that live under ``config.SRC_PATH``
-# and must be copied to DEST_PATH by ``map_file_transfer``. This single set
-# drives BOTH uses:
-#   1. the SRC_PATH preflight in ``validate_data`` — if SRC_PATH is
-#      empty/unset/missing, every file lookup silently falls through to a
-#      relative path and the user sees N copies of "Source image not
-#      found: images/x.jpg", blaming the data when the real cause is
-#      "SRC_PATH was never staged on the PVC";
-#   2. the per-record ``map_file_transfer`` gate in ``_ingest_with_lock``.
-# Keeping these two on one set is deliberate: when they were separate
-# lists, instance_segmentation sat in one but not the other and shipped
-# half-wired — rows reached MySQL and the backend API with zero files
-# staged (the silent-half-ingest pattern from #99).
-# ``tests/test_category_congruence.py`` anchors this set to the schema
-# enum's file-bearing categories.
-# Tabular / time-series have no sidecar dirs under SRC_PATH, so they're
-# excluded (the CSV path itself is checked elsewhere).
-_FILE_BEARING_CATEGORIES = frozenset({
-    TaskCategory.IMAGE_CLASSIFICATION,
-    TaskCategory.OBJECT_DETECTION,
-    TaskCategory.KEYPOINT_DETECTION,
-    TaskCategory.SEMANTIC_SEGMENTATION,
-    TaskCategory.TEXT_CLASSIFICATION,
-    TaskCategory.TOKEN_CLASSIFICATION,
-    TaskCategory.MASKED_LANGUAGE_MODELING,
-})
-
-
-# Self-supervised categories have no `label` column — the CSV manifest just
-# points at sidecar files and the model creates its own targets at training
-# time (e.g. masked_language_modeling masks tokens on-the-fly). The backend
-# correspondingly stores no edge-label metadata for these datasets, so the
-# `send_generate_edge_label_meta` call is a no-op at best and a misleading
-# HTTP 400 ("No data found for table X") at worst — see issue #213.
-# The schema (schema/ingest.v1.json) now rejects `label:` on these categories
-# at submission time; this set + the gate below are the defensive in-ingestor
-# half (script-driven runs that bypass the schema still skip the wasted call).
-_SELF_SUPERVISED_CATEGORIES = frozenset({
-    TaskCategory.MASKED_LANGUAGE_MODELING,
-})
+# NOTE: _TABULAR_FAMILY_CATEGORIES, _FILE_BEARING_CATEGORIES and
+# _SELF_SUPERVISED_CATEGORIES are imported above from modalities.registry
+# (derived from the per-category ModalitySpec flags — backend#796 P3a). The
+# ``self.category in <set>`` checks throughout this module use them unchanged;
+# the rationale for each set now lives on ModalitySpec's field docstrings.
 
 
 class IngestionSummary(NamedTuple):

diff --git a/tracebloc_ingestor/modalities/__init__.py b/tracebloc_ingestor/modalities/__init__.py
@@ -0,0 +1,20 @@
+"""Per-modality specification registry — the single source of truth for each
+task category's ingestion behavior (structural refactor — backend#796, P3)."""
+
+from .registry import (
+    FILE_BEARING_CATEGORIES,
+    REGISTRY,
+    SELF_SUPERVISED_CATEGORIES,
+    TABULAR_FAMILY_CATEGORIES,
+    spec_for,
+)
+from .spec import ModalitySpec
+
+__all__ = [
+    "ModalitySpec",
+    "REGISTRY",
+    "spec_for",
+    "FILE_BEARING_CATEGORIES",
+    "TABULAR_FAMILY_CATEGORIES",
+    "SELF_SUPERVISED_CATEGORIES",
+]
diff --git a/tracebloc_ingestor/modalities/registry.py b/tracebloc_ingestor/modalities/registry.py
@@ -0,0 +1,125 @@
+"""ModalityRegistry — one ModalitySpec per task category.
+
+The single place a category's behavior is declared (built up over P3a–P3d;
+see modalities/spec.py). ``tests/test_modality_registry.py`` and
+``tests/test_category_congruence.py`` pin the registry to the schema enum so a
+category the customer can submit always has a complete spec — making the
+``instance_segmentation`` half-wired-zombie class of bug (#240 / #99)
+unrepresentable.
+"""
+
+from __future__ import annotations
+
+from typing import Dict
+
+from ..utils.constants import TaskCategory
+from .spec import ModalitySpec
+
+# One entry per supported category. P3a populates the three behavior flags
+# that were hand-maintained frozensets in ingestors/base.py.
+_SPECS = (
+    # File-bearing categories (per-row sidecar files under SRC_PATH).
+    ModalitySpec(
+        TaskCategory.IMAGE_CLASSIFICATION,
+        is_file_bearing=True,
+        is_tabular_family=False,
+        is_self_supervised=False,
+    ),
+    ModalitySpec(
+        TaskCategory.OBJECT_DETECTION,
+        is_file_bearing=True,
+        is_tabular_family=False,
+        is_self_supervised=False,
+    ),
+    ModalitySpec(
+        TaskCategory.KEYPOINT_DETECTION,
+        is_file_bearing=True,
+        is_tabular_family=False,
+        is_self_supervised=False,
+    ),
+    ModalitySpec(
+        TaskCategory.SEMANTIC_SEGMENTATION,
+        is_file_bearing=True,
+        is_tabular_family=False,
+        is_self_supervised=False,
+    ),
+    ModalitySpec(
+        TaskCategory.TEXT_CLASSIFICATION,
+        is_file_bearing=True,
+        is_tabular_family=False,
+        is_self_supervised=False,
+    ),
+    ModalitySpec(
+        TaskCategory.TOKEN_CLASSIFICATION,
+        is_file_bearing=True,
+        is_tabular_family=False,
+        is_self_supervised=False,
+    ),
+    # masked_language_modeling is file-bearing AND self-supervised (no label).
+    ModalitySpec(
+        TaskCategory.MASKED_LANGUAGE_MODELING,
+        is_file_bearing=True,
+        is_tabular_family=False,
+        is_self_supervised=True,
+    ),
+    # Tabular family (structured feature tables; no sidecar files).
+    ModalitySpec(
+        TaskCategory.TABULAR_CLASSIFICATION,
+        is_file_bearing=False,
+        is_tabular_family=True,
+        is_self_supervised=False,
+    ),
+    ModalitySpec(
+        TaskCategory.TABULAR_REGRESSION,
+        is_file_bearing=False,
+        is_tabular_family=True,
+        is_self_supervised=False,
+    ),
+    ModalitySpec(
+        TaskCategory.TIME_SERIES_FORECASTING,
+        is_file_bearing=False,
+        is_tabular_family=True,
+        is_self_supervised=False,
+    ),
+    ModalitySpec(
+        TaskCategory.TIME_TO_EVENT_PREDICTION,
+        is_file_bearing=False,
+        is_tabular_family=True,
+        is_self_supervised=False,
+    ),
+)
+
+REGISTRY: Dict[str, ModalitySpec] = {spec.category: spec for spec in _SPECS}
+
+
+def spec_for(category: str) -> ModalitySpec:
+    """Return the spec for ``category``, raising on an unknown one.
+
+    Strict by design: a category that reaches the engine without a spec is a
+    wiring bug, not something to silently no-op (that was the
+    ``instance_segmentation`` failure mode). Call sites that must tolerate
+    ``None`` / non-categories use the derived ``*_CATEGORIES`` frozensets
+    below, whose ``in`` test returns False — preserving the prior
+    ``category in frozenset`` semantics exactly.
+    """
+    try:
+        return REGISTRY[category]
+    except KeyError:
+        raise ValueError(
+            f"No ModalitySpec registered for category {category!r}. Add one in "
+            f"modalities/registry.py or remove the category from the schema enum."
+        )
+
+
+# Registry-derived category sets. These REPLACE the three hand-maintained
+# frozensets that lived in ingestors/base.py: derived from the specs so they
+# can no longer drift from the single source. ``base.py`` imports these under
+# its previous names, so its ``category in <set>`` checks (and their
+# None/unknown -> False semantics) are unchanged.
+FILE_BEARING_CATEGORIES = frozenset(c for c, s in REGISTRY.items() if s.is_file_bearing)
+TABULAR_FAMILY_CATEGORIES = frozenset(
+    c for c, s in REGISTRY.items() if s.is_tabular_family
+)
+SELF_SUPERVISED_CATEGORIES = frozenset(
+    c for c, s in REGISTRY.items() if s.is_self_supervised
+)
diff --git a/tracebloc_ingestor/modalities/spec.py b/tracebloc_ingestor/modalities/spec.py
@@ -0,0 +1,52 @@
+"""ModalitySpec — the single source of truth for one task category's
+ingestion behavior.
+
+Today "what is modality X" is spread across separate per-category dispatch
+sites (validator mapping, file-transfer dispatch, three frozensets in
+ingestors/base.py, the conventions groupings, the schema enum). They are kept
+consistent only by ``tests/test_category_congruence.py`` — consistency by
+test, not by construction. The registry (modalities/registry.py) makes one
+``ModalitySpec`` per category the source the call sites are *derived from*, so
+they can't drift.
+
+This is delivered in slices (structural refactor — backend#796, phase P3):
+
+- **P3a (this slice):** the three per-category FLAGS that were frozensets in
+  ingestors/base.py.
+- P3b: the validator factory (replaces utils/validators_mapping.py).
+- P3c: the sidecar transfer plan (replaces file_transfer.map_file_transfer;
+  folds in the semseg mask sidecar / #136).
+- P3d: the conventions defaults (data_format, default file/csv options,
+  regression-class flag).
+
+So the spec is intentionally small here and grows over P3b–P3d.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class ModalitySpec:
+    """Per-category behavior flags (P3a). More fields land in P3b–P3d.
+
+    Attributes:
+        category: the ``TaskCategory`` value this spec describes.
+        is_file_bearing: every record references sidecar files (images,
+            annotations, masks, texts, sequences) under ``SRC_PATH`` that must
+            be copied to ``DEST_PATH``. Drives both the SRC_PATH preflight and
+            the per-record file-transfer gate. Tabular / time-series are False.
+        is_tabular_family: a structured-feature table — inject
+            ``number_of_columns`` into the backend metadata. Image / text
+            categories are False (a column count there would mislead).
+        is_self_supervised: no ``label`` column; the model creates its own
+            targets at train time (e.g. masked language modeling). The backend
+            stores no edge-label metadata, so the edge-label call is skipped
+            (#213).
+    """
+
+    category: str
+    is_file_bearing: bool
+    is_tabular_family: bool
+    is_self_supervised: bool