From 3152f2f70124caa1fbe950b27d3b11f225da17bd Mon Sep 17 00:00:00 2001 From: Lukas Wuttke Date: Mon, 15 Jun 2026 10:41:58 +0200 Subject: [PATCH] refactor(P3a): move per-category behavior flags into a ModalityRegistry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First slice of P3 (backend#796): introduce tracebloc_ingestor/modalities/ — one ModalitySpec per task category as the single source of truth — and migrate the three hand-maintained frozensets in ingestors/base.py (_TABULAR_FAMILY_CATEGORIES, _FILE_BEARING_CATEGORIES, _SELF_SUPERVISED_CATEGORIES) onto it. - modalities/spec.py: ModalitySpec dataclass (the 3 behavior flags; grows in P3b-P3d with the validator factory, sidecar transfer plan, conventions defaults). - modalities/registry.py: REGISTRY (one spec per category) + spec_for() + the three category sets DERIVED from the specs, so they can't drift. - base.py: deletes the 3 frozenset literals and imports the derived sets under their previous names — every `category in ` check (and its None/unknown -> False semantics) is unchanged. Behaviour-preserving: the derived sets equal the old frozensets (7 file-bearing / 4 tabular / 1 self-supervised). Full suite 1033 passed, 97.3% coverage, modalities/ 100%. tests/test_modality_registry.py pins the registry <-> TaskCategory <-> schema-enum invariant (making the instance_segmentation half-wired-zombie class, #240/#99, unrepresentable); the existing test_category_congruence + base tests pass unchanged via the aliased imports. P3b (validators) and P3c (transfer plan; folds in semseg / #136) follow. Co-Authored-By: Claude Fable 5 --- tests/test_modality_registry.py | 87 +++++++++++++++ tracebloc_ingestor/ingestors/base.py | 68 +++--------- tracebloc_ingestor/modalities/__init__.py | 20 ++++ tracebloc_ingestor/modalities/registry.py | 125 ++++++++++++++++++++++ tracebloc_ingestor/modalities/spec.py | 52 +++++++++ 5 files changed, 300 insertions(+), 52 deletions(-) create mode 100644 tests/test_modality_registry.py create mode 100644 tracebloc_ingestor/modalities/__init__.py create mode 100644 tracebloc_ingestor/modalities/registry.py create mode 100644 tracebloc_ingestor/modalities/spec.py diff --git a/tests/test_modality_registry.py b/tests/test_modality_registry.py new file mode 100644 index 0000000..9acaf15 --- /dev/null +++ b/tests/test_modality_registry.py @@ -0,0 +1,87 @@ +"""Tests for the ModalityRegistry (structural refactor — backend#796, P3a). + +The registry is the single source of truth for per-category behavior. These +pin the invariant that makes a half-wired modality (the instance_segmentation +zombie, #240/#99) unrepresentable: every category the engine/schema knows has +a spec, and the registry-derived category sets match the spec flags. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from tracebloc_ingestor.modalities import ( + FILE_BEARING_CATEGORIES, + REGISTRY, + SELF_SUPERVISED_CATEGORIES, + TABULAR_FAMILY_CATEGORIES, + registry, + spec_for, +) +from tracebloc_ingestor.utils.constants import TaskCategory + +SCHEMA_PATH = ( + Path(__file__).resolve().parent.parent + / "tracebloc_ingestor" + / "schema" + / "ingest.v1.json" +) +SCHEMA_CATEGORIES = set( + json.loads(SCHEMA_PATH.read_text(encoding="utf-8"))["properties"]["category"][ + "enum" + ] +) + + +def test_registry_covers_taskcategory_and_schema_enum_exactly(): + """The registry must cover exactly the engine's categories and the + customer-facing schema enum — no extras, no gaps. A category in the schema + without a spec is the half-wired-zombie failure mode.""" + engine = set(TaskCategory.get_all_categories()) + assert set(REGISTRY) == engine == SCHEMA_CATEGORIES + + +def test_derived_sets_match_spec_flags(): + assert FILE_BEARING_CATEGORIES == { + c for c, s in REGISTRY.items() if s.is_file_bearing + } + assert TABULAR_FAMILY_CATEGORIES == { + c for c, s in REGISTRY.items() if s.is_tabular_family + } + assert SELF_SUPERVISED_CATEGORIES == { + c for c, s in REGISTRY.items() if s.is_self_supervised + } + + +def test_spec_for_raises_on_unknown_category(): + with pytest.raises(ValueError, match="No ModalitySpec"): + spec_for("totally_not_a_category") + + +def test_known_flag_values(): + # Lock the data that used to live in the three base.py frozensets. + mlm = spec_for(TaskCategory.MASKED_LANGUAGE_MODELING) + assert mlm.is_file_bearing and mlm.is_self_supervised and not mlm.is_tabular_family + + tab = spec_for(TaskCategory.TABULAR_CLASSIFICATION) + assert ( + tab.is_tabular_family and not tab.is_file_bearing and not tab.is_self_supervised + ) + + img = spec_for(TaskCategory.IMAGE_CLASSIFICATION) + assert ( + img.is_file_bearing and not img.is_tabular_family and not img.is_self_supervised + ) + + +def test_base_py_imports_the_registry_derived_sets(): + """base.py must consume the registry's sets (not redefine its own), so the + flags can't drift from the single source.""" + from tracebloc_ingestor.ingestors import base + + assert base._FILE_BEARING_CATEGORIES is registry.FILE_BEARING_CATEGORIES + assert base._TABULAR_FAMILY_CATEGORIES is registry.TABULAR_FAMILY_CATEGORIES + assert base._SELF_SUPERVISED_CATEGORIES is registry.SELF_SUPERVISED_CATEGORIES diff --git a/tracebloc_ingestor/ingestors/base.py b/tracebloc_ingestor/ingestors/base.py index 4a5781a..6aea4d0 100644 --- a/tracebloc_ingestor/ingestors/base.py +++ b/tracebloc_ingestor/ingestors/base.py @@ -27,6 +27,17 @@ from ..file_transfer import map_file_transfer from ..reporting import ConsoleRenderer +# Per-category behavior flags now live in the ModalityRegistry (the single +# source of truth — backend#796, P3a), derived from one ModalitySpec per +# category instead of three hand-maintained frozensets here. Imported under +# the previous names so the ``category in `` checks below are unchanged +# (and their None/unknown -> False semantics are preserved). +from ..modalities.registry import ( + FILE_BEARING_CATEGORIES as _FILE_BEARING_CATEGORIES, + SELF_SUPERVISED_CATEGORIES as _SELF_SUPERVISED_CATEGORIES, + TABULAR_FAMILY_CATEGORIES as _TABULAR_FAMILY_CATEGORIES, +) + # Logger for this module. Level is set by `setup_logging()` on the root # logger when the user script calls it; child loggers inherit that level. logger = logging.getLogger(__name__) @@ -34,58 +45,11 @@ __all__ = ["BaseIngestor", "IngestionSummary"] -# Tabular-family categories carry `number_of_columns` in file_options; -# image / text categories do not (a schema may still be supplied — e.g. -# keypoint_detection's "Visibility" column — but a column count there -# would be a misleading metric). -_TABULAR_FAMILY_CATEGORIES = frozenset({ - TaskCategory.TABULAR_CLASSIFICATION, - TaskCategory.TABULAR_REGRESSION, - TaskCategory.TIME_SERIES_FORECASTING, - TaskCategory.TIME_TO_EVENT_PREDICTION, -}) - -# File-bearing categories: every record references sidecar files (images, -# annotations, masks, texts, sequences) that live under ``config.SRC_PATH`` -# and must be copied to DEST_PATH by ``map_file_transfer``. This single set -# drives BOTH uses: -# 1. the SRC_PATH preflight in ``validate_data`` — if SRC_PATH is -# empty/unset/missing, every file lookup silently falls through to a -# relative path and the user sees N copies of "Source image not -# found: images/x.jpg", blaming the data when the real cause is -# "SRC_PATH was never staged on the PVC"; -# 2. the per-record ``map_file_transfer`` gate in ``_ingest_with_lock``. -# Keeping these two on one set is deliberate: when they were separate -# lists, instance_segmentation sat in one but not the other and shipped -# half-wired — rows reached MySQL and the backend API with zero files -# staged (the silent-half-ingest pattern from #99). -# ``tests/test_category_congruence.py`` anchors this set to the schema -# enum's file-bearing categories. -# Tabular / time-series have no sidecar dirs under SRC_PATH, so they're -# excluded (the CSV path itself is checked elsewhere). -_FILE_BEARING_CATEGORIES = frozenset({ - TaskCategory.IMAGE_CLASSIFICATION, - TaskCategory.OBJECT_DETECTION, - TaskCategory.KEYPOINT_DETECTION, - TaskCategory.SEMANTIC_SEGMENTATION, - TaskCategory.TEXT_CLASSIFICATION, - TaskCategory.TOKEN_CLASSIFICATION, - TaskCategory.MASKED_LANGUAGE_MODELING, -}) - - -# Self-supervised categories have no `label` column — the CSV manifest just -# points at sidecar files and the model creates its own targets at training -# time (e.g. masked_language_modeling masks tokens on-the-fly). The backend -# correspondingly stores no edge-label metadata for these datasets, so the -# `send_generate_edge_label_meta` call is a no-op at best and a misleading -# HTTP 400 ("No data found for table X") at worst — see issue #213. -# The schema (schema/ingest.v1.json) now rejects `label:` on these categories -# at submission time; this set + the gate below are the defensive in-ingestor -# half (script-driven runs that bypass the schema still skip the wasted call). -_SELF_SUPERVISED_CATEGORIES = frozenset({ - TaskCategory.MASKED_LANGUAGE_MODELING, -}) +# NOTE: _TABULAR_FAMILY_CATEGORIES, _FILE_BEARING_CATEGORIES and +# _SELF_SUPERVISED_CATEGORIES are imported above from modalities.registry +# (derived from the per-category ModalitySpec flags — backend#796 P3a). The +# ``self.category in `` checks throughout this module use them unchanged; +# the rationale for each set now lives on ModalitySpec's field docstrings. class IngestionSummary(NamedTuple): diff --git a/tracebloc_ingestor/modalities/__init__.py b/tracebloc_ingestor/modalities/__init__.py new file mode 100644 index 0000000..d64c0f5 --- /dev/null +++ b/tracebloc_ingestor/modalities/__init__.py @@ -0,0 +1,20 @@ +"""Per-modality specification registry — the single source of truth for each +task category's ingestion behavior (structural refactor — backend#796, P3).""" + +from .registry import ( + FILE_BEARING_CATEGORIES, + REGISTRY, + SELF_SUPERVISED_CATEGORIES, + TABULAR_FAMILY_CATEGORIES, + spec_for, +) +from .spec import ModalitySpec + +__all__ = [ + "ModalitySpec", + "REGISTRY", + "spec_for", + "FILE_BEARING_CATEGORIES", + "TABULAR_FAMILY_CATEGORIES", + "SELF_SUPERVISED_CATEGORIES", +] diff --git a/tracebloc_ingestor/modalities/registry.py b/tracebloc_ingestor/modalities/registry.py new file mode 100644 index 0000000..d80a860 --- /dev/null +++ b/tracebloc_ingestor/modalities/registry.py @@ -0,0 +1,125 @@ +"""ModalityRegistry — one ModalitySpec per task category. + +The single place a category's behavior is declared (built up over P3a–P3d; +see modalities/spec.py). ``tests/test_modality_registry.py`` and +``tests/test_category_congruence.py`` pin the registry to the schema enum so a +category the customer can submit always has a complete spec — making the +``instance_segmentation`` half-wired-zombie class of bug (#240 / #99) +unrepresentable. +""" + +from __future__ import annotations + +from typing import Dict + +from ..utils.constants import TaskCategory +from .spec import ModalitySpec + +# One entry per supported category. P3a populates the three behavior flags +# that were hand-maintained frozensets in ingestors/base.py. +_SPECS = ( + # File-bearing categories (per-row sidecar files under SRC_PATH). + ModalitySpec( + TaskCategory.IMAGE_CLASSIFICATION, + is_file_bearing=True, + is_tabular_family=False, + is_self_supervised=False, + ), + ModalitySpec( + TaskCategory.OBJECT_DETECTION, + is_file_bearing=True, + is_tabular_family=False, + is_self_supervised=False, + ), + ModalitySpec( + TaskCategory.KEYPOINT_DETECTION, + is_file_bearing=True, + is_tabular_family=False, + is_self_supervised=False, + ), + ModalitySpec( + TaskCategory.SEMANTIC_SEGMENTATION, + is_file_bearing=True, + is_tabular_family=False, + is_self_supervised=False, + ), + ModalitySpec( + TaskCategory.TEXT_CLASSIFICATION, + is_file_bearing=True, + is_tabular_family=False, + is_self_supervised=False, + ), + ModalitySpec( + TaskCategory.TOKEN_CLASSIFICATION, + is_file_bearing=True, + is_tabular_family=False, + is_self_supervised=False, + ), + # masked_language_modeling is file-bearing AND self-supervised (no label). + ModalitySpec( + TaskCategory.MASKED_LANGUAGE_MODELING, + is_file_bearing=True, + is_tabular_family=False, + is_self_supervised=True, + ), + # Tabular family (structured feature tables; no sidecar files). + ModalitySpec( + TaskCategory.TABULAR_CLASSIFICATION, + is_file_bearing=False, + is_tabular_family=True, + is_self_supervised=False, + ), + ModalitySpec( + TaskCategory.TABULAR_REGRESSION, + is_file_bearing=False, + is_tabular_family=True, + is_self_supervised=False, + ), + ModalitySpec( + TaskCategory.TIME_SERIES_FORECASTING, + is_file_bearing=False, + is_tabular_family=True, + is_self_supervised=False, + ), + ModalitySpec( + TaskCategory.TIME_TO_EVENT_PREDICTION, + is_file_bearing=False, + is_tabular_family=True, + is_self_supervised=False, + ), +) + +REGISTRY: Dict[str, ModalitySpec] = {spec.category: spec for spec in _SPECS} + + +def spec_for(category: str) -> ModalitySpec: + """Return the spec for ``category``, raising on an unknown one. + + Strict by design: a category that reaches the engine without a spec is a + wiring bug, not something to silently no-op (that was the + ``instance_segmentation`` failure mode). Call sites that must tolerate + ``None`` / non-categories use the derived ``*_CATEGORIES`` frozensets + below, whose ``in`` test returns False — preserving the prior + ``category in frozenset`` semantics exactly. + """ + try: + return REGISTRY[category] + except KeyError: + raise ValueError( + f"No ModalitySpec registered for category {category!r}. Add one in " + f"modalities/registry.py or remove the category from the schema enum." + ) + + +# Registry-derived category sets. These REPLACE the three hand-maintained +# frozensets that lived in ingestors/base.py: derived from the specs so they +# can no longer drift from the single source. ``base.py`` imports these under +# its previous names, so its ``category in `` checks (and their +# None/unknown -> False semantics) are unchanged. +FILE_BEARING_CATEGORIES = frozenset(c for c, s in REGISTRY.items() if s.is_file_bearing) +TABULAR_FAMILY_CATEGORIES = frozenset( + c for c, s in REGISTRY.items() if s.is_tabular_family +) +SELF_SUPERVISED_CATEGORIES = frozenset( + c for c, s in REGISTRY.items() if s.is_self_supervised +) diff --git a/tracebloc_ingestor/modalities/spec.py b/tracebloc_ingestor/modalities/spec.py new file mode 100644 index 0000000..8b83f69 --- /dev/null +++ b/tracebloc_ingestor/modalities/spec.py @@ -0,0 +1,52 @@ +"""ModalitySpec — the single source of truth for one task category's +ingestion behavior. + +Today "what is modality X" is spread across separate per-category dispatch +sites (validator mapping, file-transfer dispatch, three frozensets in +ingestors/base.py, the conventions groupings, the schema enum). They are kept +consistent only by ``tests/test_category_congruence.py`` — consistency by +test, not by construction. The registry (modalities/registry.py) makes one +``ModalitySpec`` per category the source the call sites are *derived from*, so +they can't drift. + +This is delivered in slices (structural refactor — backend#796, phase P3): + +- **P3a (this slice):** the three per-category FLAGS that were frozensets in + ingestors/base.py. +- P3b: the validator factory (replaces utils/validators_mapping.py). +- P3c: the sidecar transfer plan (replaces file_transfer.map_file_transfer; + folds in the semseg mask sidecar / #136). +- P3d: the conventions defaults (data_format, default file/csv options, + regression-class flag). + +So the spec is intentionally small here and grows over P3b–P3d. +""" + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class ModalitySpec: + """Per-category behavior flags (P3a). More fields land in P3b–P3d. + + Attributes: + category: the ``TaskCategory`` value this spec describes. + is_file_bearing: every record references sidecar files (images, + annotations, masks, texts, sequences) under ``SRC_PATH`` that must + be copied to ``DEST_PATH``. Drives both the SRC_PATH preflight and + the per-record file-transfer gate. Tabular / time-series are False. + is_tabular_family: a structured-feature table — inject + ``number_of_columns`` into the backend metadata. Image / text + categories are False (a column count there would mislead). + is_self_supervised: no ``label`` column; the model creates its own + targets at train time (e.g. masked language modeling). The backend + stores no edge-label metadata, so the edge-label call is skipped + (#213). + """ + + category: str + is_file_bearing: bool + is_tabular_family: bool + is_self_supervised: bool