Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions tests/test_modality_registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""Tests for the ModalityRegistry (structural refactor — backend#796, P3a).

The registry is the single source of truth for per-category behavior. These
pin the invariant that makes a half-wired modality (the instance_segmentation
zombie, #240/#99) unrepresentable: every category the engine/schema knows has
a spec, and the registry-derived category sets match the spec flags.
"""

from __future__ import annotations

import json
from pathlib import Path

import pytest

from tracebloc_ingestor.modalities import (
FILE_BEARING_CATEGORIES,
REGISTRY,
SELF_SUPERVISED_CATEGORIES,
TABULAR_FAMILY_CATEGORIES,
registry,
spec_for,
)
from tracebloc_ingestor.utils.constants import TaskCategory

SCHEMA_PATH = (
Path(__file__).resolve().parent.parent
/ "tracebloc_ingestor"
/ "schema"
/ "ingest.v1.json"
)
SCHEMA_CATEGORIES = set(
json.loads(SCHEMA_PATH.read_text(encoding="utf-8"))["properties"]["category"][
"enum"
]
)


def test_registry_covers_taskcategory_and_schema_enum_exactly():
"""The registry must cover exactly the engine's categories and the
customer-facing schema enum — no extras, no gaps. A category in the schema
without a spec is the half-wired-zombie failure mode."""
engine = set(TaskCategory.get_all_categories())
assert set(REGISTRY) == engine == SCHEMA_CATEGORIES


def test_derived_sets_match_spec_flags():
assert FILE_BEARING_CATEGORIES == {
c for c, s in REGISTRY.items() if s.is_file_bearing
}
assert TABULAR_FAMILY_CATEGORIES == {
c for c, s in REGISTRY.items() if s.is_tabular_family
}
assert SELF_SUPERVISED_CATEGORIES == {
c for c, s in REGISTRY.items() if s.is_self_supervised
}


def test_spec_for_raises_on_unknown_category():
with pytest.raises(ValueError, match="No ModalitySpec"):
spec_for("totally_not_a_category")


def test_known_flag_values():
# Lock the data that used to live in the three base.py frozensets.
mlm = spec_for(TaskCategory.MASKED_LANGUAGE_MODELING)
assert mlm.is_file_bearing and mlm.is_self_supervised and not mlm.is_tabular_family

tab = spec_for(TaskCategory.TABULAR_CLASSIFICATION)
assert (
tab.is_tabular_family and not tab.is_file_bearing and not tab.is_self_supervised
)

img = spec_for(TaskCategory.IMAGE_CLASSIFICATION)
assert (
img.is_file_bearing and not img.is_tabular_family and not img.is_self_supervised
)


def test_base_py_imports_the_registry_derived_sets():
"""base.py must consume the registry's sets (not redefine its own), so the
flags can't drift from the single source."""
from tracebloc_ingestor.ingestors import base

assert base._FILE_BEARING_CATEGORIES is registry.FILE_BEARING_CATEGORIES
assert base._TABULAR_FAMILY_CATEGORIES is registry.TABULAR_FAMILY_CATEGORIES
assert base._SELF_SUPERVISED_CATEGORIES is registry.SELF_SUPERVISED_CATEGORIES
68 changes: 16 additions & 52 deletions tracebloc_ingestor/ingestors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,65 +27,29 @@
from ..file_transfer import map_file_transfer
from ..reporting import ConsoleRenderer

# Per-category behavior flags now live in the ModalityRegistry (the single
# source of truth — backend#796, P3a), derived from one ModalitySpec per
# category instead of three hand-maintained frozensets here. Imported under
# the previous names so the ``category in <set>`` checks below are unchanged
# (and their None/unknown -> False semantics are preserved).
from ..modalities.registry import (
FILE_BEARING_CATEGORIES as _FILE_BEARING_CATEGORIES,
SELF_SUPERVISED_CATEGORIES as _SELF_SUPERVISED_CATEGORIES,
TABULAR_FAMILY_CATEGORIES as _TABULAR_FAMILY_CATEGORIES,
)

# Logger for this module. Level is set by `setup_logging()` on the root
# logger when the user script calls it; child loggers inherit that level.
logger = logging.getLogger(__name__)

__all__ = ["BaseIngestor", "IngestionSummary"]


# Tabular-family categories carry `number_of_columns` in file_options;
# image / text categories do not (a schema may still be supplied — e.g.
# keypoint_detection's "Visibility" column — but a column count there
# would be a misleading metric).
_TABULAR_FAMILY_CATEGORIES = frozenset({
TaskCategory.TABULAR_CLASSIFICATION,
TaskCategory.TABULAR_REGRESSION,
TaskCategory.TIME_SERIES_FORECASTING,
TaskCategory.TIME_TO_EVENT_PREDICTION,
})

# File-bearing categories: every record references sidecar files (images,
# annotations, masks, texts, sequences) that live under ``config.SRC_PATH``
# and must be copied to DEST_PATH by ``map_file_transfer``. This single set
# drives BOTH uses:
# 1. the SRC_PATH preflight in ``validate_data`` — if SRC_PATH is
# empty/unset/missing, every file lookup silently falls through to a
# relative path and the user sees N copies of "Source image not
# found: images/x.jpg", blaming the data when the real cause is
# "SRC_PATH was never staged on the PVC";
# 2. the per-record ``map_file_transfer`` gate in ``_ingest_with_lock``.
# Keeping these two on one set is deliberate: when they were separate
# lists, instance_segmentation sat in one but not the other and shipped
# half-wired — rows reached MySQL and the backend API with zero files
# staged (the silent-half-ingest pattern from #99).
# ``tests/test_category_congruence.py`` anchors this set to the schema
# enum's file-bearing categories.
# Tabular / time-series have no sidecar dirs under SRC_PATH, so they're
# excluded (the CSV path itself is checked elsewhere).
_FILE_BEARING_CATEGORIES = frozenset({
TaskCategory.IMAGE_CLASSIFICATION,
TaskCategory.OBJECT_DETECTION,
TaskCategory.KEYPOINT_DETECTION,
TaskCategory.SEMANTIC_SEGMENTATION,
TaskCategory.TEXT_CLASSIFICATION,
TaskCategory.TOKEN_CLASSIFICATION,
TaskCategory.MASKED_LANGUAGE_MODELING,
})


# Self-supervised categories have no `label` column — the CSV manifest just
# points at sidecar files and the model creates its own targets at training
# time (e.g. masked_language_modeling masks tokens on-the-fly). The backend
# correspondingly stores no edge-label metadata for these datasets, so the
# `send_generate_edge_label_meta` call is a no-op at best and a misleading
# HTTP 400 ("No data found for table X") at worst — see issue #213.
# The schema (schema/ingest.v1.json) now rejects `label:` on these categories
# at submission time; this set + the gate below are the defensive in-ingestor
# half (script-driven runs that bypass the schema still skip the wasted call).
_SELF_SUPERVISED_CATEGORIES = frozenset({
TaskCategory.MASKED_LANGUAGE_MODELING,
})
# NOTE: _TABULAR_FAMILY_CATEGORIES, _FILE_BEARING_CATEGORIES and
# _SELF_SUPERVISED_CATEGORIES are imported above from modalities.registry
# (derived from the per-category ModalitySpec flags — backend#796 P3a). The
# ``self.category in <set>`` checks throughout this module use them unchanged;
# the rationale for each set now lives on ModalitySpec's field docstrings.


class IngestionSummary(NamedTuple):
Expand Down
20 changes: 20 additions & 0 deletions tracebloc_ingestor/modalities/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""Per-modality specification registry — the single source of truth for each
task category's ingestion behavior (structural refactor — backend#796, P3)."""

from .registry import (
FILE_BEARING_CATEGORIES,
REGISTRY,
SELF_SUPERVISED_CATEGORIES,
TABULAR_FAMILY_CATEGORIES,
spec_for,
)
from .spec import ModalitySpec

__all__ = [
"ModalitySpec",
"REGISTRY",
"spec_for",
"FILE_BEARING_CATEGORIES",
"TABULAR_FAMILY_CATEGORIES",
"SELF_SUPERVISED_CATEGORIES",
]
125 changes: 125 additions & 0 deletions tracebloc_ingestor/modalities/registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
"""ModalityRegistry — one ModalitySpec per task category.

The single place a category's behavior is declared (built up over P3a–P3d;
see modalities/spec.py). ``tests/test_modality_registry.py`` and
``tests/test_category_congruence.py`` pin the registry to the schema enum so a
category the customer can submit always has a complete spec — making the
``instance_segmentation`` half-wired-zombie class of bug (#240 / #99)
unrepresentable.
"""

from __future__ import annotations

from typing import Dict

from ..utils.constants import TaskCategory
from .spec import ModalitySpec

# One entry per supported category. P3a populates the three behavior flags
# that were hand-maintained frozensets in ingestors/base.py.
_SPECS = (
# File-bearing categories (per-row sidecar files under SRC_PATH).
ModalitySpec(
TaskCategory.IMAGE_CLASSIFICATION,
is_file_bearing=True,
is_tabular_family=False,
is_self_supervised=False,
),
ModalitySpec(
TaskCategory.OBJECT_DETECTION,
is_file_bearing=True,
is_tabular_family=False,
is_self_supervised=False,
),
ModalitySpec(
TaskCategory.KEYPOINT_DETECTION,
is_file_bearing=True,
is_tabular_family=False,
is_self_supervised=False,
),
ModalitySpec(
TaskCategory.SEMANTIC_SEGMENTATION,
is_file_bearing=True,
is_tabular_family=False,
is_self_supervised=False,
),
ModalitySpec(
TaskCategory.TEXT_CLASSIFICATION,
is_file_bearing=True,
is_tabular_family=False,
is_self_supervised=False,
),
ModalitySpec(
TaskCategory.TOKEN_CLASSIFICATION,
is_file_bearing=True,
is_tabular_family=False,
is_self_supervised=False,
),
# masked_language_modeling is file-bearing AND self-supervised (no label).
ModalitySpec(
TaskCategory.MASKED_LANGUAGE_MODELING,
is_file_bearing=True,
is_tabular_family=False,
is_self_supervised=True,
),
# Tabular family (structured feature tables; no sidecar files).
ModalitySpec(
TaskCategory.TABULAR_CLASSIFICATION,
is_file_bearing=False,
is_tabular_family=True,
is_self_supervised=False,
),
ModalitySpec(
TaskCategory.TABULAR_REGRESSION,
is_file_bearing=False,
is_tabular_family=True,
is_self_supervised=False,
),
ModalitySpec(
TaskCategory.TIME_SERIES_FORECASTING,
is_file_bearing=False,
is_tabular_family=True,
is_self_supervised=False,
),
ModalitySpec(
TaskCategory.TIME_TO_EVENT_PREDICTION,
is_file_bearing=False,
is_tabular_family=True,
is_self_supervised=False,
),
)

REGISTRY: Dict[str, ModalitySpec] = {spec.category: spec for spec in _SPECS}


def spec_for(category: str) -> ModalitySpec:
"""Return the spec for ``category``, raising on an unknown one.

Strict by design: a category that reaches the engine without a spec is a
wiring bug, not something to silently no-op (that was the
``instance_segmentation`` failure mode). Call sites that must tolerate
``None`` / non-categories use the derived ``*_CATEGORIES`` frozensets
below, whose ``in`` test returns False — preserving the prior
``category in frozenset`` semantics exactly.
"""
try:
return REGISTRY[category]
except KeyError:
raise ValueError(
f"No ModalitySpec registered for category {category!r}. Add one in "
f"modalities/registry.py or remove the category from the schema enum."
)


# Registry-derived category sets. These REPLACE the three hand-maintained
# frozensets that lived in ingestors/base.py: derived from the specs so they
# can no longer drift from the single source. ``base.py`` imports these under
# its previous names, so its ``category in <set>`` checks (and their
# None/unknown -> False semantics) are unchanged.
FILE_BEARING_CATEGORIES = frozenset(c for c, s in REGISTRY.items() if s.is_file_bearing)
TABULAR_FAMILY_CATEGORIES = frozenset(
c for c, s in REGISTRY.items() if s.is_tabular_family
)
SELF_SUPERVISED_CATEGORIES = frozenset(
c for c, s in REGISTRY.items() if s.is_self_supervised
)
52 changes: 52 additions & 0 deletions tracebloc_ingestor/modalities/spec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""ModalitySpec — the single source of truth for one task category's
ingestion behavior.

Today "what is modality X" is spread across separate per-category dispatch
sites (validator mapping, file-transfer dispatch, three frozensets in
ingestors/base.py, the conventions groupings, the schema enum). They are kept
consistent only by ``tests/test_category_congruence.py`` — consistency by
test, not by construction. The registry (modalities/registry.py) makes one
``ModalitySpec`` per category the source the call sites are *derived from*, so
they can't drift.

This is delivered in slices (structural refactor — backend#796, phase P3):

- **P3a (this slice):** the three per-category FLAGS that were frozensets in
ingestors/base.py.
- P3b: the validator factory (replaces utils/validators_mapping.py).
- P3c: the sidecar transfer plan (replaces file_transfer.map_file_transfer;
folds in the semseg mask sidecar / #136).
- P3d: the conventions defaults (data_format, default file/csv options,
regression-class flag).

So the spec is intentionally small here and grows over P3b–P3d.
"""

from __future__ import annotations

from dataclasses import dataclass


@dataclass(frozen=True)
class ModalitySpec:
"""Per-category behavior flags (P3a). More fields land in P3b–P3d.

Attributes:
category: the ``TaskCategory`` value this spec describes.
is_file_bearing: every record references sidecar files (images,
annotations, masks, texts, sequences) under ``SRC_PATH`` that must
be copied to ``DEST_PATH``. Drives both the SRC_PATH preflight and
the per-record file-transfer gate. Tabular / time-series are False.
is_tabular_family: a structured-feature table — inject
``number_of_columns`` into the backend metadata. Image / text
categories are False (a column count there would mislead).
is_self_supervised: no ``label`` column; the model creates its own
targets at train time (e.g. masked language modeling). The backend
stores no edge-label metadata, so the edge-label call is skipped
(#213).
"""

category: str
is_file_bearing: bool
is_tabular_family: bool
is_self_supervised: bool
Loading