tracebloc · shujaatTracebloc · Jun 17, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/tests/test_ingestor_base.py b/tests/test_ingestor_base.py
@@ -965,3 +965,88 @@ def test_check_src_path_required_for_token_classification():
     from tracebloc_ingestor.ingestors.base import _FILE_BEARING_CATEGORIES
 
     assert TaskCategory.TOKEN_CLASSIFICATION in _FILE_BEARING_CATEGORIES
+
+
+# ---------------------------------------------------------------------------
+# #805 Task 2: tokenizer fingerprint registration on the global-metadata channel
+# ---------------------------------------------------------------------------
+
+_FINGERPRINT = {
+    "vocab_size": 15,
+    "mask_token_id": 4,
+    "pad_token_id": 0,
+    "tokenizer_type": "WordLevel",
+}
+
+
+@pytest.mark.parametrize(
+    "category",
+    ["MASKED_LANGUAGE_MODELING", "TEXT_CLASSIFICATION", "TOKEN_CLASSIFICATION"],
+)
+def test_ingest_registers_tokenizer_fingerprint_for_nlp(category):
+    """For every NLP category, a shipped tokenizer's 4-integer fingerprint is
+    attached to file_options so it rides the existing global-metadata channel."""
+    from tracebloc_ingestor.utils.constants import TaskCategory
+
+    cat = getattr(TaskCategory, category)
+    records = [{"a": "1", "filename": "f1"}]
+    ing = make_ingestor(records=records, category=cat, label_column=None)
+    with patch.object(base_mod, "Session") as Sess, patch.object(
+        ing, "validate_data", return_value=True
+    ), patch.object(
+        base_mod, "map_file_transfer", side_effect=lambda c, r, o, cfg=None: r
+    ), patch.object(
+        base_mod, "get_shipped_tokenizer_metadata", return_value=_FINGERPRINT
+    ):
+        Sess.return_value.__enter__.return_value = MagicMock()
+        ing.ingest("src", batch_size=10)
+    args, _ = ing.api_client.send_global_meta_meta.call_args
+    assert args[2].get("tokenizer") == _FINGERPRINT
+
+
+def test_ingest_warns_and_skips_tokenizer_when_absent_for_nlp():
+    """A site that ships no tokenizer.json still registers cleanly — the
+    fingerprint is simply omitted (the epic's legacy/skipped path)."""
+    from tracebloc_ingestor.utils.constants import TaskCategory
+
+    records = [{"a": "1", "filename": "f1"}]
+    ing = make_ingestor(
+        records=records,
+        category=TaskCategory.TEXT_CLASSIFICATION,
+        label_column="a",
+    )
+    with patch.object(base_mod, "Session") as Sess, patch.object(
+        ing, "validate_data", return_value=True
+    ), patch.object(
+        base_mod, "map_file_transfer", side_effect=lambda c, r, o, cfg=None: r
+    ), patch.object(
+        base_mod, "get_shipped_tokenizer_metadata", return_value=None
+    ):
+        Sess.return_value.__enter__.return_value = MagicMock()
+        ing.ingest("src", batch_size=10)
+    args, _ = ing.api_client.send_global_meta_meta.call_args
+    assert "tokenizer" not in args[2]
+    # Registration still completes — absence is non-fatal.
+    ing.api_client.create_dataset.assert_called_once()
+
+
+def test_ingest_does_not_register_tokenizer_for_non_nlp():
+    """Non-NLP categories never touch the tokenizer path."""
+    from tracebloc_ingestor.utils.constants import TaskCategory
+
+    records = [{"a": "1", "filename": "f1"}]
+    ing = make_ingestor(
+        records=records,
+        category=TaskCategory.IMAGE_CLASSIFICATION,
+        label_column="a",
+    )
+    with patch.object(base_mod, "Session") as Sess, patch.object(
+        ing, "validate_data", return_value=True
+    ), patch.object(
+        base_mod, "map_file_transfer", side_effect=lambda c, r, o, cfg=None: r
+    ), patch.object(base_mod, "get_shipped_tokenizer_metadata") as get_meta:
+        Sess.return_value.__enter__.return_value = MagicMock()
+        ing.ingest("src", batch_size=10)
+    get_meta.assert_not_called()
+    args, _ = ing.api_client.send_global_meta_meta.call_args
+    assert "tokenizer" not in args[2]
diff --git a/tests/test_modality_registry.py b/tests/test_modality_registry.py
@@ -15,6 +15,7 @@
 
 from tracebloc_ingestor.modalities import (
     FILE_BEARING_CATEGORIES,
+    NLP_CATEGORIES,
     REGISTRY,
     SELF_SUPERVISED_CATEGORIES,
     TABULAR_FAMILY_CATEGORIES,
@@ -54,6 +55,17 @@ def test_derived_sets_match_spec_flags():
     assert SELF_SUPERVISED_CATEGORIES == {
         c for c, s in REGISTRY.items() if s.is_self_supervised
     }
+    assert NLP_CATEGORIES == {c for c, s in REGISTRY.items() if s.is_nlp}
+
+
+def test_nlp_categories_are_the_three_text_categories():
+    """#805 Task 2: the tokenizer-fingerprint set is exactly the NLP text
+    categories (text/token classification + MLM) — never image/tabular."""
+    assert NLP_CATEGORIES == {
+        TaskCategory.TEXT_CLASSIFICATION,
+        TaskCategory.TOKEN_CLASSIFICATION,
+        TaskCategory.MASKED_LANGUAGE_MODELING,
+    }
 
 
 def test_spec_for_raises_on_unknown_category():

diff --git a/tests/test_tokenizer_fingerprint.py b/tests/test_tokenizer_fingerprint.py
@@ -0,0 +1,237 @@
+"""Tests for the NLP tokenizer fingerprint extracted + registered at ingest.
+
+Covers issue #805 Task 2: the 4 structural integers (vocab_size /
+mask_token_id / pad_token_id / tokenizer_type) extracted from a shipped
+``tokenizer.json`` and shipped on the global-metadata channel. The FL
+guardrail is that ONLY these integers leave the cluster — never vocabulary
+content and never a hash.
+"""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+
+from tracebloc_ingestor import file_transfer
+from tracebloc_ingestor.validators.tokenizer_validator import (
+    _special_token_id,
+    extract_tokenizer_metadata,
+    load_tokenizer_metadata,
+)
+
+# A BERT/WordPiece-style tokenizer.json with the full classification +
+# MLM special-token set, special tokens declared in added_tokens.
+_BERT_STYLE = {
+    "version": "1.0",
+    "model": {
+        "type": "WordPiece",
+        "vocab": {
+            "[PAD]": 0,
+            "[UNK]": 1,
+            "[CLS]": 2,
+            "[SEP]": 3,
+            "[MASK]": 4,
+            "hello": 5,
+            "world": 6,
+        },
+    },
+    "added_tokens": [
+        {"id": 0, "content": "[PAD]", "special": True},
+        {"id": 1, "content": "[UNK]", "special": True},
+        {"id": 2, "content": "[CLS]", "special": True},
+        {"id": 3, "content": "[SEP]", "special": True},
+        {"id": 4, "content": "[MASK]", "special": True},
+    ],
+}
+
+# A classification tokenizer that has [PAD] but no [MASK] (no masking task).
+_NO_MASK = {
+    "model": {"type": "WordPiece", "vocab": {"[PAD]": 0, "[UNK]": 1, "a": 2}},
+    "added_tokens": [{"id": 0, "content": "[PAD]", "special": True}],
+}
+
+# A Unigram tokenizer stores its vocab as a [token, score] list.
+_UNIGRAM = {
+    "model": {
+        "type": "Unigram",
+        "vocab": [["[PAD]", 0.0], ["[MASK]", 0.0], ["x", -1.0], ["y", -2.0]],
+    },
+    "added_tokens": [
+        {"id": 0, "content": "[PAD]", "special": True},
+        {"id": 1, "content": "[MASK]", "special": True},
+    ],
+}
+
+
+# ---------------------------------------------------------------------------
+# extract_tokenizer_metadata
+# ---------------------------------------------------------------------------
+
+
+def test_extract_full_fingerprint_bert_style():
+    meta = extract_tokenizer_metadata(_BERT_STYLE)
+    assert meta == {
+        "vocab_size": 7,
+        "mask_token_id": 4,
+        "pad_token_id": 0,
+        "tokenizer_type": "WordPiece",
+    }
+
+
+def test_extract_classification_without_mask_yields_none_mask_id():
+    meta = extract_tokenizer_metadata(_NO_MASK)
+    assert meta["mask_token_id"] is None
+    assert meta["pad_token_id"] == 0
+    assert meta["vocab_size"] == 3
+    assert meta["tokenizer_type"] == "WordPiece"
+
+
+def test_extract_unigram_list_vocab():
+    meta = extract_tokenizer_metadata(_UNIGRAM)
+    assert meta["vocab_size"] == 4
+    assert meta["mask_token_id"] == 1
+    assert meta["pad_token_id"] == 0
+    assert meta["tokenizer_type"] == "Unigram"
+
+
+def test_fl_guardrail_only_four_scalar_keys():
+    """No vocabulary content or hash may cross to the backend — only the 4
+    scalar integers (one may be a type string / None)."""
+    meta = extract_tokenizer_metadata(_BERT_STYLE)
+    assert set(meta) == {
+        "vocab_size",
+        "mask_token_id",
+        "pad_token_id",
+        "tokenizer_type",
+    }
+    for value in meta.values():
+        assert not isinstance(value, (dict, list))
+
+
+def test_extract_handles_empty_or_unknown_structure():
+    meta = extract_tokenizer_metadata({})
+    assert meta == {
+        "vocab_size": 0,
+        "mask_token_id": None,
+        "pad_token_id": None,
+        "tokenizer_type": None,
+    }
+
+
+# ---------------------------------------------------------------------------
+# _special_token_id
+# ---------------------------------------------------------------------------
+
+
+def test_special_token_id_prefers_added_tokens():
+    data = {
+        "model": {"vocab": {"[PAD]": 99}},
+        "added_tokens": [{"id": 7, "content": "[PAD]"}],
+    }
+    assert _special_token_id(data, "[PAD]") == 7
+
+
+def test_special_token_id_falls_back_to_vocab():
+    data = {"model": {"vocab": {"[PAD]": 3}}, "added_tokens": []}
+    assert _special_token_id(data, "[PAD]") == 3
+
+
+def test_special_token_id_idless_added_token_falls_back_to_vocab():
+    """A malformed added_tokens entry with no ``id`` must not shadow the
+    model.vocab mapping that does hold the id (bugbot)."""
+    data = {
+        "model": {"vocab": {"[PAD]": 5}},
+        "added_tokens": [{"content": "[PAD]"}],  # no "id"
+    }
+    assert _special_token_id(data, "[PAD]") == 5
+
+
+def test_special_token_id_absent_returns_none():
+    assert _special_token_id(_NO_MASK, "[MASK]") is None
+
+
+# ---------------------------------------------------------------------------
+# load_tokenizer_metadata
+# ---------------------------------------------------------------------------
+
+
+def test_load_reads_file(tmp_path):
+    p = tmp_path / "tokenizer.json"
+    p.write_text(json.dumps(_BERT_STYLE))
+    meta = load_tokenizer_metadata(str(p))
+    assert meta["vocab_size"] == 7
+    assert meta["mask_token_id"] == 4
+
+
+def test_load_missing_file_returns_none(tmp_path):
+    assert load_tokenizer_metadata(str(tmp_path / "nope.json")) is None
+
+
+def test_load_malformed_json_returns_none(tmp_path):
+    p = tmp_path / "tokenizer.json"
+    p.write_text("{ not valid json ")
+    assert load_tokenizer_metadata(str(p)) is None
+
+
+# ---------------------------------------------------------------------------
+# file_transfer helpers (SRC_PATH / DEST_PATH backed)
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def dirs(tmp_path, monkeypatch):
+    """Point file_transfer's Config at tmp src + storage dirs (mirrors the
+    fixture in test_file_transfer_transfers.py)."""
+    src = tmp_path / "src"
+    storage = tmp_path / "storage"
+    src.mkdir()
+    storage.mkdir()
+    monkeypatch.setenv("SRC_PATH", str(src))
+    monkeypatch.setenv("TABLE_NAME", "tbl")
+    monkeypatch.setattr(file_transfer.config, "STORAGE_PATH", str(storage))
+    return src, storage / "tbl"
+
+
+def test_get_shipped_tokenizer_metadata_reads_dest(dirs):
+    """Fingerprints the STAGED tokenizer (DEST) — the file the client uses."""
+    _, dest = dirs
+    dest.mkdir(parents=True, exist_ok=True)
+    (dest / "tokenizer.json").write_text(json.dumps(_BERT_STYLE))
+    meta = file_transfer.get_shipped_tokenizer_metadata()
+    assert meta["vocab_size"] == 7
+    assert meta["pad_token_id"] == 0
+
+
+def test_get_shipped_tokenizer_metadata_none_when_absent(dirs):
+    assert file_transfer.get_shipped_tokenizer_metadata() is None
+
+
+def test_get_shipped_fingerprints_dest_not_src(dirs):
+    """On a re-ingest the copy is skipped (DEST already exists), so the client
+    trains on DEST; the registered fingerprint must describe DEST, not a
+    changed SRC (bugbot)."""
+    src, dest = dirs
+    dest.mkdir(parents=True, exist_ok=True)
+    # SRC has a 3-token tokenizer; DEST (already staged) has the 7-token one.
+    (src / "tokenizer.json").write_text(json.dumps(_NO_MASK))
+    (dest / "tokenizer.json").write_text(json.dumps(_BERT_STYLE))
+    meta = file_transfer.get_shipped_tokenizer_metadata()
+    assert meta["vocab_size"] == 7  # DEST, not SRC's 3
+
+
+def test_copy_tokenizer_returns_fingerprint_on_copy(dirs):
+    src, dest = dirs
+    # In production text_transfer creates DEST_PATH before the tokenizer copy;
+    # mirror that here since we call the helper in isolation.
+    dest.mkdir(parents=True, exist_ok=True)
+    (src / "tokenizer.json").write_text(json.dumps(_BERT_STYLE))
+    meta = file_transfer._copy_tokenizer_if_present()
+    assert (dest / "tokenizer.json").exists()
+    assert meta["vocab_size"] == 7
+    # Already copied: a second call is a no-op and returns None.
+    assert file_transfer._copy_tokenizer_if_present() is None
+
+
+def test_copy_tokenizer_none_when_absent(dirs):
+    assert file_transfer._copy_tokenizer_if_present() is None