tracebloc · divyasinghds · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026
diff --git a/tests/test_api_client_methods.py b/tests/test_api_client_methods.py
@@ -157,6 +157,53 @@ def test_prepare_dataset_local_mode():
     get.assert_not_called()
 
 
+def test_prepare_dataset_error_captures_response_body():
+    """On HTTP error, `prepare_dataset` must stash the backend response body
+    on `self.last_prepare_error` so callers can surface the actual reason
+    in their user-visible error — instead of pointing at "the logged API
+    error above". Issue #251.
+
+    The body retained must include the status code and the response text
+    (capped) so a downstream RuntimeError can include both.
+    """
+    client = _client()
+    body = '{"message":"Please provide atleast 2 labels."}'
+    with patch.object(client.session, "get", return_value=_resp(400, text=body)):
+        ok = client.prepare_dataset(
+            TaskCategory.TABULAR_CLASSIFICATION, "ing", "tabular", "train"
+        )
+    assert ok is False
+    assert client.last_prepare_error is not None
+    # Status code + body both surface so the user sees the backend reason.
+    assert "HTTP 400" in client.last_prepare_error
+    assert "Please provide" in client.last_prepare_error
+
+
+def test_prepare_dataset_last_error_starts_unset():
+    """On a clean ingestor with no prior failure, `last_prepare_error`
+    is None — base.py falls back to its generic 'see logged API error
+    above' message only when this attribute is truly absent."""
+    client = _client()
+    assert client.last_prepare_error is None
+
+
+def test_prepare_dataset_network_error_captures_string():
+    """When `e.response` is None (DNS / connection refused / timeout),
+    last_prepare_error should still be populated with the stringified
+    exception — never silently fall through to None."""
+    import requests as _req
+
+    client = _client()
+    err = _req.exceptions.ConnectionError("name resolution failed")
+    with patch.object(client.session, "get", side_effect=err):
+        ok = client.prepare_dataset(
+            TaskCategory.TABULAR_CLASSIFICATION, "ing", "tabular", "train"
+        )
+    assert ok is False
+    assert client.last_prepare_error is not None
+    assert "name resolution failed" in client.last_prepare_error
+
+
 # ---------------------------------------------------------------------------
 # create_dataset
 # ---------------------------------------------------------------------------

diff --git a/tests/test_label_diversity_validator.py b/tests/test_label_diversity_validator.py
@@ -0,0 +1,212 @@
+"""Tests for LabelDiversityValidator — fail-fast on single-label classification.
+
+Surfaces the cause locally (with the actual distinct label values) instead
+of letting the backend reject with the misleading "Backend failed to
+prepare the dataset; it was NOT registered" cascade once rows have already
+landed in MySQL. Issue #251.
+"""
+
+from __future__ import annotations
+
+import pandas as pd
+import pytest
+
+from tracebloc_ingestor.validators.label_diversity_validator import (
+    LabelDiversityValidator,
+)
+
+
+# ---------------------------------------------------------------------------
+# Positive cases — must pass
+# ---------------------------------------------------------------------------
+
+def test_two_distinct_labels_passes():
+    df = pd.DataFrame({"a": [1, 2, 3, 4], "label": ["A", "B", "A", "B"]})
+    result = LabelDiversityValidator().validate(df)
+    assert result.is_valid
+
+
+def test_many_distinct_labels_passes():
+    df = pd.DataFrame({"label": ["A", "B", "C", "D", "E"]})
+    result = LabelDiversityValidator().validate(df)
+    assert result.is_valid
+
+
+def test_distinct_labels_with_nulls_counts_only_non_null():
+    """Null cells don't count toward distinct labels; if there are still
+    ≥2 distinct non-null values the dataset is fine."""
+    df = pd.DataFrame({"label": ["A", "B", None, None]})
+    result = LabelDiversityValidator().validate(df)
+    assert result.is_valid
+    assert result.metadata["distinct_count"] == 2
+
+
+def test_label_column_case_insensitive_match():
+    """A CSV header ``Label`` should still resolve when the validator is
+    configured for ``label`` (default). Matches the case-insensitive
+    pattern BIOLabelValidator uses."""
+    df = pd.DataFrame({"a": [1, 2], "Label": ["X", "Y"]})
+    result = LabelDiversityValidator().validate(df)
+    assert result.is_valid
+
+
+# ---------------------------------------------------------------------------
+# Failure cases — must reject with a CLEAR message
+# ---------------------------------------------------------------------------
+
+def test_single_label_fails_with_distinct_value_listed():
+    """A 10-row dataset where every row has ``label = "X"`` is not a
+    classification dataset. The error must name the offending distinct
+    value(s) and the count so the user immediately sees what's wrong."""
+    df = pd.DataFrame({"label": ["X"] * 10})
+    result = LabelDiversityValidator().validate(df)
+    assert not result.is_valid
+    err = result.errors[0]
+    # User-facing message must include the actual single value found.
+    assert "'X'" in err or "'X'" in str(result.metadata.get("value_counts", {}))
+    # And explain WHY it's rejected.
+    assert "classification" in err.lower()
+    assert "distinct" in err.lower()
+
+
+def test_single_label_only_nulls_fails():
+    """All-null label column → 0 distinct → rejected."""
+    df = pd.DataFrame({"label": [None, None, None]})
+    result = LabelDiversityValidator().validate(df)
+    assert not result.is_valid
+
+
+def test_single_label_one_value_with_some_nulls_fails():
+    """One distinct value plus nulls is still only 1 distinct value."""
+    df = pd.DataFrame({"label": ["A", "A", None, "A", None]})
+    result = LabelDiversityValidator().validate(df)
+    assert not result.is_valid
+
+
+def test_error_mentions_regression_alternative():
+    """A user who has a continuous target shouldn't be told 'add a fake
+    second label' — the error should point them at regression-family
+    categories which legitimately accept a single target column."""
+    df = pd.DataFrame({"label": ["A"] * 5})
+    result = LabelDiversityValidator().validate(df)
+    assert not result.is_valid
+    assert "regression" in result.errors[0].lower()
+
+
+# ---------------------------------------------------------------------------
+# Defensive paths — must not double-report (other validators own those)
+# ---------------------------------------------------------------------------
+
+def test_empty_dataframe_passes_silently():
+    """Empty input is the 'no rows' / empty-CSV class — handled by other
+    validators with their own clear messages. Don't double-report."""
+    result = LabelDiversityValidator().validate(pd.DataFrame())
+    assert result.is_valid
+    assert result.metadata["rows_checked"] == 0
+
+
+def test_label_column_missing_passes_with_warning():
+    """If the CSV has no label column at all, that's a schema-mismatch
+    case handled by other layers. This validator just warns and skips."""
+    df = pd.DataFrame({"a": [1, 2], "b": ["x", "y"]})
+    result = LabelDiversityValidator().validate(df)
+    assert result.is_valid
+    assert any("not found in CSV" in w for w in (result.warnings or []))
+
+
+# ---------------------------------------------------------------------------
+# CSV-path streaming check — must not load the whole wide CSV into memory
+# ---------------------------------------------------------------------------
+
+def test_csv_path_reads_only_the_label_column(tmp_path):
+    """For a wide CSV (one label + many feature columns), the validator
+    must read only the label column — counting distinct labels doesn't
+    need the features, and a multi-GB proteomics panel would otherwise
+    OOM. Mirrors the streaming-first patterns elsewhere in the codebase
+    (DataValidator's chunked path)."""
+    p = tmp_path / "wide.csv"
+    cols = ",".join([f"f{i:02d}" for i in range(50)] + ["label"])
+    rows = "\n".join(
+        [",".join(["0.5"] * 50 + [("A" if i % 2 else "B")]) for i in range(20)]
+    )
+    p.write_text(cols + "\n" + rows + "\n")
+    result = LabelDiversityValidator().validate(str(p))
+    assert result.is_valid
+    assert result.metadata["distinct_count"] == 2
+
+
+def test_csv_path_rejects_single_label(tmp_path):
+    """End-to-end CSV-path test of the failure case — mirrors the
+    adversarial test against v0.3.10-rc1 that surfaced #251."""
+    p = tmp_path / "single.csv"
+    p.write_text("id,label\n1,X\n2,X\n3,X\n")
+    result = LabelDiversityValidator().validate(str(p))
+    assert not result.is_valid
+    assert "1 distinct" in result.errors[0]
+    assert "'X'" in result.errors[0] or "'X'" in str(result.metadata.get("value_counts", {}))
+
+
+def test_csv_quoted_header_does_not_skew_multilabel(tmp_path):
+    """A quoted/comma-bearing header must not trip the column resolution.
+
+    Regression (bugbot #252, medium): the old loader resolved the label
+    column with a naive ``header_line.split(",")``, which splits inside
+    quoted headers and diverges from pandas. When it failed to find the
+    column it fell back to ``nrows=1`` and counted distinct labels on that
+    single row — rejecting a perfectly diverse dataset. Resolving against
+    pandas' own header parse (nrows=0) fixes it, so a header like
+    ``"feature,with,commas"`` alongside ``label`` reads the full column.
+    """
+    p = tmp_path / "quoted.csv"
+    # The first column's header literally contains commas (quoted).
+    p.write_text(
+        '"feature,with,commas",label\n'
+        + "\n".join(f"{i},{'A' if i % 2 else 'B'}" for i in range(20))
+        + "\n"
+    )
+    result = LabelDiversityValidator().validate(str(p))
+    assert result.is_valid, f"expected valid; errors={result.errors}"
+    assert result.metadata["distinct_count"] == 2
+
+
+def test_csv_read_error_fails_closed_not_skipped(tmp_path, monkeypatch):
+    """A read failure must FAIL the check, not silently pass.
+
+    Regression (bugbot #252, high): ``_load_data`` previously swallowed any
+    read exception and returned ``None``, which ``validate`` treats as an
+    empty/benign dataset → valid. A single-label CSV whose targeted read
+    errored could sail through preflight and hit the backend rejection this
+    validator exists to prevent. Read errors now propagate to ``validate``'s
+    handler and fail the check.
+    """
+    p = tmp_path / "boom.csv"
+    p.write_text("id,label\n1,X\n2,X\n")
+
+    real_read_csv = pd.read_csv
+
+    def _boom(path, *args, **kwargs):
+        # Let the cheap header probe (nrows=0) succeed, then blow up on the
+        # actual data read — mimics a usecols/encoding failure mid-load.
+        if kwargs.get("nrows") == 0:
+            return real_read_csv(path, *args, **kwargs)
+        raise ValueError("simulated CSV read failure")
+
+    monkeypatch.setattr(pd, "read_csv", _boom)
+    result = LabelDiversityValidator().validate(str(p))
+    assert not result.is_valid
+    assert "validation error" in result.errors[0].lower()
+
+
+# ---------------------------------------------------------------------------
+# Custom column name
+# ---------------------------------------------------------------------------
+
+def test_custom_label_column_name():
+    """When the user configures a non-default label column, the
+    validator must check THAT column (mirrors BIOLabelValidator's
+    behavior with custom columns)."""
+    df = pd.DataFrame({"target": ["A", "B"], "label": ["X", "X"]})
+    # Custom column is the diverse one — should pass.
+    assert LabelDiversityValidator(label_column="target").validate(df).is_valid
+    # The default `label` column is single-value here — should fail.
+    assert not LabelDiversityValidator(label_column="label").validate(df).is_valid
diff --git a/tests/test_validators_mapping.py b/tests/test_validators_mapping.py
@@ -24,6 +24,9 @@
     KeypointVisibilityValidator,
 )
 from tracebloc_ingestor.validators.tokenizer_validator import TokenizerValidator
+from tracebloc_ingestor.validators.label_diversity_validator import (
+    LabelDiversityValidator,
+)
 
 
 IMAGE_OPTS = {"extension": FileExtension.JPG, "target_size": [224, 224]}
@@ -38,11 +41,40 @@ def test_image_classification():
     assert _types(v) == [
         FileTypeValidator,
         ImageResolutionValidator,
+        LabelDiversityValidator,
         TableNameValidator,
         DuplicateValidator,
     ]
 
 
+def test_classification_categories_include_label_diversity():
+    """Single-label classification is caught at preflight across every
+    classification-family category — image/object/semantic/keypoint/
+    tabular/text — but NOT token_classification (its label is a per-token
+    BIO sequence, not a single class) or the regression / self-supervised
+    families (issue #251)."""
+    for cat in (
+        TaskCategory.IMAGE_CLASSIFICATION,
+        TaskCategory.OBJECT_DETECTION,
+        TaskCategory.SEMANTIC_SEGMENTATION,
+        TaskCategory.KEYPOINT_DETECTION,
+        TaskCategory.TABULAR_CLASSIFICATION,
+        TaskCategory.TEXT_CLASSIFICATION,
+    ):
+        assert LabelDiversityValidator in _types(map_validators(cat, IMAGE_OPTS)), cat
+
+    for cat in (
+        TaskCategory.TOKEN_CLASSIFICATION,
+        TaskCategory.TABULAR_REGRESSION,
+        TaskCategory.TIME_SERIES_FORECASTING,
+        TaskCategory.TIME_TO_EVENT_PREDICTION,
+        TaskCategory.MASKED_LANGUAGE_MODELING,
+    ):
+        assert LabelDiversityValidator not in _types(
+            map_validators(cat, {"schema": {"a": "INT"}})
+        ), cat
+
+
 def test_object_detection_includes_xml_validator():
     v = map_validators(TaskCategory.OBJECT_DETECTION, IMAGE_OPTS)
     types = _types(v)

diff --git a/tracebloc_ingestor/api/client.py b/tracebloc_ingestor/api/client.py
@@ -41,6 +41,13 @@ def __init__(self, config: Config):
         self.config = config
         self.session = self._create_session()
 
+        # Last `prepare_dataset` HTTP-error body, retained so callers
+        # can include the actual backend reason in the user-visible
+        # RuntimeError instead of just pointing at "the logged API
+        # error above" (issue #251). Set by `prepare_dataset`'s error
+        # handler; remains None on a clean run.
+        self.last_prepare_error: Optional[str] = None
+
         # Auth resolution order:
         #   1. local mode  → mock token, no network call
         #   2. BACKEND_TOKEN set → use it directly (preferred; mirrors the
@@ -414,6 +421,13 @@ def prepare_dataset(
         Returns:
             bool: True if successful, False otherwise
         """
+        # Clear any error stashed by a previous prepare_dataset call up front,
+        # so an early `return False` below (local mode, invalid category)
+        # can't leave a stale message that base.py then attaches to an
+        # unrelated failure (bugbot #252). Only the exception handler in THIS
+        # call should ever populate it.
+        self.last_prepare_error = None
+
         # Skip API calls in local mode
         if self.config.EDGE_ENV == "local":
             logger.info(f"Mock: Would prepare dataset {category}")
@@ -454,8 +468,18 @@ def prepare_dataset(
                     f"{RED}Error preparing data: "
                     f"HTTP {e.response.status_code}: {body}{RESET}"
                 )
+                # Stash the backend's response so callers can surface the
+                # actual reason (e.g. "Please provide atleast 2 labels.")
+                # in the user-visible error — instead of pointing at "the
+                # logged API error above" which the user has to grep for.
+                # Issue #251: misleading "Backend failed to prepare the
+                # dataset" message that buried a clear backend reason.
+                self.last_prepare_error = (
+                    f"HTTP {e.response.status_code}: {body}"
+                )
             else:
                 logger.error(f"{RED}Error preparing data: {str(e)[:500]}{RESET}")
+                self.last_prepare_error = str(e)[:500]
             return False
 
     def create_dataset(

diff --git a/tracebloc_ingestor/ingestors/base.py b/tracebloc_ingestor/ingestors/base.py
@@ -966,10 +966,20 @@ def _ingest_with_lock(
                     self.data_format,
                     self.intent,
                 ):
+                    # Surface the BACKEND'S actual reason in the user-visible
+                    # error — not just "see the logged API error above" which
+                    # forces the user to grep the log for the real cause.
+                    # Issue #251: a misleading "Backend failed to prepare the
+                    # dataset" message buried the real reason (e.g. "Please
+                    # provide atleast 2 labels.") in a preceding ERROR line.
+                    detail = (
+                        getattr(self.api_client, "last_prepare_error", None)
+                        or "see the logged API error above"
+                    )
                     raise RuntimeError(
-                        "Backend failed to prepare the dataset; it was NOT "
-                        "registered (its rows are already in the database). See "
-                        "the logged API error above."
+                        f"Backend failed to prepare the dataset; it was NOT "
+                        f"registered (its rows are already in the database). "
+                        f"Backend response: {detail}"
                     )
 
                 self.api_client.create_dataset(