Skip to content
47 changes: 47 additions & 0 deletions tests/test_api_client_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,53 @@ def test_prepare_dataset_local_mode():
get.assert_not_called()


def test_prepare_dataset_error_captures_response_body():
"""On HTTP error, `prepare_dataset` must stash the backend response body
on `self.last_prepare_error` so callers can surface the actual reason
in their user-visible error — instead of pointing at "the logged API
error above". Issue #251.

The body retained must include the status code and the response text
(capped) so a downstream RuntimeError can include both.
"""
client = _client()
body = '{"message":"Please provide atleast 2 labels."}'
with patch.object(client.session, "get", return_value=_resp(400, text=body)):
ok = client.prepare_dataset(
TaskCategory.TABULAR_CLASSIFICATION, "ing", "tabular", "train"
)
assert ok is False
assert client.last_prepare_error is not None
# Status code + body both surface so the user sees the backend reason.
assert "HTTP 400" in client.last_prepare_error
assert "Please provide" in client.last_prepare_error


def test_prepare_dataset_last_error_starts_unset():
"""On a clean ingestor with no prior failure, `last_prepare_error`
is None — base.py falls back to its generic 'see logged API error
above' message only when this attribute is truly absent."""
client = _client()
assert client.last_prepare_error is None


def test_prepare_dataset_network_error_captures_string():
"""When `e.response` is None (DNS / connection refused / timeout),
last_prepare_error should still be populated with the stringified
exception — never silently fall through to None."""
import requests as _req

client = _client()
err = _req.exceptions.ConnectionError("name resolution failed")
with patch.object(client.session, "get", side_effect=err):
ok = client.prepare_dataset(
TaskCategory.TABULAR_CLASSIFICATION, "ing", "tabular", "train"
)
assert ok is False
assert client.last_prepare_error is not None
assert "name resolution failed" in client.last_prepare_error


# ---------------------------------------------------------------------------
# create_dataset
# ---------------------------------------------------------------------------
Expand Down
161 changes: 161 additions & 0 deletions tests/test_label_diversity_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
"""Tests for LabelDiversityValidator — fail-fast on single-label classification.

Surfaces the cause locally (with the actual distinct label values) instead
of letting the backend reject with the misleading "Backend failed to
prepare the dataset; it was NOT registered" cascade once rows have already
landed in MySQL. Issue #251.
"""

from __future__ import annotations

import pandas as pd
import pytest

from tracebloc_ingestor.validators.label_diversity_validator import (
LabelDiversityValidator,
)


# ---------------------------------------------------------------------------
# Positive cases — must pass
# ---------------------------------------------------------------------------

def test_two_distinct_labels_passes():
df = pd.DataFrame({"a": [1, 2, 3, 4], "label": ["A", "B", "A", "B"]})
result = LabelDiversityValidator().validate(df)
assert result.is_valid


def test_many_distinct_labels_passes():
df = pd.DataFrame({"label": ["A", "B", "C", "D", "E"]})
result = LabelDiversityValidator().validate(df)
assert result.is_valid


def test_distinct_labels_with_nulls_counts_only_non_null():
"""Null cells don't count toward distinct labels; if there are still
≥2 distinct non-null values the dataset is fine."""
df = pd.DataFrame({"label": ["A", "B", None, None]})
result = LabelDiversityValidator().validate(df)
assert result.is_valid
assert result.metadata["distinct_count"] == 2


def test_label_column_case_insensitive_match():
"""A CSV header ``Label`` should still resolve when the validator is
configured for ``label`` (default). Matches the case-insensitive
pattern BIOLabelValidator uses."""
df = pd.DataFrame({"a": [1, 2], "Label": ["X", "Y"]})
result = LabelDiversityValidator().validate(df)
assert result.is_valid


# ---------------------------------------------------------------------------
# Failure cases — must reject with a CLEAR message
# ---------------------------------------------------------------------------

def test_single_label_fails_with_distinct_value_listed():
"""A 10-row dataset where every row has ``label = "X"`` is not a
classification dataset. The error must name the offending distinct
value(s) and the count so the user immediately sees what's wrong."""
df = pd.DataFrame({"label": ["X"] * 10})
result = LabelDiversityValidator().validate(df)
assert not result.is_valid
err = result.errors[0]
# User-facing message must include the actual single value found.
assert "'X'" in err or "'X'" in str(result.metadata.get("value_counts", {}))
# And explain WHY it's rejected.
assert "classification" in err.lower()
assert "distinct" in err.lower()


def test_single_label_only_nulls_fails():
"""All-null label column → 0 distinct → rejected."""
df = pd.DataFrame({"label": [None, None, None]})
result = LabelDiversityValidator().validate(df)
assert not result.is_valid


def test_single_label_one_value_with_some_nulls_fails():
"""One distinct value plus nulls is still only 1 distinct value."""
df = pd.DataFrame({"label": ["A", "A", None, "A", None]})
result = LabelDiversityValidator().validate(df)
assert not result.is_valid


def test_error_mentions_regression_alternative():
"""A user who has a continuous target shouldn't be told 'add a fake
second label' — the error should point them at regression-family
categories which legitimately accept a single target column."""
df = pd.DataFrame({"label": ["A"] * 5})
result = LabelDiversityValidator().validate(df)
assert not result.is_valid
assert "regression" in result.errors[0].lower()


# ---------------------------------------------------------------------------
# Defensive paths — must not double-report (other validators own those)
# ---------------------------------------------------------------------------

def test_empty_dataframe_passes_silently():
"""Empty input is the 'no rows' / empty-CSV class — handled by other
validators with their own clear messages. Don't double-report."""
result = LabelDiversityValidator().validate(pd.DataFrame())
assert result.is_valid
assert result.metadata["rows_checked"] == 0


def test_label_column_missing_passes_with_warning():
"""If the CSV has no label column at all, that's a schema-mismatch
case handled by other layers. This validator just warns and skips."""
df = pd.DataFrame({"a": [1, 2], "b": ["x", "y"]})
result = LabelDiversityValidator().validate(df)
assert result.is_valid
assert any("not found in CSV" in w for w in (result.warnings or []))


# ---------------------------------------------------------------------------
# CSV-path streaming check — must not load the whole wide CSV into memory
# ---------------------------------------------------------------------------

def test_csv_path_reads_only_the_label_column(tmp_path):
"""For a wide CSV (one label + many feature columns), the validator
must read only the label column — counting distinct labels doesn't
need the features, and a multi-GB proteomics panel would otherwise
OOM. Mirrors the streaming-first patterns elsewhere in the codebase
(DataValidator's chunked path)."""
p = tmp_path / "wide.csv"
cols = ",".join([f"f{i:02d}" for i in range(50)] + ["label"])
rows = "\n".join(
[",".join(["0.5"] * 50 + [("A" if i % 2 else "B")]) for i in range(20)]
)
p.write_text(cols + "\n" + rows + "\n")
result = LabelDiversityValidator().validate(str(p))
assert result.is_valid
assert result.metadata["distinct_count"] == 2


def test_csv_path_rejects_single_label(tmp_path):
"""End-to-end CSV-path test of the failure case — mirrors the
adversarial test against v0.3.10-rc1 that surfaced #251."""
p = tmp_path / "single.csv"
p.write_text("id,label\n1,X\n2,X\n3,X\n")
result = LabelDiversityValidator().validate(str(p))
assert not result.is_valid
assert "1 distinct" in result.errors[0]
assert "'X'" in result.errors[0] or "'X'" in str(result.metadata.get("value_counts", {}))


# ---------------------------------------------------------------------------
# Custom column name
# ---------------------------------------------------------------------------

def test_custom_label_column_name():
"""When the user configures a non-default label column, the
validator must check THAT column (mirrors BIOLabelValidator's
behavior with custom columns)."""
df = pd.DataFrame({"target": ["A", "B"], "label": ["X", "X"]})
# Custom column is the diverse one — should pass.
assert LabelDiversityValidator(label_column="target").validate(df).is_valid
# The default `label` column is single-value here — should fail.
assert not LabelDiversityValidator(label_column="label").validate(df).is_valid
17 changes: 17 additions & 0 deletions tracebloc_ingestor/api/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ def __init__(self, config: Config):
self.config = config
self.session = self._create_session()

# Last `prepare_dataset` HTTP-error body, retained so callers
# can include the actual backend reason in the user-visible
# RuntimeError instead of just pointing at "the logged API
# error above" (issue #251). Set by `prepare_dataset`'s error
# handler; remains None on a clean run.
self.last_prepare_error: Optional[str] = None

# Auth resolution order:
# 1. local mode → mock token, no network call
# 2. BACKEND_TOKEN set → use it directly (preferred; mirrors the
Expand Down Expand Up @@ -454,8 +461,18 @@ def prepare_dataset(
f"{RED}Error preparing data: "
f"HTTP {e.response.status_code}: {body}{RESET}"
)
# Stash the backend's response so callers can surface the
Comment thread
cursor[bot] marked this conversation as resolved.
# actual reason (e.g. "Please provide atleast 2 labels.")
# in the user-visible error — instead of pointing at "the
# logged API error above" which the user has to grep for.
# Issue #251: misleading "Backend failed to prepare the
# dataset" message that buried a clear backend reason.
self.last_prepare_error = (
f"HTTP {e.response.status_code}: {body}"
)
else:
logger.error(f"{RED}Error preparing data: {str(e)[:500]}{RESET}")
self.last_prepare_error = str(e)[:500]
return False

def create_dataset(
Expand Down
16 changes: 13 additions & 3 deletions tracebloc_ingestor/ingestors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -966,10 +966,20 @@ def _ingest_with_lock(
self.data_format,
self.intent,
):
# Surface the BACKEND'S actual reason in the user-visible
# error — not just "see the logged API error above" which
# forces the user to grep the log for the real cause.
# Issue #251: a misleading "Backend failed to prepare the
# dataset" message buried the real reason (e.g. "Please
# provide atleast 2 labels.") in a preceding ERROR line.
detail = (
getattr(self.api_client, "last_prepare_error", None)
or "see the logged API error above"
)
raise RuntimeError(
"Backend failed to prepare the dataset; it was NOT "
"registered (its rows are already in the database). See "
"the logged API error above."
f"Backend failed to prepare the dataset; it was NOT "
f"registered (its rows are already in the database). "
f"Backend response: {detail}"
)

self.api_client.create_dataset(
Expand Down
27 changes: 27 additions & 0 deletions tracebloc_ingestor/utils/validators_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,38 @@
from tracebloc_ingestor.validators.tokenizer_validator import TokenizerValidator
from tracebloc_ingestor.validators.file_pairing_validator import FilePairingValidator
from tracebloc_ingestor.validators.bio_label_validator import BIOLabelValidator
from tracebloc_ingestor.validators.label_diversity_validator import (
LabelDiversityValidator,
)
from tracebloc_ingestor.utils.constants import TaskCategory, FileExtension


def _label_diversity_validator(options: Dict[str, Any]) -> LabelDiversityValidator:
"""Construct a LabelDiversityValidator using the user-configured
label column name (or the framework default ``label``). Centralised
so every classification-family branch wires the same instance shape.

Issue #251: a classification dataset with one distinct label value
is unlearnable and the backend rejects it at ``/global_meta/prepare/``
with ``HTTP 400: "Please provide atleast 2 labels."``. Catching it
at preflight surfaces the actual cause (and lists the offending
label value(s)) instead of cascading to a misleading
"Backend failed to prepare the dataset" message after the rows
have already landed in MySQL.
"""
return LabelDiversityValidator(
label_column=options.get("label_column") or "label",
)


def map_validators(
task_category: TaskCategory, options: Dict[str, Any]
) -> List[BaseValidator]:
if task_category == TaskCategory.IMAGE_CLASSIFICATION:
return [
FileTypeValidator(allowed_extension=options["extension"], path="images"),
ImageResolutionValidator(expected_resolution=options["target_size"]),
_label_diversity_validator(options),
TableNameValidator(),
DuplicateValidator(),
]
Expand All @@ -48,6 +70,7 @@ def map_validators(
sidecar_label="annotation",
),
ImageResolutionValidator(expected_resolution=options["target_size"]),
_label_diversity_validator(options),
TableNameValidator(),
DuplicateValidator(),
]
Expand All @@ -57,6 +80,7 @@ def map_validators(
# Add data validator if schema is provided
if options.get("schema"):
validators.append(DataValidator(schema=options["schema"]))
validators.append(_label_diversity_validator(options))
validators.append(TableNameValidator())
validators.append(DuplicateValidator())

Expand All @@ -80,6 +104,7 @@ def map_validators(
if options.get("schema"):
validators.append(DataValidator(schema=options["schema"]))

validators.append(_label_diversity_validator(options))
validators.append(TableNameValidator())
validators.append(DuplicateValidator())

Expand Down Expand Up @@ -189,6 +214,7 @@ def map_validators(
sidecar_suffix="_mask",
),
ImageResolutionValidator(expected_resolution=options["target_size"]),
_label_diversity_validator(options),
TableNameValidator(),
DuplicateValidator(),
]
Expand All @@ -206,6 +232,7 @@ def map_validators(
num_keypoints=options.get("number_of_keypoints")
),
KeypointVisibilityValidator(),
_label_diversity_validator(options),
TableNameValidator(),
DuplicateValidator(),
]
Expand Down
Loading
Loading