Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions tests/test_ingestor_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,40 @@ def test_process_record_applies_bucket_label_policy():
assert rec["label"] != "12345"


def test_process_record_strips_whitespace_from_string_label():
"""Issue #261: a raw label value like ``" A "`` must be stripped
before the label policy runs, so MySQL stores ``"A"`` and a CSV
with ``" A "`` mixed with ``"A"`` doesn't land as two distinct
classes (silent label-set corruption).

The strip mirrors what the framework does for ``data_id`` (line
below in process_record) and for column headers (csv_ingestor).
"""
ing = make_ingestor(label_column="lbl", category=None)
rec = ing.process_record({"lbl": " A ", "filename": "f"})
# PASSTHROUGH policy: label lands verbatim, but stripped.
assert rec["label"] == "A", f"expected stripped 'A', got {rec['label']!r}"


def test_process_record_label_strip_makes_whitespace_variants_equivalent():
"""End-to-end check that two records with ``" A "`` and ``"A"``
produce the SAME cleaned label β€” the contract the corruption fix
establishes."""
ing = make_ingestor(label_column="lbl", category=None)
rec1 = ing.process_record({"lbl": " A ", "filename": "f1"})
rec2 = ing.process_record({"lbl": "A", "filename": "f2"})
assert rec1["label"] == rec2["label"] == "A"


def test_process_record_label_strip_preserves_non_string_labels():
"""INT class IDs and other non-string labels (which have no
whitespace to strip) must pass through unchanged."""
ing = make_ingestor(label_column="lbl", category=None)
rec = ing.process_record({"lbl": 42, "filename": "f"})
# Numeric labels pass through the policy unchanged.
assert rec["label"] == 42


def test_process_record_preserves_none_for_sql_null():
"""Null-like values (Python None, NaN, pd.NA, NaT) must round-trip as
Python None so the DB binder writes SQL NULL β€” not as the literal
Expand Down
39 changes: 39 additions & 0 deletions tests/test_label_diversity_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,45 @@ def test_string_schema_label_no_numeric_collapse(tmp_path):
assert result.metadata["distinct_count"] == 4


# ---------------------------------------------------------------------------
# Whitespace handling β€” silent-corruption fix (#261)
# ---------------------------------------------------------------------------

def test_whitespace_duplicates_collapsed_and_warned():
"""Whitespace-padded label values must collapse to the same class
AND surface a WARNING so the user knows the framework will strip
them at write time. Without this, a CSV with ``" A "`` and
``"A"`` mixed would pass diversity (2 distinct) but train a model
with 3 classes after ingest β€” silent label-set corruption (#261).
"""
df = pd.DataFrame({"label": [" A ", "A", "B", " A"]})
result = LabelDiversityValidator().validate(df)
# After collapsing whitespace duplicates, distinct count == 2 (A, B).
assert result.is_valid
assert result.metadata["distinct_count"] == 2
# Warning must name the offending pre-strip variants so the user
# can fix their upstream data if they meant them to be different.
assert result.warnings, "expected a whitespace-duplicate warning"
w = result.warnings[0]
assert "whitespace" in w.lower()


def test_whitespace_only_single_value_still_fails():
"""If the dataset has ``[" A ", "A", " A"]`` (all collapse to ``A``),
after stripping it's a single-class dataset and the validator must
still reject β€” the silent-corruption fix doesn't undo the diversity
requirement, it just refuses to inflate distinct count via
whitespace differences.
"""
df = pd.DataFrame({"label": [" A ", "A", " A"]})
result = LabelDiversityValidator().validate(df)
assert not result.is_valid
assert "1 distinct" in result.errors[0]
# The error should mention whitespace stripping so the user knows
# what the validator did.
assert "whitespace" in result.errors[0].lower()


# ---------------------------------------------------------------------------
# Custom column name
# ---------------------------------------------------------------------------
Expand Down
16 changes: 16 additions & 0 deletions tracebloc_ingestor/ingestors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,22 @@ def _map_unique_id(
label_val = label_val.item()
except (ValueError, AttributeError):
pass
# Strip surrounding whitespace from string label values before
# the policy runs β€” protects against silent label-set
# corruption (issue #261) where ``" A "`` and ``"A"`` would
# otherwise land as distinct classes in MySQL. A user
# copy-pasting from Excel / another tool routinely has
# whitespace they can't see; the framework's contract for
# the label column is "the class identifier", and class
# identifiers don't carry whitespace semantics. The strip
# mirrors what the framework already does for the
# ``data_id`` column (line below) and for column headers
# (``chunk.columns.str.strip()`` in csv_ingestor).
#
# Non-string labels (INT class IDs, BIOLabelValidator's
# space-separated tags, etc.) pass through unchanged.
if isinstance(label_val, str):
label_val = label_val.strip()
cleaned_record["label"] = label_policy_module.apply(
label_val, self.label_policy
)
Expand Down
67 changes: 57 additions & 10 deletions tracebloc_ingestor/validators/label_diversity_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,40 +96,87 @@ def validate(self, data: Any, **kwargs) -> ValidationResult:
metadata={"label_column": self.label_column},
)

distinct = df[col].dropna().unique()
# Surface whitespace-collapsable duplicates before counting
# distinct values (issue #261). A user CSV with values like
# ``" A "`` mixed with ``"A"`` looks fine in a notebook
# (pandas treats them as distinct, and so do we), inserts
# into MySQL with both stored verbatim, and trains a model
# with one extra class the user never intended β€” silent
# label-set corruption. Spot the pattern here so the warning
# reaches the user at preflight, and ingestion strips at the
# write side (BaseIngestor.process_record) so MySQL only
# sees the trimmed value.
warnings: list = []
raw_distinct = df[col].dropna().unique()
# Build the strip-collapsed set on string-typed values only
# (label columns are VARCHAR/CHAR/TEXT in our schema; INT or
# FLOAT labels β€” if anyone ever has them β€” have no whitespace
# to collapse).
collapsed: dict = {}
for v in raw_distinct:
if isinstance(v, str):
stripped = v.strip()
collapsed.setdefault(stripped, []).append(v)
else:
collapsed.setdefault(v, []).append(v)
whitespace_dupes = {
stripped: variants
for stripped, variants in collapsed.items()
if len(variants) > 1
}
if whitespace_dupes:
# Cap the message length β€” a wholly-messy dataset shouldn't
# produce a 10kB warning.
sample = dict(list(whitespace_dupes.items())[:3])
warnings.append(
f"label column '{col}' contains values that differ only "
f"in surrounding whitespace and will be stored as "
f"separate classes unless cleaned upstream: {sample}. "
f"Ingestion strips whitespace from the label column at "
f"write time, so MySQL stores the trimmed value β€” but "
f"if you intended these to be DIFFERENT classes, fix "
f"the CSV before re-running (see issue #261)."
)

# Count distinct AFTER collapsing whitespace duplicates β€” those
# land as ONE class in MySQL after the write-side strip, so the
# validator must use the same number when deciding whether the
# dataset crosses the min_distinct gate.
distinct = list(collapsed.keys())
n = len(distinct)
if n < self.min_distinct:
# Show the actual values found, capped β€” a user with a
# 50k-row degenerate dataset doesn't need the full list,
# but the first few values plus the count tell them
# exactly what's wrong with the input.
sample = list(distinct[:5])
sample = distinct[:5]
# Surface counts per distinct value to make "all one
# class" stand out clearly: "{'X': 10}" vs "{'X': 10000}"
# both clearly read as single-class but the latter gives
# the user the full row count for free.
value_counts = df[col].value_counts(dropna=True).head(5).to_dict()
raw_counts = df[col].value_counts(dropna=True).head(5).to_dict()
return self._create_result(
is_valid=False,
errors=[
f"Classification category requires at least "
f"{self.min_distinct} distinct label values in column "
f"'{col}'; this dataset has {n} distinct value(s): "
f"{sample}. Value counts: {value_counts}. If this is "
f"intentional (e.g. you have a continuous target), "
f"pick a regression-family category like "
f"tabular_regression or time_series_forecasting "
f"instead."
f"'{col}' (after whitespace stripping); this dataset "
f"has {n} distinct value(s): {sample}. Raw value "
f"counts: {raw_counts}. If this is intentional "
f"(e.g. you have a continuous target), pick a "
f"regression-family category like tabular_regression "
f"or time_series_forecasting instead."
],
metadata={
"label_column": col,
"distinct_count": n,
"value_counts": value_counts,
"value_counts": raw_counts,
},
)

return self._create_result(
is_valid=True,
warnings=warnings or None,
metadata={
"label_column": col,
"distinct_count": n,
Expand Down
Loading