From 2ac0f772ac0644dfd2084950631b1fea0ba7ee8f Mon Sep 17 00:00:00 2001
From: Divya <divyasingh@tracebloc.io>
Date: Mon, 15 Jun 2026 16:10:48 +0530
Subject: [PATCH] fix(label): strip whitespace from label values to prevent
 silent class duplication (issue #261)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A new-user CSV with ``"  A  "`` (whitespace-padded) mixed with ``"A"``
landed in MySQL as TWO distinct classes — the ingestor reported
``🎉 Ingestion completed successfully!``, the LabelDiversityValidator
happily passed (2 distinct values ≥ 2), and a model trained on the
output learned 3 classes instead of the 2 the user intended. Silent
label-set corruption.

The framework already strips column HEADERS (``chunk.columns.str.strip()``
in csv_ingestor) and the ``data_id`` column (line above in
process_record). Extending the same convention to label-column VALUES is
internally consistent — a label is a class identifier, and class
identifiers don't carry whitespace semantics. (Other VARCHAR columns —
free-text descriptions, names, etc. — are left alone; only the label
column gets this treatment.)

## Two changes

### 1. Write-side strip — ``BaseIngestor.process_record``

Strip ``label_val.strip()`` for string labels before ``label_policy.apply``
runs. Numeric labels (INT class IDs) and other non-strings pass through
unchanged.

### 2. Preflight WARNING — ``LabelDiversityValidator``

Collapse whitespace-equivalent values BEFORE counting distinct, AND
surface a WARNING listing the pre-strip variants when collapsing
changed anything. The validator and the write side now use the same
distinct count, so a dataset with ``["  A  ", "A", "B"]``:
  - Validator: distinct=2 (A, B), warning lists ``{"A": ["  A  ", "A"]}``
  - Ingest: stores ``A`` and ``B`` — 2 classes in MySQL, as intended

A dataset where ALL variants collapse to a single class
(``["  A  ", "A", "  A"]``) still fails diversity:
  - Validator: distinct=1, rejects with a message that explicitly
    mentions whitespace stripping so the user knows what the validator
    did

## Tests

``test_label_diversity_validator.py`` (2 new):
  - ``test_whitespace_duplicates_collapsed_and_warned`` — mixed
    dataset passes; warning names the variants
  - ``test_whitespace_only_single_value_still_fails`` — all-collapse
    case rejects with a clear message

``test_ingestor_base.py`` (3 new):
  - ``test_process_record_strips_whitespace_from_string_label`` —
    ``"  A  "`` → ``"A"``
  - ``test_process_record_label_strip_makes_whitespace_variants_equivalent``
    — two records with whitespace variants produce identical cleaned
    labels
  - ``test_process_record_label_strip_preserves_non_string_labels`` —
    INT 42 passes through unchanged

## Surfaced by

Adversarial new-user test pass against v0.3.10-rc2 (scenario N15):
shipped CSV had ``"  A  "`` from an Excel paste, MySQL ended up with 3
distinct classes ``{"  A  ", "A", "B"}`` and the ingestor reported
success.

## Related

- #251: LabelDiversityValidator caught the inverse case (1 distinct).
  This fixes the silent-but-more-distinct sibling: validator accepts
  because the strings are bytewise distinct, but the user has fewer
  semantic classes than that.
- The whitespace strip on column HEADERS in csv_ingestor (already in
  develop) establishes the convention; this extends it to label values.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/test_ingestor_base.py                   | 34 ++++++++++
 tests/test_label_diversity_validator.py       | 39 +++++++++++
 tracebloc_ingestor/ingestors/base.py          | 16 +++++
 .../validators/label_diversity_validator.py   | 67 ++++++++++++++++---
 4 files changed, 146 insertions(+), 10 deletions(-)

diff --git a/tests/test_ingestor_base.py b/tests/test_ingestor_base.py
index 4afe527..27c88e3 100644
--- a/tests/test_ingestor_base.py
+++ b/tests/test_ingestor_base.py
@@ -138,6 +138,40 @@ def test_process_record_applies_bucket_label_policy():
     assert rec["label"] != "12345"
 
 
+def test_process_record_strips_whitespace_from_string_label():
+    """Issue #261: a raw label value like ``"  A  "`` must be stripped
+    before the label policy runs, so MySQL stores ``"A"`` and a CSV
+    with ``"  A  "`` mixed with ``"A"`` doesn't land as two distinct
+    classes (silent label-set corruption).
+
+    The strip mirrors what the framework does for ``data_id`` (line
+    below in process_record) and for column headers (csv_ingestor).
+    """
+    ing = make_ingestor(label_column="lbl", category=None)
+    rec = ing.process_record({"lbl": "  A  ", "filename": "f"})
+    # PASSTHROUGH policy: label lands verbatim, but stripped.
+    assert rec["label"] == "A", f"expected stripped 'A', got {rec['label']!r}"
+
+
+def test_process_record_label_strip_makes_whitespace_variants_equivalent():
+    """End-to-end check that two records with ``"  A  "`` and ``"A"``
+    produce the SAME cleaned label — the contract the corruption fix
+    establishes."""
+    ing = make_ingestor(label_column="lbl", category=None)
+    rec1 = ing.process_record({"lbl": "  A  ", "filename": "f1"})
+    rec2 = ing.process_record({"lbl": "A", "filename": "f2"})
+    assert rec1["label"] == rec2["label"] == "A"
+
+
+def test_process_record_label_strip_preserves_non_string_labels():
+    """INT class IDs and other non-string labels (which have no
+    whitespace to strip) must pass through unchanged."""
+    ing = make_ingestor(label_column="lbl", category=None)
+    rec = ing.process_record({"lbl": 42, "filename": "f"})
+    # Numeric labels pass through the policy unchanged.
+    assert rec["label"] == 42
+
+
 def test_process_record_preserves_none_for_sql_null():
     """Null-like values (Python None, NaN, pd.NA, NaT) must round-trip as
     Python None so the DB binder writes SQL NULL — not as the literal
diff --git a/tests/test_label_diversity_validator.py b/tests/test_label_diversity_validator.py
index 6ba25d2..c7586ed 100644
--- a/tests/test_label_diversity_validator.py
+++ b/tests/test_label_diversity_validator.py
@@ -268,6 +268,45 @@ def test_string_schema_label_no_numeric_collapse(tmp_path):
     assert result.metadata["distinct_count"] == 4
 
 
+# ---------------------------------------------------------------------------
+# Whitespace handling — silent-corruption fix (#261)
+# ---------------------------------------------------------------------------
+
+def test_whitespace_duplicates_collapsed_and_warned():
+    """Whitespace-padded label values must collapse to the same class
+    AND surface a WARNING so the user knows the framework will strip
+    them at write time. Without this, a CSV with ``"  A  "`` and
+    ``"A"`` mixed would pass diversity (2 distinct) but train a model
+    with 3 classes after ingest — silent label-set corruption (#261).
+    """
+    df = pd.DataFrame({"label": ["  A  ", "A", "B", "  A"]})
+    result = LabelDiversityValidator().validate(df)
+    # After collapsing whitespace duplicates, distinct count == 2 (A, B).
+    assert result.is_valid
+    assert result.metadata["distinct_count"] == 2
+    # Warning must name the offending pre-strip variants so the user
+    # can fix their upstream data if they meant them to be different.
+    assert result.warnings, "expected a whitespace-duplicate warning"
+    w = result.warnings[0]
+    assert "whitespace" in w.lower()
+
+
+def test_whitespace_only_single_value_still_fails():
+    """If the dataset has ``["  A  ", "A", "  A"]`` (all collapse to ``A``),
+    after stripping it's a single-class dataset and the validator must
+    still reject — the silent-corruption fix doesn't undo the diversity
+    requirement, it just refuses to inflate distinct count via
+    whitespace differences.
+    """
+    df = pd.DataFrame({"label": ["  A  ", "A", "  A"]})
+    result = LabelDiversityValidator().validate(df)
+    assert not result.is_valid
+    assert "1 distinct" in result.errors[0]
+    # The error should mention whitespace stripping so the user knows
+    # what the validator did.
+    assert "whitespace" in result.errors[0].lower()
+
+
 # ---------------------------------------------------------------------------
 # Custom column name
 # ---------------------------------------------------------------------------
diff --git a/tracebloc_ingestor/ingestors/base.py b/tracebloc_ingestor/ingestors/base.py
index a0bd49b..87bf12c 100644
--- a/tracebloc_ingestor/ingestors/base.py
+++ b/tracebloc_ingestor/ingestors/base.py
@@ -313,6 +313,22 @@ def _map_unique_id(
                     label_val = label_val.item()
                 except (ValueError, AttributeError):
                     pass
+            # Strip surrounding whitespace from string label values before
+            # the policy runs — protects against silent label-set
+            # corruption (issue #261) where ``"  A  "`` and ``"A"`` would
+            # otherwise land as distinct classes in MySQL. A user
+            # copy-pasting from Excel / another tool routinely has
+            # whitespace they can't see; the framework's contract for
+            # the label column is "the class identifier", and class
+            # identifiers don't carry whitespace semantics. The strip
+            # mirrors what the framework already does for the
+            # ``data_id`` column (line below) and for column headers
+            # (``chunk.columns.str.strip()`` in csv_ingestor).
+            #
+            # Non-string labels (INT class IDs, BIOLabelValidator's
+            # space-separated tags, etc.) pass through unchanged.
+            if isinstance(label_val, str):
+                label_val = label_val.strip()
             cleaned_record["label"] = label_policy_module.apply(
                 label_val, self.label_policy
             )
diff --git a/tracebloc_ingestor/validators/label_diversity_validator.py b/tracebloc_ingestor/validators/label_diversity_validator.py
index 954b341..9c0d75c 100644
--- a/tracebloc_ingestor/validators/label_diversity_validator.py
+++ b/tracebloc_ingestor/validators/label_diversity_validator.py
@@ -96,40 +96,87 @@ def validate(self, data: Any, **kwargs) -> ValidationResult:
                     metadata={"label_column": self.label_column},
                 )
 
-            distinct = df[col].dropna().unique()
+            # Surface whitespace-collapsable duplicates before counting
+            # distinct values (issue #261). A user CSV with values like
+            # ``"  A  "`` mixed with ``"A"`` looks fine in a notebook
+            # (pandas treats them as distinct, and so do we), inserts
+            # into MySQL with both stored verbatim, and trains a model
+            # with one extra class the user never intended — silent
+            # label-set corruption. Spot the pattern here so the warning
+            # reaches the user at preflight, and ingestion strips at the
+            # write side (BaseIngestor.process_record) so MySQL only
+            # sees the trimmed value.
+            warnings: list = []
+            raw_distinct = df[col].dropna().unique()
+            # Build the strip-collapsed set on string-typed values only
+            # (label columns are VARCHAR/CHAR/TEXT in our schema; INT or
+            # FLOAT labels — if anyone ever has them — have no whitespace
+            # to collapse).
+            collapsed: dict = {}
+            for v in raw_distinct:
+                if isinstance(v, str):
+                    stripped = v.strip()
+                    collapsed.setdefault(stripped, []).append(v)
+                else:
+                    collapsed.setdefault(v, []).append(v)
+            whitespace_dupes = {
+                stripped: variants
+                for stripped, variants in collapsed.items()
+                if len(variants) > 1
+            }
+            if whitespace_dupes:
+                # Cap the message length — a wholly-messy dataset shouldn't
+                # produce a 10kB warning.
+                sample = dict(list(whitespace_dupes.items())[:3])
+                warnings.append(
+                    f"label column '{col}' contains values that differ only "
+                    f"in surrounding whitespace and will be stored as "
+                    f"separate classes unless cleaned upstream: {sample}. "
+                    f"Ingestion strips whitespace from the label column at "
+                    f"write time, so MySQL stores the trimmed value — but "
+                    f"if you intended these to be DIFFERENT classes, fix "
+                    f"the CSV before re-running (see issue #261)."
+                )
+
+            # Count distinct AFTER collapsing whitespace duplicates — those
+            # land as ONE class in MySQL after the write-side strip, so the
+            # validator must use the same number when deciding whether the
+            # dataset crosses the min_distinct gate.
+            distinct = list(collapsed.keys())
             n = len(distinct)
             if n < self.min_distinct:
                 # Show the actual values found, capped — a user with a
                 # 50k-row degenerate dataset doesn't need the full list,
                 # but the first few values plus the count tell them
                 # exactly what's wrong with the input.
-                sample = list(distinct[:5])
+                sample = distinct[:5]
                 # Surface counts per distinct value to make "all one
                 # class" stand out clearly: "{'X': 10}" vs "{'X': 10000}"
                 # both clearly read as single-class but the latter gives
                 # the user the full row count for free.
-                value_counts = df[col].value_counts(dropna=True).head(5).to_dict()
+                raw_counts = df[col].value_counts(dropna=True).head(5).to_dict()
                 return self._create_result(
                     is_valid=False,
                     errors=[
                         f"Classification category requires at least "
                         f"{self.min_distinct} distinct label values in column "
-                        f"'{col}'; this dataset has {n} distinct value(s): "
-                        f"{sample}. Value counts: {value_counts}. If this is "
-                        f"intentional (e.g. you have a continuous target), "
-                        f"pick a regression-family category like "
-                        f"tabular_regression or time_series_forecasting "
-                        f"instead."
+                        f"'{col}' (after whitespace stripping); this dataset "
+                        f"has {n} distinct value(s): {sample}. Raw value "
+                        f"counts: {raw_counts}. If this is intentional "
+                        f"(e.g. you have a continuous target), pick a "
+                        f"regression-family category like tabular_regression "
+                        f"or time_series_forecasting instead."
                     ],
                     metadata={
                         "label_column": col,
                         "distinct_count": n,
-                        "value_counts": value_counts,
+                        "value_counts": raw_counts,
                     },
                 )
 
             return self._create_result(
                 is_valid=True,
+                warnings=warnings or None,
                 metadata={
                     "label_column": col,
                     "distinct_count": n,