From 2ac0f772ac0644dfd2084950631b1fea0ba7ee8f Mon Sep 17 00:00:00 2001 From: Divya Date: Mon, 15 Jun 2026 16:10:48 +0530 Subject: [PATCH] fix(label): strip whitespace from label values to prevent silent class duplication (issue #261) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A new-user CSV with ``" A "`` (whitespace-padded) mixed with ``"A"`` landed in MySQL as TWO distinct classes — the ingestor reported ``🎉 Ingestion completed successfully!``, the LabelDiversityValidator happily passed (2 distinct values ≥ 2), and a model trained on the output learned 3 classes instead of the 2 the user intended. Silent label-set corruption. The framework already strips column HEADERS (``chunk.columns.str.strip()`` in csv_ingestor) and the ``data_id`` column (line above in process_record). Extending the same convention to label-column VALUES is internally consistent — a label is a class identifier, and class identifiers don't carry whitespace semantics. (Other VARCHAR columns — free-text descriptions, names, etc. — are left alone; only the label column gets this treatment.) ## Two changes ### 1. Write-side strip — ``BaseIngestor.process_record`` Strip ``label_val.strip()`` for string labels before ``label_policy.apply`` runs. Numeric labels (INT class IDs) and other non-strings pass through unchanged. ### 2. Preflight WARNING — ``LabelDiversityValidator`` Collapse whitespace-equivalent values BEFORE counting distinct, AND surface a WARNING listing the pre-strip variants when collapsing changed anything. The validator and the write side now use the same distinct count, so a dataset with ``[" A ", "A", "B"]``: - Validator: distinct=2 (A, B), warning lists ``{"A": [" A ", "A"]}`` - Ingest: stores ``A`` and ``B`` — 2 classes in MySQL, as intended A dataset where ALL variants collapse to a single class (``[" A ", "A", " A"]``) still fails diversity: - Validator: distinct=1, rejects with a message that explicitly mentions whitespace stripping so the user knows what the validator did ## Tests ``test_label_diversity_validator.py`` (2 new): - ``test_whitespace_duplicates_collapsed_and_warned`` — mixed dataset passes; warning names the variants - ``test_whitespace_only_single_value_still_fails`` — all-collapse case rejects with a clear message ``test_ingestor_base.py`` (3 new): - ``test_process_record_strips_whitespace_from_string_label`` — ``" A "`` → ``"A"`` - ``test_process_record_label_strip_makes_whitespace_variants_equivalent`` — two records with whitespace variants produce identical cleaned labels - ``test_process_record_label_strip_preserves_non_string_labels`` — INT 42 passes through unchanged ## Surfaced by Adversarial new-user test pass against v0.3.10-rc2 (scenario N15): shipped CSV had ``" A "`` from an Excel paste, MySQL ended up with 3 distinct classes ``{" A ", "A", "B"}`` and the ingestor reported success. ## Related - #251: LabelDiversityValidator caught the inverse case (1 distinct). This fixes the silent-but-more-distinct sibling: validator accepts because the strings are bytewise distinct, but the user has fewer semantic classes than that. - The whitespace strip on column HEADERS in csv_ingestor (already in develop) establishes the convention; this extends it to label values. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_ingestor_base.py | 34 ++++++++++ tests/test_label_diversity_validator.py | 39 +++++++++++ tracebloc_ingestor/ingestors/base.py | 16 +++++ .../validators/label_diversity_validator.py | 67 ++++++++++++++++--- 4 files changed, 146 insertions(+), 10 deletions(-) diff --git a/tests/test_ingestor_base.py b/tests/test_ingestor_base.py index 4afe527..27c88e3 100644 --- a/tests/test_ingestor_base.py +++ b/tests/test_ingestor_base.py @@ -138,6 +138,40 @@ def test_process_record_applies_bucket_label_policy(): assert rec["label"] != "12345" +def test_process_record_strips_whitespace_from_string_label(): + """Issue #261: a raw label value like ``" A "`` must be stripped + before the label policy runs, so MySQL stores ``"A"`` and a CSV + with ``" A "`` mixed with ``"A"`` doesn't land as two distinct + classes (silent label-set corruption). + + The strip mirrors what the framework does for ``data_id`` (line + below in process_record) and for column headers (csv_ingestor). + """ + ing = make_ingestor(label_column="lbl", category=None) + rec = ing.process_record({"lbl": " A ", "filename": "f"}) + # PASSTHROUGH policy: label lands verbatim, but stripped. + assert rec["label"] == "A", f"expected stripped 'A', got {rec['label']!r}" + + +def test_process_record_label_strip_makes_whitespace_variants_equivalent(): + """End-to-end check that two records with ``" A "`` and ``"A"`` + produce the SAME cleaned label — the contract the corruption fix + establishes.""" + ing = make_ingestor(label_column="lbl", category=None) + rec1 = ing.process_record({"lbl": " A ", "filename": "f1"}) + rec2 = ing.process_record({"lbl": "A", "filename": "f2"}) + assert rec1["label"] == rec2["label"] == "A" + + +def test_process_record_label_strip_preserves_non_string_labels(): + """INT class IDs and other non-string labels (which have no + whitespace to strip) must pass through unchanged.""" + ing = make_ingestor(label_column="lbl", category=None) + rec = ing.process_record({"lbl": 42, "filename": "f"}) + # Numeric labels pass through the policy unchanged. + assert rec["label"] == 42 + + def test_process_record_preserves_none_for_sql_null(): """Null-like values (Python None, NaN, pd.NA, NaT) must round-trip as Python None so the DB binder writes SQL NULL — not as the literal diff --git a/tests/test_label_diversity_validator.py b/tests/test_label_diversity_validator.py index 6ba25d2..c7586ed 100644 --- a/tests/test_label_diversity_validator.py +++ b/tests/test_label_diversity_validator.py @@ -268,6 +268,45 @@ def test_string_schema_label_no_numeric_collapse(tmp_path): assert result.metadata["distinct_count"] == 4 +# --------------------------------------------------------------------------- +# Whitespace handling — silent-corruption fix (#261) +# --------------------------------------------------------------------------- + +def test_whitespace_duplicates_collapsed_and_warned(): + """Whitespace-padded label values must collapse to the same class + AND surface a WARNING so the user knows the framework will strip + them at write time. Without this, a CSV with ``" A "`` and + ``"A"`` mixed would pass diversity (2 distinct) but train a model + with 3 classes after ingest — silent label-set corruption (#261). + """ + df = pd.DataFrame({"label": [" A ", "A", "B", " A"]}) + result = LabelDiversityValidator().validate(df) + # After collapsing whitespace duplicates, distinct count == 2 (A, B). + assert result.is_valid + assert result.metadata["distinct_count"] == 2 + # Warning must name the offending pre-strip variants so the user + # can fix their upstream data if they meant them to be different. + assert result.warnings, "expected a whitespace-duplicate warning" + w = result.warnings[0] + assert "whitespace" in w.lower() + + +def test_whitespace_only_single_value_still_fails(): + """If the dataset has ``[" A ", "A", " A"]`` (all collapse to ``A``), + after stripping it's a single-class dataset and the validator must + still reject — the silent-corruption fix doesn't undo the diversity + requirement, it just refuses to inflate distinct count via + whitespace differences. + """ + df = pd.DataFrame({"label": [" A ", "A", " A"]}) + result = LabelDiversityValidator().validate(df) + assert not result.is_valid + assert "1 distinct" in result.errors[0] + # The error should mention whitespace stripping so the user knows + # what the validator did. + assert "whitespace" in result.errors[0].lower() + + # --------------------------------------------------------------------------- # Custom column name # --------------------------------------------------------------------------- diff --git a/tracebloc_ingestor/ingestors/base.py b/tracebloc_ingestor/ingestors/base.py index a0bd49b..87bf12c 100644 --- a/tracebloc_ingestor/ingestors/base.py +++ b/tracebloc_ingestor/ingestors/base.py @@ -313,6 +313,22 @@ def _map_unique_id( label_val = label_val.item() except (ValueError, AttributeError): pass + # Strip surrounding whitespace from string label values before + # the policy runs — protects against silent label-set + # corruption (issue #261) where ``" A "`` and ``"A"`` would + # otherwise land as distinct classes in MySQL. A user + # copy-pasting from Excel / another tool routinely has + # whitespace they can't see; the framework's contract for + # the label column is "the class identifier", and class + # identifiers don't carry whitespace semantics. The strip + # mirrors what the framework already does for the + # ``data_id`` column (line below) and for column headers + # (``chunk.columns.str.strip()`` in csv_ingestor). + # + # Non-string labels (INT class IDs, BIOLabelValidator's + # space-separated tags, etc.) pass through unchanged. + if isinstance(label_val, str): + label_val = label_val.strip() cleaned_record["label"] = label_policy_module.apply( label_val, self.label_policy ) diff --git a/tracebloc_ingestor/validators/label_diversity_validator.py b/tracebloc_ingestor/validators/label_diversity_validator.py index 954b341..9c0d75c 100644 --- a/tracebloc_ingestor/validators/label_diversity_validator.py +++ b/tracebloc_ingestor/validators/label_diversity_validator.py @@ -96,40 +96,87 @@ def validate(self, data: Any, **kwargs) -> ValidationResult: metadata={"label_column": self.label_column}, ) - distinct = df[col].dropna().unique() + # Surface whitespace-collapsable duplicates before counting + # distinct values (issue #261). A user CSV with values like + # ``" A "`` mixed with ``"A"`` looks fine in a notebook + # (pandas treats them as distinct, and so do we), inserts + # into MySQL with both stored verbatim, and trains a model + # with one extra class the user never intended — silent + # label-set corruption. Spot the pattern here so the warning + # reaches the user at preflight, and ingestion strips at the + # write side (BaseIngestor.process_record) so MySQL only + # sees the trimmed value. + warnings: list = [] + raw_distinct = df[col].dropna().unique() + # Build the strip-collapsed set on string-typed values only + # (label columns are VARCHAR/CHAR/TEXT in our schema; INT or + # FLOAT labels — if anyone ever has them — have no whitespace + # to collapse). + collapsed: dict = {} + for v in raw_distinct: + if isinstance(v, str): + stripped = v.strip() + collapsed.setdefault(stripped, []).append(v) + else: + collapsed.setdefault(v, []).append(v) + whitespace_dupes = { + stripped: variants + for stripped, variants in collapsed.items() + if len(variants) > 1 + } + if whitespace_dupes: + # Cap the message length — a wholly-messy dataset shouldn't + # produce a 10kB warning. + sample = dict(list(whitespace_dupes.items())[:3]) + warnings.append( + f"label column '{col}' contains values that differ only " + f"in surrounding whitespace and will be stored as " + f"separate classes unless cleaned upstream: {sample}. " + f"Ingestion strips whitespace from the label column at " + f"write time, so MySQL stores the trimmed value — but " + f"if you intended these to be DIFFERENT classes, fix " + f"the CSV before re-running (see issue #261)." + ) + + # Count distinct AFTER collapsing whitespace duplicates — those + # land as ONE class in MySQL after the write-side strip, so the + # validator must use the same number when deciding whether the + # dataset crosses the min_distinct gate. + distinct = list(collapsed.keys()) n = len(distinct) if n < self.min_distinct: # Show the actual values found, capped — a user with a # 50k-row degenerate dataset doesn't need the full list, # but the first few values plus the count tell them # exactly what's wrong with the input. - sample = list(distinct[:5]) + sample = distinct[:5] # Surface counts per distinct value to make "all one # class" stand out clearly: "{'X': 10}" vs "{'X': 10000}" # both clearly read as single-class but the latter gives # the user the full row count for free. - value_counts = df[col].value_counts(dropna=True).head(5).to_dict() + raw_counts = df[col].value_counts(dropna=True).head(5).to_dict() return self._create_result( is_valid=False, errors=[ f"Classification category requires at least " f"{self.min_distinct} distinct label values in column " - f"'{col}'; this dataset has {n} distinct value(s): " - f"{sample}. Value counts: {value_counts}. If this is " - f"intentional (e.g. you have a continuous target), " - f"pick a regression-family category like " - f"tabular_regression or time_series_forecasting " - f"instead." + f"'{col}' (after whitespace stripping); this dataset " + f"has {n} distinct value(s): {sample}. Raw value " + f"counts: {raw_counts}. If this is intentional " + f"(e.g. you have a continuous target), pick a " + f"regression-family category like tabular_regression " + f"or time_series_forecasting instead." ], metadata={ "label_column": col, "distinct_count": n, - "value_counts": value_counts, + "value_counts": raw_counts, }, ) return self._create_result( is_valid=True, + warnings=warnings or None, metadata={ "label_column": col, "distinct_count": n,