diff --git a/tests/test_csv_ingestor.py b/tests/test_csv_ingestor.py index a542e17..1353487 100644 --- a/tests/test_csv_ingestor.py +++ b/tests/test_csv_ingestor.py @@ -96,11 +96,24 @@ def test_read_data_unique_id_column_missing_raises(make_csv): list(ing.read_data(str(path))) -def test_read_data_empty_file_returns_nothing(tmp_path): +def test_read_data_empty_file_raises_with_clear_message(tmp_path): + """An empty (zero-byte) CSV is a hard input error, not a successful + '0 rows' run. The old code logged a WARNING and silently returned an + empty generator — the ingestor then created an empty MySQL table and + called `send_generate_edge_label_meta`, which 400'd with the misleading + 'No data found for table X' message (same cascade #213 traced for + self-supervised + label mismatch). The user blamed the backend. + + Fail fast at the read layer with a clear, source-truthful message + naming the path and pointing at staging as the likely cause — + DataValidator's existing 'No data found to validate' path catches it + before any backend round-trip. + """ p = tmp_path / "empty.csv" p.write_text("") ing = make_csv_ingestor(schema={}) - assert list(ing.read_data(str(p))) == [] + with pytest.raises(ValueError, match="Empty CSV file"): + list(ing.read_data(str(p))) def test_validate_csv_type_coercion(): diff --git a/tracebloc_ingestor/ingestors/csv_ingestor.py b/tracebloc_ingestor/ingestors/csv_ingestor.py index a8ff312..46c6f93 100644 --- a/tracebloc_ingestor/ingestors/csv_ingestor.py +++ b/tracebloc_ingestor/ingestors/csv_ingestor.py @@ -490,8 +490,24 @@ def read_data(self, file_path: str) -> Generator[Dict[str, Any], None, None]: yield record except pd.errors.EmptyDataError: - logger.warning(f"{YELLOW}Empty CSV file: {file_path}{RESET}") - return + # An empty (zero-byte) CSV is a hard input error, not a + # successful "0 rows" run. Previously this branch logged a + # WARNING and silently returned an empty generator — the + # ingestor then proceeded to create an empty MySQL table and + # called `send_generate_edge_label_meta`, which 400'd with + # the misleading "No data found for table X" message that + # blamed the BACKEND instead of the input. (Same misleading + # cascade #213 traced for self-supervised + label mismatch.) + # Raise here so DataValidator's existing "No data found to + # validate" error path surfaces the empty-input cause with a + # clear, source-truthful message — and no backend round-trip. + raise ValueError( + f"{RED}Empty CSV file: {file_path}. The file has no " + f"header and no rows. Either stage a non-empty CSV at " + f"this path, or check the cluster-side path (the chart " + f"mounts your PVC at /data/shared/ — confirm staging " + f"completed before helm install).{RESET}" + ) except (pd.errors.ParserError, Exception): raise