Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions tests/test_csv_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,24 @@ def test_read_data_unique_id_column_missing_raises(make_csv):
list(ing.read_data(str(path)))


def test_read_data_empty_file_returns_nothing(tmp_path):
def test_read_data_empty_file_raises_with_clear_message(tmp_path):
"""An empty (zero-byte) CSV is a hard input error, not a successful
'0 rows' run. The old code logged a WARNING and silently returned an
empty generator — the ingestor then created an empty MySQL table and
called `send_generate_edge_label_meta`, which 400'd with the misleading
'No data found for table X' message (same cascade #213 traced for
self-supervised + label mismatch). The user blamed the backend.

Fail fast at the read layer with a clear, source-truthful message
naming the path and pointing at staging as the likely cause —
DataValidator's existing 'No data found to validate' path catches it
before any backend round-trip.
"""
p = tmp_path / "empty.csv"
p.write_text("")
ing = make_csv_ingestor(schema={})
assert list(ing.read_data(str(p))) == []
with pytest.raises(ValueError, match="Empty CSV file"):
list(ing.read_data(str(p)))


def test_validate_csv_type_coercion():
Expand Down
20 changes: 18 additions & 2 deletions tracebloc_ingestor/ingestors/csv_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,8 +490,24 @@ def read_data(self, file_path: str) -> Generator[Dict[str, Any], None, None]:
yield record

except pd.errors.EmptyDataError:
logger.warning(f"{YELLOW}Empty CSV file: {file_path}{RESET}")
return
# An empty (zero-byte) CSV is a hard input error, not a
# successful "0 rows" run. Previously this branch logged a
# WARNING and silently returned an empty generator — the
# ingestor then proceeded to create an empty MySQL table and
# called `send_generate_edge_label_meta`, which 400'd with
# the misleading "No data found for table X" message that
# blamed the BACKEND instead of the input. (Same misleading
# cascade #213 traced for self-supervised + label mismatch.)
# Raise here so DataValidator's existing "No data found to
# validate" error path surfaces the empty-input cause with a
# clear, source-truthful message — and no backend round-trip.
raise ValueError(
f"{RED}Empty CSV file: {file_path}. The file has no "
f"header and no rows. Either stage a non-empty CSV at "
f"this path, or check the cluster-side path (the chart "
f"mounts your PVC at /data/shared/ — confirm staging "
f"completed before helm install).{RESET}"
)

except (pd.errors.ParserError, Exception):
raise
Expand Down
Loading