Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 32 additions & 7 deletions packages/markitdown/src/markitdown/converters/_csv_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,17 @@
ACCEPTED_MIME_TYPE_PREFIXES = [
"text/csv",
"application/csv",
"text/tab-separated-values",
"text/tsv",
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I couldn't find text/tsv as IANA recognized MIME type why has this been added? IANA

]
ACCEPTED_FILE_EXTENSIONS = [".csv"]
ACCEPTED_FILE_EXTENSIONS = [".csv", ".tsv"]

SNIFF_SAMPLE_SIZE = 8192


class CsvConverter(DocumentConverter):
"""
Converts CSV files to Markdown tables.
Converts CSV and TSV files to Markdown tables.
"""

def __init__(self):
Expand Down Expand Up @@ -47,8 +51,12 @@ def convert(
else:
content = str(from_bytes(file_stream.read()).best())

# Parse CSV content
reader = csv.reader(io.StringIO(content))
# Auto-detect the delimiter
extension = (stream_info.extension or "").lower()
delimiter = self._detect_delimiter(content, extension)

# Parse content
reader = csv.reader(io.StringIO(content), delimiter=delimiter)
rows = list(reader)

if not rows:
Expand All @@ -57,8 +65,9 @@ def convert(
# Create markdown table
markdown_table = []

# Add header row
markdown_table.append("| " + " | ".join(rows[0]) + " |")
# Add header row (with pipe escaping)
header = [self._escape_cell(cell) for cell in rows[0]]
markdown_table.append("| " + " | ".join(header) + " |")

# Add separator row
markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |")
Expand All @@ -70,8 +79,24 @@ def convert(
row.append("")
# Truncate if row has more columns than header
row = row[: len(rows[0])]
markdown_table.append("| " + " | ".join(row) + " |")
escaped = [self._escape_cell(cell) for cell in row]
markdown_table.append("| " + " | ".join(escaped) + " |")

result = "\n".join(markdown_table)

return DocumentConverterResult(markdown=result)

def _detect_delimiter(self, content: str, extension: str) -> str:
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After doing a little more digging I have found that this implementation is fairly brittle in my opinion.

  1. using the csv.sniffer function for checking for delimiter is highly brittle as it deduces the delimiter using a heuristic methods (like frequency counting) it tends to produce false positives and false negative. you don't check if the guess is correct or not. docs
  2. If case of csv.Error, you only check if the file's extension is ".tsv" but in the case where the file are coming from Object Storage (AWS S3, Google Cloud Storage), these files usually don't have any extension at all, hence checking the MIME type along with extension is better way to fallback. currently all files without extension during csv.Error simply fallback to CSV type.

"""Auto-detect the delimiter using csv.Sniffer, with sensible fallbacks."""
try:
sample = content[:SNIFF_SAMPLE_SIZE]
dialect = csv.Sniffer().sniff(sample)
return dialect.delimiter
except csv.Error:
if extension == ".tsv":
return "\t"
return ","

def _escape_cell(self, cell: str) -> str:
"""Escape characters that would break a Markdown table."""
return cell.replace("|", "\\|").replace("\n", " ").replace("\r", "")
14 changes: 14 additions & 0 deletions packages/markitdown/tests/_test_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,20 @@ class FileTestVector(object):
],
must_not_include=[],
),
FileTestVector(
filename="test.tsv",
mimetype="text/tsv",
charset="ascii",
url=None,
must_include=[
"| Name | Age | City | Notes |",
"| --- | --- | --- | --- |",
"| Alice | 30 | New York | Likes coffee |",
"| Bob | 25 | San Francisco | Uses pipes \\| often |",
"| Charlie | 35 | Chicago | N/A |",
],
must_not_include=[],
),
FileTestVector(
filename="test.json",
mimetype="application/json",
Expand Down
4 changes: 4 additions & 0 deletions packages/markitdown/tests/test_files/test.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Name Age City Notes
Alice 30 New York Likes coffee
Bob 25 San Francisco Uses pipes | often
Charlie 35 Chicago N/A