Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions packages/markitdown/src/markitdown/converters/_csv_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@
ACCEPTED_FILE_EXTENSIONS = [".csv"]


def _escape_cell(value: str) -> str:
# A literal pipe or newline inside a cell would split it into extra
# columns or rows when rendered, so escape pipes and flatten newlines.
return value.replace("|", "\\|").replace("\r\n", " ").replace("\n", " ").replace(
"\r", " "
)


class CsvConverter(DocumentConverter):
"""
Converts CSV files to Markdown tables.
Expand Down Expand Up @@ -58,7 +66,9 @@ def convert(
markdown_table = []

# Add header row
markdown_table.append("| " + " | ".join(rows[0]) + " |")
markdown_table.append(
"| " + " | ".join(_escape_cell(cell) for cell in rows[0]) + " |"
)

# Add separator row
markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |")
Expand All @@ -70,7 +80,9 @@ def convert(
row.append("")
# Truncate if row has more columns than header
row = row[: len(rows[0])]
markdown_table.append("| " + " | ".join(row) + " |")
markdown_table.append(
"| " + " | ".join(_escape_cell(cell) for cell in row) + " |"
)

result = "\n".join(markdown_table)

Expand Down
29 changes: 29 additions & 0 deletions packages/markitdown/tests/test_csv_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import io

from markitdown.converters._csv_converter import CsvConverter
from markitdown._stream_info import StreamInfo

CSV_STREAM_INFO = StreamInfo(extension=".csv", mimetype="text/csv")


def _convert(data: bytes) -> str:
return CsvConverter().convert(io.BytesIO(data), CSV_STREAM_INFO).markdown


def test_pipe_in_cell_is_escaped():
markdown = _convert(b"name,note\nAlice,a|b\nBob,plain\n")
lines = markdown.splitlines()
# Header, separator and two data rows.
assert len(lines) == 4
# Every row must have the same number of unescaped column separators.
expected_pipes = lines[0].count("|")
for line in lines:
assert line.replace("\\|", "").count("|") == expected_pipes
assert "a\\|b" in markdown


def test_newline_in_cell_does_not_break_row():
markdown = _convert(b'name,note\n"Bob","line1\nline2"\n')
lines = markdown.splitlines()
assert len(lines) == 3
assert "line1 line2" in lines[2]