From 2b687fbe362f77ab78eb3b0c5452f759c2991b1a Mon Sep 17 00:00:00 2001 From: LeSingh1 Date: Sun, 31 May 2026 22:50:59 -0700 Subject: [PATCH] Escape pipes and newlines in CSV to Markdown table cells CsvConverter wrote cell values straight into the Markdown table without escaping. A cell containing a literal pipe added an extra column separator, and a quoted field with an embedded newline split the row in two. Either case produces a malformed table whose data rows no longer match the header column count. Add an _escape_cell helper that escapes pipe as \| and flattens embedded newlines to spaces, and apply it to header and data cells. --- .../markitdown/converters/_csv_converter.py | 16 ++++++++-- .../markitdown/tests/test_csv_converter.py | 29 +++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 packages/markitdown/tests/test_csv_converter.py diff --git a/packages/markitdown/src/markitdown/converters/_csv_converter.py b/packages/markitdown/src/markitdown/converters/_csv_converter.py index 7e9631e1b..988de75e7 100644 --- a/packages/markitdown/src/markitdown/converters/_csv_converter.py +++ b/packages/markitdown/src/markitdown/converters/_csv_converter.py @@ -12,6 +12,14 @@ ACCEPTED_FILE_EXTENSIONS = [".csv"] +def _escape_cell(value: str) -> str: + # A literal pipe or newline inside a cell would split it into extra + # columns or rows when rendered, so escape pipes and flatten newlines. + return value.replace("|", "\\|").replace("\r\n", " ").replace("\n", " ").replace( + "\r", " " + ) + + class CsvConverter(DocumentConverter): """ Converts CSV files to Markdown tables. @@ -58,7 +66,9 @@ def convert( markdown_table = [] # Add header row - markdown_table.append("| " + " | ".join(rows[0]) + " |") + markdown_table.append( + "| " + " | ".join(_escape_cell(cell) for cell in rows[0]) + " |" + ) # Add separator row markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |") @@ -70,7 +80,9 @@ def convert( row.append("") # Truncate if row has more columns than header row = row[: len(rows[0])] - markdown_table.append("| " + " | ".join(row) + " |") + markdown_table.append( + "| " + " | ".join(_escape_cell(cell) for cell in row) + " |" + ) result = "\n".join(markdown_table) diff --git a/packages/markitdown/tests/test_csv_converter.py b/packages/markitdown/tests/test_csv_converter.py new file mode 100644 index 000000000..f86f79966 --- /dev/null +++ b/packages/markitdown/tests/test_csv_converter.py @@ -0,0 +1,29 @@ +import io + +from markitdown.converters._csv_converter import CsvConverter +from markitdown._stream_info import StreamInfo + +CSV_STREAM_INFO = StreamInfo(extension=".csv", mimetype="text/csv") + + +def _convert(data: bytes) -> str: + return CsvConverter().convert(io.BytesIO(data), CSV_STREAM_INFO).markdown + + +def test_pipe_in_cell_is_escaped(): + markdown = _convert(b"name,note\nAlice,a|b\nBob,plain\n") + lines = markdown.splitlines() + # Header, separator and two data rows. + assert len(lines) == 4 + # Every row must have the same number of unescaped column separators. + expected_pipes = lines[0].count("|") + for line in lines: + assert line.replace("\\|", "").count("|") == expected_pipes + assert "a\\|b" in markdown + + +def test_newline_in_cell_does_not_break_row(): + markdown = _convert(b'name,note\n"Bob","line1\nline2"\n') + lines = markdown.splitlines() + assert len(lines) == 3 + assert "line1 line2" in lines[2]