From a10d86ecbdb65c4bb4533811d0922cf83db369e4 Mon Sep 17 00:00:00 2001 From: Andrii Date: Sun, 31 May 2026 23:07:45 +0300 Subject: [PATCH 1/2] feat: add image auto-download and renaming for HTML converter (part of #2012) --- .../src/markitdown/converters/_markdownify.py | 48 +++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py index 19e8a2984..4f6361010 100644 --- a/packages/markitdown/src/markitdown/converters/_markdownify.py +++ b/packages/markitdown/src/markitdown/converters/_markdownify.py @@ -1,9 +1,12 @@ +import os import re -import markdownify - +import urllib.request +import warnings from typing import Any, Optional from urllib.parse import quote, unquote, urlparse, urlunparse +import markdownify + class _CustomMarkdownify(markdownify.MarkdownConverter): """ @@ -13,11 +16,18 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): - Removing javascript hyperlinks. - Truncating images with large data:uri sources. - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax + - Supporting optional local image downloading and sequential renaming. """ def __init__(self, **options: Any): options["heading_style"] = options.get("heading_style", markdownify.ATX) options["keep_data_uris"] = options.get("keep_data_uris", False) + + # Options for downloading images locally + self.download_images: bool = options.pop("download_images", False) + self.output_dir: str = options.pop("output_dir", ".") + self.image_counter: int = 0 + # Explicitly cast options to the expected type if necessary super().__init__(**options) @@ -89,7 +99,7 @@ def convert_img( convert_as_inline: Optional[bool] = False, **kwargs, ) -> str: - """Same as usual converter, but removes data URIs""" + """Same as usual converter, but removes data URIs and handles auto-downloading""" alt = el.attrs.get("alt", None) or "" src = el.attrs.get("src", None) or el.attrs.get("data-src", None) or "" @@ -107,6 +117,38 @@ def convert_img( if src.startswith("data:") and not self.options["keep_data_uris"]: src = src.split(",")[0] + "..." + # Download remote images locally and assign a sequential filename if enabled + if self.download_images and src.startswith(("http://", "https://")): + try: + self.image_counter += 1 + + parsed_path = urlparse(src).path + ext = os.path.splitext(parsed_path)[1] or ".png" + + new_filename = f"figure-{self.image_counter:03d}{ext}" + + os.makedirs(self.output_dir, exist_ok=True) + full_save_path = os.path.join(self.output_dir, new_filename) + + req = urllib.request.Request( + src, + headers={ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0" + }, + ) + with ( + urllib.request.urlopen(req) as response, + open(full_save_path, "wb") as out_file, + ): + out_file.write(response.read()) + + src = new_filename + except Exception as e: + warnings.warn( + f"Could not download image {src}: {e}", + RuntimeWarning, + ) + return "![%s](%s%s)" % (alt, src, title_part) def convert_input( From f2334c0b4e91418e7a78427579883488e5d0bf72 Mon Sep 17 00:00:00 2001 From: Andrii Date: Mon, 1 Jun 2026 22:13:23 +0300 Subject: [PATCH 2/2] feat: add SVG-to-Mermaid conversion for standalone files and inline HTML (part of #2012) - Add SvgConverter(DocumentConverter) for standalone .svg files using LLM client; falls back to fenced XML block on missing LLM, errors, or SKIP response - Implement convert_svg() in _CustomMarkdownify to detect and convert inline elements in HTML into Mermaid code blocks - Add _llm_svg() helper to send SVG to OpenAI-compatible client and return parsed response - Add test_svg_converter.py covering accepts(), Mermaid conversion, and all fallback cases - Add inline SVG tests in test_module_misc.py for Mermaid conversion and fallback scenarios --- .../markitdown/src/markitdown/_markitdown.py | 2 + .../src/markitdown/converters/__init__.py | 2 + .../src/markitdown/converters/_llm_svg.py | 88 ++++++++++++++ .../src/markitdown/converters/_markdownify.py | 39 ++++++ .../markitdown/converters/_svg_converter.py | 72 +++++++++++ packages/markitdown/tests/test_module_misc.py | 71 +++++++++++ .../markitdown/tests/test_svg_converter.py | 115 ++++++++++++++++++ 7 files changed, 389 insertions(+) create mode 100644 packages/markitdown/src/markitdown/converters/_llm_svg.py create mode 100644 packages/markitdown/src/markitdown/converters/_svg_converter.py create mode 100644 packages/markitdown/tests/test_svg_converter.py diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f6aa4df0e..463920faa 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -37,6 +37,7 @@ OutlookMsgConverter, ZipConverter, EpubConverter, + SvgConverter, DocumentIntelligenceConverter, ContentUnderstandingConverter, CsvConverter, @@ -197,6 +198,7 @@ def enable_builtins(self, **kwargs) -> None: self.register_converter(XlsConverter()) self.register_converter(PptxConverter()) self.register_converter(AudioConverter()) + self.register_converter(SvgConverter()) self.register_converter(ImageConverter()) self.register_converter(IpynbConverter()) self.register_converter(PdfConverter()) diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index 77f8b1acd..6659394ee 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -26,6 +26,7 @@ ContentUnderstandingFileType, ) from ._epub_converter import EpubConverter +from ._svg_converter import SvgConverter from ._csv_converter import CsvConverter __all__ = [ @@ -50,5 +51,6 @@ "ContentUnderstandingConverter", "ContentUnderstandingFileType", "EpubConverter", + "SvgConverter", "CsvConverter", ] diff --git a/packages/markitdown/src/markitdown/converters/_llm_svg.py b/packages/markitdown/src/markitdown/converters/_llm_svg.py new file mode 100644 index 000000000..01f4c6104 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_llm_svg.py @@ -0,0 +1,88 @@ +"""Utility helpers for converting SVG diagrams to Mermaid via LLM. + +This module is used by the HTML conversion pipeline to translate inline SVG +content into Mermaid diagrams when a supported LLM client is configured. +The conversion is intentionally narrow: the model is instructed to return only +raw Mermaid source, and to return SKIP if the SVG is decorative or cannot be +expressed as a diagram. +""" + +import re +from typing import Any, BinaryIO, Union + +from .._stream_info import StreamInfo + +_MAX_SVG_CHARS = 12_000 +MAX_RESPONSE_TOKENS = 2048 + + +def llm_svg( + file_stream: BinaryIO, + stream_info: StreamInfo, + *, + client: Any, + model: str, + prompt: Union[str, None] = None, +) -> Union[None, str]: + """Convert streamed SVG content into Mermaid source using an LLM.""" + + if prompt is None or prompt.strip() == "": + prompt = ( + "You are a diagram-analysis assistant. " + "Your task is to read an SVG element and convert it into a Mermaid " + "diagram that faithfully represents the same visual structure. " + "Reply with ONLY the raw Mermaid source. Do not include markdown fences or explanations." + 'Start your reply with the Mermaid diagram type keyword (e.g. "flowchart LR", "sequenceDiagram"). ' + "If the SVG is decorative and has no logical diagram structure, reply with exactly: SKIP" + ) + + # Preserve the stream position so this helper is non-destructive to the caller + encoding = stream_info.charset or "utf-8" + cur_pos = file_stream.tell() + try: + raw = file_stream.read() + finally: + file_stream.seek(cur_pos) + + svg_text = raw.decode(encoding, errors="replace") + if not svg_text.strip(): + return None + + # Truncate large SVGs to keep the token count within reasonable limits + truncated = len(svg_text) > _MAX_SVG_CHARS + payload = svg_text[:_MAX_SVG_CHARS] + if truncated: + payload += "\n" + + messages = [ + {"role": "system", "content": prompt}, + { + "role": "user", + "content": "Convert the following SVG to Mermaid:\n\n" + payload, + }, + ] + + response = client.chat.completions.create( + model=model, + messages=messages, + temperature=0, + max_tokens=MAX_RESPONSE_TOKENS, + ) + + raw_reply: str = response.choices[0].message.content or "" + result = _clean_mermaid_response(raw_reply) + + # Return None if the LLM explicitly skipped the image or returned an empty response + if not result or result.upper() == "SKIP": + return None + + return result + + +def _clean_mermaid_response(text: str) -> str: + """Extract the raw Mermaid code from Markdown fences, or return text as is.""" + text = text.strip() + match = re.search(r"```(?:mermaid)?\s*\n?(.*?)```", text, re.DOTALL | re.IGNORECASE) + if match: + return match.group(1).strip() + return text diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py index 4f6361010..60a100ecb 100644 --- a/packages/markitdown/src/markitdown/converters/_markdownify.py +++ b/packages/markitdown/src/markitdown/converters/_markdownify.py @@ -1,3 +1,4 @@ +import io import os import re import urllib.request @@ -7,6 +8,9 @@ import markdownify +from .._stream_info import StreamInfo +from ._llm_svg import llm_svg + class _CustomMarkdownify(markdownify.MarkdownConverter): """ @@ -17,6 +21,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): - Truncating images with large data:uri sources. - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax - Supporting optional local image downloading and sequential renaming. + - Converting inline elements to Mermaid diagrams via an LLM (if configured). """ def __init__(self, **options: Any): @@ -164,5 +169,39 @@ def convert_input( return "[x] " if el.has_attr("checked") else "[ ] " return "" + def convert_svg( + self, + el: Any, + text: str, + convert_as_inline: Optional[bool] = False, + **kwargs, + ) -> str: + """Convert an inline element via an LLM to a Mermaid diagram, if configured.""" + llm_client = self.options.get("llm_client") + llm_model = self.options.get("llm_model") + svg_source = str(el) + + if llm_client is not None and llm_model is not None: + stream = io.BytesIO(svg_source.encode("utf-8")) + stream_info = StreamInfo( + mimetype="image/svg+xml", extension=".svg", charset="utf-8" + ) + try: + mermaid = llm_svg( + stream, + stream_info, + client=llm_client, + model=llm_model, + prompt=self.options.get("llm_prompt"), + ) + except Exception: + mermaid = None + + if mermaid: + return f"\n\n```mermaid\n{mermaid}\n```\n\n" + + # Fallback: preserve the original inline SVG source when Mermaid extraction fails + return f"\n\n```xml\n{svg_source.strip()}\n```\n\n" + def convert_soup(self, soup: Any) -> str: return super().convert_soup(soup) # type: ignore diff --git a/packages/markitdown/src/markitdown/converters/_svg_converter.py b/packages/markitdown/src/markitdown/converters/_svg_converter.py new file mode 100644 index 000000000..ce12fac68 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_svg_converter.py @@ -0,0 +1,72 @@ +from typing import Any, BinaryIO + +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo +from ._llm_svg import llm_svg + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "image/svg+xml", + "image/svg", +] + +ACCEPTED_FILE_EXTENSIONS = [".svg"] + + +class SvgConverter(DocumentConverter): + """ + Converts SVG files to Markdown. + When an LLM client is configured, attempts to produce a Mermaid diagram. + Falls back to a fenced xml code block to preserve the SVG source. + """ + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + llm_client = kwargs.get("llm_client") + llm_model = kwargs.get("llm_model") + + if llm_client is not None and llm_model is not None: + try: + mermaid = llm_svg( + file_stream, + stream_info, + client=llm_client, + model=llm_model, + prompt=kwargs.get("llm_prompt"), + ) + except Exception: + mermaid = None + + if mermaid: + return DocumentConverterResult(markdown=f"```mermaid\n{mermaid}\n```") + + # Fallback: preserve the SVG source in a fenced xml block + encoding = stream_info.charset or "utf-8" + cur_pos = file_stream.tell() + try: + svg_text = file_stream.read().decode(encoding, errors="replace") + finally: + file_stream.seek(cur_pos) + + return DocumentConverterResult(markdown=f"```xml\n{svg_text.strip()}\n```") diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..df708cb3b 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -531,6 +531,74 @@ def test_markitdown_llm() -> None: # Standard alt text is included validate_strings(result, PPTX_TEST_STRINGS) +def test_inline_svg_converts_to_mermaid_block() -> None: + """An inline inside HTML is converted to a mermaid block when LLM is configured.""" + svg_html = ( + "" + '' + '' + "" + "" + ) + + client = MagicMock() + client.chat.completions.create.return_value = MagicMock( + choices=[MagicMock(message=MagicMock(content="flowchart LR\n A --> B"))] + ) + markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o") + + result = markitdown.convert_stream( + io.BytesIO(svg_html.encode("utf-8")), + stream_info=StreamInfo(mimetype="text/html", extension=".html"), + ) + + assert "```mermaid" in result.markdown + assert client.chat.completions.create.called + + +def test_inline_svg_fallback_to_xml_block_when_no_llm() -> None: + """When no LLM is configured, an inline is preserved as an xml code block.""" + svg_html = ( + "" + '' + '' + "" + "" + ) + + markitdown = MarkItDown() + result = markitdown.convert_stream( + io.BytesIO(svg_html.encode("utf-8")), + stream_info=StreamInfo(mimetype="text/html", extension=".html"), + ) + + assert "```xml" in result.markdown + assert " None: + """When LLM responds with SKIP, inline falls back to an xml block.""" + svg_html = ( + "" + '' + "" + ) + + client = MagicMock() + client.chat.completions.create.return_value = MagicMock( + choices=[MagicMock(message=MagicMock(content="SKIP"))] + ) + markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o") + + result = markitdown.convert_stream( + io.BytesIO(svg_html.encode("utf-8")), + stream_info=StreamInfo(mimetype="text/html", extension=".html"), + ) + + assert "```xml" in result.markdown + assert "```mermaid" not in result.markdown + if __name__ == "__main__": """Runs this file's tests from the command line.""" @@ -547,6 +615,9 @@ def test_markitdown_llm() -> None: test_markitdown_exiftool, test_markitdown_llm_parameters, test_markitdown_llm, + test_inline_svg_converts_to_mermaid_block, + test_inline_svg_fallback_to_xml_block_when_no_llm, + test_inline_svg_fallback_when_llm_returns_skip, ]: print(f"Running {test.__name__}...", end="") test() diff --git a/packages/markitdown/tests/test_svg_converter.py b/packages/markitdown/tests/test_svg_converter.py new file mode 100644 index 000000000..7253683dc --- /dev/null +++ b/packages/markitdown/tests/test_svg_converter.py @@ -0,0 +1,115 @@ +"""Tests for SvgConverter.""" + +import io +from unittest.mock import MagicMock + +from markitdown._stream_info import StreamInfo +from markitdown.converters._svg_converter import SvgConverter + +SIMPLE_SVG = b'' + + +def _make_llm_client(reply: str) -> MagicMock: + """Create a mock LLM client that returns the given response.""" + client = MagicMock() + client.chat.completions.create.return_value = MagicMock( + choices=[MagicMock(message=MagicMock(content=reply))] + ) + return client + + +class TestSvgConverterAccepts: + """Test accepts() for .svg extension and image/svg+xml MIME type.""" + + def test_accepts_svg_extension(self) -> None: + conv = SvgConverter() + assert conv.accepts(io.BytesIO(SIMPLE_SVG), StreamInfo(extension=".svg")) + + def test_accepts_svg_mimetype(self) -> None: + conv = SvgConverter() + assert conv.accepts( + io.BytesIO(SIMPLE_SVG), StreamInfo(mimetype="image/svg+xml") + ) + + def test_rejects_html_mimetype(self) -> None: + conv = SvgConverter() + assert not conv.accepts(io.BytesIO(b""), StreamInfo(mimetype="text/html")) + + def test_rejects_png_extension(self) -> None: + conv = SvgConverter() + assert not conv.accepts(io.BytesIO(b""), StreamInfo(extension=".png")) + + +class TestSvgConverterConvert: + """Test convert() success and fallback cases.""" + + def test_llm_response_produces_mermaid_block(self) -> None: + """When LLM is configured and returns valid Mermaid, output is a mermaid code block.""" + conv = SvgConverter() + llm_client = _make_llm_client("flowchart LR\n A --> B") + + result = conv.convert( + io.BytesIO(SIMPLE_SVG), + StreamInfo(extension=".svg", mimetype="image/svg+xml"), + llm_client=llm_client, + llm_model="gpt-4o", + ) + + assert "```mermaid" in result.markdown + assert "flowchart LR" in result.markdown + assert "A --> B" in result.markdown + + def test_fallback_no_llm_client_returns_xml_block(self) -> None: + """When no LLM client is provided, the SVG source is wrapped in an xml block.""" + conv = SvgConverter() + + result = conv.convert( + io.BytesIO(SIMPLE_SVG), + StreamInfo(extension=".svg", mimetype="image/svg+xml"), + ) + + assert "```xml" in result.markdown + assert " None: + """When LLM replies with SKIP, converter falls back to xml block.""" + conv = SvgConverter() + llm_client = _make_llm_client("SKIP") + + result = conv.convert( + io.BytesIO(SIMPLE_SVG), + StreamInfo(extension=".svg", mimetype="image/svg+xml"), + llm_client=llm_client, + llm_model="gpt-4o", + ) + + assert "```xml" in result.markdown + assert " None: + """When the LLM call raises an exception, converter falls back.""" + conv = SvgConverter() + client = MagicMock() + client.chat.completions.create.side_effect = RuntimeError("API error") + + result = conv.convert( + io.BytesIO(SIMPLE_SVG), + StreamInfo(extension=".svg", mimetype="image/svg+xml"), + llm_client=client, + llm_model="gpt-4o", + ) + + assert "```xml" in result.markdown + assert " None: + """convert() must not consume the stream permanently.""" + conv = SvgConverter() + stream = io.BytesIO(SIMPLE_SVG) + stream.seek(0) + + conv.convert(stream, StreamInfo(extension=".svg")) + + # Verify the stream pointer was rewound to prevent data loss for subsequent readers + assert stream.tell() == 0 or stream.seek(0) == 0