Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
OutlookMsgConverter,
ZipConverter,
EpubConverter,
SvgConverter,
DocumentIntelligenceConverter,
ContentUnderstandingConverter,
CsvConverter,
Expand Down Expand Up @@ -197,6 +198,7 @@ def enable_builtins(self, **kwargs) -> None:
self.register_converter(XlsConverter())
self.register_converter(PptxConverter())
self.register_converter(AudioConverter())
self.register_converter(SvgConverter())
self.register_converter(ImageConverter())
self.register_converter(IpynbConverter())
self.register_converter(PdfConverter())
Expand Down
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
ContentUnderstandingFileType,
)
from ._epub_converter import EpubConverter
from ._svg_converter import SvgConverter
from ._csv_converter import CsvConverter

__all__ = [
Expand All @@ -50,5 +51,6 @@
"ContentUnderstandingConverter",
"ContentUnderstandingFileType",
"EpubConverter",
"SvgConverter",
"CsvConverter",
]
88 changes: 88 additions & 0 deletions packages/markitdown/src/markitdown/converters/_llm_svg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""Utility helpers for converting SVG diagrams to Mermaid via LLM.

This module is used by the HTML conversion pipeline to translate inline SVG
content into Mermaid diagrams when a supported LLM client is configured.
The conversion is intentionally narrow: the model is instructed to return only
raw Mermaid source, and to return SKIP if the SVG is decorative or cannot be
expressed as a diagram.
"""

import re
from typing import Any, BinaryIO, Union

from .._stream_info import StreamInfo

_MAX_SVG_CHARS = 12_000
MAX_RESPONSE_TOKENS = 2048


def llm_svg(
file_stream: BinaryIO,
stream_info: StreamInfo,
*,
client: Any,
model: str,
prompt: Union[str, None] = None,
) -> Union[None, str]:
"""Convert streamed SVG content into Mermaid source using an LLM."""

if prompt is None or prompt.strip() == "":
prompt = (
"You are a diagram-analysis assistant. "
"Your task is to read an SVG element and convert it into a Mermaid "
"diagram that faithfully represents the same visual structure. "
"Reply with ONLY the raw Mermaid source. Do not include markdown fences or explanations."
'Start your reply with the Mermaid diagram type keyword (e.g. "flowchart LR", "sequenceDiagram"). '
"If the SVG is decorative and has no logical diagram structure, reply with exactly: SKIP"
)

# Preserve the stream position so this helper is non-destructive to the caller
encoding = stream_info.charset or "utf-8"
cur_pos = file_stream.tell()
try:
raw = file_stream.read()
finally:
file_stream.seek(cur_pos)

svg_text = raw.decode(encoding, errors="replace")
if not svg_text.strip():
return None

# Truncate large SVGs to keep the token count within reasonable limits
truncated = len(svg_text) > _MAX_SVG_CHARS
payload = svg_text[:_MAX_SVG_CHARS]
if truncated:
payload += "\n<!-- SVG truncated for brevity -->"

messages = [
{"role": "system", "content": prompt},
{
"role": "user",
"content": "Convert the following SVG to Mermaid:\n\n" + payload,
},
]

response = client.chat.completions.create(
model=model,
messages=messages,
temperature=0,
max_tokens=MAX_RESPONSE_TOKENS,
)

raw_reply: str = response.choices[0].message.content or ""
result = _clean_mermaid_response(raw_reply)

# Return None if the LLM explicitly skipped the image or returned an empty response
if not result or result.upper() == "SKIP":
return None

return result


def _clean_mermaid_response(text: str) -> str:
"""Extract the raw Mermaid code from Markdown fences, or return text as is."""
text = text.strip()
match = re.search(r"```(?:mermaid)?\s*\n?(.*?)```", text, re.DOTALL | re.IGNORECASE)
if match:
return match.group(1).strip()
return text
87 changes: 84 additions & 3 deletions packages/markitdown/src/markitdown/converters/_markdownify.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
import io
import os
import re
import markdownify

import urllib.request
import warnings
from typing import Any, Optional
from urllib.parse import quote, unquote, urlparse, urlunparse

import markdownify

from .._stream_info import StreamInfo
from ._llm_svg import llm_svg


class _CustomMarkdownify(markdownify.MarkdownConverter):
"""
Expand All @@ -13,11 +20,19 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
- Removing javascript hyperlinks.
- Truncating images with large data:uri sources.
- Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
- Supporting optional local image downloading and sequential renaming.
- Converting inline <svg> elements to Mermaid diagrams via an LLM (if configured).
"""

def __init__(self, **options: Any):
options["heading_style"] = options.get("heading_style", markdownify.ATX)
options["keep_data_uris"] = options.get("keep_data_uris", False)

# Options for downloading images locally
self.download_images: bool = options.pop("download_images", False)
self.output_dir: str = options.pop("output_dir", ".")
self.image_counter: int = 0

# Explicitly cast options to the expected type if necessary
super().__init__(**options)

Expand Down Expand Up @@ -89,7 +104,7 @@ def convert_img(
convert_as_inline: Optional[bool] = False,
**kwargs,
) -> str:
"""Same as usual converter, but removes data URIs"""
"""Same as usual converter, but removes data URIs and handles auto-downloading"""

alt = el.attrs.get("alt", None) or ""
src = el.attrs.get("src", None) or el.attrs.get("data-src", None) or ""
Expand All @@ -107,6 +122,38 @@ def convert_img(
if src.startswith("data:") and not self.options["keep_data_uris"]:
src = src.split(",")[0] + "..."

# Download remote images locally and assign a sequential filename if enabled
if self.download_images and src.startswith(("http://", "https://")):
try:
self.image_counter += 1

parsed_path = urlparse(src).path
ext = os.path.splitext(parsed_path)[1] or ".png"

new_filename = f"figure-{self.image_counter:03d}{ext}"

os.makedirs(self.output_dir, exist_ok=True)
full_save_path = os.path.join(self.output_dir, new_filename)

req = urllib.request.Request(
src,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
},
)
with (
urllib.request.urlopen(req) as response,
open(full_save_path, "wb") as out_file,
):
out_file.write(response.read())

src = new_filename
except Exception as e:
warnings.warn(
f"Could not download image {src}: {e}",
RuntimeWarning,
)

return "![%s](%s%s)" % (alt, src, title_part)

def convert_input(
Expand All @@ -122,5 +169,39 @@ def convert_input(
return "[x] " if el.has_attr("checked") else "[ ] "
return ""

def convert_svg(
self,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
) -> str:
"""Convert an inline <svg> element via an LLM to a Mermaid diagram, if configured."""
llm_client = self.options.get("llm_client")
llm_model = self.options.get("llm_model")
svg_source = str(el)

if llm_client is not None and llm_model is not None:
stream = io.BytesIO(svg_source.encode("utf-8"))
stream_info = StreamInfo(
mimetype="image/svg+xml", extension=".svg", charset="utf-8"
)
try:
mermaid = llm_svg(
stream,
stream_info,
client=llm_client,
model=llm_model,
prompt=self.options.get("llm_prompt"),
)
except Exception:
mermaid = None

if mermaid:
return f"\n\n```mermaid\n{mermaid}\n```\n\n"

# Fallback: preserve the original inline SVG source when Mermaid extraction fails
return f"\n\n```xml\n{svg_source.strip()}\n```\n\n"

def convert_soup(self, soup: Any) -> str:
return super().convert_soup(soup) # type: ignore
72 changes: 72 additions & 0 deletions packages/markitdown/src/markitdown/converters/_svg_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from typing import Any, BinaryIO

from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from ._llm_svg import llm_svg

ACCEPTED_MIME_TYPE_PREFIXES = [
"image/svg+xml",
"image/svg",
]

ACCEPTED_FILE_EXTENSIONS = [".svg"]


class SvgConverter(DocumentConverter):
"""
Converts SVG files to Markdown.
When an LLM client is configured, attempts to produce a Mermaid diagram.
Falls back to a fenced xml code block to preserve the SVG source.
"""

def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()

if extension in ACCEPTED_FILE_EXTENSIONS:
return True

for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True

return False

def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> DocumentConverterResult:
llm_client = kwargs.get("llm_client")
llm_model = kwargs.get("llm_model")

if llm_client is not None and llm_model is not None:
try:
mermaid = llm_svg(
file_stream,
stream_info,
client=llm_client,
model=llm_model,
prompt=kwargs.get("llm_prompt"),
)
except Exception:
mermaid = None

if mermaid:
return DocumentConverterResult(markdown=f"```mermaid\n{mermaid}\n```")

# Fallback: preserve the SVG source in a fenced xml block
encoding = stream_info.charset or "utf-8"
cur_pos = file_stream.tell()
try:
svg_text = file_stream.read().decode(encoding, errors="replace")
finally:
file_stream.seek(cur_pos)

return DocumentConverterResult(markdown=f"```xml\n{svg_text.strip()}\n```")
Loading