diff --git a/packages/markitdown/src/markitdown/converters/_epub_converter.py b/packages/markitdown/src/markitdown/converters/_epub_converter.py
index 3be65b016..c3aa86061 100644
--- a/packages/markitdown/src/markitdown/converters/_epub_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_epub_converter.py
@@ -2,6 +2,7 @@
import zipfile
from defusedxml import minidom
from xml.dom.minidom import Document
+from urllib.parse import unquote, urljoin, urlparse
from typing import BinaryIO, Any, Dict, List
@@ -88,11 +89,8 @@ def convert(
spine_order = [item.getAttribute("idref") for item in spine_items]
# Convert spine order to actual file paths
- base_path = "/".join(
- opf_path.split("/")[:-1]
- ) # Get base directory of content.opf
spine = [
- f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
+ self._resolve_manifest_href(opf_path, manifest[item_id])
for item_id in spine_order
if item_id in manifest
]
@@ -144,3 +142,9 @@ def _get_all_texts_from_nodes(self, dom: Document, tag_name: str) -> List[str]:
if node.firstChild and hasattr(node.firstChild, "nodeValue"):
texts.append(node.firstChild.nodeValue.strip())
return texts
+
+ def _resolve_manifest_href(self, opf_path: str, href: str) -> str:
+ """Resolve a manifest href relative to the OPF file into a ZIP member path."""
+ base_uri = f"{'/'.join(opf_path.split('/')[:-1])}/" if "/" in opf_path else ""
+ parsed = urlparse(urljoin(base_uri, href))
+ return unquote(parsed.path)
diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
index 4d62e4919..906b34e05 100644
--- a/packages/markitdown/tests/test_module_misc.py
+++ b/packages/markitdown/tests/test_module_misc.py
@@ -3,6 +3,7 @@
import os
import re
import shutil
+import zipfile
import pytest
from unittest.mock import MagicMock
@@ -252,6 +253,45 @@ def test_file_uris() -> None:
assert path == "/path/to/file.txt"
+def test_epub_percent_encoded_manifest_href() -> None:
+ epub = io.BytesIO()
+ with zipfile.ZipFile(epub, "w") as z:
+ z.writestr(
+ "META-INF/container.xml",
+ """
+
visible text marker
+""", + ) + + epub.seek(0) + result = MarkItDown().convert(epub, stream_info=StreamInfo(extension=".epub")) + + assert "Encoded Chapter" in result.markdown + assert "visible text marker" in result.markdown + + def test_docx_comments() -> None: # Test DOCX processing, with comments and setting style_map on init markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")