microsoft · xujiantop-crypto · Jun 1, 2026
diff --git a/packages/markitdown/src/markitdown/converters/_epub_converter.py b/packages/markitdown/src/markitdown/converters/_epub_converter.py
@@ -2,6 +2,7 @@
 import zipfile
 from defusedxml import minidom
 from xml.dom.minidom import Document
+from urllib.parse import unquote, urljoin, urlparse
 
 from typing import BinaryIO, Any, Dict, List
 
@@ -88,11 +89,8 @@ def convert(
             spine_order = [item.getAttribute("idref") for item in spine_items]
 
             # Convert spine order to actual file paths
-            base_path = "/".join(
-                opf_path.split("/")[:-1]
-            )  # Get base directory of content.opf
             spine = [
-                f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
+                self._resolve_manifest_href(opf_path, manifest[item_id])
                 for item_id in spine_order
                 if item_id in manifest
             ]
@@ -144,3 +142,9 @@ def _get_all_texts_from_nodes(self, dom: Document, tag_name: str) -> List[str]:
             if node.firstChild and hasattr(node.firstChild, "nodeValue"):
                 texts.append(node.firstChild.nodeValue.strip())
         return texts
+
+    def _resolve_manifest_href(self, opf_path: str, href: str) -> str:
+        """Resolve a manifest href relative to the OPF file into a ZIP member path."""
+        base_uri = f"{'/'.join(opf_path.split('/')[:-1])}/" if "/" in opf_path else ""
+        parsed = urlparse(urljoin(base_uri, href))
+        return unquote(parsed.path)
diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
@@ -3,6 +3,7 @@
 import os
 import re
 import shutil
+import zipfile
 import pytest
 from unittest.mock import MagicMock
 
@@ -252,6 +253,45 @@ def test_file_uris() -> None:
     assert path == "/path/to/file.txt"
 
 
+def test_epub_percent_encoded_manifest_href() -> None:
+    epub = io.BytesIO()
+    with zipfile.ZipFile(epub, "w") as z:
+        z.writestr(
+            "META-INF/container.xml",
+            """<?xml version="1.0"?>
+<container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
+  <rootfiles>
+    <rootfile full-path="OPS/content.opf" media-type="application/oebps-package+xml"/>
+  </rootfiles>
+</container>""",
+        )
+        z.writestr(
+            "OPS/content.opf",
+            """<?xml version="1.0" encoding="UTF-8"?>
+<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="bookid">
+  <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
+    <dc:title>Encoded href test</dc:title>
+  </metadata>
+  <manifest>
+    <item id="chapter" href="chapter%201.xhtml" media-type="application/xhtml+xml"/>
+  </manifest>
+  <spine><itemref idref="chapter"/></spine>
+</package>""",
+        )
+        z.writestr(
+            "OPS/chapter 1.xhtml",
+            """<html xmlns="http://www.w3.org/1999/xhtml">
+<body><h1>Encoded Chapter</h1><p>visible text marker</p></body>
+</html>""",
+        )
+
+    epub.seek(0)
+    result = MarkItDown().convert(epub, stream_info=StreamInfo(extension=".epub"))
+
+    assert "Encoded Chapter" in result.markdown
+    assert "visible text marker" in result.markdown
+
+
 def test_docx_comments() -> None:
     # Test DOCX processing, with comments and setting style_map on init
     markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")