diff --git a/packages/markitdown/src/markitdown/converters/_epub_converter.py b/packages/markitdown/src/markitdown/converters/_epub_converter.py index 3be65b016..c3aa86061 100644 --- a/packages/markitdown/src/markitdown/converters/_epub_converter.py +++ b/packages/markitdown/src/markitdown/converters/_epub_converter.py @@ -2,6 +2,7 @@ import zipfile from defusedxml import minidom from xml.dom.minidom import Document +from urllib.parse import unquote, urljoin, urlparse from typing import BinaryIO, Any, Dict, List @@ -88,11 +89,8 @@ def convert( spine_order = [item.getAttribute("idref") for item in spine_items] # Convert spine order to actual file paths - base_path = "/".join( - opf_path.split("/")[:-1] - ) # Get base directory of content.opf spine = [ - f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id] + self._resolve_manifest_href(opf_path, manifest[item_id]) for item_id in spine_order if item_id in manifest ] @@ -144,3 +142,9 @@ def _get_all_texts_from_nodes(self, dom: Document, tag_name: str) -> List[str]: if node.firstChild and hasattr(node.firstChild, "nodeValue"): texts.append(node.firstChild.nodeValue.strip()) return texts + + def _resolve_manifest_href(self, opf_path: str, href: str) -> str: + """Resolve a manifest href relative to the OPF file into a ZIP member path.""" + base_uri = f"{'/'.join(opf_path.split('/')[:-1])}/" if "/" in opf_path else "" + parsed = urlparse(urljoin(base_uri, href)) + return unquote(parsed.path) diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..906b34e05 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -3,6 +3,7 @@ import os import re import shutil +import zipfile import pytest from unittest.mock import MagicMock @@ -252,6 +253,45 @@ def test_file_uris() -> None: assert path == "/path/to/file.txt" +def test_epub_percent_encoded_manifest_href() -> None: + epub = io.BytesIO() + with zipfile.ZipFile(epub, "w") as z: + z.writestr( + "META-INF/container.xml", + """ + + + + +""", + ) + z.writestr( + "OPS/content.opf", + """ + + + Encoded href test + + + + + +""", + ) + z.writestr( + "OPS/chapter 1.xhtml", + """ +

Encoded Chapter

visible text marker

+""", + ) + + epub.seek(0) + result = MarkItDown().convert(epub, stream_info=StreamInfo(extension=".epub")) + + assert "Encoded Chapter" in result.markdown + assert "visible text marker" in result.markdown + + def test_docx_comments() -> None: # Test DOCX processing, with comments and setting style_map on init markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")