Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions packages/markitdown/src/markitdown/converters/_epub_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import zipfile
from defusedxml import minidom
from xml.dom.minidom import Document
from urllib.parse import unquote, urljoin, urlparse

from typing import BinaryIO, Any, Dict, List

Expand Down Expand Up @@ -88,11 +89,8 @@ def convert(
spine_order = [item.getAttribute("idref") for item in spine_items]

# Convert spine order to actual file paths
base_path = "/".join(
opf_path.split("/")[:-1]
) # Get base directory of content.opf
spine = [
f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
self._resolve_manifest_href(opf_path, manifest[item_id])
for item_id in spine_order
if item_id in manifest
]
Expand Down Expand Up @@ -144,3 +142,9 @@ def _get_all_texts_from_nodes(self, dom: Document, tag_name: str) -> List[str]:
if node.firstChild and hasattr(node.firstChild, "nodeValue"):
texts.append(node.firstChild.nodeValue.strip())
return texts

def _resolve_manifest_href(self, opf_path: str, href: str) -> str:
"""Resolve a manifest href relative to the OPF file into a ZIP member path."""
base_uri = f"{'/'.join(opf_path.split('/')[:-1])}/" if "/" in opf_path else ""
parsed = urlparse(urljoin(base_uri, href))
return unquote(parsed.path)
40 changes: 40 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import re
import shutil
import zipfile
import pytest
from unittest.mock import MagicMock

Expand Down Expand Up @@ -252,6 +253,45 @@ def test_file_uris() -> None:
assert path == "/path/to/file.txt"


def test_epub_percent_encoded_manifest_href() -> None:
epub = io.BytesIO()
with zipfile.ZipFile(epub, "w") as z:
z.writestr(
"META-INF/container.xml",
"""<?xml version="1.0"?>
<container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
<rootfiles>
<rootfile full-path="OPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>""",
)
z.writestr(
"OPS/content.opf",
"""<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="bookid">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title>Encoded href test</dc:title>
</metadata>
<manifest>
<item id="chapter" href="chapter%201.xhtml" media-type="application/xhtml+xml"/>
</manifest>
<spine><itemref idref="chapter"/></spine>
</package>""",
)
z.writestr(
"OPS/chapter 1.xhtml",
"""<html xmlns="http://www.w3.org/1999/xhtml">
<body><h1>Encoded Chapter</h1><p>visible text marker</p></body>
</html>""",
)

epub.seek(0)
result = MarkItDown().convert(epub, stream_info=StreamInfo(extension=".epub"))

assert "Encoded Chapter" in result.markdown
assert "visible text marker" in result.markdown


def test_docx_comments() -> None:
# Test DOCX processing, with comments and setting style_map on init
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
Expand Down