Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import traceback
import io
from dataclasses import dataclass
from email.message import Message
from email.utils import collapse_rfc2231_value
from importlib.metadata import entry_points
from typing import Any, List, Dict, Optional, Union, BinaryIO
from pathlib import Path
Expand Down Expand Up @@ -44,13 +46,31 @@

from ._base_converter import DocumentConverter, DocumentConverterResult


from ._exceptions import (
FileConversionException,
UnsupportedFormatException,
FailedConversionAttempt,
)


def _get_content_disposition_filename(content_disposition: str) -> Optional[str]:
message = Message()
message["content-disposition"] = content_disposition

fallback_filename: Optional[str] = None
extended_filename: Optional[str] = None
for key, value in message.get_params(header="content-disposition", unquote=True):
if key != "filename":
continue
if isinstance(value, tuple):
extended_filename = collapse_rfc2231_value(value)
elif fallback_filename is None:
fallback_filename = value

return extended_filename or fallback_filename


# Lower priority values are tried first.
PRIORITY_SPECIFIC_FILE_FORMAT = (
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
Expand Down Expand Up @@ -512,9 +532,10 @@ def convert_response(
filename: Optional[str] = None
extension: Optional[str] = None
if "content-disposition" in response.headers:
m = re.search(r"filename=([^;]+)", response.headers["content-disposition"])
if m:
filename = m.group(1).strip("\"'")
filename = _get_content_disposition_filename(
response.headers["content-disposition"]
)
if filename is not None:
_, _extension = os.path.splitext(filename)
if len(_extension) > 0:
extension = _extension
Expand Down
39 changes: 39 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,45 @@ def test_input_as_strings() -> None:
assert "# Test" in result.text_content


def _mock_response(content_disposition: str) -> MagicMock:
response = MagicMock()
response.headers = {"content-disposition": content_disposition}
response.url = "https://example.com/download"
response.iter_content.return_value = [b"name,value\nalpha,beta\n"]
response.raise_for_status.return_value = None
return response


def test_convert_response_uses_rfc5987_content_disposition_filename() -> None:
markitdown = MarkItDown()
result = markitdown.convert_response(
_mock_response("attachment; filename*=UTF-8''data.csv")
)

assert result.markdown == "\n".join(
[
"| name | value |",
"| --- | --- |",
"| alpha | beta |",
]
)


def test_convert_response_prefers_extended_content_disposition_filename() -> None:
markitdown = MarkItDown()
result = markitdown.convert_response(
_mock_response("attachment; filename=fallback.txt; filename*=UTF-8''data.csv")
)

assert result.markdown == "\n".join(
[
"| name | value |",
"| --- | --- |",
"| alpha | beta |",
]
)


def test_deeply_nested_html_fallback() -> None:
"""Large, deeply nested HTML should fall back to plain-text extraction
instead of silently returning unconverted HTML (issue #1636).
Expand Down