diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f6aa4df0e..2c8c237f8 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -6,6 +6,8 @@ import traceback import io from dataclasses import dataclass +from email.message import Message +from email.utils import collapse_rfc2231_value from importlib.metadata import entry_points from typing import Any, List, Dict, Optional, Union, BinaryIO from pathlib import Path @@ -44,6 +46,7 @@ from ._base_converter import DocumentConverter, DocumentConverterResult + from ._exceptions import ( FileConversionException, UnsupportedFormatException, @@ -51,6 +54,23 @@ ) +def _get_content_disposition_filename(content_disposition: str) -> Optional[str]: + message = Message() + message["content-disposition"] = content_disposition + + fallback_filename: Optional[str] = None + extended_filename: Optional[str] = None + for key, value in message.get_params(header="content-disposition", unquote=True): + if key != "filename": + continue + if isinstance(value, tuple): + extended_filename = collapse_rfc2231_value(value) + elif fallback_filename is None: + fallback_filename = value + + return extended_filename or fallback_filename + + # Lower priority values are tried first. PRIORITY_SPECIFIC_FILE_FORMAT = ( 0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia @@ -512,9 +532,10 @@ def convert_response( filename: Optional[str] = None extension: Optional[str] = None if "content-disposition" in response.headers: - m = re.search(r"filename=([^;]+)", response.headers["content-disposition"]) - if m: - filename = m.group(1).strip("\"'") + filename = _get_content_disposition_filename( + response.headers["content-disposition"] + ) + if filename is not None: _, _extension = os.path.splitext(filename) if len(_extension) > 0: extension = _extension diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..a10658b0e 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -288,6 +288,45 @@ def test_input_as_strings() -> None: assert "# Test" in result.text_content +def _mock_response(content_disposition: str) -> MagicMock: + response = MagicMock() + response.headers = {"content-disposition": content_disposition} + response.url = "https://example.com/download" + response.iter_content.return_value = [b"name,value\nalpha,beta\n"] + response.raise_for_status.return_value = None + return response + + +def test_convert_response_uses_rfc5987_content_disposition_filename() -> None: + markitdown = MarkItDown() + result = markitdown.convert_response( + _mock_response("attachment; filename*=UTF-8''data.csv") + ) + + assert result.markdown == "\n".join( + [ + "| name | value |", + "| --- | --- |", + "| alpha | beta |", + ] + ) + + +def test_convert_response_prefers_extended_content_disposition_filename() -> None: + markitdown = MarkItDown() + result = markitdown.convert_response( + _mock_response("attachment; filename=fallback.txt; filename*=UTF-8''data.csv") + ) + + assert result.markdown == "\n".join( + [ + "| name | value |", + "| --- | --- |", + "| alpha | beta |", + ] + ) + + def test_deeply_nested_html_fallback() -> None: """Large, deeply nested HTML should fall back to plain-text extraction instead of silently returning unconverted HTML (issue #1636).