From 49584083efc3a185bbed928a87e431d996d29a06 Mon Sep 17 00:00:00 2001 From: doitgo Date: Sun, 31 May 2026 22:14:54 +0800 Subject: [PATCH 1/4] fix: remove mammoth dependency exception handling --- .../src/markitdown/converters/_plain_text_converter.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py index 6f1306fe8..b5114fc5d 100644 --- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py +++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py @@ -8,11 +8,6 @@ # Try loading optional (but in this case, required) dependencies # Save reporting of any exceptions for later _dependency_exc_info = None -try: - import mammoth # noqa: F401 -except ImportError: - # Preserve the error and stack trace for later - _dependency_exc_info = sys.exc_info() ACCEPTED_MIME_TYPE_PREFIXES = [ "text/", From fa85621433a43fed64f4c1b1cde399d7cbaa4476 Mon Sep 17 00:00:00 2001 From: doitgo Date: Sun, 31 May 2026 22:35:33 +0800 Subject: [PATCH 2/4] fix: initialize md_text before conditional blocks in RSS converter (#1946) --- packages/markitdown/src/markitdown/converters/_rss_converter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/markitdown/src/markitdown/converters/_rss_converter.py b/packages/markitdown/src/markitdown/converters/_rss_converter.py index bec42484f..e36b2b31e 100644 --- a/packages/markitdown/src/markitdown/converters/_rss_converter.py +++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py @@ -143,6 +143,7 @@ def _parse_rss_type(self, doc: Document) -> DocumentConverterResult: channel_title = self._get_data_by_tag_name(channel, "title") channel_description = self._get_data_by_tag_name(channel, "description") items = channel.getElementsByTagName("item") + md_text = "" if channel_title: md_text = f"# {channel_title}\n" if channel_description: From 815227a596cf7a3dd91aa97f6d5dc4af0f493d0a Mon Sep 17 00:00:00 2001 From: doitgo Date: Mon, 1 Jun 2026 18:44:25 +0800 Subject: [PATCH 3/4] fix: avoid rendering "# None" title in WikipediaConverter When main_title is None, the converter previously rendered "# None" as the markdown heading. Now it only adds the title prefix when main_title is truthy. Fixes #1968 Co-Authored-By: Claude Opus 4.7 --- .../src/markitdown/converters/_wikipedia_converter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py index c20018659..3692a3793 100644 --- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py @@ -75,7 +75,8 @@ def convert( main_title = title_elm.string # Convert the page - webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify( + title_prefix = f"# {main_title}\n\n" if main_title else "" + webpage_text = title_prefix + _CustomMarkdownify( **kwargs ).convert_soup(body_elm) else: From 4ad18e6b551e7bd64c60dceac582c9722233b239 Mon Sep 17 00:00:00 2001 From: doitgo Date: Mon, 1 Jun 2026 19:02:37 +0800 Subject: [PATCH 4/4] fix: use utf-8 instead of locale encoding in exiftool JSON parsing locale.getpreferredencoding() returns cp936/GBK on Chinese Windows, which causes UnicodeDecodeError when parsing JSON output from exiftool. Using utf-8 directly fixes this. Fixes #1972 Co-Authored-By: Claude Opus 4.7 --- packages/markitdown/src/markitdown/converters/_exiftool.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_exiftool.py b/packages/markitdown/src/markitdown/converters/_exiftool.py index f605024fd..d684d9e1d 100644 --- a/packages/markitdown/src/markitdown/converters/_exiftool.py +++ b/packages/markitdown/src/markitdown/converters/_exiftool.py @@ -1,5 +1,4 @@ import json -import locale import subprocess from typing import Any, BinaryIO, Union @@ -45,8 +44,6 @@ def exiftool_metadata( text=False, ).stdout - return json.loads( - output.decode(locale.getpreferredencoding(False)), - )[0] + return json.loads(output.decode("utf-8"))[0] finally: file_stream.seek(cur_pos)