Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
2613ebb
Add OCR test data and implement tests for various document formats
lesyk Jan 26, 2026
2e83594
Merge branch 'main' into u/vilesyk/inline_image
lesyk Jan 26, 2026
2a3a3ef
Enhance OCR functionality and validation in document converters
lesyk Jan 27, 2026
f4fab9b
Add support for scanned PDFs with full-page OCR fallback and implemen…
lesyk Jan 27, 2026
40e0be5
Bump version to 0.1.6b1 in __about__.py
lesyk Feb 12, 2026
9daaeff
Refactor OCR services to support LLM Vision, update README and tests …
lesyk Feb 13, 2026
bd9c98d
Add OCR-enabled converters and ensure consistent OCR format across do…
lesyk Feb 13, 2026
6732692
Refactor converters to improve import organization and enhance OCR fu…
lesyk Feb 13, 2026
678ea75
Refactor exception imports for consistency across converters and tests
lesyk Feb 13, 2026
dfd57e0
Fix OCR tests to match MockOCRService output and fix cross-platform f…
lesyk Feb 16, 2026
550243a
Merge origin/main into u/vilesyk/inline_image
lesyk Feb 16, 2026
222ec95
Bump version to 0.1.6b1 in __about__.py
lesyk Feb 16, 2026
ce21005
Skip DOCX/XLSX/PPTX OCR tests when optional dependencies are missing
lesyk Feb 16, 2026
6e7cf50
Add comprehensive OCR test suite for various document formats
lesyk Feb 24, 2026
0816de8
Merge branch 'u/vilesyk/inline_image' of https://github.com/lesyk/mar…
lesyk Feb 24, 2026
a23087a
Remove obsolete HTML test files and refactor test cases for file URIs…
lesyk Feb 24, 2026
f7ee5ef
Refactor OCR processing in PdfConverterWithOCR and enhance unit tests…
lesyk Feb 24, 2026
fefc3b6
Revert
lesyk Feb 24, 2026
1ef0d50
Revert
lesyk Feb 24, 2026
9d485bd
Update REDMEs
lesyk Feb 24, 2026
207e58c
Merge branch 'main' into u/vilesyk/inline_image
lesyk Feb 25, 2026
b8e28c0
Refactor import statements for consistency and improve formatting in …
lesyk Feb 25, 2026
aff82a3
Merge branch 'u/vilesyk/inline_image' of https://github.com/lesyk/mar…
lesyk Feb 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,89 @@ result = md.convert("example.jpg")
print(result.text_content)
```

To extract text from images embedded in documents using OCR (requires Tesseract):

```python
from markitdown.converters._ocr_service import MultiBackendOCRService, OCRBackend
from markitdown.converters._pdf_converter_with_ocr import PdfConverterWithOCR

# Create OCR service with Tesseract
ocr_service = MultiBackendOCRService(backends=[OCRBackend.TESSERACT])

# Convert PDF with OCR
converter = PdfConverterWithOCR()
with open("document.pdf", "rb") as f:
result = converter.convert(f, ocr_service=ocr_service)
print(result.text_content)
```

To use LLM (GPT-4o, Gemini, etc.) for OCR instead of Tesseract:

```python
from markitdown.converters._ocr_service import MultiBackendOCRService, OCRBackend
from markitdown.converters._pdf_converter_with_ocr import PdfConverterWithOCR
from openai import OpenAI

# Create OCR service with LLM Vision backend
client = OpenAI()
ocr_service = MultiBackendOCRService(
backends=[OCRBackend.LLM_VISION],
llm_client=client,
llm_model="gpt-4o"
)

# Convert PDF with LLM-based OCR
converter = PdfConverterWithOCR()
with open("document.pdf", "rb") as f:
result = converter.convert(f, ocr_service=ocr_service)
print(result.text_content)
```

Multi-backend fallback (tries Tesseract first, falls back to LLM if Tesseract fails):

```python
ocr_service = MultiBackendOCRService(
backends=[OCRBackend.TESSERACT, OCRBackend.LLM_VISION],
llm_client=client,
llm_model="gpt-4o"
)
```

OCR converters are available for PDF, DOCX, XLSX (multi-sheet), and PPTX formats. Images are extracted with context preservation (page numbers, cell references, relationship IDs).

#### Scanned PDF Support

MarkItDown automatically detects scanned PDFs (documents with no extractable text) and falls back to full-page OCR. When a PDF extraction returns empty or whitespace-only results, the converter:

1. Renders each page as a high-resolution image (300 DPI)
2. Performs OCR on the full page image
3. Preserves page structure with page markers
4. Indicates which OCR backend was used

This works seamlessly with any OCR backend (Tesseract, EasyOCR, LLM Vision):

```python
from markitdown.converters._ocr_service import MultiBackendOCRService, OCRBackend
from markitdown.converters._pdf_converter_with_ocr import PdfConverterWithOCR

# Create OCR service
ocr_service = MultiBackendOCRService(backends=[OCRBackend.TESSERACT])

# Convert scanned PDF - fallback is automatic
converter = PdfConverterWithOCR()
with open("scanned_invoice.pdf", "rb") as f:
result = converter.convert(f, ocr_service=ocr_service)
print(result.text_content)
```

The fallback triggers automatically when:

- PDF has no extractable text (truly scanned documents)
- Text extraction returns only whitespace
- No embedded text is found via pdfminer or pdfplumber

No additional configuration is needed - just provide an OCR service and the converter handles the rest.

### Docker

```sh
Expand Down
2 changes: 1 addition & 1 deletion packages/markitdown/src/markitdown/__about__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.1.5b1"
__version__ = "0.1.6b1"
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
"""
Enhanced DOCX Converter with OCR support for embedded images.
Extracts images from Word documents and performs OCR while maintaining context.
"""

import sys
import io
import re
from typing import BinaryIO, Any, Optional

from ._html_converter import HtmlConverter
from ..converter_utils.docx.pre_process import pre_process_docx
from .._base_converter import DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
from ._ocr_service import MultiBackendOCRService

# Try loading dependencies
_dependency_exc_info = None
try:
import mammoth
from docx import Document
except ImportError:
_dependency_exc_info = sys.exc_info()


class DocxConverterWithOCR(HtmlConverter):
"""
Enhanced DOCX Converter with OCR support for embedded images.
Maintains document flow while extracting text from images inline.
"""

def __init__(self):
super().__init__()
self._html_converter = HtmlConverter()

def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()

if extension == ".docx":
return True

if mimetype.startswith(
"application/vnd.openxmlformats-officedocument.wordprocessingml"
):
return True

return False

def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> DocumentConverterResult:
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".docx",
feature="docx",
)
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
) # type: ignore[union-attr]

# Get OCR service if available
ocr_service: Optional[MultiBackendOCRService] = kwargs.get("ocr_service")

if ocr_service:
# Extract and OCR images before mammoth processing
file_stream.seek(0)
image_ocr_map = self._extract_and_ocr_images(file_stream, ocr_service)

# Process with mammoth
file_stream.seek(0)
pre_process_stream = pre_process_docx(file_stream)
html_result = mammoth.convert_to_html(
pre_process_stream, style_map=kwargs.get("style_map")
).value

# Inject OCR results into HTML
html_with_ocr = self._inject_ocr_into_html(html_result, image_ocr_map)

return self._html_converter.convert_string(html_with_ocr, **kwargs)
else:
# Standard conversion without OCR
style_map = kwargs.get("style_map", None)
pre_process_stream = pre_process_docx(file_stream)
return self._html_converter.convert_string(
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
**kwargs,
)

def _extract_and_ocr_images(
self, file_stream: BinaryIO, ocr_service: MultiBackendOCRService
) -> dict[str, str]:
"""
Extract images from DOCX and OCR them.

Args:
file_stream: DOCX file stream
ocr_service: OCR service to use

Returns:
Dict mapping image relationship IDs to OCR text
"""
ocr_map = {}

try:
file_stream.seek(0)
doc = Document(file_stream)

# Extract images from document relationships
for rel in doc.part.rels.values():
if "image" in rel.target_ref.lower():
try:
image_part = rel.target_part
image_bytes = image_part.blob

# Create stream for OCR
image_stream = io.BytesIO(image_bytes)

# Perform OCR
ocr_result = ocr_service.extract_text(image_stream)

if ocr_result.text.strip():
# Store with relationship ID
ocr_text = f"\n[Image OCR: {rel.rId}]\n{ocr_result.text}\n[End OCR]\n"
ocr_map[rel.rId] = ocr_text

except Exception:
continue

except Exception:
pass

return ocr_map

def _inject_ocr_into_html(self, html: str, ocr_map: dict[str, str]) -> str:
"""
Replace image tags with OCR text inline (no base64 images).

Args:
html: HTML content from mammoth
ocr_map: Map of image IDs to OCR text

Returns:
HTML with images replaced by OCR text
"""
if not ocr_map:
return html

# Create a list of OCR texts and track which ones we've used
ocr_texts = list(ocr_map.values())
ocr_keys = list(ocr_map.keys())
used_indices = []

def replace_img(match):
# Replace the entire image tag with OCR text (no base64!)
for i, ocr_text in enumerate(ocr_texts):
if i not in used_indices:
used_indices.append(i)
# Return just the OCR text as a paragraph, no image
return f"<p><em>{ocr_text}</em></p>"
return "" # Remove image if no OCR text available

# Replace ALL img tags (including base64) with OCR text
result = re.sub(r"<img[^>]*>", replace_img, html)

# If there are remaining OCR texts (images that weren't in HTML), append them
remaining_ocr = [
ocr_texts[i] for i in range(len(ocr_texts)) if i not in used_indices
]
if remaining_ocr:
result += f"<p><em>{''.join(remaining_ocr)}</em></p>"

return result
Loading