microsoft · afourney · Mar 10, 2026 · Jan 26, 2026 · Jan 26, 2026 · Jan 27, 2026
diff --git a/README.md b/README.md
@@ -176,6 +176,89 @@ result = md.convert("example.jpg")
 print(result.text_content)
 ```
 
+To extract text from images embedded in documents using OCR (requires Tesseract):
+
+```python
+from markitdown.converters._ocr_service import MultiBackendOCRService, OCRBackend
+from markitdown.converters._pdf_converter_with_ocr import PdfConverterWithOCR
+
+# Create OCR service with Tesseract
+ocr_service = MultiBackendOCRService(backends=[OCRBackend.TESSERACT])
+
+# Convert PDF with OCR
+converter = PdfConverterWithOCR()
+with open("document.pdf", "rb") as f:
+    result = converter.convert(f, ocr_service=ocr_service)
+    print(result.text_content)
+```
+
+To use LLM (GPT-4o, Gemini, etc.) for OCR instead of Tesseract:
+
+```python
+from markitdown.converters._ocr_service import MultiBackendOCRService, OCRBackend
+from markitdown.converters._pdf_converter_with_ocr import PdfConverterWithOCR
+from openai import OpenAI
+
+# Create OCR service with LLM Vision backend
+client = OpenAI()
+ocr_service = MultiBackendOCRService(
+    backends=[OCRBackend.LLM_VISION],
+    llm_client=client,
+    llm_model="gpt-4o"
+)
+
+# Convert PDF with LLM-based OCR
+converter = PdfConverterWithOCR()
+with open("document.pdf", "rb") as f:
+    result = converter.convert(f, ocr_service=ocr_service)
+    print(result.text_content)
+```
+
+Multi-backend fallback (tries Tesseract first, falls back to LLM if Tesseract fails):
+
+```python
+ocr_service = MultiBackendOCRService(
+    backends=[OCRBackend.TESSERACT, OCRBackend.LLM_VISION],
+    llm_client=client,
+    llm_model="gpt-4o"
+)
+```
+
+OCR converters are available for PDF, DOCX, XLSX (multi-sheet), and PPTX formats. Images are extracted with context preservation (page numbers, cell references, relationship IDs).
+
+#### Scanned PDF Support
+
+MarkItDown automatically detects scanned PDFs (documents with no extractable text) and falls back to full-page OCR. When a PDF extraction returns empty or whitespace-only results, the converter:
+
+1. Renders each page as a high-resolution image (300 DPI)
+2. Performs OCR on the full page image
+3. Preserves page structure with page markers
+4. Indicates which OCR backend was used
+
+This works seamlessly with any OCR backend (Tesseract, EasyOCR, LLM Vision):
+
+```python
+from markitdown.converters._ocr_service import MultiBackendOCRService, OCRBackend
+from markitdown.converters._pdf_converter_with_ocr import PdfConverterWithOCR
+
+# Create OCR service
+ocr_service = MultiBackendOCRService(backends=[OCRBackend.TESSERACT])
+
+# Convert scanned PDF - fallback is automatic
+converter = PdfConverterWithOCR()
+with open("scanned_invoice.pdf", "rb") as f:
+    result = converter.convert(f, ocr_service=ocr_service)
+    print(result.text_content)
+```
+
+The fallback triggers automatically when:
+
+- PDF has no extractable text (truly scanned documents)
+- Text extraction returns only whitespace
+- No embedded text is found via pdfminer or pdfplumber
+
+No additional configuration is needed - just provide an OCR service and the converter handles the rest.
+
 ### Docker
 
 ```sh

diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.5b1"
+__version__ = "0.1.6b1"
diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter_with_ocr.py b/packages/markitdown/src/markitdown/converters/_docx_converter_with_ocr.py
@@ -0,0 +1,184 @@
+"""
+Enhanced DOCX Converter with OCR support for embedded images.
+Extracts images from Word documents and performs OCR while maintaining context.
+"""
+
+import sys
+import io
+import re
+from typing import BinaryIO, Any, Optional
+
+from ._html_converter import HtmlConverter
+from ..converter_utils.docx.pre_process import pre_process_docx
+from .._base_converter import DocumentConverterResult
+from .._stream_info import StreamInfo
+from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
+from ._ocr_service import MultiBackendOCRService
+
+# Try loading dependencies
+_dependency_exc_info = None
+try:
+    import mammoth
+    from docx import Document
+except ImportError:
+    _dependency_exc_info = sys.exc_info()
+
+
+class DocxConverterWithOCR(HtmlConverter):
+    """
+    Enhanced DOCX Converter with OCR support for embedded images.
+    Maintains document flow while extracting text from images inline.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._html_converter = HtmlConverter()
+
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+
+        if extension == ".docx":
+            return True
+
+        if mimetype.startswith(
+            "application/vnd.openxmlformats-officedocument.wordprocessingml"
+        ):
+            return True
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> DocumentConverterResult:
+        if _dependency_exc_info is not None:
+            raise MissingDependencyException(
+                MISSING_DEPENDENCY_MESSAGE.format(
+                    converter=type(self).__name__,
+                    extension=".docx",
+                    feature="docx",
+                )
+            ) from _dependency_exc_info[1].with_traceback(
+                _dependency_exc_info[2]
+            )  # type: ignore[union-attr]
+
+        # Get OCR service if available
+        ocr_service: Optional[MultiBackendOCRService] = kwargs.get("ocr_service")
+
+        if ocr_service:
+            # Extract and OCR images before mammoth processing
+            file_stream.seek(0)
+            image_ocr_map = self._extract_and_ocr_images(file_stream, ocr_service)
+
+            # Process with mammoth
+            file_stream.seek(0)
+            pre_process_stream = pre_process_docx(file_stream)
+            html_result = mammoth.convert_to_html(
+                pre_process_stream, style_map=kwargs.get("style_map")
+            ).value
+
+            # Inject OCR results into HTML
+            html_with_ocr = self._inject_ocr_into_html(html_result, image_ocr_map)
+
+            return self._html_converter.convert_string(html_with_ocr, **kwargs)
+        else:
+            # Standard conversion without OCR
+            style_map = kwargs.get("style_map", None)
+            pre_process_stream = pre_process_docx(file_stream)
+            return self._html_converter.convert_string(
+                mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
+                **kwargs,
+            )
+
+    def _extract_and_ocr_images(
+        self, file_stream: BinaryIO, ocr_service: MultiBackendOCRService
+    ) -> dict[str, str]:
+        """
+        Extract images from DOCX and OCR them.
+
+        Args:
+            file_stream: DOCX file stream
+            ocr_service: OCR service to use
+
+        Returns:
+            Dict mapping image relationship IDs to OCR text
+        """
+        ocr_map = {}
+
+        try:
+            file_stream.seek(0)
+            doc = Document(file_stream)
+
+            # Extract images from document relationships
+            for rel in doc.part.rels.values():
+                if "image" in rel.target_ref.lower():
+                    try:
+                        image_part = rel.target_part
+                        image_bytes = image_part.blob
+
+                        # Create stream for OCR
+                        image_stream = io.BytesIO(image_bytes)
+
+                        # Perform OCR
+                        ocr_result = ocr_service.extract_text(image_stream)
+
+                        if ocr_result.text.strip():
+                            # Store with relationship ID
+                            ocr_text = f"\n[Image OCR: {rel.rId}]\n{ocr_result.text}\n[End OCR]\n"
+                            ocr_map[rel.rId] = ocr_text
+
+                    except Exception:
+                        continue
+
+        except Exception:
+            pass
+
+        return ocr_map
+
+    def _inject_ocr_into_html(self, html: str, ocr_map: dict[str, str]) -> str:
+        """
+        Replace image tags with OCR text inline (no base64 images).
+
+        Args:
+            html: HTML content from mammoth
+            ocr_map: Map of image IDs to OCR text
+
+        Returns:
+            HTML with images replaced by OCR text
+        """
+        if not ocr_map:
+            return html
+
+        # Create a list of OCR texts and track which ones we've used
+        ocr_texts = list(ocr_map.values())
+        ocr_keys = list(ocr_map.keys())
+        used_indices = []
+
+        def replace_img(match):
+            # Replace the entire image tag with OCR text (no base64!)
+            for i, ocr_text in enumerate(ocr_texts):
+                if i not in used_indices:
+                    used_indices.append(i)
+                    # Return just the OCR text as a paragraph, no image
+                    return f"<p><em>{ocr_text}</em></p>"
+            return ""  # Remove image if no OCR text available
+
+        # Replace ALL img tags (including base64) with OCR text
+        result = re.sub(r"<img[^>]*>", replace_img, html)
+
+        # If there are remaining OCR texts (images that weren't in HTML), append them
+        remaining_ocr = [
+            ocr_texts[i] for i in range(len(ocr_texts)) if i not in used_indices
+        ]
+        if remaining_ocr:
+            result += f"<p><em>{''.join(remaining_ocr)}</em></p>"
+
+        return result