Skip to content
Merged
2 changes: 2 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
!requirements_test.txt
!LICENSE
!README.md
!repo/tesserocr
!repo/tesseract

# avoid .git and __pycache__ etc:
!ocrd_tesserocr/**/*.py
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ test-workspace
/.coverage
/htmlcov
/.cache
build_tesseract
6 changes: 6 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[submodule "repo/tesserocr"]
path = repo/tesserocr
url = https://github.com/sirfz/tesserocr/
[submodule "repo/tesseract"]
path = repo/tesseract
url = https://github.com/tesseract-ocr/tesseract
68 changes: 59 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
FROM ocrd/core
FROM ocrd/core:v2.62.0 AS base
# set proper locales
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
# install ocrd-tesserocr (until here commands for installing tesseract-ocr)
ARG VCS_REF
ARG BUILD_DATE
LABEL \
Expand All @@ -7,32 +11,78 @@ LABEL \
org.label-schema.vcs-url="https://github.com/OCR-D/ocrd_tesserocr" \
org.label-schema.build-date=$BUILD_DATE

ENV DEBIAN_FRONTEND noninteractive
ENV PYTHONIOENCODING utf8

# set TESSDATA_PREFIX
ENV TESSDATA_PREFIX /usr/local/share/tessdata

# set frontend non-interactive to silence interactive tzdata config
ARG DEBIAN_FRONTEND=noninteractive


# install common tools and tesseract build dependencies
# use provided leptonica
# tzdata required for proper timezone settings
RUN apt-get update && apt-get install -y \
apt-utils \
build-essential \
g++ \
git \
libjpeg-dev \
libgif-dev \
libwebp-dev \
libopenjp2-7-dev \
libpng-dev \
libtiff-dev \
libtool \
pkg-config \
tzdata \
xzgv \
zlib1g-dev \
libleptonica-dev \
libpango1.0-dev \
libicu-dev \
autotools-dev \
automake \
libcurl4-nss-dev \
libarchive-dev
Comment thread
bertsky marked this conversation as resolved.
Outdated

# set proper date and timezone in container
RUN echo "Europe/Berlin" > /etc/timezone
RUN ln -sf /usr/share/zoneinfo/Europe/Berlin /etc/localtime
RUN dpkg-reconfigure -f noninteractive tzdata

# diagnostic output - check timezone settings
# RUN cat /etc/timezone

# avoid HOME/.local/share (hard to predict USER here)
# so let XDG_DATA_HOME coincide with fixed system location
# (can still be overridden by derived stages)
ENV XDG_DATA_HOME /usr/local/share

WORKDIR /build-ocrd
WORKDIR /build-ocrd_tesserocr
COPY setup.py .
COPY ocrd_tesserocr/ocrd-tool.json .
COPY README.md .
COPY requirements.txt .
COPY requirements_test.txt .
COPY ocrd_tesserocr ./ocrd_tesserocr
COPY repo/tesserocr ./repo/tesserocr
COPY repo/tesseract ./repo/tesseract
COPY Makefile .
RUN make deps-ubuntu && \
Comment thread
bertsky marked this conversation as resolved.
apt-get install -y --no-install-recommends \
g++ \
&& make deps install \
&& rm -rf /build-ocrd \
RUN apt-get install -y --no-install-recommends \
python3 \
python3-pip \
Comment thread
bertsky marked this conversation as resolved.
Outdated
&& make deps install-tesseract install-tesserocr install \
&& rm -rf /build-ocrd_tesserocr \
&& apt-get -y remove --auto-remove g++ libtesseract-dev make

# PPA tessdata prefix (= ocrd_tesserocr moduledir) is owned by root
RUN sudo chmod go+w `dpkg-query -L tesseract-ocr-eng | sed -n s,/eng.traineddata,,p`
# next line causes failure because tesseract-ocr-eng not existing. Not sure if needed, so skipping
# RUN sudo chmod go+w `dpkg-query -L tesseract-ocr-eng | sed -n s,/eng.traineddata,,p`
RUN ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata
RUN ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata
RUN ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata
Comment thread
bertsky marked this conversation as resolved.

WORKDIR /data
VOLUME /data
34 changes: 33 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,20 @@ LOG_LEVEL = INFO
PYTHONIOENCODING=utf8
LC_ALL = C.UTF-8
LANG = C.UTF-8
export
ifdef VIRTUAL_ENV
TESSERACT_PREFIX = $(VIRTUAL_ENV)
else
TESSERACT_PREFIX = /usr/local
endif

ifeq ($(PKG_CONFIG_PATH),)
PKG_CONFIG_PATH := $(TESSERACT_PREFIX)/lib/pkgconfig
else
PKG_CONFIG_PATH := $(TESSERACT_PREFIX)/lib/pkgconfig:$(PKG_CONFIG_PATH)
endif
export PKG_CONFIG_PATH

export

# pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. Default: '$(PYTEST_ARGS)'
PYTEST_ARGS =
Expand Down Expand Up @@ -85,6 +97,20 @@ docker:
--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
-t $(DOCKER_TAG) .

install-tesserocr:
cd repo/tesserocr; $(PIP) install .

install-tesseract:
cd repo/tesseract; ./autogen.sh
mkdir -p $(CURDIR)/build_tesseract
cd $(CURDIR)/build_tesseract && $(CURDIR)/repo/tesseract/configure \
--prefix=$(TESSERACT_PREFIX) \
--disable-openmp \
--disable-shared \
'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic -fPIC' ;\
cd $(CURDIR)/build_tesseract && $(MAKE) install
if [[ "$(TESSERACT_PREFIX)" = "/usr"* ]];then ldconfig ;fi

# Install this package
install: deps
$(PIP) install .
Expand Down Expand Up @@ -135,6 +161,12 @@ repo/assets:
mkdir -p $(dir $@)
git clone https://github.com/OCR-D/assets "$@"

.PHONY: clean
clean: assets-clean tesseract-clean

tesseract-clean:
rm -rf $(CURDIR)/build_tesseract
cd repo/tesseract; make distclean

.PHONY: assets-clean
# Remove symlinks in test/assets
Expand Down
1 change: 1 addition & 0 deletions repo/tesseract
Submodule tesseract added at 8ee020
1 change: 1 addition & 0 deletions repo/tesserocr
Submodule tesserocr added at 1f960e