import org.bytedeco.tesseract.TessBaseAPI;
public class TesseractInitExample {
public static void main(String[] args) {
// Create a new Tesseract API instance
TessBaseAPI api = new TessBaseAPI();
// Set the path to tessdata and language (change accordingly)
String dataPath = "/root/GitHub/tessdata_best";
String language = "eng";
// Initialize Tesseract
if (api.Init(dataPath, language) != 0) {
System.err.println("Could not initialize Tesseract.");
return;
}
System.out.println("Tesseract initialized successfully!");
// Cleanup
api.End();
}
}
下面是dockerfile
FROM gradle:8.11.1-jdk17-alpine AS build
RUN apk add --no-cache git
RUN git clone --branch main --single-branch https://github.com/tesseract-ocr/tessdata_best.git /root/GitHub/tessdata_best
COPY --chown=gradle:gradle . /home/gradle/src
WORKDIR /home/gradle/src
RUN gradle build --no-daemon -x test
FROM eclipse-temurin:17.0.13_11-jdk-alpine
COPY --from=build /root/GitHub/tessdata_best /root/GitHub/tessdata_best
RUN apk add --no-cache \
tesseract-ocr \
leptonica-dev
VOLUME /tmp
ARG JAVA_OPTS
ENV JAVA_OPTS=$JAVA_OPTS
RUN mkdir /app
COPY --from=build /home/gradle/src/build/libs/*.jar /app/extract.jar
ENTRYPOINT ["java", "-XX:+UnlockExperimentalVMOptions", "-Djava.security.egd=file:/dev/./urandom","-jar","/app/extract.jar"]
运行容器时,我在
init
期间遇到以下错误。
3.362 seconds (process running for 4.095)
#
# A fatal error has been detected by the Java Runtime Environment:
#
# SIGSEGV (0xb) at pc=0x000000000005f2b0, pid=1, tid=7
#
# JRE version: OpenJDK Runtime Environment Temurin-17.0.13+11 (17.0.13+11) (build 17.0.13+11)
# Java VM: OpenJDK 64-Bit Server VM Temurin-17.0.13+11 (17.0.13+11, mixed mode, sharing, tiered, compressed oops, compressed class ptrs, g1 gc, linux-amd64)
# Problematic frame:
# C [libtesseract.so.5.5+0x14b9c8] tesseract::CHAR_FRAGMENT::parse_from_string(char const*)+0xc8
#
# Core dump will be written. Default location: Core dumps may be processed with "/wsl-capture-crash %t %E %p %s" (or dumping to //core.1)
#
# An error report file with more information is saved as:
# //hs_err_pid1.log
#
# If you would like to submit a bug report, please visit:
# https://github.com/adoptium/adoptium-support/issues
# The crash happened outside the Java Virtual Machine in native code.
# See problematic frame for where to report the bug.
#
无法找出此错误。我该如何解决这个问题(在本地 Windows 中工作正常)?
您遇到的问题可能是由于基于 Alpine 的图像和本机 Tesseract 库之间的兼容性问题造成的。 Alpine 使用 musl 而不是 glibc,这有时会导致在使用 libtesseract.so 等本机库时出现分段错误或其他意外行为。切换到 Debian 或基于 Ubuntu 的映像通常可以解决此类问题。
这是一个使用基于 Debian 的镜像以获得更好兼容性的 Dockerfile:
FROM gradle:8.11.1-jdk17 AS build
# Install git and clone tessdata_best
RUN apt-get update && apt-get install -y git
RUN git clone --branch main --single-branch https://github.com/tesseract-ocr/tessdata_best.git /root/GitHub/tessdata_best
# Build the project with Gradle
COPY --chown=gradle:gradle . /home/gradle/src
WORKDIR /home/gradle/src
RUN gradle build --no-daemon -x test
FROM eclipse-temurin:17.0.13_11-jdk
# Install Tesseract and dependencies
RUN apt-get update && apt-get install -y \
tesseract-ocr \
libleptonica-dev
# Copy tessdata_best
COPY --from=build /root/GitHub/tessdata_best /root/GitHub/tessdata_best
# Prepare the application
VOLUME /tmp
ARG JAVA_OPTS
ENV JAVA_OPTS=$JAVA_OPTS
RUN mkdir /app
COPY --from=build /home/gradle/src/build/libs/*.jar /app/extract.jar
# Run the application
ENTRYPOINT ["java", "-XX:+UnlockExperimentalVMOptions", "-Djava.security.egd=file:/dev/./urandom","-jar","/app/extract.jar"]