我正在尝试检测项目中的文本,这些项目可能会向各个方向旋转。我尝试过使用 Tesseract、EasyOCR 和 EAST 进行文本检测和提取,但遇到旋转文本的问题。 Tesseract 给了我最接近的结果,但在旋转时它仍然错误地提取文本。
有没有可能的方法可以正确提取文本,无论其旋转如何?为了更好地理解,我提供了一些示例图像。
有人建议每次旋转图像并检测文本,但这个解决方案对我来说太耗时(每次运行 70 小时)。这是我使用的代码:
import os
import cv2
import pytesseract
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
# Directory containing the images
directory = 'Camera2/front'
# Ensure pytesseract can find the tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # Adjust path as necessary
# Initialize an empty list to store results
results = []
# Get the list of image files in the directory
image_files = [f for f in os.listdir(directory) if f.endswith('.jpeg') or f.endswith('.jpg')]
def preprocess_image(image):
# Convert the image to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Apply adaptive thresholding to preprocess the image
binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
return gray, binary
def detect_text(image):
# Preprocess the image
gray, binary = preprocess_image(image)
# Perform OCR on the preprocessed image
text = pytesseract.image_to_string(binary, config='--psm 3 -l eng --oem 3') # Using page segmentation mode 3
# Check if any text is detected
return bool(text.strip()), text, gray
def rotate_image(image, angle):
# Get the image dimensions
(h, w) = image.shape[:2]
# Calculate the center of the image
center = (w / 2, h / 2)
# Perform the rotation
matrix = cv2.getRotationMatrix2D(center, angle, 1.0)
rotated = cv2.warpAffine(image, matrix, (w, h))
return rotated
# Iterate through each file in the directory with tqdm for progress visualization
for filename in tqdm(image_files, desc="Processing images"):
filepath = os.path.join(directory, filename)
# Load the current image
original_image = cv2.imread(filepath)
# Initialize text detection result
has_text = False
detected_text = ""
gray_image = None
# Rotate the image from 0 to 359 degrees
for angle in tqdm(range(0, 360)):
rotated_image = rotate_image(original_image, angle)
has_text, detected_text, gray_image = detect_text(rotated_image)
if has_text:
break
# Plotting the original and preprocessed images
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
# Original image
axes[0].imshow(cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB))
axes[0].set_title('Original Image')
axes[0].axis('off')
# Gray scale image
if gray_image is not None:
axes[1].imshow(gray_image, cmap='gray')
axes[1].set_title('Grayscale Image with Adjusted Thresholding')
axes[1].axis('off')
plt.tight_layout()
plt.show()
if has_text:
print(f"Text detected in {filename}:")
print(detected_text)
# Store text in results list if it's longer than 3 characters
if len(detected_text) > 3:
image_id = filename.replace('.jpeg', '').replace('.jpg', '')
results.append({'ID': image_id, 'text': detected_text})
else:
print(f"No text detected in {filename}.")
results_df = pd.DataFrame(results)
如果您能够确定图像的对齐方式,则无需每次都将图像旋转一度。鉴于您的文本通常比单个字母长,其边界框应该是宽度>高度的矩形。
那么你可以:
(x1, x2, y1, y2)
您必须首先计算边界框,但您只需要对初始图像执行一次。应该比您当前的方法快 180 倍左右。