Python OCR 技术入门与验证码识别实战

Python OCR 技术入门与验证码识别实战 | 极客日志

brew install tesseract # 安装中文语言包
brew install tesseract-lang

sudo apt-get update
sudo apt-get install tesseract-ocr
sudo apt-get install tesseract-ocr-chi-sim # 中文简体语言包
sudo apt-get install tesseract-ocr-eng # 英文语言包

tesseract --version
tesseract --list-langs # 查看已安装的语言包

pip install pytesseract pillow opencv-python numpy

import pytesseract
from PIL import Image
import os

# 如果 Tesseract 未添加到系统 PATH，可以手动指定路径
# Windows 示例：
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def test_tesseract():
    try:
        # 尝试获取版本信息，说明环境正常
        version = pytesseract.get_tesseract_version()
        print(f"Tesseract 版本：{version}")
        print("环境配置成功！")
    except Exception as e:
        print(f"环境配置失败：{e}")
        print("请检查 Tesseract 是否正确安装并添加到 PATH")

test_tesseract()

import pytesseract
from PIL import Image

def ocr_basic(image_path):
    """ 基础 OCR 识别 """
    # 打开图片
    image = Image.open(image_path)
    # 使用 Tesseract 进行识别
    # lang='eng'指定使用英文语言包
    text = pytesseract.image_to_string(image, lang='eng')
    print("识别结果：")
    print(text)
    return text

# 使用示例
# ocr_basic('sample.png')

text = pytesseract.image_to_string(image, lang='chi_sim')

text = pytesseract.image_to_string(image, lang='chi_sim+eng')

import cv2
import numpy as np
from PIL import Image
import pytesseract

def preprocess_image(image_path):
    """ 图像预处理：灰度化、二值化、去噪 """
    # 读取图像（使用 OpenCV）
    img = cv2.imread(image_path)
    # 1. 灰度化
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # 2. 去噪（高斯模糊）
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    # 3. 二值化（自适应阈值，处理光照不均的情况）
    # 使用自适应阈值，根据局部像素分布自动确定阈值
    binary = cv2.adaptiveThreshold(
        blurred,
        255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY,
        11, # 块大小
        2   # 常数
    )
    # 4. 可选：形态学操作，去除小的噪点
    kernel = np.ones((1, 1), np.uint8)
    opening = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
    return opening

def ocr_with_preprocess(image_path):
    """ 预处理后的 OCR 识别 """
    # 预处理
    processed_img = preprocess_image(image_path)
    # OpenCV 图像（numpy 数组）转换为 PIL Image
    pil_img = Image.fromarray(processed_img)
    # OCR 识别
    text = pytesseract.image_to_string(pil_img, lang='eng')
    return text

# 对比实验：直接识别 vs 预处理后识别
def compare_ocr(image_path):
    print("=== 直接识别 ===")
    img_raw = Image.open(image_path)
    text_raw = pytesseract.image_to_string(img_raw, lang='eng')
    print(text_raw[:200]) # 只显示前 200 字符
    print("\n=== 预处理后识别 ===")
    text_processed = ocr_with_preprocess(image_path)
    print(text_processed[:200])

# 使用示例
# compare_ocr('noisy_text.jpg')

import cv2
import numpy as np
import math

def correct_skew(image):
    """ 校正图像倾斜 """
    # 转换为灰度图
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # 二值化
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    # 检测边缘
    edges = cv2.Canny(binary, 50, 150, apertureSize=3)
    # 霍夫变换检测直线
    lines = cv2.HoughLinesP(edges, 1, np.pi / 180, 100, minLineLength=100, maxLineGap=10)
    if lines is None:
        return image
    # 计算所有检测到的直线的角度
    angles = []
    for line in lines:
        x1, y1, x2, y2 = line[0]
        angle = math.degrees(math.atan2(y2 - y1, x2 - x1))
        angles.append(angle)
    # 取中位数角度
    median_angle = np.median(angles)
    # 旋转校正
    h, w = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

def ocr_with_config(image_path, psm=6):
    """ 使用自定义配置进行 OCR 识别 """
    image = Image.open(image_path)
    # 配置参数：--psm 6 表示将图像视为一个统一的文本块
    # --oem 3 表示使用默认的 OCR 引擎模式
    custom_config = r'--oem 3 --psm {}'.format(psm)
    text = pytesseract.image_to_string(
        image,
        lang='eng',
        config=custom_config
    )
    return text

# 不同 PSM 模式的对比
def test_psm_modes(image_path):
    for psm in [6, 7, 8, 13]:
        text = ocr_with_config(image_path, psm)
        print(f"PSM 模式 {psm}:")
        print(text[:100] + "...\n")

import cv2
import numpy as np
import pytesseract
from PIL import Image

def preprocess_captcha(image_path):
    """ 验证码图像预处理 """
    # 读取图像
    img = cv2.imread(image_path)
    # 1. 灰度化
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # 2. 去噪（中值滤波，对去除椒盐噪声效果好）
    denoised = cv2.medianBlur(gray, 3)
    # 3. 二值化（使用 OTSU 自动阈值）
    _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    # 4. 形态学操作：去除小的噪点
    # 定义结构元素
    kernel = np.ones((2, 2), np.uint8)
    # 开运算（先腐蚀后膨胀），去除小的白色噪点
    opening = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
    # 5. 可选：膨胀操作，连接断开的字符
    # kernel_dilate = np.ones((2, 2), np.uint8)
    # dilated = cv2.dilate(opening, kernel_dilate, iterations=1)
    return opening

def recognize_captcha(image_path):
    """ 识别验证码 """
    # 预处理
    processed = preprocess_captcha(image_path)
    # 转换为 PIL 图像
    pil_img = Image.fromarray(processed)
    # OCR 配置：--psm 8（单个单词），只允许数字和大写字母
    # -c tessedit_char_whitelist=0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ
    custom_config = r'--psm 8 -c tessedit_char_whitelist=0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    text = pytesseract.image_to_string(
        pil_img,
        lang='eng',
        config=custom_config
    )
    # 清理结果：去除空格和特殊字符
    text = ''.join(filter(str.isalnum, text))
    return text

# 使用示例
captcha_text = recognize_captcha('captcha.png')
print(f"验证码识别结果：{captcha_text}")

def segment_and_recognize(image_path):
    """ 字符分割后识别 """
    # 预处理
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    # 查找轮廓（每个字符的轮廓）
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    # 筛选轮廓：根据宽高比和面积过滤
    char_contours = []
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        # 过滤太小的噪声
        if w > 5 and h > 10 and w < 50 and h < 50:
            char_contours.append((x, y, w, h))
    # 按 x 坐标排序（从左到右）
    char_contours.sort(key=lambda x: x[0])
    # 逐个识别
    result = ""
    for i, (x, y, w, h) in enumerate(char_contours):
        # 提取单个字符区域
        char_img = binary[y:y+h, x:x+w]
        # 可选：为字符添加边框，方便识别
        char_with_border = cv2.copyMakeBorder(
            char_img, 5, 5, 5, 5, cv2.BORDER_CONSTANT, value=0
        )
        # 转换为 PIL 图像
        pil_char = Image.fromarray(char_with_border)
        # 识别单个字符
        custom_config = r'--psm 10 -c tessedit_char_whitelist=0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
        char_text = pytesseract.image_to_string(pil_char, config=custom_config).strip()
        if char_text:
            result += char_text
    return result

from concurrent.futures import ThreadPoolExecutor
import hashlib
import pickle
import os

class CaptchaRecognizer:
    """ 带缓存的验证码识别器 """
    def __init__(self, cache_dir='captcha_cache'):
        self.cache_dir = cache_dir
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)

    def _get_cache_key(self, image_path):
        """生成缓存键（基于图片内容的哈希）"""
        with open(image_path, 'rb') as f:
            img_data = f.read()
        return hashlib.md5(img_data).hexdigest()

    def _recognize(self, image_path):
        """实际的识别逻辑"""
        processed = preprocess_captcha(image_path)
        pil_img = Image.fromarray(processed)
        custom_config = r'--psm 8 -c tessedit_char_whitelist=0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
        text = pytesseract.image_to_string(pil_img, config=custom_config)
        return text.strip()

    def recognize(self, image_path):
        """带缓存的识别"""
        cache_key = self._get_cache_key(image_path)
        cache_file = os.path.join(self.cache_dir, cache_key)
        # 检查缓存
        if os.path.exists(cache_file):
            with open(cache_file, 'rb') as f:
                return pickle.load(f)
        # 识别
        result = self._recognize(image_path)
        # 保存到缓存
        with open(cache_file, 'wb') as f:
            pickle.dump(result, f)
        return result

    def batch_recognize(self, image_paths, max_workers=4):
        """批量识别（并行处理）"""
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            results = list(executor.map(self.recognize, image_paths))
        return results

import os
import pytesseract
from PIL import Image
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import cv2
import numpy as np
from datetime import datetime

class BatchOCRProcessor:
    """ 批量 OCR 识别处理器 """
    def __init__(self, input_dir, output_excel, lang='chi_sim+eng', use_preprocess=True):
        """ 初始化
        :param input_dir: 输入图片目录
        :param output_excel: 输出 Excel 文件路径
        :param lang: OCR 语言包
        :param use_preprocess: 是否使用预处理
        """
        self.input_dir = input_dir
        self.output_excel = output_excel
        self.lang = lang
        self.use_preprocess = use_preprocess
        # 支持的图片格式
        self.image_extensions = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff')

    def preprocess_image(self, image_path):
        """ 图像预处理 """
        # 读取图像
        img = cv2.imread(image_path)
        # 灰度化
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        # 去噪
        denoised = cv2.medianBlur(gray, 3)
        # 二值化（自适应阈值）
        binary = cv2.adaptiveThreshold(
            denoised,
            255,
            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY,
            11,
            2
        )
        # 转换为 PIL 图像
        return Image.fromarray(binary)

    def process_single_image(self, filename):
        """ 处理单张图片 """
        file_path = os.path.join(self.input_dir, filename)
        try:
            if self.use_preprocess:
                # 预处理后识别
                img = self.preprocess_image(file_path)
            else:
                # 直接识别
                img = Image.open(file_path)
            # 执行 OCR 识别
            # 配置：尝试不同的 PSM 模式，选择最佳结果
            text = self.ocr_with_multiple_configs(img)
            return {
                '文件名': filename,
                '识别内容': text,
                '状态': '成功',
                '处理时间': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }
        except Exception as e:
            return {
                '文件名': filename,
                '识别内容': '',
                '状态': f'失败：{str(e)}',
                '处理时间': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }

    def ocr_with_multiple_configs(self, img):
        """ 使用多种配置尝试识别，返回最佳结果 """
        # 尝试不同的 PSM 模式
        psm_modes = [6, 7, 8, 13]
        results = []
        for psm in psm_modes:
            config = f'--psm {psm}'
            text = pytesseract.image_to_string(img, lang=self.lang, config=config)
            # 计算有效字符数（去除非字母数字字符）
            valid_chars = sum(c.isalnum() for c in text)
            results.append((valid_chars, text))
        # 返回有效字符最多的结果
        best_result = max(results, key=lambda x: x[0])
        return best_result[1].strip()

    def run(self, max_workers=4):
        """ 运行批量处理 """
        print(f"开始扫描目录：{self.input_dir}")
        # 获取所有图片文件
        image_files = [
            f for f in os.listdir(self.input_dir)
            if f.lower().endswith(self.image_extensions)
        ]
        print(f"找到 {len(image_files)} 个图片文件")
        if not image_files:
            print("未找到图片文件")
            return
        # 批量处理（并行）
        results = []
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(self.process_single_image, f) for f in image_files]
            for i, future in enumerate(futures):
                result = future.result()
                results.append(result)
                print(f"进度：{i+1}/{len(image_files)} - 已处理 {result['文件名']}")
        # 保存到 Excel
        df = pd.DataFrame(results)
        df.to_excel(self.output_excel, index=False, engine='openpyxl')
        print(f"\n处理完成！结果已保存至：{self.output_excel}")
        print(f"成功：{len(df[df['状态']=='成功'])} 个，失败：{len(df[df['状态']!='成功'])} 个")
        # 返回统计信息
        return df

# 使用示例
if __name__ == "__main__":
    processor = BatchOCRProcessor(
        input_dir='./images',      # 图片目录
        output_excel='./ocr_results.xlsx',  # 输出 Excel
        lang='chi_sim+eng',        # 中英文混合
        use_preprocess=True        # 启用预处理
    )
    results_df = processor.run(max_workers=4)

# 安装 PaddleOCR
# pip install paddlepaddle paddleocr
from paddleocr import PaddleOCR

def ocr_with_paddle(image_path):
    """ 使用 PaddleOCR 识别 """
    # 初始化 OCR（首次运行会下载模型）
    ocr = PaddleOCR(use_angle_cls=True, lang='ch')
    # 识别
    result = ocr.ocr(image_path, cls=True)
    # 提取文本
    text = ''
    for line in result:
        for word_info in line:
            text += word_info[1][0] + ' '
    return text

def postprocess_text(text, known_words=None):
    """ 后处理：清洗和校正识别结果 """
    # 去除多余的空格和换行
    text = ' '.join(text.split())
    # 去除特殊字符，只保留字母、数字、中文和基本标点
    import re
    text = re.sub(r'[^\u4e00-\u9fff\u0041-\u005a\u0061-\u007a\u0030-\u0039\s\.,;:!?()]', '', text)
    # 如果提供了已知词汇表，可以进行简单的纠错
    if known_words:
        words = text.split()
        corrected = []
        for word in words:
            if word not in known_words:
                # 简单纠错：查找最相似的已知词
                # 这里可以集成更复杂的拼写检查算法
                pass
            corrected.append(word)
        text = ' '.join(corrected)
    return text

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def enhance_image(image_path):
    """ 图像增强：锐化 + 超分辨率 """
    img = cv2.imread(image_path)
    # 锐化
    kernel_sharpen = np.array([[-1, -1, -1], [-1, 9, -1], [-1, -1, -1]])
    sharpened = cv2.filter2D(img, -1, kernel_sharpen)
    # 放大（如果图像太小）
    height, width = sharpened.shape[:2]
    if width < 800 or height < 600:
        scale = max(800 / width, 600 / height)
        new_width = int(width * scale)
        new_height = int(height * scale)
        enlarged = cv2.resize(sharpened, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
        return enlarged
    return sharpened

Python OCR 技术入门与验证码识别实战

Python OCR 技术入门与验证码识别实战

引言：当机器学会'阅读'

一、环境搭建：工欲善其事，必先利其器

1.1 安装 Tesseract OCR 引擎

1.2 安装 Python 库

1.3 验证环境配置

二、基础实战：从图片中提取文字

2.1 最简单的 OCR 识别

2.2 图像预处理：提高识别准确率的关键

2.3 倾斜校正

2.4 OCR 配置优化

三、进阶实战：验证码识别

3.1 验证码识别的挑战

3.2 验证码预处理流程

3.3 字符分割

3.4 验证码识别的性能优化

四、实战项目：批量图片文字识别与 Excel 导出

4.1 完整实现代码

4.2 使用 PaddleOCR 作为备选方案

五、性能优化与常见问题解决方案

5.1 识别准确率提升策略

5.2 常见问题及解决方案

更多推荐文章

相关免费在线工具

Python OCR 技术入门与验证码识别实战

Python OCR 技术入门与验证码识别实战

引言：当机器学会'阅读'

一、环境搭建：工欲善其事，必先利其器

1.1 安装 Tesseract OCR 引擎

1.2 安装 Python 库

1.3 验证环境配置

二、基础实战：从图片中提取文字

2.1 最简单的 OCR 识别

2.2 图像预处理：提高识别准确率的关键

2.3 倾斜校正

2.4 OCR 配置优化

三、进阶实战：验证码识别

3.1 验证码识别的挑战

3.2 验证码预处理流程

3.3 字符分割

3.4 验证码识别的性能优化

四、实战项目：批量图片文字识别与 Excel 导出

4.1 完整实现代码

4.2 使用 PaddleOCR 作为备选方案

五、性能优化与常见问题解决方案

5.1 识别准确率提升策略

5.2 常见问题及解决方案

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具