deliveryman-api/app/core/ocr_service.py

from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.ocr.v20181119 import ocr_client, models
from app.core.config import settings
import json
import base64

class OCRService:
    def __init__(self):
        cred = credential.Credential(settings.TENCENT_SECRET_ID, settings.TENCENT_SECRET_KEY)
        httpProfile = HttpProfile()
        httpProfile.endpoint = "ocr.tencentcloudapi.com"

        clientProfile = ClientProfile()
        clientProfile.httpProfile = httpProfile
        self.client = ocr_client.OcrClient(cred, settings.TENCENT_REGION, clientProfile)

    async def recognize_pickup_code(self, image_content: bytes) -> dict:
        """识别收件码图片"""
        try:
            # 将图片内容转为base64
            img_base64 = base64.b64encode(image_content).decode()

            req = models.GeneralAccurateOCRRequest()
            req.ImageBase64 = img_base64

            resp = self.client.GeneralAccurateOCR(req)
            result = json.loads(resp.to_json_string())

            print(result)

            # 解析文本内容
            text_list = []
            for item in result.get("TextDetections", []):
                text_list.append(item["DetectedText"])

            # 提取关键信息
            pickup_info = self._extract_pickup_info(text_list)
            return pickup_info

        except Exception as e:
            raise Exception(f"识别失败: {str(e)}")

    def _is_valid_pickup_code(self, text: str) -> bool:
        """验证是否是有效的取件码格式"""
        import re
        # 匹配格式：xx-x-xxx 或 xx-xx-xxx 等类似格式
        patterns = [
            r'\b\d{1,2}-\d{1,2}-\d{2,3}\b',  # 15-4-223
            r'\b\d{4,8}\b',                   # 普通4-8位数字
        ]

        for pattern in patterns:
            if re.search(pattern, text):
                return True
        return False

    def _extract_pickup_info(self, text_list: list) -> dict:
        """提取收件码信息"""
        # 存储所有驿站信息
        stations = []
        current_station = None
        current_codes = []

        pickup_info = {
            "stations": [],        # 驿站列表
            "app_type": None      # APP类型(菜鸟/京东等)
        }

        # 识别APP类型
        app_keywords = {
            "菜鸟": "CAINIAO",
            "京东": "JD",
            "顺丰": "SF"
        }

        for text in text_list:
            # 查找APP类型
            for keyword, app_type in app_keywords.items():
                if keyword in text:
                    pickup_info["app_type"] = app_type
                    break

            # 查找驿站名称
            is_station = False
            if "驿站" in text:
                is_station = True
            elif "站点" in text:
                is_station = True
            elif "仓" in text:
                is_station = True
            elif "站" in text:
                is_station = True
            elif "分拨" in text:
                is_station = True
            elif "分拣" in text:
                is_station = True
            elif "分拨" in text:
                is_station = True

            if is_station:
                # 如果之前有未保存的驿站信息，先保存
                if current_station and current_codes:
                    stations.append({
                        "station_name": current_station,
                        "pickup_codes": current_codes
                    })
                # 开始新的驿站信息收集
                current_station = text
                current_codes = []

            # 查找取件码
            if self._is_valid_pickup_code(text):
                # 清理文本中的多余字符
                cleaned_text = ''.join(c for c in text if c.isdigit() or c == '-')
                # 提取所有匹配的取件码
                import re
                for pattern in [r'\d{1,2}-\d{1,2}-\d{2,3}', r'\d{4,8}']:
                    matches = re.finditer(pattern, cleaned_text)
                    for match in matches:
                        code = match.group()
                        # 如果已找到驿站，将取件码添加到当前驿站
                        if current_station and code not in current_codes:
                            current_codes.append(code)
                        # 如果还没找到驿站，暂存取件码
                        elif code not in current_codes:
                            current_codes.append(code)

        # 保存最后一个驿站的信息
        if current_station and current_codes:
            stations.append({
                "station_name": current_station,
                "pickup_codes": current_codes
            })
        # 如果有未分配到驿站的取件码，创建一个默认驿站
        elif current_codes:
            stations.append({
                "station_name": None,
                "pickup_codes": current_codes
            })

        # 如果找到了取件码但没找到APP类型，根据取件码格式推测
        if stations and not pickup_info["app_type"]:
            # 如果任一取件码包含连字符，判定为菜鸟
            for station in stations:
                if any('-' in code for code in station["pickup_codes"]):
                    pickup_info["app_type"] = "CAINIAO"
                    break

        pickup_info["stations"] = stations
        return pickup_info

ocr_service = OCRService()