deliveryman-api/app/core/ocr_service.py
2025-02-21 16:49:51 +08:00

154 lines
5.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.ocr.v20181119 import ocr_client, models
from app.core.config import settings
import json
import base64
class OCRService:
def __init__(self):
cred = credential.Credential(settings.TENCENT_SECRET_ID, settings.TENCENT_SECRET_KEY)
httpProfile = HttpProfile()
httpProfile.endpoint = "ocr.tencentcloudapi.com"
clientProfile = ClientProfile()
clientProfile.httpProfile = httpProfile
self.client = ocr_client.OcrClient(cred, settings.TENCENT_REGION, clientProfile)
async def recognize_pickup_code(self, image_content: bytes) -> dict:
"""识别收件码图片"""
try:
# 将图片内容转为base64
img_base64 = base64.b64encode(image_content).decode()
req = models.GeneralAccurateOCRRequest()
req.ImageBase64 = img_base64
resp = self.client.GeneralAccurateOCR(req)
result = json.loads(resp.to_json_string())
print(result)
# 解析文本内容
text_list = []
for item in result.get("TextDetections", []):
text_list.append(item["DetectedText"])
# 提取关键信息
pickup_info = self._extract_pickup_info(text_list)
return pickup_info
except Exception as e:
raise Exception(f"识别失败: {str(e)}")
def _is_valid_pickup_code(self, text: str) -> bool:
"""验证是否是有效的取件码格式"""
import re
# 匹配格式xx-x-xxx 或 xx-xx-xxx 等类似格式
patterns = [
r'\b\d{1,2}-\d{1,2}-\d{2,3}\b', # 15-4-223
r'\b\d{4,8}\b', # 普通4-8位数字
]
for pattern in patterns:
if re.search(pattern, text):
return True
return False
def _extract_pickup_info(self, text_list: list) -> dict:
"""提取收件码信息"""
# 存储所有驿站信息
stations = []
current_station = None
current_codes = []
pickup_info = {
"stations": [], # 驿站列表
"app_type": None # APP类型(菜鸟/京东等)
}
# 识别APP类型
app_keywords = {
"菜鸟": "CAINIAO",
"京东": "JD",
"顺丰": "SF"
}
for text in text_list:
# 查找APP类型
for keyword, app_type in app_keywords.items():
if keyword in text:
pickup_info["app_type"] = app_type
break
# 查找驿站名称
is_station = False
if "驿站" in text:
is_station = True
elif "站点" in text:
is_station = True
elif "" in text:
is_station = True
elif "" in text:
is_station = True
elif "分拨" in text:
is_station = True
elif "分拣" in text:
is_station = True
elif "分拨" in text:
is_station = True
if is_station:
# 如果之前有未保存的驿站信息,先保存
if current_station and current_codes:
stations.append({
"station_name": current_station,
"pickup_codes": current_codes
})
# 开始新的驿站信息收集
current_station = text
current_codes = []
# 查找取件码
if self._is_valid_pickup_code(text):
# 清理文本中的多余字符
cleaned_text = ''.join(c for c in text if c.isdigit() or c == '-')
# 提取所有匹配的取件码
import re
for pattern in [r'\d{1,2}-\d{1,2}-\d{2,3}', r'\d{4,8}']:
matches = re.finditer(pattern, cleaned_text)
for match in matches:
code = match.group()
# 如果已找到驿站,将取件码添加到当前驿站
if current_station and code not in current_codes:
current_codes.append(code)
# 如果还没找到驿站,暂存取件码
elif code not in current_codes:
current_codes.append(code)
# 保存最后一个驿站的信息
if current_station and current_codes:
stations.append({
"station_name": current_station,
"pickup_codes": current_codes
})
# 如果有未分配到驿站的取件码,创建一个默认驿站
elif current_codes:
stations.append({
"station_name": None,
"pickup_codes": current_codes
})
# 如果找到了取件码但没找到APP类型根据取件码格式推测
if stations and not pickup_info["app_type"]:
# 如果任一取件码包含连字符,判定为菜鸟
for station in stations:
if any('-' in code for code in station["pickup_codes"]):
pickup_info["app_type"] = "CAINIAO"
break
pickup_info["stations"] = stations
return pickup_info
ocr_service = OCRService()