get rid of local ocr, using gpt-4o instead

This commit is contained in:
robinrolle
2025-04-13 16:15:02 +02:00
parent 1ef525c4b3
commit 2decad1ae1
2 changed files with 67 additions and 48 deletions

View File

@ -1,13 +1,15 @@
import base64 import base64
import binascii import binascii
import io
from PIL import Image
from langchain_core.messages import HumanMessage, SystemMessage
from typing import Callable, Type, Any, TypeVar from typing import Callable, Type, Any, TypeVar
from langchain_core.runnables import Runnable from langchain_core.runnables import Runnable
from langchain_core.prompts import ChatPromptTemplate from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser from langchain_core.output_parsers import PydanticOutputParser
from langchain_openai.chat_models import ChatOpenAI from langchain_openai.chat_models import ChatOpenAI
from pydantic import BaseModel from pydantic import BaseModel
from utils.parsers import process_profile, process_account, process_description ,process_passport
from utils.parsers import process_profile, process_passport, process_account, process_description
from validation.from_account import FromAccount from validation.from_account import FromAccount
from validation.from_passport import FromPassport from validation.from_passport import FromPassport
from validation.from_profile import FromProfile from validation.from_profile import FromProfile
@ -54,23 +56,43 @@ def extract_account(client_data: dict[str, Any])-> FromAccount:
def extract_passport(client_data: dict[str, Any]) -> FromPassport: def extract_passport(client_data: dict[str, Any]) -> FromPassport:
passport_data = client_data.get("passport") raw_file_data = client_data.get("passport")
prompt_template = ( if not raw_file_data:
"Extract the following information from the provided passport text.\n" raise ValueError("Missing passport base64 data")
"Return only JSON matching this format:\n{format_instructions}\n\n"
"Pay special attention to the passport number\n"
"Passport text:\n{processed_text}"
)
result = __run_extraction_chain( try:
raw_file_data=passport_data, base64.b64decode(raw_file_data, validate=True)
file_processor=process_passport, except binascii.Error as e:
pydantic_model=FromPassport, raise ValueError(f"Invalid base64 data: {e}")
prompt_template=prompt_template,
)
return result # Décodage image
image_bytes = base64.b64decode(raw_file_data)
image = Image.open(io.BytesIO(image_bytes))
# Parser Pydantic
parser = PydanticOutputParser(pydantic_object=FromPassport)
format_instructions = parser.get_format_instructions()
# LLM gpt-4o
llm = ChatOpenAI(model="gpt-4o", temperature=0)
# Messages multimodaux
messages = [
SystemMessage(content="Tu es un assistant qui lit les passeports."),
HumanMessage(
content=[
{"type": "text", "text": f"Lis ce passeport et retourne les infos suivantes au format JSON :\n{format_instructions}"},
{"type": "image_url", "image_url": {"url": "data:image/jpeg;base64," + raw_file_data}},
]
)
]
# Appel direct du LLM (hors prompt chain)
result = llm.invoke(messages)
# Parsing structuré
return parser.parse(result.content)
def extract_profile(client_data: dict[str, Any]) -> FromProfile: def extract_profile(client_data: dict[str, Any]) -> FromProfile:

View File

@ -1,42 +1,39 @@
import base64 import base64
import io import io
from tempfile import NamedTemporaryFile from PIL import Image
from PIL import Image, ImageEnhance from langchain_openai import ChatOpenAI
import pytesseract from langchain.output_parsers import PydanticOutputParser
from passporteye import read_mrz from langchain_core.prompts import ChatPromptTemplate
import json from validation.from_passport import FromPassport
def process_passport(passport_b64: str) -> str: def process_passport(passport_b64: str) -> str:
""" """
Traite le passport : Traite le passport :
- Décodage de l'image en base64. - Décodage de l'image en base64.
- Application de l'OCR pour extraire le texte. - Envoi à GPT-4o avec un prompt d'extraction structuré.
- Parsing structuré avec un modèle Pydantic.
:param passport_b64: Chaîne base64 représentant l'image du passport.
:return: Texte extrait de l'image.
""" """
image_bytes = base64.b64decode(passport_b64) image_bytes = base64.b64decode(passport_b64)
with NamedTemporaryFile(mode="wb") as tmp_img:
tmp_img.write(image_bytes)
with open(tmp_img.name, "rb") as read_img:
mrz_obj = read_mrz(read_img)
image = Image.open(io.BytesIO(image_bytes)) image = Image.open(io.BytesIO(image_bytes))
enhancer = ImageEnhance.Contrast(image)
image = enhancer.enhance(2.0) # 2.0 = double le contraste (1.0 = inchangé) # Parser structuré basé sur le modèle FromPassport
tesseract_text = pytesseract.image_to_string(image, lang='eng') parser = PydanticOutputParser(pydantic_object=FromPassport)
out_dict = {}
if not mrz_obj is None: # Prompt + Instructions pour extraction
number_raw = str(mrz_obj.number) prompt = ChatPromptTemplate.from_messages([
# It's not called a 'Hack'athon for nothing... ("system", "Tu es un assistant d'extraction de données de passeport."),
number = number_raw.replace("B", "8") ("human", "Voici l'image d'un passeport. Extrais les informations dans ce format :\n\n{format_instructions}"),
out_dict = { ])
"country": mrz_obj.country,
"names": mrz_obj.names, # LLM avec vision (GPT-4o)
"number": number, llm = ChatOpenAI(model="gpt-4o", temperature=0)
"surname": mrz_obj.surname,
"mrz": mrz_obj.aux["text"], chain = prompt | llm.with_structured_output(parser=parser)
}
out_dict["raw"] = tesseract_text # Appel du LLM avec l'image en contexte
out = json.dumps(out_dict) result = chain.invoke({
return out "format_instructions": parser.get_format_instructions(),
"image": image,
})
return result.json()