From f4fcc5a0f8cc46f85a1042314c3186391a42c3ec Mon Sep 17 00:00:00 2001 From: robinrolle Date: Sun, 13 Apr 2025 09:33:41 +0200 Subject: [PATCH] Revert "add dumb mistral exemple" This reverts commit 987a6b938fe45c97a6c7072299777cbaefce5791. --- requirements.txt | 2 -- utils/parsers/passport_parser.py | 56 ++++++++++++++++++++++---------- 2 files changed, 38 insertions(+), 20 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0bb217a..20afd0f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -44,5 +44,3 @@ langchain-openai numpy==2.2.4 pymupdf == 1.25.5 flask-cors==5.0.1 -mistralai==1.6.0 - diff --git a/utils/parsers/passport_parser.py b/utils/parsers/passport_parser.py index b2c4b9c..372008a 100644 --- a/utils/parsers/passport_parser.py +++ b/utils/parsers/passport_parser.py @@ -1,22 +1,42 @@ -import os -from mistralai import Mistral - -api_key = "XEOc110BYE4PMj8FQBauxxGZTitRTs2w" -client = Mistral(api_key=api_key) - +import base64 +import io +from tempfile import NamedTemporaryFile +from PIL import Image, ImageEnhance +import pytesseract +from passporteye import read_mrz +import json def process_passport(passport_b64: str) -> str: - mistral_image_url = f"data:image/png;base64,{passport_b64}" + """ + Traite le passport : + - Décodage de l'image en base64. + - Application de l'OCR pour extraire le texte. - ocr_response = client.ocr.process( - model="mistral-ocr-latest", - document={ - "type": "image_url", - "image_url": mistral_image_url + :param passport_b64: Chaîne base64 représentant l'image du passport. + :return: Texte extrait de l'image. + """ + image_bytes = base64.b64decode(passport_b64) + with NamedTemporaryFile(mode="wb") as tmp_img: + tmp_img.write(image_bytes) + with open(tmp_img.name, "rb") as read_img: + mrz_obj = read_mrz(read_img) + + image = Image.open(io.BytesIO(image_bytes)) + enhancer = ImageEnhance.Contrast(image) + image = enhancer.enhance(2.0) # 2.0 = double le contraste (1.0 = inchangé) + tesseract_text = pytesseract.image_to_string(image, lang='eng') + out_dict = {} + if not mrz_obj is None: + number_raw = str(mrz_obj.number) + # It's not called a 'Hack'athon for nothing... + number = number_raw.replace("B", "8") + out_dict = { + "country": mrz_obj.country, + "names": mrz_obj.names, + "number": number, + "surname": mrz_obj.surname, + "mrz": mrz_obj.aux["text"], } - ) - - # Extraire le markdown de toutes les pages - markdown_text = "\n\n".join(page.markdown for page in ocr_response.pages) - - return markdown_text # Tu peux aussi retourner juste le chemin si tu préfères \ No newline at end of file + out_dict["raw"] = tesseract_text + out = json.dumps(out_dict) + return out \ No newline at end of file