OCR

2025-04-13 10:03:32 +02:00
parent 692890e6a3
commit 38c2a1b767
1 changed files with 19 additions and 38 deletions
--- a/utils/parsers/passport_parser.py
+++ b/utils/parsers/passport_parser.py
@ -1,42 +1,23 @@
-import base64
+import os
-import io
+from mistralai import Mistral
-from tempfile import NamedTemporaryFile
+import config
-from PIL import Image, ImageEnhance
+
-import pytesseract
+api_key = config.MISTRAL_API_KEY
-from passporteye import read_mrz
+client = Mistral(api_key=api_key)
-import json
+
 def process_passport(passport_b64: str) -> str:
-    """
+    mistral_image_url = f"data:image/jpeg;base64,{passport_b64}"
    Traite le passport :
    - Décodage de l'image en base64.
    - Application de l'OCR pour extraire le texte.
-    :param passport_b64: Chaîne base64 représentant l'image du passport.
+    ocr_response = client.ocr.process(
-    :return: Texte extrait de l'image.
+        model="mistral-ocr-latest",
-    """
+        document={
-    image_bytes = base64.b64decode(passport_b64)
+            "type": "image_url",
-    with NamedTemporaryFile(mode="wb") as tmp_img:
+            "image_url": mistral_image_url
        tmp_img.write(image_bytes)
        with open(tmp_img.name, "rb") as read_img:
            mrz_obj = read_mrz(read_img)
    image = Image.open(io.BytesIO(image_bytes))
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2.0)  # 2.0 = double le contraste (1.0 = inchangé)
    tesseract_text = pytesseract.image_to_string(image, lang='eng')
    out_dict = {}
    if not mrz_obj is None:
        number_raw = str(mrz_obj.number)
        # It's not called a 'Hack'athon for nothing...
        number = number_raw.replace("B", "8")
        out_dict = {
            "country": mrz_obj.country,
            "names": mrz_obj.names,
            "number": number,
            "surname": mrz_obj.surname,
            "mrz": mrz_obj.aux["text"],
        }
-    out_dict["raw"] = tesseract_text
+    )
-    out = json.dumps(out_dict)
+
-    return out
+    # Extraire le markdown de toutes les pages
    markdown_text = "\n\n".join(page.markdown for page in ocr_response.pages)
    return markdown_text  # Tu peux aussi retourner juste le chemin si tu préfères