Files
julius_baer_onboarding/utils/parsers/passport_parser.py

40 lines
1.2 KiB
Python
Raw Normal View History

2025-04-12 10:51:07 +02:00
import base64
import io
2025-04-12 20:12:14 +02:00
from tempfile import NamedTemporaryFile
2025-04-12 10:51:07 +02:00
from PIL import Image
import pytesseract
2025-04-12 20:12:14 +02:00
from passporteye import read_mrz
import json
2025-04-12 10:51:07 +02:00
def process_passport(passport_b64: str) -> str:
"""
Traite le passport :
- Décodage de l'image en base64.
- Application de l'OCR pour extraire le texte.
:param passport_b64: Chaîne base64 représentant l'image du passport.
:return: Texte extrait de l'image.
"""
image_bytes = base64.b64decode(passport_b64)
2025-04-12 20:12:14 +02:00
with NamedTemporaryFile(mode="wb") as tmp_img:
tmp_img.write(image_bytes)
with open(tmp_img.name, "rb") as read_img:
2025-04-12 20:31:20 +02:00
mrz_obj = read_mrz(read_img)
image = Image.open(io.BytesIO(image_bytes))
tesseract_text = pytesseract.image_to_string(image, lang='eng')
2025-04-12 22:29:54 +02:00
out_dict = {}
if not mrz_obj is None:
2025-04-12 22:58:37 +02:00
number_raw = str(mrz_obj.number)
# It's not called a 'Hack'athon for nothing...
number = number_raw.replace("B", "8")
2025-04-12 22:29:54 +02:00
out_dict = {
"country": mrz_obj.country,
"names": mrz_obj.names,
2025-04-12 22:58:37 +02:00
"number": number,
2025-04-12 22:29:54 +02:00
"surname": mrz_obj.surname,
"mrz": mrz_obj.aux["text"],
}
out_dict["raw"] = tesseract_text
2025-04-12 20:31:20 +02:00
out = json.dumps(out_dict)
return out