Passport parser in progress
This commit is contained in:
@ -1,22 +1,38 @@
|
||||
import base64
|
||||
from pdf2image import convert_from_bytes
|
||||
import pytesseract
|
||||
|
||||
import pymupdf
|
||||
|
||||
def process_account(account_b64: str) -> str:
|
||||
"""
|
||||
Traite l'account :
|
||||
- Décodage du PDF encodé en base64.
|
||||
- Conversion de chaque page du PDF en image.
|
||||
- Application de l'OCR sur chaque image pour extraire le texte.
|
||||
- Extraction du texte et des champs de formulaire directement depuis le PDF.
|
||||
|
||||
:param account_b64: Chaîne base64 représentant le PDF.
|
||||
:return: Texte extrait de chaque page du PDF.
|
||||
:return: Texte extrait de chaque page du PDF, incluant les champs du formulaire.
|
||||
"""
|
||||
# Décodage du PDF en base64
|
||||
pdf_bytes = base64.b64decode(account_b64)
|
||||
images = convert_from_bytes(pdf_bytes)
|
||||
pages_text = []
|
||||
for i, image in enumerate(images):
|
||||
text = pytesseract.image_to_string(image, lang="eng")
|
||||
pages_text.append(f"--- Page {i + 1} ---\n{text}")
|
||||
return "\n".join(pages_text)
|
||||
|
||||
# Ouverture du PDF avec PyMuPDF
|
||||
pdf_document = pymupdf.open(stream=pdf_bytes, filetype="pdf")
|
||||
|
||||
# Traitement de chaque page
|
||||
for i in range(len(pdf_document)):
|
||||
page = pdf_document[i]
|
||||
|
||||
# Extraction des champs de formulaire
|
||||
fields = page.widgets()
|
||||
form_fields_text = []
|
||||
|
||||
for field in fields:
|
||||
field_name = field.field_name
|
||||
field_value = field.field_value
|
||||
form_fields_text.append(f"Field: {field_name}, Value: {field_value}")
|
||||
|
||||
combined_text = ""
|
||||
|
||||
if form_fields_text:
|
||||
combined_text += "\n\nForm Fields:\n" + "\n".join(form_fields_text)
|
||||
|
||||
pdf_document.close()
|
||||
return "\n".join(combined_text)
|
@ -1,8 +1,10 @@
|
||||
import base64
|
||||
import io
|
||||
from tempfile import NamedTemporaryFile
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
|
||||
from passporteye import read_mrz
|
||||
import json
|
||||
|
||||
def process_passport(passport_b64: str) -> str:
|
||||
"""
|
||||
@ -14,6 +16,12 @@ def process_passport(passport_b64: str) -> str:
|
||||
:return: Texte extrait de l'image.
|
||||
"""
|
||||
image_bytes = base64.b64decode(passport_b64)
|
||||
image = Image.open(io.BytesIO(image_bytes))
|
||||
text = pytesseract.image_to_string(image, lang='eng')
|
||||
# image = Image.open(io.BytesIO(image_bytes))
|
||||
# text = pytesseract.image_to_string(image, lang='eng')
|
||||
with NamedTemporaryFile(mode="wb") as tmp_img:
|
||||
tmp_img.write(image_bytes)
|
||||
with open(tmp_img.name, "rb") as read_img:
|
||||
text = read_mrz(read_img)
|
||||
# text = json.dumps(text)
|
||||
# TODO CONTINUE
|
||||
return text
|
Reference in New Issue
Block a user