Files
julius_baer_onboarding/poc_ocr.py

113 lines
3.5 KiB
Python
Raw Normal View History

2025-04-11 23:39:21 +02:00
import zipfile
import requests
import base64
from PIL import Image
import pytesseract
import io
import xml.etree.ElementTree as ET
2025-04-12 09:11:25 +02:00
import base64
import io
import pymupdf
from pathlib import Path
2025-04-11 23:39:21 +02:00
# === Configuration API ===
api_url = "https://hackathon-api.mlo.sehlat.io/game/start"
api_key = "OwogAztgWRdPfT2wWe7Xevdw98tdJelatlk82K6bozw"
team_name = "Welch"
headers = {
"x-api-key": api_key,
"Content-Type": "application/json"
}
payload = {
"player_name": team_name
}
response = requests.post(api_url, headers=headers, json=payload)
response_data = response.json()
2025-04-12 09:11:25 +02:00
print(response_data)
2025-04-11 23:39:21 +02:00
# === Traitement du passeport ===
passport_b64 = response_data["client_data"]["passport"]
image_bytes = base64.b64decode(passport_b64)
image = Image.open(io.BytesIO(image_bytes))
extracted_text = pytesseract.image_to_string(image, lang='eng')
2025-04-11 23:39:21 +02:00
print("=== Texte extrait depuis le passeport ===\n")
print(extracted_text)
# === Traitement du profil DOCX ===
2025-04-11 23:39:21 +02:00
profile_b64 = response_data["client_data"]["profile"]
profile_bytes = base64.b64decode(profile_b64)
zip_file = zipfile.ZipFile(io.BytesIO(profile_bytes))
document_path = "word/document.xml"
styles_path = "word/styles.xml"
# Charger la carte des styles pour retrouver les titres
style_map = {}
if styles_path in zip_file.namelist():
styles_xml = zip_file.read(styles_path)
styles_tree = ET.fromstring(styles_xml)
ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
for style in styles_tree.findall(".//w:style", ns):
style_id = style.attrib.get(f"{{{ns['w']}}}styleId")
name_elem = style.find("w:name", ns)
if name_elem is not None:
style_name = name_elem.attrib.get(f"{{{ns['w']}}}val", "")
style_map[style_id] = style_name
# Parse du document principal
2025-04-11 23:39:21 +02:00
if document_path in zip_file.namelist():
document_xml = zip_file.read(document_path)
tree = ET.fromstring(document_xml)
ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
print("\n=== Texte structuré extrait depuis le profil ===\n")
2025-04-11 23:39:21 +02:00
for para in tree.findall(".//w:p", ns):
texts = [node.text for node in para.findall(".//w:t", ns) if node.text]
if not texts:
continue
full_text = " ".join(texts).strip()
2025-04-11 23:39:21 +02:00
# Détecter le style (Heading1, Heading2, etc.)
p_style = para.find(".//w:pStyle", ns)
if p_style is not None:
style_id = p_style.attrib.get(f"{{{ns['w']}}}val")
style_name = style_map.get(style_id, "")
if "Heading1" in style_name:
print(f"\n# {full_text}\n")
elif "Heading2" in style_name:
print(f"\n## {full_text}\n")
elif "Heading3" in style_name:
print(f"\n### {full_text}\n")
else:
print(full_text)
else:
print(full_text)
2025-04-11 23:39:21 +02:00
else:
2025-04-12 09:11:25 +02:00
print("Le fichier 'word/document.xml' est introuvable dans le profil.")
# === Traitement du fichier account (PDF) avec PyMuPDF ===
print("\n=== Texte extrait depuis le fichier ACCOUNT (via PyMuPDF) ===\n")
import base64
from pdf2image import convert_from_bytes
import pytesseract
# Décode ton PDF base64
account_b64 = response_data["client_data"].get("account")
pdf_bytes = base64.b64decode(account_b64)
# Convertit chaque page en image
images = convert_from_bytes(pdf_bytes)
# Applique OCR avec pytesseract
for i, image in enumerate(images):
text = pytesseract.image_to_string(image, lang="eng")
print(f"--- Page {i + 1} ---")
print(text)