Files
julius_baer_onboarding/poc_ocr.py
2025-04-12 09:11:25 +02:00

113 lines
3.5 KiB
Python

import zipfile
import requests
import base64
from PIL import Image
import pytesseract
import io
import xml.etree.ElementTree as ET
import base64
import io
import pymupdf
from pathlib import Path
# === Configuration API ===
api_url = "https://hackathon-api.mlo.sehlat.io/game/start"
api_key = "OwogAztgWRdPfT2wWe7Xevdw98tdJelatlk82K6bozw"
team_name = "Welch"
headers = {
"x-api-key": api_key,
"Content-Type": "application/json"
}
payload = {
"player_name": team_name
}
response = requests.post(api_url, headers=headers, json=payload)
response_data = response.json()
print(response_data)
# === Traitement du passeport ===
passport_b64 = response_data["client_data"]["passport"]
image_bytes = base64.b64decode(passport_b64)
image = Image.open(io.BytesIO(image_bytes))
extracted_text = pytesseract.image_to_string(image, lang='eng')
print("=== Texte extrait depuis le passeport ===\n")
print(extracted_text)
# === Traitement du profil DOCX ===
profile_b64 = response_data["client_data"]["profile"]
profile_bytes = base64.b64decode(profile_b64)
zip_file = zipfile.ZipFile(io.BytesIO(profile_bytes))
document_path = "word/document.xml"
styles_path = "word/styles.xml"
# Charger la carte des styles pour retrouver les titres
style_map = {}
if styles_path in zip_file.namelist():
styles_xml = zip_file.read(styles_path)
styles_tree = ET.fromstring(styles_xml)
ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
for style in styles_tree.findall(".//w:style", ns):
style_id = style.attrib.get(f"{{{ns['w']}}}styleId")
name_elem = style.find("w:name", ns)
if name_elem is not None:
style_name = name_elem.attrib.get(f"{{{ns['w']}}}val", "")
style_map[style_id] = style_name
# Parse du document principal
if document_path in zip_file.namelist():
document_xml = zip_file.read(document_path)
tree = ET.fromstring(document_xml)
ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
print("\n=== Texte structuré extrait depuis le profil ===\n")
for para in tree.findall(".//w:p", ns):
texts = [node.text for node in para.findall(".//w:t", ns) if node.text]
if not texts:
continue
full_text = " ".join(texts).strip()
# Détecter le style (Heading1, Heading2, etc.)
p_style = para.find(".//w:pStyle", ns)
if p_style is not None:
style_id = p_style.attrib.get(f"{{{ns['w']}}}val")
style_name = style_map.get(style_id, "")
if "Heading1" in style_name:
print(f"\n# {full_text}\n")
elif "Heading2" in style_name:
print(f"\n## {full_text}\n")
elif "Heading3" in style_name:
print(f"\n### {full_text}\n")
else:
print(full_text)
else:
print(full_text)
else:
print("Le fichier 'word/document.xml' est introuvable dans le profil.")
# === Traitement du fichier account (PDF) avec PyMuPDF ===
print("\n=== Texte extrait depuis le fichier ACCOUNT (via PyMuPDF) ===\n")
import base64
from pdf2image import convert_from_bytes
import pytesseract
# Décode ton PDF base64
account_b64 = response_data["client_data"].get("account")
pdf_bytes = base64.b64decode(account_b64)
# Convertit chaque page en image
images = convert_from_bytes(pdf_bytes)
# Applique OCR avec pytesseract
for i, image in enumerate(images):
text = pytesseract.image_to_string(image, lang="eng")
print(f"--- Page {i + 1} ---")
print(text)