julius_baer_onboarding/poc_ocr.py

import zipfile
import requests
import base64
from PIL import Image
import pytesseract
import io
import xml.etree.ElementTree as ET
import base64
import io
import pymupdf
from pathlib import Path

# === Configuration API ===
api_url = "https://hackathon-api.mlo.sehlat.io/game/start"
api_key = "OwogAztgWRdPfT2wWe7Xevdw98tdJelatlk82K6bozw"
team_name = "Welch"

headers = {
    "x-api-key": api_key,
    "Content-Type": "application/json"
}
payload = {
    "player_name": team_name
}

response = requests.post(api_url, headers=headers, json=payload)
response_data = response.json()

print(response_data)

# === Traitement du passeport ===
passport_b64 = response_data["client_data"]["passport"]
image_bytes = base64.b64decode(passport_b64)
image = Image.open(io.BytesIO(image_bytes))
extracted_text = pytesseract.image_to_string(image, lang='eng')

print("=== Texte extrait depuis le passeport ===\n")
print(extracted_text)

# === Traitement du profil DOCX ===
profile_b64 = response_data["client_data"]["profile"]
profile_bytes = base64.b64decode(profile_b64)
zip_file = zipfile.ZipFile(io.BytesIO(profile_bytes))

document_path = "word/document.xml"
styles_path = "word/styles.xml"

# Charger la carte des styles pour retrouver les titres
style_map = {}
if styles_path in zip_file.namelist():
    styles_xml = zip_file.read(styles_path)
    styles_tree = ET.fromstring(styles_xml)

    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
    for style in styles_tree.findall(".//w:style", ns):
        style_id = style.attrib.get(f"{{{ns['w']}}}styleId")
        name_elem = style.find("w:name", ns)
        if name_elem is not None:
            style_name = name_elem.attrib.get(f"{{{ns['w']}}}val", "")
            style_map[style_id] = style_name

# Parse du document principal
if document_path in zip_file.namelist():
    document_xml = zip_file.read(document_path)
    tree = ET.fromstring(document_xml)

    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
    print("\n=== Texte structuré extrait depuis le profil ===\n")

    for para in tree.findall(".//w:p", ns):
        texts = [node.text for node in para.findall(".//w:t", ns) if node.text]
        if not texts:
            continue
        full_text = " ".join(texts).strip()

        # Détecter le style (Heading1, Heading2, etc.)
        p_style = para.find(".//w:pStyle", ns)
        if p_style is not None:
            style_id = p_style.attrib.get(f"{{{ns['w']}}}val")
            style_name = style_map.get(style_id, "")
            if "Heading1" in style_name:
                print(f"\n# {full_text}\n")
            elif "Heading2" in style_name:
                print(f"\n## {full_text}\n")
            elif "Heading3" in style_name:
                print(f"\n### {full_text}\n")
            else:
                print(full_text)
        else:
            print(full_text)
else:
    print("Le fichier 'word/document.xml' est introuvable dans le profil.")


# === Traitement du fichier account (PDF) avec PyMuPDF ===
print("\n=== Texte extrait depuis le fichier ACCOUNT (via PyMuPDF) ===\n")
import base64
from pdf2image import convert_from_bytes
import pytesseract

# Décode ton PDF base64
account_b64 = response_data["client_data"].get("account")
pdf_bytes = base64.b64decode(account_b64)

# Convertit chaque page en image
images = convert_from_bytes(pdf_bytes)

# Applique OCR avec pytesseract
for i, image in enumerate(images):
    text = pytesseract.image_to_string(image, lang="eng")
    print(f"--- Page {i + 1} ---")
    print(text)
add OCR profile. Zip -> Word -> txt 2025-04-11 23:39:21 +02:00			`import zipfile`
petit poc requête + OCR du passport. Img base64 -> Text 2025-04-11 23:22:48 +02:00			`import requests`
			`import base64`
			`from PIL import Image`
			`import pytesseract`
			`import io`
Add OCR profile conservant sémantic word. 2025-04-11 23:42:48 +02:00			`import xml.etree.ElementTree as ET`
Add Account OCR 2025-04-12 09:11:25 +02:00			`import base64`
			`import io`
			`import pymupdf`
petit poc requête + OCR du passport. Img base64 -> Text 2025-04-11 23:22:48 +02:00			`from pathlib import Path`

add OCR profile. Zip -> Word -> txt 2025-04-11 23:39:21 +02:00			`# === Configuration API ===`
petit poc requête + OCR du passport. Img base64 -> Text 2025-04-11 23:22:48 +02:00			`api_url = "https://hackathon-api.mlo.sehlat.io/game/start"`
			`api_key = "OwogAztgWRdPfT2wWe7Xevdw98tdJelatlk82K6bozw"`
			`team_name = "Welch"`

			`headers = {`
			`"x-api-key": api_key,`
			`"Content-Type": "application/json"`
			`}`
			`payload = {`
			`"player_name": team_name`
			`}`

			`response = requests.post(api_url, headers=headers, json=payload)`
			`response_data = response.json()`

Add Account OCR 2025-04-12 09:11:25 +02:00			`print(response_data)`

add OCR profile. Zip -> Word -> txt 2025-04-11 23:39:21 +02:00			`# === Traitement du passeport ===`
petit poc requête + OCR du passport. Img base64 -> Text 2025-04-11 23:22:48 +02:00			`passport_b64 = response_data["client_data"]["passport"]`
			`image_bytes = base64.b64decode(passport_b64)`
			`image = Image.open(io.BytesIO(image_bytes))`
			`extracted_text = pytesseract.image_to_string(image, lang='eng')`

add OCR profile. Zip -> Word -> txt 2025-04-11 23:39:21 +02:00			`print("=== Texte extrait depuis le passeport ===\n")`
			`print(extracted_text)`

Add OCR profile conservant sémantic word. 2025-04-11 23:42:48 +02:00			`# === Traitement du profil DOCX ===`
add OCR profile. Zip -> Word -> txt 2025-04-11 23:39:21 +02:00			`profile_b64 = response_data["client_data"]["profile"]`
			`profile_bytes = base64.b64decode(profile_b64)`
			`zip_file = zipfile.ZipFile(io.BytesIO(profile_bytes))`

			`document_path = "word/document.xml"`
Add OCR profile conservant sémantic word. 2025-04-11 23:42:48 +02:00			`styles_path = "word/styles.xml"`

			`# Charger la carte des styles pour retrouver les titres`
			`style_map = {}`
			`if styles_path in zip_file.namelist():`
			`styles_xml = zip_file.read(styles_path)`
			`styles_tree = ET.fromstring(styles_xml)`

			`ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}`
			`for style in styles_tree.findall(".//w:style", ns):`
			`style_id = style.attrib.get(f"{{{ns['w']}}}styleId")`
			`name_elem = style.find("w:name", ns)`
			`if name_elem is not None:`
			`style_name = name_elem.attrib.get(f"{{{ns['w']}}}val", "")`
			`style_map[style_id] = style_name`

			`# Parse du document principal`
add OCR profile. Zip -> Word -> txt 2025-04-11 23:39:21 +02:00			`if document_path in zip_file.namelist():`
Add OCR profile conservant sémantic word. 2025-04-11 23:42:48 +02:00			`document_xml = zip_file.read(document_path)`
			`tree = ET.fromstring(document_xml)`

			`ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}`
			`print("\n=== Texte structuré extrait depuis le profil ===\n")`
add OCR profile. Zip -> Word -> txt 2025-04-11 23:39:21 +02:00
Add OCR profile conservant sémantic word. 2025-04-11 23:42:48 +02:00			`for para in tree.findall(".//w:p", ns):`
			`texts = [node.text for node in para.findall(".//w:t", ns) if node.text]`
			`if not texts:`
			`continue`
			`full_text = " ".join(texts).strip()`
add OCR profile. Zip -> Word -> txt 2025-04-11 23:39:21 +02:00
Add OCR profile conservant sémantic word. 2025-04-11 23:42:48 +02:00			`# Détecter le style (Heading1, Heading2, etc.)`
			`p_style = para.find(".//w:pStyle", ns)`
			`if p_style is not None:`
			`style_id = p_style.attrib.get(f"{{{ns['w']}}}val")`
			`style_name = style_map.get(style_id, "")`
			`if "Heading1" in style_name:`
			`print(f"\n# {full_text}\n")`
			`elif "Heading2" in style_name:`
			`print(f"\n## {full_text}\n")`
			`elif "Heading3" in style_name:`
			`print(f"\n### {full_text}\n")`
			`else:`
			`print(full_text)`
			`else:`
			`print(full_text)`
add OCR profile. Zip -> Word -> txt 2025-04-11 23:39:21 +02:00			`else:`
Add Account OCR 2025-04-12 09:11:25 +02:00			`print("Le fichier 'word/document.xml' est introuvable dans le profil.")`


			`# === Traitement du fichier account (PDF) avec PyMuPDF ===`
			`print("\n=== Texte extrait depuis le fichier ACCOUNT (via PyMuPDF) ===\n")`
			`import base64`
			`from pdf2image import convert_from_bytes`
			`import pytesseract`

			`# Décode ton PDF base64`
			`account_b64 = response_data["client_data"].get("account")`
			`pdf_bytes = base64.b64decode(account_b64)`

			`# Convertit chaque page en image`
			`images = convert_from_bytes(pdf_bytes)`

			`# Applique OCR avec pytesseract`
			`for i, image in enumerate(images):`
			`text = pytesseract.image_to_string(image, lang="eng")`
			`print(f"--- Page {i + 1} ---")`
			`print(text)`