Add OCR profile conservant sémantic word.

This commit is contained in:
robinrolle
2025-04-11 23:42:48 +02:00
parent dd4f8b2425
commit d64bf40d46

View File

@ -4,7 +4,7 @@ import base64
from PIL import Image from PIL import Image
import pytesseract import pytesseract
import io import io
import re import xml.etree.ElementTree as ET
from pathlib import Path from pathlib import Path
# === Configuration API === # === Configuration API ===
@ -12,7 +12,6 @@ api_url = "https://hackathon-api.mlo.sehlat.io/game/start"
api_key = "OwogAztgWRdPfT2wWe7Xevdw98tdJelatlk82K6bozw" api_key = "OwogAztgWRdPfT2wWe7Xevdw98tdJelatlk82K6bozw"
team_name = "Welch" team_name = "Welch"
# === Requête API ===
headers = { headers = {
"x-api-key": api_key, "x-api-key": api_key,
"Content-Type": "application/json" "Content-Type": "application/json"
@ -33,21 +32,56 @@ extracted_text = pytesseract.image_to_string(image, lang='eng')
print("=== Texte extrait depuis le passeport ===\n") print("=== Texte extrait depuis le passeport ===\n")
print(extracted_text) print(extracted_text)
# === Traitement du profil === # === Traitement du profil DOCX ===
profile_b64 = response_data["client_data"]["profile"] profile_b64 = response_data["client_data"]["profile"]
profile_bytes = base64.b64decode(profile_b64) profile_bytes = base64.b64decode(profile_b64)
zip_file = zipfile.ZipFile(io.BytesIO(profile_bytes)) zip_file = zipfile.ZipFile(io.BytesIO(profile_bytes))
# Vérifie la présence du fichier principal du Word
document_path = "word/document.xml" document_path = "word/document.xml"
styles_path = "word/styles.xml"
# Charger la carte des styles pour retrouver les titres
style_map = {}
if styles_path in zip_file.namelist():
styles_xml = zip_file.read(styles_path)
styles_tree = ET.fromstring(styles_xml)
ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
for style in styles_tree.findall(".//w:style", ns):
style_id = style.attrib.get(f"{{{ns['w']}}}styleId")
name_elem = style.find("w:name", ns)
if name_elem is not None:
style_name = name_elem.attrib.get(f"{{{ns['w']}}}val", "")
style_map[style_id] = style_name
# Parse du document principal
if document_path in zip_file.namelist(): if document_path in zip_file.namelist():
with zip_file.open(document_path) as doc_file: document_xml = zip_file.read(document_path)
xml_content = doc_file.read().decode("utf-8", errors="ignore") tree = ET.fromstring(document_xml)
# Extraction naïve du texte (on peut améliorer avec XML parser si besoin) ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
text_only = re.sub(r"<[^>]+>", "", xml_content) print("\n=== Texte structuré extrait depuis le profil ===\n")
print("\n=== Texte extrait depuis le profil (document Word) ===\n") for para in tree.findall(".//w:p", ns):
print(text_only.strip()) texts = [node.text for node in para.findall(".//w:t", ns) if node.text]
if not texts:
continue
full_text = " ".join(texts).strip()
# Détecter le style (Heading1, Heading2, etc.)
p_style = para.find(".//w:pStyle", ns)
if p_style is not None:
style_id = p_style.attrib.get(f"{{{ns['w']}}}val")
style_name = style_map.get(style_id, "")
if "Heading1" in style_name:
print(f"\n# {full_text}\n")
elif "Heading2" in style_name:
print(f"\n## {full_text}\n")
elif "Heading3" in style_name:
print(f"\n### {full_text}\n")
else:
print(full_text)
else:
print(full_text)
else: else:
print("Le fichier 'word/document.xml' est introuvable dans l'archive du profil.") print("Le fichier 'word/document.xml' est introuvable dans le profil.")