From d64bf40d46159e1122fcd76a1ece3d1f42d03980 Mon Sep 17 00:00:00 2001
From: robinrolle <robinrolle1748@gmail.com>
Date: Fri, 11 Apr 2025 23:42:48 +0200
Subject: [PATCH] =?UTF-8?q?Add=20OCR=20profile=20conservant=20s=C3=A9manti?=
 =?UTF-8?q?c=20word.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 poc_ocr.py | 56 +++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 45 insertions(+), 11 deletions(-)

diff --git a/poc_ocr.py b/poc_ocr.py
index c9471f3..7ef4cf4 100644
--- a/poc_ocr.py
+++ b/poc_ocr.py
@@ -4,7 +4,7 @@ import base64
 from PIL import Image
 import pytesseract
 import io
-import re
+import xml.etree.ElementTree as ET
 from pathlib import Path
 
 # === Configuration API ===
@@ -12,7 +12,6 @@ api_url = "https://hackathon-api.mlo.sehlat.io/game/start"
 api_key = "OwogAztgWRdPfT2wWe7Xevdw98tdJelatlk82K6bozw"
 team_name = "Welch"
 
-# === Requête API ===
 headers = {
     "x-api-key": api_key,
     "Content-Type": "application/json"
@@ -33,21 +32,56 @@ extracted_text = pytesseract.image_to_string(image, lang='eng')
 print("=== Texte extrait depuis le passeport ===\n")
 print(extracted_text)
 
-# === Traitement du profil ===
+# === Traitement du profil DOCX ===
 profile_b64 = response_data["client_data"]["profile"]
 profile_bytes = base64.b64decode(profile_b64)
 zip_file = zipfile.ZipFile(io.BytesIO(profile_bytes))
 
-# Vérifie la présence du fichier principal du Word
 document_path = "word/document.xml"
+styles_path = "word/styles.xml"
+
+# Charger la carte des styles pour retrouver les titres
+style_map = {}
+if styles_path in zip_file.namelist():
+    styles_xml = zip_file.read(styles_path)
+    styles_tree = ET.fromstring(styles_xml)
+
+    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
+    for style in styles_tree.findall(".//w:style", ns):
+        style_id = style.attrib.get(f"{{{ns['w']}}}styleId")
+        name_elem = style.find("w:name", ns)
+        if name_elem is not None:
+            style_name = name_elem.attrib.get(f"{{{ns['w']}}}val", "")
+            style_map[style_id] = style_name
+
+# Parse du document principal
 if document_path in zip_file.namelist():
-    with zip_file.open(document_path) as doc_file:
-        xml_content = doc_file.read().decode("utf-8", errors="ignore")
+    document_xml = zip_file.read(document_path)
+    tree = ET.fromstring(document_xml)
 
-    # Extraction naïve du texte (on peut améliorer avec XML parser si besoin)
-    text_only = re.sub(r"<[^>]+>", "", xml_content)
+    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
+    print("\n=== Texte structuré extrait depuis le profil ===\n")
 
-    print("\n=== Texte extrait depuis le profil (document Word) ===\n")
-    print(text_only.strip())
+    for para in tree.findall(".//w:p", ns):
+        texts = [node.text for node in para.findall(".//w:t", ns) if node.text]
+        if not texts:
+            continue
+        full_text = " ".join(texts).strip()
+
+        # Détecter le style (Heading1, Heading2, etc.)
+        p_style = para.find(".//w:pStyle", ns)
+        if p_style is not None:
+            style_id = p_style.attrib.get(f"{{{ns['w']}}}val")
+            style_name = style_map.get(style_id, "")
+            if "Heading1" in style_name:
+                print(f"\n# {full_text}\n")
+            elif "Heading2" in style_name:
+                print(f"\n## {full_text}\n")
+            elif "Heading3" in style_name:
+                print(f"\n### {full_text}\n")
+            else:
+                print(full_text)
+        else:
+            print(full_text)
 else:
-    print("Le fichier 'word/document.xml' est introuvable dans l'archive du profil.")
\ No newline at end of file
+    print("Le fichier 'word/document.xml' est introuvable dans le profil.")
\ No newline at end of file