From d64bf40d46159e1122fcd76a1ece3d1f42d03980 Mon Sep 17 00:00:00 2001 From: robinrolle Date: Fri, 11 Apr 2025 23:42:48 +0200 Subject: [PATCH] =?UTF-8?q?Add=20OCR=20profile=20conservant=20s=C3=A9manti?= =?UTF-8?q?c=20word.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- poc_ocr.py | 56 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 11 deletions(-) diff --git a/poc_ocr.py b/poc_ocr.py index c9471f3..7ef4cf4 100644 --- a/poc_ocr.py +++ b/poc_ocr.py @@ -4,7 +4,7 @@ import base64 from PIL import Image import pytesseract import io -import re +import xml.etree.ElementTree as ET from pathlib import Path # === Configuration API === @@ -12,7 +12,6 @@ api_url = "https://hackathon-api.mlo.sehlat.io/game/start" api_key = "OwogAztgWRdPfT2wWe7Xevdw98tdJelatlk82K6bozw" team_name = "Welch" -# === Requête API === headers = { "x-api-key": api_key, "Content-Type": "application/json" @@ -33,21 +32,56 @@ extracted_text = pytesseract.image_to_string(image, lang='eng') print("=== Texte extrait depuis le passeport ===\n") print(extracted_text) -# === Traitement du profil === +# === Traitement du profil DOCX === profile_b64 = response_data["client_data"]["profile"] profile_bytes = base64.b64decode(profile_b64) zip_file = zipfile.ZipFile(io.BytesIO(profile_bytes)) -# Vérifie la présence du fichier principal du Word document_path = "word/document.xml" +styles_path = "word/styles.xml" + +# Charger la carte des styles pour retrouver les titres +style_map = {} +if styles_path in zip_file.namelist(): + styles_xml = zip_file.read(styles_path) + styles_tree = ET.fromstring(styles_xml) + + ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} + for style in styles_tree.findall(".//w:style", ns): + style_id = style.attrib.get(f"{{{ns['w']}}}styleId") + name_elem = style.find("w:name", ns) + if name_elem is not None: + style_name = name_elem.attrib.get(f"{{{ns['w']}}}val", "") + style_map[style_id] = style_name + +# Parse du document principal if document_path in zip_file.namelist(): - with zip_file.open(document_path) as doc_file: - xml_content = doc_file.read().decode("utf-8", errors="ignore") + document_xml = zip_file.read(document_path) + tree = ET.fromstring(document_xml) - # Extraction naïve du texte (on peut améliorer avec XML parser si besoin) - text_only = re.sub(r"<[^>]+>", "", xml_content) + ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} + print("\n=== Texte structuré extrait depuis le profil ===\n") - print("\n=== Texte extrait depuis le profil (document Word) ===\n") - print(text_only.strip()) + for para in tree.findall(".//w:p", ns): + texts = [node.text for node in para.findall(".//w:t", ns) if node.text] + if not texts: + continue + full_text = " ".join(texts).strip() + + # Détecter le style (Heading1, Heading2, etc.) + p_style = para.find(".//w:pStyle", ns) + if p_style is not None: + style_id = p_style.attrib.get(f"{{{ns['w']}}}val") + style_name = style_map.get(style_id, "") + if "Heading1" in style_name: + print(f"\n# {full_text}\n") + elif "Heading2" in style_name: + print(f"\n## {full_text}\n") + elif "Heading3" in style_name: + print(f"\n### {full_text}\n") + else: + print(full_text) + else: + print(full_text) else: - print("Le fichier 'word/document.xml' est introuvable dans l'archive du profil.") \ No newline at end of file + print("Le fichier 'word/document.xml' est introuvable dans le profil.") \ No newline at end of file