Add OCR profile conservant sémantic word.
This commit is contained in:
56
poc_ocr.py
56
poc_ocr.py
@ -4,7 +4,7 @@ import base64
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
import pytesseract
|
import pytesseract
|
||||||
import io
|
import io
|
||||||
import re
|
import xml.etree.ElementTree as ET
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
# === Configuration API ===
|
# === Configuration API ===
|
||||||
@ -12,7 +12,6 @@ api_url = "https://hackathon-api.mlo.sehlat.io/game/start"
|
|||||||
api_key = "OwogAztgWRdPfT2wWe7Xevdw98tdJelatlk82K6bozw"
|
api_key = "OwogAztgWRdPfT2wWe7Xevdw98tdJelatlk82K6bozw"
|
||||||
team_name = "Welch"
|
team_name = "Welch"
|
||||||
|
|
||||||
# === Requête API ===
|
|
||||||
headers = {
|
headers = {
|
||||||
"x-api-key": api_key,
|
"x-api-key": api_key,
|
||||||
"Content-Type": "application/json"
|
"Content-Type": "application/json"
|
||||||
@ -33,21 +32,56 @@ extracted_text = pytesseract.image_to_string(image, lang='eng')
|
|||||||
print("=== Texte extrait depuis le passeport ===\n")
|
print("=== Texte extrait depuis le passeport ===\n")
|
||||||
print(extracted_text)
|
print(extracted_text)
|
||||||
|
|
||||||
# === Traitement du profil ===
|
# === Traitement du profil DOCX ===
|
||||||
profile_b64 = response_data["client_data"]["profile"]
|
profile_b64 = response_data["client_data"]["profile"]
|
||||||
profile_bytes = base64.b64decode(profile_b64)
|
profile_bytes = base64.b64decode(profile_b64)
|
||||||
zip_file = zipfile.ZipFile(io.BytesIO(profile_bytes))
|
zip_file = zipfile.ZipFile(io.BytesIO(profile_bytes))
|
||||||
|
|
||||||
# Vérifie la présence du fichier principal du Word
|
|
||||||
document_path = "word/document.xml"
|
document_path = "word/document.xml"
|
||||||
|
styles_path = "word/styles.xml"
|
||||||
|
|
||||||
|
# Charger la carte des styles pour retrouver les titres
|
||||||
|
style_map = {}
|
||||||
|
if styles_path in zip_file.namelist():
|
||||||
|
styles_xml = zip_file.read(styles_path)
|
||||||
|
styles_tree = ET.fromstring(styles_xml)
|
||||||
|
|
||||||
|
ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
|
||||||
|
for style in styles_tree.findall(".//w:style", ns):
|
||||||
|
style_id = style.attrib.get(f"{{{ns['w']}}}styleId")
|
||||||
|
name_elem = style.find("w:name", ns)
|
||||||
|
if name_elem is not None:
|
||||||
|
style_name = name_elem.attrib.get(f"{{{ns['w']}}}val", "")
|
||||||
|
style_map[style_id] = style_name
|
||||||
|
|
||||||
|
# Parse du document principal
|
||||||
if document_path in zip_file.namelist():
|
if document_path in zip_file.namelist():
|
||||||
with zip_file.open(document_path) as doc_file:
|
document_xml = zip_file.read(document_path)
|
||||||
xml_content = doc_file.read().decode("utf-8", errors="ignore")
|
tree = ET.fromstring(document_xml)
|
||||||
|
|
||||||
# Extraction naïve du texte (on peut améliorer avec XML parser si besoin)
|
ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
|
||||||
text_only = re.sub(r"<[^>]+>", "", xml_content)
|
print("\n=== Texte structuré extrait depuis le profil ===\n")
|
||||||
|
|
||||||
print("\n=== Texte extrait depuis le profil (document Word) ===\n")
|
for para in tree.findall(".//w:p", ns):
|
||||||
print(text_only.strip())
|
texts = [node.text for node in para.findall(".//w:t", ns) if node.text]
|
||||||
|
if not texts:
|
||||||
|
continue
|
||||||
|
full_text = " ".join(texts).strip()
|
||||||
|
|
||||||
|
# Détecter le style (Heading1, Heading2, etc.)
|
||||||
|
p_style = para.find(".//w:pStyle", ns)
|
||||||
|
if p_style is not None:
|
||||||
|
style_id = p_style.attrib.get(f"{{{ns['w']}}}val")
|
||||||
|
style_name = style_map.get(style_id, "")
|
||||||
|
if "Heading1" in style_name:
|
||||||
|
print(f"\n# {full_text}\n")
|
||||||
|
elif "Heading2" in style_name:
|
||||||
|
print(f"\n## {full_text}\n")
|
||||||
|
elif "Heading3" in style_name:
|
||||||
|
print(f"\n### {full_text}\n")
|
||||||
|
else:
|
||||||
|
print(full_text)
|
||||||
|
else:
|
||||||
|
print(full_text)
|
||||||
else:
|
else:
|
||||||
print("Le fichier 'word/document.xml' est introuvable dans l'archive du profil.")
|
print("Le fichier 'word/document.xml' est introuvable dans le profil.")
|
Reference in New Issue
Block a user