Files
julius_baer_onboarding/utils/parsers/profile_parser.py
2025-04-12 10:51:07 +02:00

64 lines
2.6 KiB
Python

import base64
import io
import zipfile
import xml.etree.ElementTree as ET
def process_profile(profile_b64: str) -> str:
"""
Traite le profile :
- Décodage du fichier DOCX (archive ZIP) encodé en base64.
- Extraction du document XML (word/document.xml).
- Utilisation du fichier word/styles.xml pour appliquer la mise en forme du texte (titres).
:param profile_b64: Chaîne base64 représentant le document DOCX.
:return: Texte structuré extrait du document.
"""
profile_bytes = base64.b64decode(profile_b64)
zip_file = zipfile.ZipFile(io.BytesIO(profile_bytes))
document_path = "word/document.xml"
styles_path = "word/styles.xml"
result_lines = []
style_map = {}
# Construction de la carte des styles
if styles_path in zip_file.namelist():
styles_xml = zip_file.read(styles_path)
styles_tree = ET.fromstring(styles_xml)
ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
for style in styles_tree.findall(".//w:style", ns):
style_id = style.attrib.get(f"{{{ns['w']}}}styleId")
name_elem = style.find("w:name", ns)
if name_elem is not None:
style_name = name_elem.attrib.get(f"{{{ns['w']}}}val", "")
style_map[style_id] = style_name
# Extraction du document principal
if document_path in zip_file.namelist():
document_xml = zip_file.read(document_path)
tree = ET.fromstring(document_xml)
ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
for para in tree.findall(".//w:p", ns):
texts = [node.text for node in para.findall(".//w:t", ns) if node.text]
if not texts:
continue
full_text = " ".join(texts).strip()
p_style = para.find(".//w:pStyle", ns)
if p_style is not None:
style_id = p_style.attrib.get(f"{{{ns['w']}}}val")
style_name = style_map.get(style_id, "")
if "Heading1" in style_name:
result_lines.append(f"\n# {full_text}\n")
elif "Heading2" in style_name:
result_lines.append(f"\n## {full_text}\n")
elif "Heading3" in style_name:
result_lines.append(f"\n### {full_text}\n")
else:
result_lines.append(full_text)
else:
result_lines.append(full_text)
else:
result_lines.append("Fichier 'word/document.xml' introuvable dans le profile.")
return "\n".join(result_lines)