From c0a04c5282ab7d137e372fe2fd604b1b14d847b7 Mon Sep 17 00:00:00 2001 From: robinrolle Date: Sat, 12 Apr 2025 10:51:07 +0200 Subject: [PATCH] add utils parsers --- utils/parsers/__init__.py | 9 +++++ utils/parsers/account_parser.py | 22 +++++++++++ utils/parsers/passport_parser.py | 19 ++++++++++ utils/parsers/profile_parser.py | 64 ++++++++++++++++++++++++++++++++ 4 files changed, 114 insertions(+) create mode 100644 utils/parsers/__init__.py create mode 100644 utils/parsers/account_parser.py create mode 100644 utils/parsers/passport_parser.py create mode 100644 utils/parsers/profile_parser.py diff --git a/utils/parsers/__init__.py b/utils/parsers/__init__.py new file mode 100644 index 0000000..9a3255a --- /dev/null +++ b/utils/parsers/__init__.py @@ -0,0 +1,9 @@ +from .passport_parser import process_passport +from .profile_parser import process_profile +from .account_parser import process_account + +__all__ = [ + "process_passport", + "process_profile", + "process_account" +] \ No newline at end of file diff --git a/utils/parsers/account_parser.py b/utils/parsers/account_parser.py new file mode 100644 index 0000000..6d293c9 --- /dev/null +++ b/utils/parsers/account_parser.py @@ -0,0 +1,22 @@ +import base64 +from pdf2image import convert_from_bytes +import pytesseract + + +def process_account(account_b64: str) -> str: + """ + Traite l'account : + - Décodage du PDF encodé en base64. + - Conversion de chaque page du PDF en image. + - Application de l'OCR sur chaque image pour extraire le texte. + + :param account_b64: Chaîne base64 représentant le PDF. + :return: Texte extrait de chaque page du PDF. + """ + pdf_bytes = base64.b64decode(account_b64) + images = convert_from_bytes(pdf_bytes) + pages_text = [] + for i, image in enumerate(images): + text = pytesseract.image_to_string(image, lang="eng") + pages_text.append(f"--- Page {i + 1} ---\n{text}") + return "\n".join(pages_text) \ No newline at end of file diff --git a/utils/parsers/passport_parser.py b/utils/parsers/passport_parser.py new file mode 100644 index 0000000..10dd221 --- /dev/null +++ b/utils/parsers/passport_parser.py @@ -0,0 +1,19 @@ +import base64 +import io +from PIL import Image +import pytesseract + + +def process_passport(passport_b64: str) -> str: + """ + Traite le passport : + - Décodage de l'image en base64. + - Application de l'OCR pour extraire le texte. + + :param passport_b64: Chaîne base64 représentant l'image du passport. + :return: Texte extrait de l'image. + """ + image_bytes = base64.b64decode(passport_b64) + image = Image.open(io.BytesIO(image_bytes)) + text = pytesseract.image_to_string(image, lang='eng') + return text \ No newline at end of file diff --git a/utils/parsers/profile_parser.py b/utils/parsers/profile_parser.py new file mode 100644 index 0000000..992f926 --- /dev/null +++ b/utils/parsers/profile_parser.py @@ -0,0 +1,64 @@ +import base64 +import io +import zipfile +import xml.etree.ElementTree as ET + + +def process_profile(profile_b64: str) -> str: + """ + Traite le profile : + - Décodage du fichier DOCX (archive ZIP) encodé en base64. + - Extraction du document XML (word/document.xml). + - Utilisation du fichier word/styles.xml pour appliquer la mise en forme du texte (titres). + + :param profile_b64: Chaîne base64 représentant le document DOCX. + :return: Texte structuré extrait du document. + """ + profile_bytes = base64.b64decode(profile_b64) + zip_file = zipfile.ZipFile(io.BytesIO(profile_bytes)) + + document_path = "word/document.xml" + styles_path = "word/styles.xml" + result_lines = [] + style_map = {} + + # Construction de la carte des styles + if styles_path in zip_file.namelist(): + styles_xml = zip_file.read(styles_path) + styles_tree = ET.fromstring(styles_xml) + ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} + for style in styles_tree.findall(".//w:style", ns): + style_id = style.attrib.get(f"{{{ns['w']}}}styleId") + name_elem = style.find("w:name", ns) + if name_elem is not None: + style_name = name_elem.attrib.get(f"{{{ns['w']}}}val", "") + style_map[style_id] = style_name + + # Extraction du document principal + if document_path in zip_file.namelist(): + document_xml = zip_file.read(document_path) + tree = ET.fromstring(document_xml) + ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} + for para in tree.findall(".//w:p", ns): + texts = [node.text for node in para.findall(".//w:t", ns) if node.text] + if not texts: + continue + full_text = " ".join(texts).strip() + p_style = para.find(".//w:pStyle", ns) + if p_style is not None: + style_id = p_style.attrib.get(f"{{{ns['w']}}}val") + style_name = style_map.get(style_id, "") + if "Heading1" in style_name: + result_lines.append(f"\n# {full_text}\n") + elif "Heading2" in style_name: + result_lines.append(f"\n## {full_text}\n") + elif "Heading3" in style_name: + result_lines.append(f"\n### {full_text}\n") + else: + result_lines.append(full_text) + else: + result_lines.append(full_text) + else: + result_lines.append("Fichier 'word/document.xml' introuvable dans le profile.") + + return "\n".join(result_lines) \ No newline at end of file