add utils parsers
This commit is contained in:
9
utils/parsers/__init__.py
Normal file
9
utils/parsers/__init__.py
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
from .passport_parser import process_passport
|
||||||
|
from .profile_parser import process_profile
|
||||||
|
from .account_parser import process_account
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"process_passport",
|
||||||
|
"process_profile",
|
||||||
|
"process_account"
|
||||||
|
]
|
22
utils/parsers/account_parser.py
Normal file
22
utils/parsers/account_parser.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
import base64
|
||||||
|
from pdf2image import convert_from_bytes
|
||||||
|
import pytesseract
|
||||||
|
|
||||||
|
|
||||||
|
def process_account(account_b64: str) -> str:
|
||||||
|
"""
|
||||||
|
Traite l'account :
|
||||||
|
- Décodage du PDF encodé en base64.
|
||||||
|
- Conversion de chaque page du PDF en image.
|
||||||
|
- Application de l'OCR sur chaque image pour extraire le texte.
|
||||||
|
|
||||||
|
:param account_b64: Chaîne base64 représentant le PDF.
|
||||||
|
:return: Texte extrait de chaque page du PDF.
|
||||||
|
"""
|
||||||
|
pdf_bytes = base64.b64decode(account_b64)
|
||||||
|
images = convert_from_bytes(pdf_bytes)
|
||||||
|
pages_text = []
|
||||||
|
for i, image in enumerate(images):
|
||||||
|
text = pytesseract.image_to_string(image, lang="eng")
|
||||||
|
pages_text.append(f"--- Page {i + 1} ---\n{text}")
|
||||||
|
return "\n".join(pages_text)
|
19
utils/parsers/passport_parser.py
Normal file
19
utils/parsers/passport_parser.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
import base64
|
||||||
|
import io
|
||||||
|
from PIL import Image
|
||||||
|
import pytesseract
|
||||||
|
|
||||||
|
|
||||||
|
def process_passport(passport_b64: str) -> str:
|
||||||
|
"""
|
||||||
|
Traite le passport :
|
||||||
|
- Décodage de l'image en base64.
|
||||||
|
- Application de l'OCR pour extraire le texte.
|
||||||
|
|
||||||
|
:param passport_b64: Chaîne base64 représentant l'image du passport.
|
||||||
|
:return: Texte extrait de l'image.
|
||||||
|
"""
|
||||||
|
image_bytes = base64.b64decode(passport_b64)
|
||||||
|
image = Image.open(io.BytesIO(image_bytes))
|
||||||
|
text = pytesseract.image_to_string(image, lang='eng')
|
||||||
|
return text
|
64
utils/parsers/profile_parser.py
Normal file
64
utils/parsers/profile_parser.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
import base64
|
||||||
|
import io
|
||||||
|
import zipfile
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
|
||||||
|
def process_profile(profile_b64: str) -> str:
|
||||||
|
"""
|
||||||
|
Traite le profile :
|
||||||
|
- Décodage du fichier DOCX (archive ZIP) encodé en base64.
|
||||||
|
- Extraction du document XML (word/document.xml).
|
||||||
|
- Utilisation du fichier word/styles.xml pour appliquer la mise en forme du texte (titres).
|
||||||
|
|
||||||
|
:param profile_b64: Chaîne base64 représentant le document DOCX.
|
||||||
|
:return: Texte structuré extrait du document.
|
||||||
|
"""
|
||||||
|
profile_bytes = base64.b64decode(profile_b64)
|
||||||
|
zip_file = zipfile.ZipFile(io.BytesIO(profile_bytes))
|
||||||
|
|
||||||
|
document_path = "word/document.xml"
|
||||||
|
styles_path = "word/styles.xml"
|
||||||
|
result_lines = []
|
||||||
|
style_map = {}
|
||||||
|
|
||||||
|
# Construction de la carte des styles
|
||||||
|
if styles_path in zip_file.namelist():
|
||||||
|
styles_xml = zip_file.read(styles_path)
|
||||||
|
styles_tree = ET.fromstring(styles_xml)
|
||||||
|
ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
|
||||||
|
for style in styles_tree.findall(".//w:style", ns):
|
||||||
|
style_id = style.attrib.get(f"{{{ns['w']}}}styleId")
|
||||||
|
name_elem = style.find("w:name", ns)
|
||||||
|
if name_elem is not None:
|
||||||
|
style_name = name_elem.attrib.get(f"{{{ns['w']}}}val", "")
|
||||||
|
style_map[style_id] = style_name
|
||||||
|
|
||||||
|
# Extraction du document principal
|
||||||
|
if document_path in zip_file.namelist():
|
||||||
|
document_xml = zip_file.read(document_path)
|
||||||
|
tree = ET.fromstring(document_xml)
|
||||||
|
ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
|
||||||
|
for para in tree.findall(".//w:p", ns):
|
||||||
|
texts = [node.text for node in para.findall(".//w:t", ns) if node.text]
|
||||||
|
if not texts:
|
||||||
|
continue
|
||||||
|
full_text = " ".join(texts).strip()
|
||||||
|
p_style = para.find(".//w:pStyle", ns)
|
||||||
|
if p_style is not None:
|
||||||
|
style_id = p_style.attrib.get(f"{{{ns['w']}}}val")
|
||||||
|
style_name = style_map.get(style_id, "")
|
||||||
|
if "Heading1" in style_name:
|
||||||
|
result_lines.append(f"\n# {full_text}\n")
|
||||||
|
elif "Heading2" in style_name:
|
||||||
|
result_lines.append(f"\n## {full_text}\n")
|
||||||
|
elif "Heading3" in style_name:
|
||||||
|
result_lines.append(f"\n### {full_text}\n")
|
||||||
|
else:
|
||||||
|
result_lines.append(full_text)
|
||||||
|
else:
|
||||||
|
result_lines.append(full_text)
|
||||||
|
else:
|
||||||
|
result_lines.append("Fichier 'word/document.xml' introuvable dans le profile.")
|
||||||
|
|
||||||
|
return "\n".join(result_lines)
|
Reference in New Issue
Block a user