add OCR profile. Zip -> Word -> txt
This commit is contained in:
40
poc_ocr.py
40
poc_ocr.py
@ -1,16 +1,18 @@
|
|||||||
|
import zipfile
|
||||||
import requests
|
import requests
|
||||||
import base64
|
import base64
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import pytesseract
|
import pytesseract
|
||||||
import io
|
import io
|
||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
# Définir les informations de l'API (à personnaliser)
|
# === Configuration API ===
|
||||||
api_url = "https://hackathon-api.mlo.sehlat.io/game/start"
|
api_url = "https://hackathon-api.mlo.sehlat.io/game/start"
|
||||||
api_key = "OwogAztgWRdPfT2wWe7Xevdw98tdJelatlk82K6bozw"
|
api_key = "OwogAztgWRdPfT2wWe7Xevdw98tdJelatlk82K6bozw"
|
||||||
team_name = "Welch"
|
team_name = "Welch"
|
||||||
|
|
||||||
# Requête POST pour obtenir les données du jeu
|
# === Requête API ===
|
||||||
headers = {
|
headers = {
|
||||||
"x-api-key": api_key,
|
"x-api-key": api_key,
|
||||||
"Content-Type": "application/json"
|
"Content-Type": "application/json"
|
||||||
@ -19,23 +21,33 @@ payload = {
|
|||||||
"player_name": team_name
|
"player_name": team_name
|
||||||
}
|
}
|
||||||
|
|
||||||
# Simulation de la requête (à activer quand tu veux l’exécuter pour de vrai)
|
|
||||||
response = requests.post(api_url, headers=headers, json=payload)
|
response = requests.post(api_url, headers=headers, json=payload)
|
||||||
response_data = response.json()
|
response_data = response.json()
|
||||||
print(response_data["client_data"]["passport"])
|
|
||||||
|
|
||||||
|
# === Traitement du passeport ===
|
||||||
# === Extraction du champ "passport" depuis ta réponse API ===
|
|
||||||
passport_b64 = response_data["client_data"]["passport"]
|
passport_b64 = response_data["client_data"]["passport"]
|
||||||
|
|
||||||
# === Décodage de l’image (Base64 → image binaire) ===
|
|
||||||
image_bytes = base64.b64decode(passport_b64)
|
image_bytes = base64.b64decode(passport_b64)
|
||||||
image = Image.open(io.BytesIO(image_bytes))
|
image = Image.open(io.BytesIO(image_bytes))
|
||||||
|
|
||||||
# === OCR avec pytesseract ===
|
|
||||||
# (Tu peux changer 'eng' par 'fra', 'deu', etc. selon la langue du passeport)
|
|
||||||
extracted_text = pytesseract.image_to_string(image, lang='eng')
|
extracted_text = pytesseract.image_to_string(image, lang='eng')
|
||||||
|
|
||||||
# === Affichage du résultat ===
|
print("=== Texte extrait depuis le passeport ===\n")
|
||||||
print("Texte extrait depuis le passeport :\n")
|
print(extracted_text)
|
||||||
print(extracted_text)
|
|
||||||
|
# === Traitement du profil ===
|
||||||
|
profile_b64 = response_data["client_data"]["profile"]
|
||||||
|
profile_bytes = base64.b64decode(profile_b64)
|
||||||
|
zip_file = zipfile.ZipFile(io.BytesIO(profile_bytes))
|
||||||
|
|
||||||
|
# Vérifie la présence du fichier principal du Word
|
||||||
|
document_path = "word/document.xml"
|
||||||
|
if document_path in zip_file.namelist():
|
||||||
|
with zip_file.open(document_path) as doc_file:
|
||||||
|
xml_content = doc_file.read().decode("utf-8", errors="ignore")
|
||||||
|
|
||||||
|
# Extraction naïve du texte (on peut améliorer avec XML parser si besoin)
|
||||||
|
text_only = re.sub(r"<[^>]+>", "", xml_content)
|
||||||
|
|
||||||
|
print("\n=== Texte extrait depuis le profil (document Word) ===\n")
|
||||||
|
print(text_only.strip())
|
||||||
|
else:
|
||||||
|
print("Le fichier 'word/document.xml' est introuvable dans l'archive du profil.")
|
Reference in New Issue
Block a user