Spaces:
Running
Running
pierreguillou
commited on
Commit
•
e011df5
1
Parent(s):
fcb82c4
Update app.py
Browse files
app.py
CHANGED
@@ -11,6 +11,12 @@ import zipfile
|
|
11 |
from pdf2image import convert_from_path
|
12 |
import google.generativeai as genai
|
13 |
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
def authenticate(username, password):
|
16 |
return username == os.getenv("HF_USERNAME") and password == os.getenv("HF_PASSWORD")
|
@@ -184,28 +190,10 @@ def process_pdf(pdf_file):
|
|
184 |
shutil.rmtree(temp_dir)
|
185 |
os.makedirs(output_dir, exist_ok=True)
|
186 |
|
187 |
-
## JSON of teh data to extract with descriptions
|
188 |
path_to_data_to_extract = os.path.join(template_dir, "data_to_extract.json")
|
189 |
|
190 |
try:
|
191 |
-
#
|
192 |
-
images = convert_from_path(pdf_file.name)
|
193 |
-
annotated_images = []
|
194 |
-
for i, img in enumerate(images):
|
195 |
-
temp_img_path = os.path.join(temp_dir, f'temp_page_{i}.png')
|
196 |
-
img.save(temp_img_path)
|
197 |
-
blocks, annotated_image_path = process_image(temp_img_path, output_dir, i)
|
198 |
-
annotated_images.append(annotated_image_path)
|
199 |
-
save_extracted_text(blocks, i + 1, output_dir)
|
200 |
-
|
201 |
-
# Create ZIP file
|
202 |
-
zip_path = os.path.join(temp_dir, "annotated_images.zip")
|
203 |
-
with zipfile.ZipFile(zip_path, 'w') as zipf:
|
204 |
-
for img_path in annotated_images:
|
205 |
-
zipf.write(img_path, os.path.basename(img_path))
|
206 |
-
|
207 |
-
# Get the text file
|
208 |
-
text_file_path = os.path.join(output_dir, 'extracted_text.txt')
|
209 |
|
210 |
# Process with Gemini
|
211 |
extracted_data = extract_data_with_gemini(text_file_path, path_to_data_to_extract)
|
@@ -215,7 +203,12 @@ def process_pdf(pdf_file):
|
|
215 |
with open(json_path, 'w', encoding='utf-8') as f:
|
216 |
json.dump(extracted_data, f, ensure_ascii=False, indent=2)
|
217 |
|
218 |
-
|
|
|
|
|
|
|
|
|
|
|
219 |
|
220 |
except Exception as e:
|
221 |
raise gr.Error(f"Error processing PDF: {str(e)}")
|
@@ -237,28 +230,29 @@ demo = gr.Interface(
|
|
237 |
fn=process_pdf,
|
238 |
inputs=[
|
239 |
gr.File(
|
240 |
-
label="
|
241 |
file_types=[".pdf"],
|
242 |
type="filepath"
|
243 |
)
|
244 |
],
|
245 |
outputs=[
|
246 |
-
gr.File(label="
|
247 |
-
gr.File(label="
|
248 |
-
gr.File(label="
|
|
|
249 |
],
|
250 |
-
title="
|
251 |
description="""
|
252 |
-
|
253 |
-
1.
|
254 |
-
2.
|
255 |
-
3.
|
|
|
256 |
|
257 |
-
|
258 |
""",
|
259 |
-
#article="Created by [Your Name] - [Your GitHub/Profile Link]",
|
260 |
css=css,
|
261 |
-
examples=[],
|
262 |
cache_examples=False,
|
263 |
theme=gr.themes.Soft()
|
264 |
)
|
|
|
11 |
from pdf2image import convert_from_path
|
12 |
import google.generativeai as genai
|
13 |
import json
|
14 |
+
from docx import Document
|
15 |
+
from docx.shared import Pt, RGBColor, Inches
|
16 |
+
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
17 |
+
from docx.enum.section import WD_SECTION
|
18 |
+
from docx.oxml import OxmlElement
|
19 |
+
from docx.oxml.ns import qn
|
20 |
|
21 |
def authenticate(username, password):
|
22 |
return username == os.getenv("HF_USERNAME") and password == os.getenv("HF_PASSWORD")
|
|
|
190 |
shutil.rmtree(temp_dir)
|
191 |
os.makedirs(output_dir, exist_ok=True)
|
192 |
|
|
|
193 |
path_to_data_to_extract = os.path.join(template_dir, "data_to_extract.json")
|
194 |
|
195 |
try:
|
196 |
+
# [Code existant pour le traitement du PDF...]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
|
198 |
# Process with Gemini
|
199 |
extracted_data = extract_data_with_gemini(text_file_path, path_to_data_to_extract)
|
|
|
203 |
with open(json_path, 'w', encoding='utf-8') as f:
|
204 |
json.dump(extracted_data, f, ensure_ascii=False, indent=2)
|
205 |
|
206 |
+
# Generate DOCX report
|
207 |
+
generator = RapportGenerator(json_path)
|
208 |
+
docx_path = os.path.join(temp_dir, "rapport_extraction.docx")
|
209 |
+
generator.generate_report()
|
210 |
+
|
211 |
+
return text_file_path, zip_path, json_path, docx_path
|
212 |
|
213 |
except Exception as e:
|
214 |
raise gr.Error(f"Error processing PDF: {str(e)}")
|
|
|
230 |
fn=process_pdf,
|
231 |
inputs=[
|
232 |
gr.File(
|
233 |
+
label="Télécharger un document PDF",
|
234 |
file_types=[".pdf"],
|
235 |
type="filepath"
|
236 |
)
|
237 |
],
|
238 |
outputs=[
|
239 |
+
gr.File(label="Texte extrait (TXT)"),
|
240 |
+
gr.File(label="Images annotées (ZIP)"),
|
241 |
+
gr.File(label="Données extraites (JSON)"),
|
242 |
+
gr.File(label="Rapport généré (DOCX)") # Nouvelle sortie
|
243 |
],
|
244 |
+
title="Extraction et analyse de texte PDF",
|
245 |
description="""
|
246 |
+
Téléchargez un document PDF pour :
|
247 |
+
1. Extraire le contenu textuel
|
248 |
+
2. Obtenir des images annotées montrant les blocs de texte détectés
|
249 |
+
3. Extraire des données structurées grâce à une analyse IA
|
250 |
+
4. Générer un rapport formaté au format DOCX
|
251 |
|
252 |
+
Prend en charge les documents multi-pages et les documents juridiques français.
|
253 |
""",
|
|
|
254 |
css=css,
|
255 |
+
examples=[],
|
256 |
cache_examples=False,
|
257 |
theme=gr.themes.Soft()
|
258 |
)
|