pierreguillou commited on
Commit
e011df5
1 Parent(s): fcb82c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -32
app.py CHANGED
@@ -11,6 +11,12 @@ import zipfile
11
  from pdf2image import convert_from_path
12
  import google.generativeai as genai
13
  import json
 
 
 
 
 
 
14
 
15
  def authenticate(username, password):
16
  return username == os.getenv("HF_USERNAME") and password == os.getenv("HF_PASSWORD")
@@ -184,28 +190,10 @@ def process_pdf(pdf_file):
184
  shutil.rmtree(temp_dir)
185
  os.makedirs(output_dir, exist_ok=True)
186
 
187
- ## JSON of teh data to extract with descriptions
188
  path_to_data_to_extract = os.path.join(template_dir, "data_to_extract.json")
189
 
190
  try:
191
- # Convert PDF to images and process
192
- images = convert_from_path(pdf_file.name)
193
- annotated_images = []
194
- for i, img in enumerate(images):
195
- temp_img_path = os.path.join(temp_dir, f'temp_page_{i}.png')
196
- img.save(temp_img_path)
197
- blocks, annotated_image_path = process_image(temp_img_path, output_dir, i)
198
- annotated_images.append(annotated_image_path)
199
- save_extracted_text(blocks, i + 1, output_dir)
200
-
201
- # Create ZIP file
202
- zip_path = os.path.join(temp_dir, "annotated_images.zip")
203
- with zipfile.ZipFile(zip_path, 'w') as zipf:
204
- for img_path in annotated_images:
205
- zipf.write(img_path, os.path.basename(img_path))
206
-
207
- # Get the text file
208
- text_file_path = os.path.join(output_dir, 'extracted_text.txt')
209
 
210
  # Process with Gemini
211
  extracted_data = extract_data_with_gemini(text_file_path, path_to_data_to_extract)
@@ -215,7 +203,12 @@ def process_pdf(pdf_file):
215
  with open(json_path, 'w', encoding='utf-8') as f:
216
  json.dump(extracted_data, f, ensure_ascii=False, indent=2)
217
 
218
- return text_file_path, zip_path, json_path
 
 
 
 
 
219
 
220
  except Exception as e:
221
  raise gr.Error(f"Error processing PDF: {str(e)}")
@@ -237,28 +230,29 @@ demo = gr.Interface(
237
  fn=process_pdf,
238
  inputs=[
239
  gr.File(
240
- label="Upload PDF Document",
241
  file_types=[".pdf"],
242
  type="filepath"
243
  )
244
  ],
245
  outputs=[
246
- gr.File(label="Extracted Text (TXT)"),
247
- gr.File(label="Annotated Images (ZIP)"),
248
- gr.File(label="Extracted Data (JSON)")
 
249
  ],
250
- title="PDF Text Extraction and Analysis",
251
  description="""
252
- Upload a PDF document to:
253
- 1. Extract text content
254
- 2. Get annotated images showing detected text blocks
255
- 3. Extract structured data using AI analysis
 
256
 
257
- Supports multiple pages and French legal documents.
258
  """,
259
- #article="Created by [Your Name] - [Your GitHub/Profile Link]",
260
  css=css,
261
- examples=[], # Add example PDFs if you have any
262
  cache_examples=False,
263
  theme=gr.themes.Soft()
264
  )
 
11
  from pdf2image import convert_from_path
12
  import google.generativeai as genai
13
  import json
14
+ from docx import Document
15
+ from docx.shared import Pt, RGBColor, Inches
16
+ from docx.enum.text import WD_ALIGN_PARAGRAPH
17
+ from docx.enum.section import WD_SECTION
18
+ from docx.oxml import OxmlElement
19
+ from docx.oxml.ns import qn
20
 
21
  def authenticate(username, password):
22
  return username == os.getenv("HF_USERNAME") and password == os.getenv("HF_PASSWORD")
 
190
  shutil.rmtree(temp_dir)
191
  os.makedirs(output_dir, exist_ok=True)
192
 
 
193
  path_to_data_to_extract = os.path.join(template_dir, "data_to_extract.json")
194
 
195
  try:
196
+ # [Code existant pour le traitement du PDF...]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
  # Process with Gemini
199
  extracted_data = extract_data_with_gemini(text_file_path, path_to_data_to_extract)
 
203
  with open(json_path, 'w', encoding='utf-8') as f:
204
  json.dump(extracted_data, f, ensure_ascii=False, indent=2)
205
 
206
+ # Generate DOCX report
207
+ generator = RapportGenerator(json_path)
208
+ docx_path = os.path.join(temp_dir, "rapport_extraction.docx")
209
+ generator.generate_report()
210
+
211
+ return text_file_path, zip_path, json_path, docx_path
212
 
213
  except Exception as e:
214
  raise gr.Error(f"Error processing PDF: {str(e)}")
 
230
  fn=process_pdf,
231
  inputs=[
232
  gr.File(
233
+ label="Télécharger un document PDF",
234
  file_types=[".pdf"],
235
  type="filepath"
236
  )
237
  ],
238
  outputs=[
239
+ gr.File(label="Texte extrait (TXT)"),
240
+ gr.File(label="Images annotées (ZIP)"),
241
+ gr.File(label="Données extraites (JSON)"),
242
+ gr.File(label="Rapport généré (DOCX)") # Nouvelle sortie
243
  ],
244
+ title="Extraction et analyse de texte PDF",
245
  description="""
246
+ Téléchargez un document PDF pour :
247
+ 1. Extraire le contenu textuel
248
+ 2. Obtenir des images annotées montrant les blocs de texte détectés
249
+ 3. Extraire des données structurées grâce à une analyse IA
250
+ 4. Générer un rapport formaté au format DOCX
251
 
252
+ Prend en charge les documents multi-pages et les documents juridiques français.
253
  """,
 
254
  css=css,
255
+ examples=[],
256
  cache_examples=False,
257
  theme=gr.themes.Soft()
258
  )