Spaces:

pierreguillou
/

arquiteturia

Running

App Files Files Community

pierreguillou commited on 21 days ago

Commit

e011df5

•

1 Parent(s): fcb82c4

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -32

app.py CHANGED Viewed

@@ -11,6 +11,12 @@ import zipfile
 from pdf2image import convert_from_path
 import google.generativeai as genai
 import json
 def authenticate(username, password):
     return username == os.getenv("HF_USERNAME") and password == os.getenv("HF_PASSWORD")
@@ -184,28 +190,10 @@ def process_pdf(pdf_file):
         shutil.rmtree(temp_dir)
     os.makedirs(output_dir, exist_ok=True)
-    ## JSON of teh data to extract with descriptions
     path_to_data_to_extract = os.path.join(template_dir, "data_to_extract.json")
     try:
-        # Convert PDF to images and process
-        images = convert_from_path(pdf_file.name)
-        annotated_images = []
-        for i, img in enumerate(images):
-            temp_img_path = os.path.join(temp_dir, f'temp_page_{i}.png')
-            img.save(temp_img_path)
-            blocks, annotated_image_path = process_image(temp_img_path, output_dir, i)
-            annotated_images.append(annotated_image_path)
-            save_extracted_text(blocks, i + 1, output_dir)
-        # Create ZIP file
-        zip_path = os.path.join(temp_dir, "annotated_images.zip")
-        with zipfile.ZipFile(zip_path, 'w') as zipf:
-            for img_path in annotated_images:
-                zipf.write(img_path, os.path.basename(img_path))
-        # Get the text file
-        text_file_path = os.path.join(output_dir, 'extracted_text.txt')
         # Process with Gemini
         extracted_data = extract_data_with_gemini(text_file_path, path_to_data_to_extract)
@@ -215,7 +203,12 @@ def process_pdf(pdf_file):
         with open(json_path, 'w', encoding='utf-8') as f:
             json.dump(extracted_data, f, ensure_ascii=False, indent=2)
-        return text_file_path, zip_path, json_path
     except Exception as e:
         raise gr.Error(f"Error processing PDF: {str(e)}")
@@ -237,28 +230,29 @@ demo = gr.Interface(
     fn=process_pdf,
     inputs=[
         gr.File(
-            label="Upload PDF Document",
             file_types=[".pdf"],
             type="filepath"
         )
     ],
     outputs=[
-        gr.File(label="Extracted Text (TXT)"),
-        gr.File(label="Annotated Images (ZIP)"),
-        gr.File(label="Extracted Data (JSON)")
     ],
-    title="PDF Text Extraction and Analysis",
     description="""
-    Upload a PDF document to:
-    1. Extract text content
-    2. Get annotated images showing detected text blocks
-    3. Extract structured data using AI analysis
-    Supports multiple pages and French legal documents.
     """,
-    #article="Created by [Your Name] - [Your GitHub/Profile Link]",
     css=css,
-    examples=[],  # Add example PDFs if you have any
     cache_examples=False,
     theme=gr.themes.Soft()
 )

 from pdf2image import convert_from_path
 import google.generativeai as genai
 import json
+from docx import Document
+from docx.shared import Pt, RGBColor, Inches
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+from docx.enum.section import WD_SECTION
+from docx.oxml import OxmlElement
+from docx.oxml.ns import qn
 def authenticate(username, password):
     return username == os.getenv("HF_USERNAME") and password == os.getenv("HF_PASSWORD")
         shutil.rmtree(temp_dir)
     os.makedirs(output_dir, exist_ok=True)
     path_to_data_to_extract = os.path.join(template_dir, "data_to_extract.json")
     try:
+        # [Code existant pour le traitement du PDF...]
         # Process with Gemini
         extracted_data = extract_data_with_gemini(text_file_path, path_to_data_to_extract)
         with open(json_path, 'w', encoding='utf-8') as f:
             json.dump(extracted_data, f, ensure_ascii=False, indent=2)
+        # Generate DOCX report
+        generator = RapportGenerator(json_path)
+        docx_path = os.path.join(temp_dir, "rapport_extraction.docx")
+        generator.generate_report()
+        return text_file_path, zip_path, json_path, docx_path
     except Exception as e:
         raise gr.Error(f"Error processing PDF: {str(e)}")
     fn=process_pdf,
     inputs=[
         gr.File(
+            label="Télécharger un document PDF",
             file_types=[".pdf"],
             type="filepath"
         )
     ],
     outputs=[
+        gr.File(label="Texte extrait (TXT)"),
+        gr.File(label="Images annotées (ZIP)"),
+        gr.File(label="Données extraites (JSON)"),
+        gr.File(label="Rapport généré (DOCX)")  # Nouvelle sortie
     ],
+    title="Extraction et analyse de texte PDF",
     description="""
+    Téléchargez un document PDF pour :
+    1. Extraire le contenu textuel
+    2. Obtenir des images annotées montrant les blocs de texte détectés
+    3. Extraire des données structurées grâce à une analyse IA
+    4. Générer un rapport formaté au format DOCX
+    Prend en charge les documents multi-pages et les documents juridiques français.
     """,
     css=css,
+    examples=[],
     cache_examples=False,
     theme=gr.themes.Soft()
 )