Spaces:

Nassiraaa
/

LLM-for-email-phone-gmail

Sleeping

App Files Files Community

Nassiraaa commited on Aug 1

Commit

6220a93

•

1 Parent(s): 8183a3e

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -41

app.py CHANGED Viewed

@@ -1,47 +1,28 @@
 import streamlit as st
 from doctr.models import ocr_predictor
 from doctr.io import DocumentFile
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 # Initialize DocTR OCR predictor
 ocr_model = ocr_predictor(pretrained=True)
-# Initialize the LLM model and tokenizer
-model = AutoModelForCausalLM.from_pretrained(
-    "microsoft/Phi-3-mini-4k-instruct",
-    device_map="auto",
-    torch_dtype="auto",
-    trust_remote_code=True,
-)
-tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
-# Define the text-generation pipeline
-pipe = pipeline(
-    "text-generation",
-    model=model,
-    tokenizer=tokenizer,
-)
-generation_args = {
-    "max_new_tokens": 500,
-    "return_full_text": False,
-    "temperature": 0.0,
-    "do_sample": False,
-}
 # Streamlit application
 def main():
-    st.title('EMAIL,Phone,Location ')
-    # Uploader un fichier PDF
-    uploaded_file = st.file_uploader("Uploader un fichier PDF", type="pdf")
     if uploaded_file is not None:
-        # Charger le fichier PDF avec Doctr
         pdf_bytes = uploaded_file.read()
         doc = DocumentFile.from_pdf(pdf_bytes)
-        # Extraire le texte
         result = ocr_model(doc)
         text = ""
         for page in result.pages:
@@ -49,23 +30,27 @@ def main():
                 for line in block.lines:
                     for word in line.words:
                         text += word.value + " "
-                text += "\n"
-        # Préparer l'entrée pour le LLM
         messages = [
-            {"role": "system", "content": "Vous êtes un assistant IA utile."},
-            {"role": "user", "content": f"Extraire l'email, le numéro de téléphone et la localisation à partir du texte suivant :\n{text}"}
         ]
-        # Utiliser le LLM pour extraire les détails
-        output = pipe(messages, **generation_args)
-        generated_text = output[0]['generated_text']
-        # Afficher les informations extraites
-        st.header('Informations extraites')
-        st.write(generated_text)
 if __name__ == '__main__':
     main()

+import os
 import streamlit as st
 from doctr.models import ocr_predictor
 from doctr.io import DocumentFile
+from openai import OpenAI
 # Initialize DocTR OCR predictor
 ocr_model = ocr_predictor(pretrained=True)
+# Initialize the OpenAI client
+client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
 # Streamlit application
 def main():
+    st.title('EMAIL, Phone, Location Extractor')
+    # Upload a PDF file
+    uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
     if uploaded_file is not None:
+        # Load the PDF file with Doctr
         pdf_bytes = uploaded_file.read()
         doc = DocumentFile.from_pdf(pdf_bytes)
+        # Extract the text
         result = ocr_model(doc)
         text = ""
         for page in result.pages:
                 for line in block.lines:
                     for word in line.words:
                         text += word.value + " "
+                    text += "\n"
+        # Prepare the input for the LLM
         messages = [
+            {"role": "system", "content": "You are a helpful AI assistant."},
+            {"role": "user", "content": f"Extract the email, phone number, and location from the following text:\n{text}"}
         ]
+        # Use OpenAI's GPT-3.5-turbo to extract the details
+        try:
+            chat_completion = client.chat.completions.create(
+                messages=messages,
+                model="gpt-3.5-turbo",
+            )
+            generated_text = chat_completion.choices[0].message.content
+            # Display the extracted information
+            st.header('Extracted Information')
+            st.write(generated_text)
+        except Exception as e:
+            st.error(f"An error occurred: {str(e)}")
 if __name__ == '__main__':
     main()