Nassiraaa commited on
Commit
6220a93
1 Parent(s): 8183a3e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -41
app.py CHANGED
@@ -1,47 +1,28 @@
 
1
  import streamlit as st
2
  from doctr.models import ocr_predictor
3
  from doctr.io import DocumentFile
4
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
5
 
6
  # Initialize DocTR OCR predictor
7
  ocr_model = ocr_predictor(pretrained=True)
8
 
9
- # Initialize the LLM model and tokenizer
10
- model = AutoModelForCausalLM.from_pretrained(
11
- "microsoft/Phi-3-mini-4k-instruct",
12
- device_map="auto",
13
- torch_dtype="auto",
14
- trust_remote_code=True,
15
- )
16
- tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
17
-
18
- # Define the text-generation pipeline
19
- pipe = pipeline(
20
- "text-generation",
21
- model=model,
22
- tokenizer=tokenizer,
23
- )
24
-
25
- generation_args = {
26
- "max_new_tokens": 500,
27
- "return_full_text": False,
28
- "temperature": 0.0,
29
- "do_sample": False,
30
- }
31
 
32
  # Streamlit application
33
  def main():
34
- st.title('EMAIL,Phone,Location ')
35
 
36
- # Uploader un fichier PDF
37
- uploaded_file = st.file_uploader("Uploader un fichier PDF", type="pdf")
38
 
39
  if uploaded_file is not None:
40
- # Charger le fichier PDF avec Doctr
41
  pdf_bytes = uploaded_file.read()
42
  doc = DocumentFile.from_pdf(pdf_bytes)
43
 
44
- # Extraire le texte
45
  result = ocr_model(doc)
46
  text = ""
47
  for page in result.pages:
@@ -49,23 +30,27 @@ def main():
49
  for line in block.lines:
50
  for word in line.words:
51
  text += word.value + " "
52
- text += "\n"
53
 
54
-
55
-
56
- # Préparer l'entrée pour le LLM
57
  messages = [
58
- {"role": "system", "content": "Vous êtes un assistant IA utile."},
59
- {"role": "user", "content": f"Extraire l'email, le numéro de téléphone et la localisation à partir du texte suivant :\n{text}"}
60
  ]
61
 
62
- # Utiliser le LLM pour extraire les détails
63
- output = pipe(messages, **generation_args)
64
- generated_text = output[0]['generated_text']
65
-
66
- # Afficher les informations extraites
67
- st.header('Informations extraites')
68
- st.write(generated_text)
 
 
 
 
 
 
69
 
70
  if __name__ == '__main__':
71
  main()
 
1
+ import os
2
  import streamlit as st
3
  from doctr.models import ocr_predictor
4
  from doctr.io import DocumentFile
5
+ from openai import OpenAI
6
 
7
  # Initialize DocTR OCR predictor
8
  ocr_model = ocr_predictor(pretrained=True)
9
 
10
+ # Initialize the OpenAI client
11
+ client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Streamlit application
14
  def main():
15
+ st.title('EMAIL, Phone, Location Extractor')
16
 
17
+ # Upload a PDF file
18
+ uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
19
 
20
  if uploaded_file is not None:
21
+ # Load the PDF file with Doctr
22
  pdf_bytes = uploaded_file.read()
23
  doc = DocumentFile.from_pdf(pdf_bytes)
24
 
25
+ # Extract the text
26
  result = ocr_model(doc)
27
  text = ""
28
  for page in result.pages:
 
30
  for line in block.lines:
31
  for word in line.words:
32
  text += word.value + " "
33
+ text += "\n"
34
 
35
+ # Prepare the input for the LLM
 
 
36
  messages = [
37
+ {"role": "system", "content": "You are a helpful AI assistant."},
38
+ {"role": "user", "content": f"Extract the email, phone number, and location from the following text:\n{text}"}
39
  ]
40
 
41
+ # Use OpenAI's GPT-3.5-turbo to extract the details
42
+ try:
43
+ chat_completion = client.chat.completions.create(
44
+ messages=messages,
45
+ model="gpt-3.5-turbo",
46
+ )
47
+ generated_text = chat_completion.choices[0].message.content
48
+
49
+ # Display the extracted information
50
+ st.header('Extracted Information')
51
+ st.write(generated_text)
52
+ except Exception as e:
53
+ st.error(f"An error occurred: {str(e)}")
54
 
55
  if __name__ == '__main__':
56
  main()