Nassiraaa's picture
Update app.py
6220a93 verified
import os
import streamlit as st
from doctr.models import ocr_predictor
from doctr.io import DocumentFile
from openai import OpenAI
# Initialize DocTR OCR predictor
ocr_model = ocr_predictor(pretrained=True)
# Initialize the OpenAI client
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
# Streamlit application
def main():
st.title('EMAIL, Phone, Location Extractor')
# Upload a PDF file
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
if uploaded_file is not None:
# Load the PDF file with Doctr
pdf_bytes = uploaded_file.read()
doc = DocumentFile.from_pdf(pdf_bytes)
# Extract the text
result = ocr_model(doc)
text = ""
for page in result.pages:
for block in page.blocks:
for line in block.lines:
for word in line.words:
text += word.value + " "
text += "\n"
# Prepare the input for the LLM
messages = [
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": f"Extract the email, phone number, and location from the following text:\n{text}"}
]
# Use OpenAI's GPT-3.5-turbo to extract the details
try:
chat_completion = client.chat.completions.create(
messages=messages,
model="gpt-3.5-turbo",
)
generated_text = chat_completion.choices[0].message.content
# Display the extracted information
st.header('Extracted Information')
st.write(generated_text)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
if __name__ == '__main__':
main()