import os import streamlit as st from doctr.models import ocr_predictor from doctr.io import DocumentFile from openai import OpenAI # Initialize DocTR OCR predictor ocr_model = ocr_predictor(pretrained=True) # Initialize the OpenAI client client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) # Streamlit application def main(): st.title('EMAIL, Phone, Location Extractor') # Upload a PDF file uploaded_file = st.file_uploader("Upload a PDF file", type="pdf") if uploaded_file is not None: # Load the PDF file with Doctr pdf_bytes = uploaded_file.read() doc = DocumentFile.from_pdf(pdf_bytes) # Extract the text result = ocr_model(doc) text = "" for page in result.pages: for block in page.blocks: for line in block.lines: for word in line.words: text += word.value + " " text += "\n" # Prepare the input for the LLM messages = [ {"role": "system", "content": "You are a helpful AI assistant."}, {"role": "user", "content": f"Extract the email, phone number, and location from the following text:\n{text}"} ] # Use OpenAI's GPT-3.5-turbo to extract the details try: chat_completion = client.chat.completions.create( messages=messages, model="gpt-3.5-turbo", ) generated_text = chat_completion.choices[0].message.content # Display the extracted information st.header('Extracted Information') st.write(generated_text) except Exception as e: st.error(f"An error occurred: {str(e)}") if __name__ == '__main__': main()