Spaces:
Sleeping
Sleeping
import os | |
import streamlit as st | |
from doctr.models import ocr_predictor | |
from doctr.io import DocumentFile | |
from openai import OpenAI | |
# Initialize DocTR OCR predictor | |
ocr_model = ocr_predictor(pretrained=True) | |
# Initialize the OpenAI client | |
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) | |
# Streamlit application | |
def main(): | |
st.title('EMAIL, Phone, Location Extractor') | |
# Upload a PDF file | |
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf") | |
if uploaded_file is not None: | |
# Load the PDF file with Doctr | |
pdf_bytes = uploaded_file.read() | |
doc = DocumentFile.from_pdf(pdf_bytes) | |
# Extract the text | |
result = ocr_model(doc) | |
text = "" | |
for page in result.pages: | |
for block in page.blocks: | |
for line in block.lines: | |
for word in line.words: | |
text += word.value + " " | |
text += "\n" | |
# Prepare the input for the LLM | |
messages = [ | |
{"role": "system", "content": "You are a helpful AI assistant."}, | |
{"role": "user", "content": f"Extract the email, phone number, and location from the following text:\n{text}"} | |
] | |
# Use OpenAI's GPT-3.5-turbo to extract the details | |
try: | |
chat_completion = client.chat.completions.create( | |
messages=messages, | |
model="gpt-3.5-turbo", | |
) | |
generated_text = chat_completion.choices[0].message.content | |
# Display the extracted information | |
st.header('Extracted Information') | |
st.write(generated_text) | |
except Exception as e: | |
st.error(f"An error occurred: {str(e)}") | |
if __name__ == '__main__': | |
main() |