Hammad712 commited on
Commit
05847c9
·
verified ·
1 Parent(s): 86757af

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -0
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import json
4
+ import tempfile
5
+ from typing import List
6
+ from pydantic import BaseModel
7
+ from langchain_groq import ChatGroq
8
+ from langchain.document_loaders import PyPDFLoader
9
+
10
+ # Define the response schema using Pydantic
11
+ class ExtractionResult(BaseModel):
12
+ answers: List[str]
13
+
14
+ # Function to initialize the LLM
15
+ def get_llm(api_key: str):
16
+ return ChatGroq(
17
+ model="llama-3.3-70b-versatile",
18
+ temperature=0,
19
+ max_tokens=1024,
20
+ api_key=api_key
21
+ )
22
+
23
+ # Function to process the uploaded PDF and extract text
24
+ def process_pdf(file) -> str:
25
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
26
+ tmp_file.write(file.read())
27
+ tmp_path = tmp_file.name
28
+
29
+ loader = PyPDFLoader(tmp_path)
30
+ pages = loader.load_and_split()
31
+ os.remove(tmp_path)
32
+ all_page_content = "\n".join(page.page_content for page in pages)
33
+ return all_page_content
34
+
35
+ # Build the prompt using the JSON schema from ExtractionResult
36
+ def build_prompt(all_page_content: str) -> str:
37
+ schema_dict = ExtractionResult.model_json_schema()
38
+ schema = json.dumps(schema_dict, indent=2)
39
+ system_message = (
40
+ "You are a document analysis tool that extracts the options and correct answers from the provided document content. "
41
+ "The output must be a JSON object that strictly follows the schema: " + schema
42
+ )
43
+ user_message = (
44
+ "Please extract the correct answers and options (A, B, C, D, E) from the following document content:\n\n"
45
+ + all_page_content
46
+ )
47
+ return system_message + "\n\n" + user_message
48
+
49
+ def main():
50
+ st.title("PDF Answer Extraction App")
51
+ st.write("Upload a PDF document to extract the correct answers and options.")
52
+
53
+ # Retrieve API key from Streamlit secrets or environment variables
54
+ api_key = st.secrets.get("GROQ_API_KEY") or os.getenv("GROQ_API_KEY")
55
+ if not api_key:
56
+ st.error("GROQ API key not found! Please set it in your environment or Streamlit secrets.")
57
+ st.stop()
58
+
59
+ # Initialize the language model
60
+ llm = get_llm(api_key)
61
+
62
+ uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
63
+
64
+ if uploaded_file is not None:
65
+ with st.spinner("Processing the PDF..."):
66
+ try:
67
+ all_page_content = process_pdf(uploaded_file)
68
+ prompt = build_prompt(all_page_content)
69
+ response = llm.invoke(prompt, response_format={"type": "json_object"})
70
+ result = ExtractionResult.model_validate_json(response.content)
71
+
72
+ st.success("Extraction complete!")
73
+ st.json(result.model_dump())
74
+ except Exception as e:
75
+ st.error(f"An error occurred: {e}")
76
+
77
+ if __name__ == "__main__":
78
+ main()