PDF-Insight-PRO / preprocessing.py
jatinmehra's picture
Initial
70da74c verified
import os
import PyPDF2
from groq import Groq
import streamlit as st
from collections import defaultdict
class Model:
"""
A class that represents a model for generating responses based on a given context and query.
"""
def __init__(self):
"""
Initializes the Model object and sets up the Groq client.
"""
# api_key = os.getenv("GROQ_API_KEY")
api_key = st.secrets["GROQ_API_KEY"]
if not api_key:
raise ValueError("GROQ_API_KEY environment variable is not set.")
self.client = Groq(api_key=api_key)
self.contexts = []
self.cache = defaultdict(dict) # Caching for repeated queries
def extract_text_from_pdf(self, pdf_file):
"""
Extracts text from a PDF file.
Args:
- pdf_file: The file-like object of the PDF.
Returns:
- text: The extracted text from the PDF file.
"""
try:
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
except Exception as e:
raise ValueError(f"Error extracting text: {str(e)}")
def generate_response(self, context, query, temperature, max_tokens, model):
"""
Generates a response based on the given context and query.
Args:
- context: The context for generating the response.
- query: The query or question.
- temperature: The sampling temperature for response generation.
- max_tokens: The maximum number of tokens for the response.
- model: The model ID to be used for generating the response.
Returns:
- response: The generated response.
"""
# Caching check
if query in self.cache and self.cache[query]["context"] == context:
return self.cache[query]["response"]
messages = [
{"role": "system", "content": f"Context: {context}"},
{"role": "user", "content": query},
]
try:
completion = self.client.chat.completions.create(
model=model, # Model ID
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
)
response = completion.choices[0].message.content
self.cache[query]["context"] = context
self.cache[query]["response"] = response # Cache the response
return response
except Exception as e:
return f"API request failed: {str(e)}"
def add_to_context(self, file_path: str):
"""
Reads a PDF file and appends its content to the context for generating responses.
Args:
- file_path: The path to the PDF file.
"""
try:
with open(file_path, "rb") as pdf_file:
context = self.extract_text_from_pdf(pdf_file)
self.contexts.append(context)
except Exception as e:
raise ValueError(f"Error processing PDF: {str(e)}")
def remove_from_context(self, index: int):
"""
Removes a document from the context based on its index.
Args:
- index: The index of the document to remove.
"""
if 0 <= index < len(self.contexts):
self.contexts.pop(index)
else:
raise ValueError("Invalid index for removing context.")
def get_combined_context(self):
"""
Combines all contexts into a single context string.
Returns:
- combined_context: The combined context from all documents.
"""
return "\n".join(self.contexts)
def get_response(self, question: str, temperature: float, max_tokens: int, model: str):
"""
Generates a response based on the given question and the current combined context.
Args:
- question: The user's question.
- temperature: The sampling temperature for response generation.
- max_tokens: The maximum number of tokens for the response.
- model: The model ID to be used for generating the response.
Returns:
- response: The generated response or a prompt to upload a document.
"""
if not self.contexts:
return "Please upload a document."
combined_context = self.get_combined_context()
return self.generate_response(combined_context, question, temperature, max_tokens, model)
def clear(self):
"""
Clears the current context.
"""
self.contexts = []
self.cache.clear()