Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -12,17 +12,19 @@ from langchain.prompts import PromptTemplate
|
|
12 |
from langchain.chains import RetrievalQA
|
13 |
from langchain_groq import ChatGroq
|
14 |
|
15 |
-
|
16 |
class ChatbotModel:
|
17 |
def __init__(self):
|
|
|
18 |
os.environ["GROQ_API_KEY"] = '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'
|
19 |
|
|
|
20 |
self.embeddings = HuggingFaceEmbeddings(
|
21 |
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
22 |
model_kwargs={'device': 'cpu'},
|
23 |
encode_kwargs={'normalize_embeddings': True}
|
24 |
)
|
25 |
|
|
|
26 |
self.llm = ChatGroq(
|
27 |
model='llama3-70b-8192',
|
28 |
temperature=0.5,
|
@@ -31,14 +33,70 @@ class ChatbotModel:
|
|
31 |
max_retries=2,
|
32 |
)
|
33 |
|
|
|
34 |
self.memory = ConversationBufferMemory(memory_key="history", input_key="question")
|
35 |
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
self.QA_CHAIN_PROMPT = PromptTemplate(
|
39 |
input_variables=["history", "context", "question"],
|
40 |
template=self.template
|
41 |
)
|
|
|
42 |
self.db1 = None
|
43 |
self.qa_chain = None
|
44 |
|
@@ -51,13 +109,16 @@ class ChatbotModel:
|
|
51 |
return "\n".join([pytesseract.image_to_string(img, lang=language) for img in images])
|
52 |
|
53 |
def process_file(self, uploaded_file):
|
|
|
54 |
_, file_extension = os.path.splitext(uploaded_file.name)
|
55 |
file_extension = file_extension.lower()
|
56 |
|
|
|
57 |
with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
|
58 |
temp_file.write(uploaded_file.read())
|
59 |
temp_path = temp_file.name
|
60 |
|
|
|
61 |
if file_extension == '.pdf':
|
62 |
raw_text = self.ocr_pdf(temp_path, language='guj+eng')
|
63 |
elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
|
@@ -65,9 +126,11 @@ class ChatbotModel:
|
|
65 |
else:
|
66 |
return "Unsupported file format."
|
67 |
|
|
|
68 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
69 |
text_chunks = text_splitter.split_text(raw_text)
|
70 |
|
|
|
71 |
self.db1 = FAISS.from_documents(text_chunks, self.embeddings)
|
72 |
self.qa_chain = RetrievalQA.from_chain_type(
|
73 |
self.llm,
|
@@ -80,27 +143,26 @@ class ChatbotModel:
|
|
80 |
"memory": self.memory
|
81 |
}
|
82 |
)
|
83 |
-
|
84 |
return "File processed successfully!"
|
85 |
|
86 |
def get_response(self, user_input):
|
|
|
87 |
if not self.qa_chain:
|
88 |
return "Please upload and process a file before asking questions."
|
89 |
response = self.qa_chain({"query": user_input})
|
90 |
return response["result"]
|
91 |
|
92 |
-
|
93 |
chatbot = ChatbotModel()
|
94 |
|
95 |
-
|
96 |
def upload_and_process(file):
|
97 |
return chatbot.process_file(file)
|
98 |
|
99 |
-
|
100 |
def ask_question(question):
|
101 |
return chatbot.get_response(question)
|
102 |
|
103 |
-
|
104 |
interface = gr.Blocks()
|
105 |
|
106 |
with interface:
|
@@ -115,7 +177,9 @@ with interface:
|
|
115 |
ask_btn = gr.Button("Submit")
|
116 |
answer = gr.Textbox(label="Answer")
|
117 |
|
|
|
118 |
upload_btn.click(upload_and_process, inputs=file_upload, outputs=output)
|
119 |
ask_btn.click(ask_question, inputs=question_box, outputs=answer)
|
120 |
|
|
|
121 |
interface.launch()
|
|
|
12 |
from langchain.chains import RetrievalQA
|
13 |
from langchain_groq import ChatGroq
|
14 |
|
|
|
15 |
class ChatbotModel:
|
16 |
def __init__(self):
|
17 |
+
# Initialize the environment variable for the GROQ API Key
|
18 |
os.environ["GROQ_API_KEY"] = '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'
|
19 |
|
20 |
+
# Initialize embeddings
|
21 |
self.embeddings = HuggingFaceEmbeddings(
|
22 |
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
23 |
model_kwargs={'device': 'cpu'},
|
24 |
encode_kwargs={'normalize_embeddings': True}
|
25 |
)
|
26 |
|
27 |
+
# Initialize the chat model
|
28 |
self.llm = ChatGroq(
|
29 |
model='llama3-70b-8192',
|
30 |
temperature=0.5,
|
|
|
33 |
max_retries=2,
|
34 |
)
|
35 |
|
36 |
+
# Initialize memory for conversation
|
37 |
self.memory = ConversationBufferMemory(memory_key="history", input_key="question")
|
38 |
|
39 |
+
# Create the QA chain prompt template
|
40 |
+
self.template = """You are an intelligent educational assistant specialized in handling queries about documents in both English and Gujarati languages. You have been provided with OCR-processed text from {document_type} that contains important educational information.
|
41 |
+
|
42 |
+
Core Responsibilities:
|
43 |
+
1. Language Processing:
|
44 |
+
- Identify the language of the user's query (English or Gujarati)
|
45 |
+
- Respond in the same language as the query
|
46 |
+
- If the query is in Gujarati, ensure the response maintains proper Gujarati grammar and terminology
|
47 |
+
- For technical terms, provide both English and Gujarati versions when relevant
|
48 |
+
|
49 |
+
2. Document Understanding:
|
50 |
+
- Analyze the OCR-processed text from the uploaded {document_type}
|
51 |
+
- Account for potential OCR errors or misinterpretations
|
52 |
+
- Focus on extracting accurate information despite possible OCR imperfections
|
53 |
+
|
54 |
+
3. Response Guidelines:
|
55 |
+
- Provide direct, clear answers based solely on the document content
|
56 |
+
- If information is unclear due to OCR quality, mention this limitation
|
57 |
+
- For numerical data (dates, percentages, marks), double-check accuracy before responding
|
58 |
+
- If information is not found in the document, clearly state: "This information is not present in the uploaded document"
|
59 |
+
|
60 |
+
4. Educational Context:
|
61 |
+
- Maintain focus on educational queries related to the document content
|
62 |
+
- For admission-related queries, emphasize important deadlines and requirements
|
63 |
+
- For scholarship information, highlight eligibility criteria and application processes
|
64 |
+
- For course-related queries, provide detailed, accurate information from the document
|
65 |
+
|
66 |
+
5. Response Format:
|
67 |
+
- Structure responses clearly with relevant subpoints when necessary
|
68 |
+
- For complex information, break down the answer into digestible parts
|
69 |
+
- Include relevant reference points from the document when applicable
|
70 |
+
- Format numerical data and dates clearly
|
71 |
+
|
72 |
+
6. Quality Control:
|
73 |
+
- Verify that responses align with the document content
|
74 |
+
- Don't make assumptions beyond the provided information
|
75 |
+
- If multiple interpretations are possible due to OCR quality, mention all possibilities
|
76 |
+
- Maintain consistency in terminology throughout the conversation
|
77 |
+
|
78 |
+
Important Rules:
|
79 |
+
- Never make up information not present in the document
|
80 |
+
- Don't combine information from previous conversations or external knowledge
|
81 |
+
- Always indicate if certain parts of the document are unclear due to OCR quality
|
82 |
+
- Maintain professional tone while being accessible to students and parents
|
83 |
+
- If the query is out of scope of the uploaded document, politely redirect to relevant official sources
|
84 |
+
|
85 |
+
Context from uploaded document:
|
86 |
+
{context}
|
87 |
+
|
88 |
+
Chat History:
|
89 |
+
{history}
|
90 |
+
|
91 |
+
Current Question: {question}
|
92 |
+
Assistant: Let me provide a clear and accurate response based on the uploaded document content...
|
93 |
+
"""
|
94 |
|
95 |
self.QA_CHAIN_PROMPT = PromptTemplate(
|
96 |
input_variables=["history", "context", "question"],
|
97 |
template=self.template
|
98 |
)
|
99 |
+
|
100 |
self.db1 = None
|
101 |
self.qa_chain = None
|
102 |
|
|
|
109 |
return "\n".join([pytesseract.image_to_string(img, lang=language) for img in images])
|
110 |
|
111 |
def process_file(self, uploaded_file):
|
112 |
+
"""Process an uploaded file and initialize the QA chain."""
|
113 |
_, file_extension = os.path.splitext(uploaded_file.name)
|
114 |
file_extension = file_extension.lower()
|
115 |
|
116 |
+
# Temporarily save the file for processing
|
117 |
with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
|
118 |
temp_file.write(uploaded_file.read())
|
119 |
temp_path = temp_file.name
|
120 |
|
121 |
+
# OCR processing based on file type
|
122 |
if file_extension == '.pdf':
|
123 |
raw_text = self.ocr_pdf(temp_path, language='guj+eng')
|
124 |
elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
|
|
|
126 |
else:
|
127 |
return "Unsupported file format."
|
128 |
|
129 |
+
# Split text into chunks
|
130 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
131 |
text_chunks = text_splitter.split_text(raw_text)
|
132 |
|
133 |
+
# Create vector store and initialize QA chain
|
134 |
self.db1 = FAISS.from_documents(text_chunks, self.embeddings)
|
135 |
self.qa_chain = RetrievalQA.from_chain_type(
|
136 |
self.llm,
|
|
|
143 |
"memory": self.memory
|
144 |
}
|
145 |
)
|
|
|
146 |
return "File processed successfully!"
|
147 |
|
148 |
def get_response(self, user_input):
|
149 |
+
"""Generate response to the user input question."""
|
150 |
if not self.qa_chain:
|
151 |
return "Please upload and process a file before asking questions."
|
152 |
response = self.qa_chain({"query": user_input})
|
153 |
return response["result"]
|
154 |
|
155 |
+
# Initialize the chatbot
|
156 |
chatbot = ChatbotModel()
|
157 |
|
158 |
+
# Define Gradio interface functions
|
159 |
def upload_and_process(file):
|
160 |
return chatbot.process_file(file)
|
161 |
|
|
|
162 |
def ask_question(question):
|
163 |
return chatbot.get_response(question)
|
164 |
|
165 |
+
# Set up Gradio interface
|
166 |
interface = gr.Blocks()
|
167 |
|
168 |
with interface:
|
|
|
177 |
ask_btn = gr.Button("Submit")
|
178 |
answer = gr.Textbox(label="Answer")
|
179 |
|
180 |
+
# Connect buttons to functions
|
181 |
upload_btn.click(upload_and_process, inputs=file_upload, outputs=output)
|
182 |
ask_btn.click(ask_question, inputs=question_box, outputs=answer)
|
183 |
|
184 |
+
# Launch Gradio interface
|
185 |
interface.launch()
|