KrishP-12 commited on
Commit
2de3e63
·
verified ·
1 Parent(s): aafc661

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -7
app.py CHANGED
@@ -12,17 +12,19 @@ from langchain.prompts import PromptTemplate
12
  from langchain.chains import RetrievalQA
13
  from langchain_groq import ChatGroq
14
 
15
-
16
  class ChatbotModel:
17
  def __init__(self):
 
18
  os.environ["GROQ_API_KEY"] = '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'
19
 
 
20
  self.embeddings = HuggingFaceEmbeddings(
21
  model_name="sentence-transformers/all-MiniLM-L6-v2",
22
  model_kwargs={'device': 'cpu'},
23
  encode_kwargs={'normalize_embeddings': True}
24
  )
25
 
 
26
  self.llm = ChatGroq(
27
  model='llama3-70b-8192',
28
  temperature=0.5,
@@ -31,14 +33,70 @@ class ChatbotModel:
31
  max_retries=2,
32
  )
33
 
 
34
  self.memory = ConversationBufferMemory(memory_key="history", input_key="question")
35
 
36
- self.template = """You are an intelligent assistant... (Rest of your prompt as is)"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  self.QA_CHAIN_PROMPT = PromptTemplate(
39
  input_variables=["history", "context", "question"],
40
  template=self.template
41
  )
 
42
  self.db1 = None
43
  self.qa_chain = None
44
 
@@ -51,13 +109,16 @@ class ChatbotModel:
51
  return "\n".join([pytesseract.image_to_string(img, lang=language) for img in images])
52
 
53
  def process_file(self, uploaded_file):
 
54
  _, file_extension = os.path.splitext(uploaded_file.name)
55
  file_extension = file_extension.lower()
56
 
 
57
  with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
58
  temp_file.write(uploaded_file.read())
59
  temp_path = temp_file.name
60
 
 
61
  if file_extension == '.pdf':
62
  raw_text = self.ocr_pdf(temp_path, language='guj+eng')
63
  elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
@@ -65,9 +126,11 @@ class ChatbotModel:
65
  else:
66
  return "Unsupported file format."
67
 
 
68
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
69
  text_chunks = text_splitter.split_text(raw_text)
70
 
 
71
  self.db1 = FAISS.from_documents(text_chunks, self.embeddings)
72
  self.qa_chain = RetrievalQA.from_chain_type(
73
  self.llm,
@@ -80,27 +143,26 @@ class ChatbotModel:
80
  "memory": self.memory
81
  }
82
  )
83
-
84
  return "File processed successfully!"
85
 
86
  def get_response(self, user_input):
 
87
  if not self.qa_chain:
88
  return "Please upload and process a file before asking questions."
89
  response = self.qa_chain({"query": user_input})
90
  return response["result"]
91
 
92
-
93
  chatbot = ChatbotModel()
94
 
95
-
96
  def upload_and_process(file):
97
  return chatbot.process_file(file)
98
 
99
-
100
  def ask_question(question):
101
  return chatbot.get_response(question)
102
 
103
-
104
  interface = gr.Blocks()
105
 
106
  with interface:
@@ -115,7 +177,9 @@ with interface:
115
  ask_btn = gr.Button("Submit")
116
  answer = gr.Textbox(label="Answer")
117
 
 
118
  upload_btn.click(upload_and_process, inputs=file_upload, outputs=output)
119
  ask_btn.click(ask_question, inputs=question_box, outputs=answer)
120
 
 
121
  interface.launch()
 
12
  from langchain.chains import RetrievalQA
13
  from langchain_groq import ChatGroq
14
 
 
15
  class ChatbotModel:
16
  def __init__(self):
17
+ # Initialize the environment variable for the GROQ API Key
18
  os.environ["GROQ_API_KEY"] = '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'
19
 
20
+ # Initialize embeddings
21
  self.embeddings = HuggingFaceEmbeddings(
22
  model_name="sentence-transformers/all-MiniLM-L6-v2",
23
  model_kwargs={'device': 'cpu'},
24
  encode_kwargs={'normalize_embeddings': True}
25
  )
26
 
27
+ # Initialize the chat model
28
  self.llm = ChatGroq(
29
  model='llama3-70b-8192',
30
  temperature=0.5,
 
33
  max_retries=2,
34
  )
35
 
36
+ # Initialize memory for conversation
37
  self.memory = ConversationBufferMemory(memory_key="history", input_key="question")
38
 
39
+ # Create the QA chain prompt template
40
+ self.template = """You are an intelligent educational assistant specialized in handling queries about documents in both English and Gujarati languages. You have been provided with OCR-processed text from {document_type} that contains important educational information.
41
+
42
+ Core Responsibilities:
43
+ 1. Language Processing:
44
+ - Identify the language of the user's query (English or Gujarati)
45
+ - Respond in the same language as the query
46
+ - If the query is in Gujarati, ensure the response maintains proper Gujarati grammar and terminology
47
+ - For technical terms, provide both English and Gujarati versions when relevant
48
+
49
+ 2. Document Understanding:
50
+ - Analyze the OCR-processed text from the uploaded {document_type}
51
+ - Account for potential OCR errors or misinterpretations
52
+ - Focus on extracting accurate information despite possible OCR imperfections
53
+
54
+ 3. Response Guidelines:
55
+ - Provide direct, clear answers based solely on the document content
56
+ - If information is unclear due to OCR quality, mention this limitation
57
+ - For numerical data (dates, percentages, marks), double-check accuracy before responding
58
+ - If information is not found in the document, clearly state: "This information is not present in the uploaded document"
59
+
60
+ 4. Educational Context:
61
+ - Maintain focus on educational queries related to the document content
62
+ - For admission-related queries, emphasize important deadlines and requirements
63
+ - For scholarship information, highlight eligibility criteria and application processes
64
+ - For course-related queries, provide detailed, accurate information from the document
65
+
66
+ 5. Response Format:
67
+ - Structure responses clearly with relevant subpoints when necessary
68
+ - For complex information, break down the answer into digestible parts
69
+ - Include relevant reference points from the document when applicable
70
+ - Format numerical data and dates clearly
71
+
72
+ 6. Quality Control:
73
+ - Verify that responses align with the document content
74
+ - Don't make assumptions beyond the provided information
75
+ - If multiple interpretations are possible due to OCR quality, mention all possibilities
76
+ - Maintain consistency in terminology throughout the conversation
77
+
78
+ Important Rules:
79
+ - Never make up information not present in the document
80
+ - Don't combine information from previous conversations or external knowledge
81
+ - Always indicate if certain parts of the document are unclear due to OCR quality
82
+ - Maintain professional tone while being accessible to students and parents
83
+ - If the query is out of scope of the uploaded document, politely redirect to relevant official sources
84
+
85
+ Context from uploaded document:
86
+ {context}
87
+
88
+ Chat History:
89
+ {history}
90
+
91
+ Current Question: {question}
92
+ Assistant: Let me provide a clear and accurate response based on the uploaded document content...
93
+ """
94
 
95
  self.QA_CHAIN_PROMPT = PromptTemplate(
96
  input_variables=["history", "context", "question"],
97
  template=self.template
98
  )
99
+
100
  self.db1 = None
101
  self.qa_chain = None
102
 
 
109
  return "\n".join([pytesseract.image_to_string(img, lang=language) for img in images])
110
 
111
  def process_file(self, uploaded_file):
112
+ """Process an uploaded file and initialize the QA chain."""
113
  _, file_extension = os.path.splitext(uploaded_file.name)
114
  file_extension = file_extension.lower()
115
 
116
+ # Temporarily save the file for processing
117
  with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
118
  temp_file.write(uploaded_file.read())
119
  temp_path = temp_file.name
120
 
121
+ # OCR processing based on file type
122
  if file_extension == '.pdf':
123
  raw_text = self.ocr_pdf(temp_path, language='guj+eng')
124
  elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
 
126
  else:
127
  return "Unsupported file format."
128
 
129
+ # Split text into chunks
130
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
131
  text_chunks = text_splitter.split_text(raw_text)
132
 
133
+ # Create vector store and initialize QA chain
134
  self.db1 = FAISS.from_documents(text_chunks, self.embeddings)
135
  self.qa_chain = RetrievalQA.from_chain_type(
136
  self.llm,
 
143
  "memory": self.memory
144
  }
145
  )
 
146
  return "File processed successfully!"
147
 
148
  def get_response(self, user_input):
149
+ """Generate response to the user input question."""
150
  if not self.qa_chain:
151
  return "Please upload and process a file before asking questions."
152
  response = self.qa_chain({"query": user_input})
153
  return response["result"]
154
 
155
+ # Initialize the chatbot
156
  chatbot = ChatbotModel()
157
 
158
+ # Define Gradio interface functions
159
  def upload_and_process(file):
160
  return chatbot.process_file(file)
161
 
 
162
  def ask_question(question):
163
  return chatbot.get_response(question)
164
 
165
+ # Set up Gradio interface
166
  interface = gr.Blocks()
167
 
168
  with interface:
 
177
  ask_btn = gr.Button("Submit")
178
  answer = gr.Textbox(label="Answer")
179
 
180
+ # Connect buttons to functions
181
  upload_btn.click(upload_and_process, inputs=file_upload, outputs=output)
182
  ask_btn.click(ask_question, inputs=question_box, outputs=answer)
183
 
184
+ # Launch Gradio interface
185
  interface.launch()