Anupam251272 commited on
Commit
5bf2fd4
·
verified ·
1 Parent(s): 181f4ef

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +347 -0
app.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import pandas as pd
3
+ import networkx as nx
4
+ import pdfplumber
5
+ import gradio as gr
6
+ from transformers import pipeline, MBartTokenizer, MBartForConditionalGeneration
7
+ from sentence_transformers import SentenceTransformer
8
+ import re
9
+ from typing import List, Dict, Optional
10
+
11
+ class MultilingualAyurvedicRecommender:
12
+ def __init__(self):
13
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ print(f"Using device: {self.device}")
15
+
16
+ # Initialize multilingual question-answering model
17
+ self.qa_model = pipeline(
18
+ "question-answering",
19
+ model="deepset/xlm-roberta-large-squad2",
20
+ device=0 if self.device == "cuda" else -1
21
+ )
22
+
23
+ # Initialize multilingual sentence transformer
24
+ self.similarity_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
25
+ self.similarity_model.to(self.device)
26
+
27
+ # Initialize translation models
28
+ print("Loading translation models...")
29
+ self.translation_tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
30
+ self.translation_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
31
+ self.translation_model.to(self.device)
32
+
33
+ # Language codes for supported Indian languages
34
+ self.language_codes = {
35
+ "en_XX": "English",
36
+ "hi_IN": "Hindi",
37
+ "te_IN": "Telugu",
38
+ "ta_IN": "Tamil",
39
+ "mr_IN": "Marathi",
40
+ "gu_IN": "Gujarati",
41
+ "bn_IN": "Bengali"
42
+ }
43
+
44
+ self.G = nx.Graph()
45
+
46
+ def detect_language(self, text: str) -> str:
47
+ """
48
+ Detect the language of input text
49
+
50
+ Args:
51
+ text: Input text to detect language for
52
+
53
+ Returns:
54
+ str: Detected language code
55
+ """
56
+ try:
57
+ inputs = self.translation_tokenizer(text, return_tensors="pt", padding=True).to(self.device)
58
+ lang_scores = self.translation_model(**inputs).logits[0]
59
+ detected_lang = self.translation_tokenizer.decode(torch.argmax(lang_scores))
60
+ return self.language_codes.get(detected_lang, "en_XX")
61
+ except Exception as e:
62
+ print(f"Language detection error: {e}")
63
+ return "en_XX"
64
+
65
+ def translate_text(self, text: str, target_lang: str) -> str:
66
+ """
67
+ Translate text to target language
68
+
69
+ Args:
70
+ text: Text to translate
71
+ target_lang: Target language code
72
+
73
+ Returns:
74
+ str: Translated text
75
+ """
76
+ try:
77
+ source_lang = self.detect_language(text)
78
+
79
+ if source_lang == target_lang:
80
+ return text
81
+
82
+ inputs = self.translation_tokenizer(text, return_tensors="pt", padding=True).to(self.device)
83
+ translated = self.translation_model.generate(
84
+ **inputs,
85
+ forced_bos_token_id=self.translation_tokenizer.lang_code_to_id[target_lang],
86
+ max_length=1024,
87
+ num_beams=4,
88
+ length_penalty=1.0
89
+ )
90
+ return self.translation_tokenizer.decode(translated[0], skip_special_tokens=True)
91
+ except Exception as e:
92
+ print(f"Translation error: {e}")
93
+ return text
94
+
95
+ def extract_from_pdf(self, pdf_path: str) -> pd.DataFrame:
96
+ """
97
+ Extract text from PDF and parse into structured format
98
+
99
+ Args:
100
+ pdf_path: Path to PDF file
101
+
102
+ Returns:
103
+ pd.DataFrame: Extracted medicine data
104
+ """
105
+ medicines_data = {
106
+ "Medicine": [],
107
+ "Conditions": [],
108
+ "Remedies": []
109
+ }
110
+
111
+ try:
112
+ with pdfplumber.open(pdf_path) as pdf:
113
+ current_medicine = None
114
+ current_conditions = []
115
+ current_remedies = []
116
+
117
+ for page in pdf.pages:
118
+ text = page.extract_text()
119
+
120
+ # Skip non-content pages
121
+ if any(header in text.upper() for header in ["INSTRUCTIONS", "INDEX", "FOREWORD"]):
122
+ continue
123
+
124
+ lines = text.split('\n')
125
+ for line in lines:
126
+ line = line.strip()
127
+ if not line:
128
+ continue
129
+
130
+ # Detect medicine headers
131
+ if re.match(r'^[A-Za-z\s]+\([A-Za-z\s]+\)', line):
132
+ if current_medicine and current_conditions:
133
+ medicines_data["Medicine"].append(current_medicine)
134
+ medicines_data["Conditions"].append(';'.join(set(current_conditions)))
135
+ medicines_data["Remedies"].append(';'.join(current_remedies))
136
+
137
+ current_medicine = line.split('(')[0].strip()
138
+ current_conditions = []
139
+ current_remedies = []
140
+ continue
141
+
142
+ if current_medicine:
143
+ # Detect remedy instructions
144
+ if re.search(r'\d+(?:\s*(?:gm|ml|times|drops|days))', line.lower()):
145
+ current_remedies.append(line)
146
+ # Detect conditions
147
+ elif any(condition in line.lower() for condition in [
148
+ 'pain', 'ache', 'fever', 'cold', 'cough', 'diabetes',
149
+ 'wounds', 'ulcer', 'skin', 'digestion', 'appetite'
150
+ ]):
151
+ condition = line.split(':')[0] if ':' in line else line
152
+ current_conditions.append(condition)
153
+
154
+ # Add final medicine entry
155
+ if current_medicine and current_conditions:
156
+ medicines_data["Medicine"].append(current_medicine)
157
+ medicines_data["Conditions"].append(';'.join(set(current_conditions)))
158
+ medicines_data["Remedies"].append(';'.join(current_remedies))
159
+
160
+ df = pd.DataFrame(medicines_data)
161
+ return df[df['Conditions'].str.len() > 0].drop_duplicates()
162
+
163
+ except Exception as e:
164
+ print(f"Error processing PDF: {e}")
165
+ return pd.DataFrame()
166
+
167
+ def build_knowledge_graph(self, df: pd.DataFrame) -> None:
168
+ """
169
+ Build knowledge graph from medicine data
170
+
171
+ Args:
172
+ df: DataFrame containing medicine data
173
+ """
174
+ self.G.clear()
175
+
176
+ for _, row in df.iterrows():
177
+ medicine = row['Medicine']
178
+ conditions = row['Conditions'].split(';')
179
+ remedies = row['Remedies'].split(';')
180
+
181
+ self.G.add_node(medicine, type='medicine')
182
+
183
+ for condition in conditions:
184
+ condition = condition.strip()
185
+ if condition:
186
+ self.G.add_node(condition, type='condition')
187
+ self.G.add_edge(medicine, condition)
188
+
189
+ for remedy in remedies:
190
+ remedy = remedy.strip()
191
+ if remedy:
192
+ self.G.add_node(remedy, type='remedy', info=remedy)
193
+ self.G.add_edge(medicine, remedy)
194
+
195
+ def find_similar_conditions(self, symptoms: str, conditions: List[str]) -> List[tuple]:
196
+ """
197
+ Find conditions similar to input symptoms
198
+
199
+ Args:
200
+ symptoms: Input symptoms text
201
+ conditions: List of known conditions
202
+
203
+ Returns:
204
+ List[tuple]: List of (condition, similarity_score) pairs
205
+ """
206
+ symptoms_embedding = self.similarity_model.encode(symptoms, convert_to_tensor=True)
207
+ conditions_embeddings = self.similarity_model.encode(conditions, convert_to_tensor=True)
208
+
209
+ similarities = torch.nn.functional.cosine_similarity(
210
+ symptoms_embedding.unsqueeze(0),
211
+ conditions_embeddings,
212
+ dim=1
213
+ )
214
+
215
+ similar_conditions = [
216
+ (condition, float(similarity))
217
+ for condition, similarity in zip(conditions, similarities)
218
+ if similarity > 0.5
219
+ ]
220
+
221
+ return sorted(similar_conditions, key=lambda x: x[1], reverse=True)
222
+
223
+ def recommend_medicines(self, symptoms: str, df: pd.DataFrame, target_lang: str = "en_XX") -> List[Dict]:
224
+ """
225
+ Recommend medicines based on symptoms with language support
226
+
227
+ Args:
228
+ symptoms: Input symptoms text
229
+ df: DataFrame containing medicine data
230
+ target_lang: Target language code
231
+
232
+ Returns:
233
+ List[Dict]: List of recommendations
234
+ """
235
+ english_symptoms = self.translate_text(symptoms, "en_XX")
236
+
237
+ all_conditions = [
238
+ c.strip() for conditions_list in df['Conditions'].str.split(';')
239
+ for c in conditions_list if c.strip()
240
+ ]
241
+ all_conditions = list(set(all_conditions))
242
+
243
+ if not all_conditions:
244
+ return []
245
+
246
+ similar_conditions = self.find_similar_conditions(english_symptoms, all_conditions)
247
+
248
+ recommendations = []
249
+ for condition, confidence in similar_conditions:
250
+ medicines = [
251
+ n for n, attr in self.G.nodes(data=True)
252
+ if attr.get('type') == 'medicine' and self.G.has_edge(n, condition)
253
+ ]
254
+
255
+ for medicine in medicines:
256
+ remedies = [
257
+ self.G.nodes[n]['info']
258
+ for n in self.G.neighbors(medicine)
259
+ if self.G.nodes[n]['type'] == 'remedy'
260
+ ]
261
+
262
+ recommendations.append({
263
+ 'medicine': self.translate_text(medicine, target_lang),
264
+ 'condition': self.translate_text(condition, target_lang),
265
+ 'confidence': confidence,
266
+ 'remedies': [self.translate_text(remedy, target_lang) for remedy in remedies]
267
+ })
268
+
269
+ return sorted(recommendations, key=lambda x: x['confidence'], reverse=True)
270
+
271
+ def process_file_and_recommend(
272
+ self,
273
+ file: gr.File,
274
+ symptoms: str,
275
+ target_language: str = "English"
276
+ ) -> str:
277
+ """
278
+ Process input file and return recommendations in specified language
279
+
280
+ Args:
281
+ file: Uploaded PDF file
282
+ symptoms: Input symptoms text
283
+ target_language: Target language name
284
+
285
+ Returns:
286
+ str: Formatted recommendations text
287
+ """
288
+ try:
289
+ target_lang = next(
290
+ (code for code, lang in self.language_codes.items()
291
+ if lang.lower() == target_language.lower()),
292
+ "en_XX"
293
+ )
294
+
295
+ df = self.extract_from_pdf(file.name)
296
+ if df.empty:
297
+ return self.translate_text("Error: Could not extract data from the PDF file.", target_lang)
298
+
299
+ self.build_knowledge_graph(df)
300
+ recommendations = self.recommend_medicines(symptoms, df, target_lang)
301
+
302
+ if not recommendations:
303
+ return self.translate_text("No matching recommendations found.", target_lang)
304
+
305
+ output = [self.translate_text("Ayurvedic Medicine Recommendations:", target_lang)]
306
+
307
+ for i, rec in enumerate(recommendations[:5], 1):
308
+ output.extend([
309
+ f"\n{i}. {self.translate_text('Medicine', target_lang)}: {rec['medicine']}",
310
+ f" {self.translate_text('Matching Condition', target_lang)}: {rec['condition']}",
311
+ f" {self.translate_text('Confidence Score', target_lang)}: {rec['confidence']:.2f}",
312
+ f" {self.translate_text('Recommended Remedies', target_lang)}:"
313
+ ])
314
+ output.extend([f" - {remedy}" for remedy in rec['remedies']])
315
+ output.append("")
316
+
317
+ return "\n".join(output)
318
+
319
+ except Exception as e:
320
+ return f"Error: {str(e)}"
321
+
322
+ # Create and launch Gradio interface
323
+ def main():
324
+ recommender = MultilingualAyurvedicRecommender()
325
+
326
+ interface = gr.Interface(
327
+ fn=recommender.process_file_and_recommend,
328
+ inputs=[
329
+ gr.File(label="Upload Ayurvedic Home Remedies PDF"),
330
+ gr.Textbox(
331
+ label="Enter symptoms in any language (e.g., 'cold and fever' या 'सर्दी और बुखार' या 'జలుబు మరియు జ్వరం')"
332
+ ),
333
+ gr.Dropdown(
334
+ choices=list(recommender.language_codes.values()),
335
+ label="Select output language",
336
+ value="English"
337
+ )
338
+ ],
339
+ outputs=gr.Textbox(label="Recommendations"),
340
+ title="Multilingual Ayurvedic Medicine Recommender",
341
+ description="Get Ayurvedic medicine recommendations in your preferred language. Enter symptoms in any language!"
342
+ )
343
+
344
+ interface.launch(share=True)
345
+
346
+ if __name__ == "__main__":
347
+ main()