Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import pandas as pd
|
3 |
+
import networkx as nx
|
4 |
+
import pdfplumber
|
5 |
+
import gradio as gr
|
6 |
+
from transformers import pipeline, MBartTokenizer, MBartForConditionalGeneration
|
7 |
+
from sentence_transformers import SentenceTransformer
|
8 |
+
import re
|
9 |
+
from typing import List, Dict, Optional
|
10 |
+
|
11 |
+
class MultilingualAyurvedicRecommender:
|
12 |
+
def __init__(self):
|
13 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
14 |
+
print(f"Using device: {self.device}")
|
15 |
+
|
16 |
+
# Initialize multilingual question-answering model
|
17 |
+
self.qa_model = pipeline(
|
18 |
+
"question-answering",
|
19 |
+
model="deepset/xlm-roberta-large-squad2",
|
20 |
+
device=0 if self.device == "cuda" else -1
|
21 |
+
)
|
22 |
+
|
23 |
+
# Initialize multilingual sentence transformer
|
24 |
+
self.similarity_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
|
25 |
+
self.similarity_model.to(self.device)
|
26 |
+
|
27 |
+
# Initialize translation models
|
28 |
+
print("Loading translation models...")
|
29 |
+
self.translation_tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
|
30 |
+
self.translation_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
|
31 |
+
self.translation_model.to(self.device)
|
32 |
+
|
33 |
+
# Language codes for supported Indian languages
|
34 |
+
self.language_codes = {
|
35 |
+
"en_XX": "English",
|
36 |
+
"hi_IN": "Hindi",
|
37 |
+
"te_IN": "Telugu",
|
38 |
+
"ta_IN": "Tamil",
|
39 |
+
"mr_IN": "Marathi",
|
40 |
+
"gu_IN": "Gujarati",
|
41 |
+
"bn_IN": "Bengali"
|
42 |
+
}
|
43 |
+
|
44 |
+
self.G = nx.Graph()
|
45 |
+
|
46 |
+
def detect_language(self, text: str) -> str:
|
47 |
+
"""
|
48 |
+
Detect the language of input text
|
49 |
+
|
50 |
+
Args:
|
51 |
+
text: Input text to detect language for
|
52 |
+
|
53 |
+
Returns:
|
54 |
+
str: Detected language code
|
55 |
+
"""
|
56 |
+
try:
|
57 |
+
inputs = self.translation_tokenizer(text, return_tensors="pt", padding=True).to(self.device)
|
58 |
+
lang_scores = self.translation_model(**inputs).logits[0]
|
59 |
+
detected_lang = self.translation_tokenizer.decode(torch.argmax(lang_scores))
|
60 |
+
return self.language_codes.get(detected_lang, "en_XX")
|
61 |
+
except Exception as e:
|
62 |
+
print(f"Language detection error: {e}")
|
63 |
+
return "en_XX"
|
64 |
+
|
65 |
+
def translate_text(self, text: str, target_lang: str) -> str:
|
66 |
+
"""
|
67 |
+
Translate text to target language
|
68 |
+
|
69 |
+
Args:
|
70 |
+
text: Text to translate
|
71 |
+
target_lang: Target language code
|
72 |
+
|
73 |
+
Returns:
|
74 |
+
str: Translated text
|
75 |
+
"""
|
76 |
+
try:
|
77 |
+
source_lang = self.detect_language(text)
|
78 |
+
|
79 |
+
if source_lang == target_lang:
|
80 |
+
return text
|
81 |
+
|
82 |
+
inputs = self.translation_tokenizer(text, return_tensors="pt", padding=True).to(self.device)
|
83 |
+
translated = self.translation_model.generate(
|
84 |
+
**inputs,
|
85 |
+
forced_bos_token_id=self.translation_tokenizer.lang_code_to_id[target_lang],
|
86 |
+
max_length=1024,
|
87 |
+
num_beams=4,
|
88 |
+
length_penalty=1.0
|
89 |
+
)
|
90 |
+
return self.translation_tokenizer.decode(translated[0], skip_special_tokens=True)
|
91 |
+
except Exception as e:
|
92 |
+
print(f"Translation error: {e}")
|
93 |
+
return text
|
94 |
+
|
95 |
+
def extract_from_pdf(self, pdf_path: str) -> pd.DataFrame:
|
96 |
+
"""
|
97 |
+
Extract text from PDF and parse into structured format
|
98 |
+
|
99 |
+
Args:
|
100 |
+
pdf_path: Path to PDF file
|
101 |
+
|
102 |
+
Returns:
|
103 |
+
pd.DataFrame: Extracted medicine data
|
104 |
+
"""
|
105 |
+
medicines_data = {
|
106 |
+
"Medicine": [],
|
107 |
+
"Conditions": [],
|
108 |
+
"Remedies": []
|
109 |
+
}
|
110 |
+
|
111 |
+
try:
|
112 |
+
with pdfplumber.open(pdf_path) as pdf:
|
113 |
+
current_medicine = None
|
114 |
+
current_conditions = []
|
115 |
+
current_remedies = []
|
116 |
+
|
117 |
+
for page in pdf.pages:
|
118 |
+
text = page.extract_text()
|
119 |
+
|
120 |
+
# Skip non-content pages
|
121 |
+
if any(header in text.upper() for header in ["INSTRUCTIONS", "INDEX", "FOREWORD"]):
|
122 |
+
continue
|
123 |
+
|
124 |
+
lines = text.split('\n')
|
125 |
+
for line in lines:
|
126 |
+
line = line.strip()
|
127 |
+
if not line:
|
128 |
+
continue
|
129 |
+
|
130 |
+
# Detect medicine headers
|
131 |
+
if re.match(r'^[A-Za-z\s]+\([A-Za-z\s]+\)', line):
|
132 |
+
if current_medicine and current_conditions:
|
133 |
+
medicines_data["Medicine"].append(current_medicine)
|
134 |
+
medicines_data["Conditions"].append(';'.join(set(current_conditions)))
|
135 |
+
medicines_data["Remedies"].append(';'.join(current_remedies))
|
136 |
+
|
137 |
+
current_medicine = line.split('(')[0].strip()
|
138 |
+
current_conditions = []
|
139 |
+
current_remedies = []
|
140 |
+
continue
|
141 |
+
|
142 |
+
if current_medicine:
|
143 |
+
# Detect remedy instructions
|
144 |
+
if re.search(r'\d+(?:\s*(?:gm|ml|times|drops|days))', line.lower()):
|
145 |
+
current_remedies.append(line)
|
146 |
+
# Detect conditions
|
147 |
+
elif any(condition in line.lower() for condition in [
|
148 |
+
'pain', 'ache', 'fever', 'cold', 'cough', 'diabetes',
|
149 |
+
'wounds', 'ulcer', 'skin', 'digestion', 'appetite'
|
150 |
+
]):
|
151 |
+
condition = line.split(':')[0] if ':' in line else line
|
152 |
+
current_conditions.append(condition)
|
153 |
+
|
154 |
+
# Add final medicine entry
|
155 |
+
if current_medicine and current_conditions:
|
156 |
+
medicines_data["Medicine"].append(current_medicine)
|
157 |
+
medicines_data["Conditions"].append(';'.join(set(current_conditions)))
|
158 |
+
medicines_data["Remedies"].append(';'.join(current_remedies))
|
159 |
+
|
160 |
+
df = pd.DataFrame(medicines_data)
|
161 |
+
return df[df['Conditions'].str.len() > 0].drop_duplicates()
|
162 |
+
|
163 |
+
except Exception as e:
|
164 |
+
print(f"Error processing PDF: {e}")
|
165 |
+
return pd.DataFrame()
|
166 |
+
|
167 |
+
def build_knowledge_graph(self, df: pd.DataFrame) -> None:
|
168 |
+
"""
|
169 |
+
Build knowledge graph from medicine data
|
170 |
+
|
171 |
+
Args:
|
172 |
+
df: DataFrame containing medicine data
|
173 |
+
"""
|
174 |
+
self.G.clear()
|
175 |
+
|
176 |
+
for _, row in df.iterrows():
|
177 |
+
medicine = row['Medicine']
|
178 |
+
conditions = row['Conditions'].split(';')
|
179 |
+
remedies = row['Remedies'].split(';')
|
180 |
+
|
181 |
+
self.G.add_node(medicine, type='medicine')
|
182 |
+
|
183 |
+
for condition in conditions:
|
184 |
+
condition = condition.strip()
|
185 |
+
if condition:
|
186 |
+
self.G.add_node(condition, type='condition')
|
187 |
+
self.G.add_edge(medicine, condition)
|
188 |
+
|
189 |
+
for remedy in remedies:
|
190 |
+
remedy = remedy.strip()
|
191 |
+
if remedy:
|
192 |
+
self.G.add_node(remedy, type='remedy', info=remedy)
|
193 |
+
self.G.add_edge(medicine, remedy)
|
194 |
+
|
195 |
+
def find_similar_conditions(self, symptoms: str, conditions: List[str]) -> List[tuple]:
|
196 |
+
"""
|
197 |
+
Find conditions similar to input symptoms
|
198 |
+
|
199 |
+
Args:
|
200 |
+
symptoms: Input symptoms text
|
201 |
+
conditions: List of known conditions
|
202 |
+
|
203 |
+
Returns:
|
204 |
+
List[tuple]: List of (condition, similarity_score) pairs
|
205 |
+
"""
|
206 |
+
symptoms_embedding = self.similarity_model.encode(symptoms, convert_to_tensor=True)
|
207 |
+
conditions_embeddings = self.similarity_model.encode(conditions, convert_to_tensor=True)
|
208 |
+
|
209 |
+
similarities = torch.nn.functional.cosine_similarity(
|
210 |
+
symptoms_embedding.unsqueeze(0),
|
211 |
+
conditions_embeddings,
|
212 |
+
dim=1
|
213 |
+
)
|
214 |
+
|
215 |
+
similar_conditions = [
|
216 |
+
(condition, float(similarity))
|
217 |
+
for condition, similarity in zip(conditions, similarities)
|
218 |
+
if similarity > 0.5
|
219 |
+
]
|
220 |
+
|
221 |
+
return sorted(similar_conditions, key=lambda x: x[1], reverse=True)
|
222 |
+
|
223 |
+
def recommend_medicines(self, symptoms: str, df: pd.DataFrame, target_lang: str = "en_XX") -> List[Dict]:
|
224 |
+
"""
|
225 |
+
Recommend medicines based on symptoms with language support
|
226 |
+
|
227 |
+
Args:
|
228 |
+
symptoms: Input symptoms text
|
229 |
+
df: DataFrame containing medicine data
|
230 |
+
target_lang: Target language code
|
231 |
+
|
232 |
+
Returns:
|
233 |
+
List[Dict]: List of recommendations
|
234 |
+
"""
|
235 |
+
english_symptoms = self.translate_text(symptoms, "en_XX")
|
236 |
+
|
237 |
+
all_conditions = [
|
238 |
+
c.strip() for conditions_list in df['Conditions'].str.split(';')
|
239 |
+
for c in conditions_list if c.strip()
|
240 |
+
]
|
241 |
+
all_conditions = list(set(all_conditions))
|
242 |
+
|
243 |
+
if not all_conditions:
|
244 |
+
return []
|
245 |
+
|
246 |
+
similar_conditions = self.find_similar_conditions(english_symptoms, all_conditions)
|
247 |
+
|
248 |
+
recommendations = []
|
249 |
+
for condition, confidence in similar_conditions:
|
250 |
+
medicines = [
|
251 |
+
n for n, attr in self.G.nodes(data=True)
|
252 |
+
if attr.get('type') == 'medicine' and self.G.has_edge(n, condition)
|
253 |
+
]
|
254 |
+
|
255 |
+
for medicine in medicines:
|
256 |
+
remedies = [
|
257 |
+
self.G.nodes[n]['info']
|
258 |
+
for n in self.G.neighbors(medicine)
|
259 |
+
if self.G.nodes[n]['type'] == 'remedy'
|
260 |
+
]
|
261 |
+
|
262 |
+
recommendations.append({
|
263 |
+
'medicine': self.translate_text(medicine, target_lang),
|
264 |
+
'condition': self.translate_text(condition, target_lang),
|
265 |
+
'confidence': confidence,
|
266 |
+
'remedies': [self.translate_text(remedy, target_lang) for remedy in remedies]
|
267 |
+
})
|
268 |
+
|
269 |
+
return sorted(recommendations, key=lambda x: x['confidence'], reverse=True)
|
270 |
+
|
271 |
+
def process_file_and_recommend(
|
272 |
+
self,
|
273 |
+
file: gr.File,
|
274 |
+
symptoms: str,
|
275 |
+
target_language: str = "English"
|
276 |
+
) -> str:
|
277 |
+
"""
|
278 |
+
Process input file and return recommendations in specified language
|
279 |
+
|
280 |
+
Args:
|
281 |
+
file: Uploaded PDF file
|
282 |
+
symptoms: Input symptoms text
|
283 |
+
target_language: Target language name
|
284 |
+
|
285 |
+
Returns:
|
286 |
+
str: Formatted recommendations text
|
287 |
+
"""
|
288 |
+
try:
|
289 |
+
target_lang = next(
|
290 |
+
(code for code, lang in self.language_codes.items()
|
291 |
+
if lang.lower() == target_language.lower()),
|
292 |
+
"en_XX"
|
293 |
+
)
|
294 |
+
|
295 |
+
df = self.extract_from_pdf(file.name)
|
296 |
+
if df.empty:
|
297 |
+
return self.translate_text("Error: Could not extract data from the PDF file.", target_lang)
|
298 |
+
|
299 |
+
self.build_knowledge_graph(df)
|
300 |
+
recommendations = self.recommend_medicines(symptoms, df, target_lang)
|
301 |
+
|
302 |
+
if not recommendations:
|
303 |
+
return self.translate_text("No matching recommendations found.", target_lang)
|
304 |
+
|
305 |
+
output = [self.translate_text("Ayurvedic Medicine Recommendations:", target_lang)]
|
306 |
+
|
307 |
+
for i, rec in enumerate(recommendations[:5], 1):
|
308 |
+
output.extend([
|
309 |
+
f"\n{i}. {self.translate_text('Medicine', target_lang)}: {rec['medicine']}",
|
310 |
+
f" {self.translate_text('Matching Condition', target_lang)}: {rec['condition']}",
|
311 |
+
f" {self.translate_text('Confidence Score', target_lang)}: {rec['confidence']:.2f}",
|
312 |
+
f" {self.translate_text('Recommended Remedies', target_lang)}:"
|
313 |
+
])
|
314 |
+
output.extend([f" - {remedy}" for remedy in rec['remedies']])
|
315 |
+
output.append("")
|
316 |
+
|
317 |
+
return "\n".join(output)
|
318 |
+
|
319 |
+
except Exception as e:
|
320 |
+
return f"Error: {str(e)}"
|
321 |
+
|
322 |
+
# Create and launch Gradio interface
|
323 |
+
def main():
|
324 |
+
recommender = MultilingualAyurvedicRecommender()
|
325 |
+
|
326 |
+
interface = gr.Interface(
|
327 |
+
fn=recommender.process_file_and_recommend,
|
328 |
+
inputs=[
|
329 |
+
gr.File(label="Upload Ayurvedic Home Remedies PDF"),
|
330 |
+
gr.Textbox(
|
331 |
+
label="Enter symptoms in any language (e.g., 'cold and fever' या 'सर्दी और बुखार' या 'జలుబు మరియు జ్వరం')"
|
332 |
+
),
|
333 |
+
gr.Dropdown(
|
334 |
+
choices=list(recommender.language_codes.values()),
|
335 |
+
label="Select output language",
|
336 |
+
value="English"
|
337 |
+
)
|
338 |
+
],
|
339 |
+
outputs=gr.Textbox(label="Recommendations"),
|
340 |
+
title="Multilingual Ayurvedic Medicine Recommender",
|
341 |
+
description="Get Ayurvedic medicine recommendations in your preferred language. Enter symptoms in any language!"
|
342 |
+
)
|
343 |
+
|
344 |
+
interface.launch(share=True)
|
345 |
+
|
346 |
+
if __name__ == "__main__":
|
347 |
+
main()
|