import torch import pandas as pd import networkx as nx import pdfplumber import gradio as gr from transformers import pipeline, MBartTokenizer, MBartForConditionalGeneration from sentence_transformers import SentenceTransformer import re from typing import List, Dict, Optional class MultilingualAyurvedicRecommender: def __init__(self): self.device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {self.device}") # Initialize multilingual question-answering model self.qa_model = pipeline( "question-answering", model="deepset/xlm-roberta-large-squad2", device=0 if self.device == "cuda" else -1 ) # Initialize multilingual sentence transformer self.similarity_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2') self.similarity_model.to(self.device) # Initialize translation models print("Loading translation models...") self.translation_tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") self.translation_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") self.translation_model.to(self.device) # Language codes for supported Indian languages self.language_codes = { "en_XX": "English", "hi_IN": "Hindi", "te_IN": "Telugu", "ta_IN": "Tamil", "mr_IN": "Marathi", "gu_IN": "Gujarati", "bn_IN": "Bengali" } self.G = nx.Graph() def detect_language(self, text: str) -> str: """ Detect the language of input text Args: text: Input text to detect language for Returns: str: Detected language code """ try: inputs = self.translation_tokenizer(text, return_tensors="pt", padding=True).to(self.device) lang_scores = self.translation_model(**inputs).logits[0] detected_lang = self.translation_tokenizer.decode(torch.argmax(lang_scores)) return self.language_codes.get(detected_lang, "en_XX") except Exception as e: print(f"Language detection error: {e}") return "en_XX" def translate_text(self, text: str, target_lang: str) -> str: """ Translate text to target language Args: text: Text to translate target_lang: Target language code Returns: str: Translated text """ try: source_lang = self.detect_language(text) if source_lang == target_lang: return text inputs = self.translation_tokenizer(text, return_tensors="pt", padding=True).to(self.device) translated = self.translation_model.generate( **inputs, forced_bos_token_id=self.translation_tokenizer.lang_code_to_id[target_lang], max_length=1024, num_beams=4, length_penalty=1.0 ) return self.translation_tokenizer.decode(translated[0], skip_special_tokens=True) except Exception as e: print(f"Translation error: {e}") return text def extract_from_pdf(self, pdf_path: str) -> pd.DataFrame: """ Extract text from PDF and parse into structured format Args: pdf_path: Path to PDF file Returns: pd.DataFrame: Extracted medicine data """ medicines_data = { "Medicine": [], "Conditions": [], "Remedies": [] } try: with pdfplumber.open(pdf_path) as pdf: current_medicine = None current_conditions = [] current_remedies = [] for page in pdf.pages: text = page.extract_text() # Skip non-content pages if any(header in text.upper() for header in ["INSTRUCTIONS", "INDEX", "FOREWORD"]): continue lines = text.split('\n') for line in lines: line = line.strip() if not line: continue # Detect medicine headers if re.match(r'^[A-Za-z\s]+\([A-Za-z\s]+\)', line): if current_medicine and current_conditions: medicines_data["Medicine"].append(current_medicine) medicines_data["Conditions"].append(';'.join(set(current_conditions))) medicines_data["Remedies"].append(';'.join(current_remedies)) current_medicine = line.split('(')[0].strip() current_conditions = [] current_remedies = [] continue if current_medicine: # Detect remedy instructions if re.search(r'\d+(?:\s*(?:gm|ml|times|drops|days))', line.lower()): current_remedies.append(line) # Detect conditions elif any(condition in line.lower() for condition in [ 'pain', 'ache', 'fever', 'cold', 'cough', 'diabetes', 'wounds', 'ulcer', 'skin', 'digestion', 'appetite' ]): condition = line.split(':')[0] if ':' in line else line current_conditions.append(condition) # Add final medicine entry if current_medicine and current_conditions: medicines_data["Medicine"].append(current_medicine) medicines_data["Conditions"].append(';'.join(set(current_conditions))) medicines_data["Remedies"].append(';'.join(current_remedies)) df = pd.DataFrame(medicines_data) return df[df['Conditions'].str.len() > 0].drop_duplicates() except Exception as e: print(f"Error processing PDF: {e}") return pd.DataFrame() def build_knowledge_graph(self, df: pd.DataFrame) -> None: """ Build knowledge graph from medicine data Args: df: DataFrame containing medicine data """ self.G.clear() for _, row in df.iterrows(): medicine = row['Medicine'] conditions = row['Conditions'].split(';') remedies = row['Remedies'].split(';') self.G.add_node(medicine, type='medicine') for condition in conditions: condition = condition.strip() if condition: self.G.add_node(condition, type='condition') self.G.add_edge(medicine, condition) for remedy in remedies: remedy = remedy.strip() if remedy: self.G.add_node(remedy, type='remedy', info=remedy) self.G.add_edge(medicine, remedy) def find_similar_conditions(self, symptoms: str, conditions: List[str]) -> List[tuple]: """ Find conditions similar to input symptoms Args: symptoms: Input symptoms text conditions: List of known conditions Returns: List[tuple]: List of (condition, similarity_score) pairs """ symptoms_embedding = self.similarity_model.encode(symptoms, convert_to_tensor=True) conditions_embeddings = self.similarity_model.encode(conditions, convert_to_tensor=True) similarities = torch.nn.functional.cosine_similarity( symptoms_embedding.unsqueeze(0), conditions_embeddings, dim=1 ) similar_conditions = [ (condition, float(similarity)) for condition, similarity in zip(conditions, similarities) if similarity > 0.5 ] return sorted(similar_conditions, key=lambda x: x[1], reverse=True) def recommend_medicines(self, symptoms: str, df: pd.DataFrame, target_lang: str = "en_XX") -> List[Dict]: """ Recommend medicines based on symptoms with language support Args: symptoms: Input symptoms text df: DataFrame containing medicine data target_lang: Target language code Returns: List[Dict]: List of recommendations """ english_symptoms = self.translate_text(symptoms, "en_XX") all_conditions = [ c.strip() for conditions_list in df['Conditions'].str.split(';') for c in conditions_list if c.strip() ] all_conditions = list(set(all_conditions)) if not all_conditions: return [] similar_conditions = self.find_similar_conditions(english_symptoms, all_conditions) recommendations = [] for condition, confidence in similar_conditions: medicines = [ n for n, attr in self.G.nodes(data=True) if attr.get('type') == 'medicine' and self.G.has_edge(n, condition) ] for medicine in medicines: remedies = [ self.G.nodes[n]['info'] for n in self.G.neighbors(medicine) if self.G.nodes[n]['type'] == 'remedy' ] recommendations.append({ 'medicine': self.translate_text(medicine, target_lang), 'condition': self.translate_text(condition, target_lang), 'confidence': confidence, 'remedies': [self.translate_text(remedy, target_lang) for remedy in remedies] }) return sorted(recommendations, key=lambda x: x['confidence'], reverse=True) def process_file_and_recommend( self, file: gr.File, symptoms: str, target_language: str = "English" ) -> str: """ Process input file and return recommendations in specified language Args: file: Uploaded PDF file symptoms: Input symptoms text target_language: Target language name Returns: str: Formatted recommendations text """ try: target_lang = next( (code for code, lang in self.language_codes.items() if lang.lower() == target_language.lower()), "en_XX" ) df = self.extract_from_pdf(file.name) if df.empty: return self.translate_text("Error: Could not extract data from the PDF file.", target_lang) self.build_knowledge_graph(df) recommendations = self.recommend_medicines(symptoms, df, target_lang) if not recommendations: return self.translate_text("No matching recommendations found.", target_lang) output = [self.translate_text("Ayurvedic Medicine Recommendations:", target_lang)] for i, rec in enumerate(recommendations[:5], 1): output.extend([ f"\n{i}. {self.translate_text('Medicine', target_lang)}: {rec['medicine']}", f" {self.translate_text('Matching Condition', target_lang)}: {rec['condition']}", f" {self.translate_text('Confidence Score', target_lang)}: {rec['confidence']:.2f}", f" {self.translate_text('Recommended Remedies', target_lang)}:" ]) output.extend([f" - {remedy}" for remedy in rec['remedies']]) output.append("") return "\n".join(output) except Exception as e: return f"Error: {str(e)}" # Create and launch Gradio interface def main(): recommender = MultilingualAyurvedicRecommender() interface = gr.Interface( fn=recommender.process_file_and_recommend, inputs=[ gr.File(label="Upload Ayurvedic Home Remedies PDF"), gr.Textbox( label="Enter symptoms in any language (e.g., 'cold and fever' या 'सर्दी और बुखार' या 'జలుబు మరియు జ్వరం')" ), gr.Dropdown( choices=list(recommender.language_codes.values()), label="Select output language", value="English" ) ], outputs=gr.Textbox(label="Recommendations"), title="Multilingual Ayurvedic Medicine Recommender", description="Get Ayurvedic medicine recommendations in your preferred language. Enter symptoms in any language!" ) interface.launch(share=True) if __name__ == "__main__": main()