import os import streamlit as st import torch import numpy as np from typing import List, Dict, Any from langchain_core.prompts import ChatPromptTemplate from langchain_groq import ChatGroq from langchain_community.embeddings import HuggingFaceBgeEmbeddings from langchain.memory import ConversationBufferMemory from langchain.chains import ConversationalRetrievalChain from transformers import pipeline from sentence_transformers import SentenceTransformer import tavily # Evaluation Metrics Libraries from rouge_score import rouge_scorer from nltk.translate.bleu_score import sentence_bleu from nltk.tokenize import word_tokenize from sklearn.metrics.pairwise import cosine_similarity from textstat import flesch_reading_ease, flesch_kincaid_grade class AdvancedRAGChatbot: def __init__(self, tavily_api_key: str, embedding_model: str = "BAAI/bge-large-en-v1.5", llm_model: str = "llama-3.3-70b-versatile", temperature: float = 0.7): """Initialize the Advanced RAG Chatbot with Enhanced Metrics""" os.environ["TAVILY_API_KEY"] = tavily_api_key # Tavily Client self.tavily_client = tavily.TavilyClient(tavily_api_key) # NLP Components self.embeddings = self._configure_embeddings(embedding_model) self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2') self.sentiment_analyzer = pipeline("sentiment-analysis") self.ner_pipeline = pipeline("ner", aggregation_strategy="simple") # Evaluation Components self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) # Language Model Configuration self.llm = self._configure_llm(llm_model, temperature) # Conversation Memory self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) def _calculate_comprehensive_metrics(self, query: str, response: str, web_sources: List[Dict]) -> Dict[str, Any]: """Calculate comprehensive evaluation metrics""" metrics = {} # Readability Metrics metrics['flesch_reading_ease'] = flesch_reading_ease(response) metrics['flesch_kincaid_grade'] = flesch_kincaid_grade(response) # Length Metrics metrics['query_length'] = len(word_tokenize(query)) metrics['response_length'] = len(word_tokenize(response)) # BLEU Score (compared against web sources) reference_texts = [word_tokenize(source.get('content', '')) for source in web_sources] candidate_tokens = word_tokenize(response) bleu_scores = [] for ref in reference_texts: try: bleu_score = sentence_bleu([ref], candidate_tokens) bleu_scores.append(bleu_score) except Exception: pass metrics['average_bleu_score'] = np.mean(bleu_scores) if bleu_scores else 0.0 # ROUGE Scores reference_text = ' '.join([source.get('content', '') for source in web_sources]) rouge_scores = self.rouge_scorer.score(reference_text, response) metrics['rouge_scores'] = { 'rouge1': rouge_scores['rouge1'].fmeasure, 'rouge2': rouge_scores['rouge2'].fmeasure, 'rougeL': rouge_scores['rougeL'].fmeasure } # Semantic Similarity try: web_source_embeddings = self.semantic_model.encode([source.get('content', '') for source in web_sources]) response_embedding = self.semantic_model.encode([response])[0] semantic_similarities = cosine_similarity([response_embedding], web_source_embeddings)[0] metrics['semantic_similarity'] = { 'mean': np.mean(semantic_similarities), 'max': np.max(semantic_similarities), 'min': np.min(semantic_similarities) } except Exception as e: st.warning(f"Semantic similarity calculation error: {e}") metrics['semantic_similarity'] = {'mean': 0, 'max': 0, 'min': 0} return metrics def process_query(self, query: str) -> Dict[str, Any]: """Process the user query with comprehensive evaluation""" # Web Search web_results = self._tavily_web_search(query) # Prepare context from web search context = "\n\n".join([ f"Title: {result.get('title', 'N/A')}\nContent: {result.get('content', '')}" for result in web_results ]) # NLP Analysis semantic_score = self.semantic_model.encode([query])[0] sentiment_result = self.sentiment_analyzer(query)[0] # Safe NER processing try: entities = self.ner_pipeline(query) except Exception as e: st.warning(f"NER processing error: {e}") entities = [] # Prepare prompt with web search context full_prompt = f""" Use the following web search results to answer the question precisely: Web Search Context: {context} Question: {query} Provide a comprehensive answer based on the web search results. """ # Generate Response response = self.llm.invoke(full_prompt) response_content = response.content # Calculate Comprehensive Metrics evaluation_metrics = self._calculate_comprehensive_metrics( query, response_content, web_results ) return { "response": response_content, "web_sources": web_results, "semantic_similarity": semantic_score.tolist(), "sentiment": sentiment_result, "named_entities": entities, "evaluation_metrics": evaluation_metrics } def main(): # [Previous main function code remains the same] # Add a new section to display comprehensive metrics with col2: st.header("Response & Metrics") if submit_button and user_input: with st.spinner("Searching web and processing query..."): try: response = chatbot.process_query(user_input) # Existing response display code... # Comprehensive Metrics Display st.markdown("### 📊 Comprehensive Evaluation Metrics") # Readability Metrics col_read1, col_read2 = st.columns(2) with col_read1: st.metric( "Flesch Reading Ease", f"{response['evaluation_metrics']['flesch_reading_ease']:.2f}", help="Higher scores indicate easier readability" ) with col_read2: st.metric( "Flesch-Kincaid Grade", f"{response['evaluation_metrics']['flesch_kincaid_grade']:.2f}", help="US grade level required to understand the text" ) # Length and BLEU Metrics col_len1, col_len2, col_len3 = st.columns(3) with col_len1: st.metric("Query Length", response['evaluation_metrics']['query_length']) with col_len2: st.metric("Response Length", response['evaluation_metrics']['response_length']) with col_len3: st.metric( "BLEU Score", f"{response['evaluation_metrics']['average_bleu_score']:.4f}", help="Measures similarity to reference texts" ) # ROUGE Scores st.markdown("#### 📈 ROUGE Scores") rouge_metrics = response['evaluation_metrics']['rouge_scores'] col_rouge1, col_rouge2, col_rouge3 = st.columns(3) with col_rouge1: st.metric("ROUGE-1", f"{rouge_metrics['rouge1']:.4f}") with col_rouge2: st.metric("ROUGE-2", f"{rouge_metrics['rouge2']:.4f}") with col_rouge3: st.metric("ROUGE-L", f"{rouge_metrics['rougeL']:.4f}") # Semantic Similarity st.markdown("#### 🔍 Semantic Similarity") sem_sim = response['evaluation_metrics']['semantic_similarity'] col_sem1, col_sem2, col_sem3 = st.columns(3) with col_sem1: st.metric("Mean Similarity", f"{sem_sim['mean']:.4f}") with col_sem2: st.metric("Max Similarity", f"{sem_sim['max']:.4f}") with col_sem3: st.metric("Min Similarity", f"{sem_sim['min']:.4f}") except Exception as e: st.error(f"An error occurred: {e}") if __name__ == "__main__": main()