# First gen # Install the necessary libraries # !pip install transformers # !pip install sentence-transformers # !pip install torch # !pip install requests # !pip install bs4 import requests from bs4 import BeautifulSoup from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM from sentence_transformers import SentenceTransformer, util import torch # Step 1: Load Models for Summarization and Similarity model_name = "facebook/bart-large-cnn" # Summarization model tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) # Summarization pipeline summarizer = pipeline("summarization", model=model, tokenizer=tokenizer) # Sentence similarity model similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2') # Step 2: Define Retrieval Evaluator def evaluate_retrieval(query, retrieved_docs): """ Evaluate the relevance of retrieved documents using cosine similarity with sentence embeddings. """ query_embedding = similarity_model.encode(query, convert_to_tensor=True) doc_embeddings = similarity_model.encode(retrieved_docs, convert_to_tensor=True) # Calculate cosine similarity between the query and each document similarities = [util.pytorch_cos_sim(query_embedding, doc_embedding).item() for doc_embedding in doc_embeddings] # Set a threshold for relevance (adjustable) relevance_threshold = 0.5 relevance_scores = ['Correct' if sim > relevance_threshold else 'Incorrect' for sim in similarities] return relevance_scores # Step 3: Knowledge Refinement (Decompose-then-Recompose) def decompose_then_recompose(retrieved_docs): """ Refine the retrieved documents by summarizing their key information. """ refined_knowledge = [] for doc in retrieved_docs: summary = summarizer(doc, max_length=50, min_length=20, do_sample=False)[0]['summary_text'] refined_knowledge.append(summary) return refined_knowledge # Step 4: Web Search for External Knowledge def web_search(query): """ Perform a web search to retrieve additional external knowledge if the retrieved documents are not relevant. """ search_url = f"https://www.google.com/search?q={query.replace(' ', '+')}" headers = {'User-Agent': 'Mozilla/5.0'} response = requests.get(search_url, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') # Extract URLs from search results (simplified) links = [] for item in soup.find_all('a'): link = item.get('href') if link and "http" in link: links.append(link) return links[:5] # Return the first 5 URLs # Step 5: Generate Final Output def generate_final_output(query, refined_knowledge): """ Generate the final output summary using the refined knowledge. """ combined_knowledge = " ".join(refined_knowledge) final_summary = summarizer(combined_knowledge, max_length=100, min_length=50, do_sample=False)[0]['summary_text'] return final_summary # Step 6: CRAG Workflow Integration def crag_workflow(query, retrieved_docs): """ Full CRAG workflow integrating evaluation, knowledge refinement, and web search to generate a robust output summary. """ # Step 1: Evaluate retrieval relevance_scores = evaluate_retrieval(query, retrieved_docs) if 'Correct' in relevance_scores: # Step 2: Decompose-then-Recompose for correct documents refined_knowledge = decompose_then_recompose( [doc for doc, score in zip(retrieved_docs, relevance_scores) if score == 'Correct']) else: # Step 3: Web search if retrieval is incorrect web_results = web_search(query) refined_knowledge = decompose_then_recompose(web_results) # Step 4: Generate final output final_summary = generate_final_output(query, refined_knowledge) return final_summary # Example Usage if __name__ == "__main__": # Example query and retrieved documents query = "What are the latest advancements in renewable energy?" retrieved_docs = [ "Renewable energy is becoming increasingly important in today's world...", "Solar energy has seen significant advancements in the past decade...", "Wind energy technology is rapidly evolving, with new innovations expected soon..." ] # Perform the CRAG workflow final_summary = crag_workflow(query, retrieved_docs) print("Final Summary:", final_summary)