import os import io import re import numpy as np import pytesseract from PIL import Image from typing import List from sentence_transformers import SentenceTransformer from langchain_community.vectorstores import Chroma from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import SentenceTransformerEmbeddings from groq import Groq import gradio as gr import requests # Ensure the Tesseract OCR path is set correctly pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' GROQ_API_KEY = os.getenv("groq") def preprocess_text(text): try: text = text.replace('\n', ' ').replace('\r', ' ') text = re.sub(r'[^\x00-\x7F]+', ' ', text) text = text.lower() text = re.sub(r'[^\w\s]', '', text) text = re.sub(r'\s+', ' ', text).strip() return text except Exception as e: print("Failed to preprocess text:", e) return "" def fetch_text_file_from_huggingface_space(): url = "https://huggingface.co/spaces/Luciferalive/goosev9/blob/main/extracted_text.txt" try: response = requests.get(url) response.raise_for_status() text_content = response.text print("Successfully downloaded the text file") return text_content except Exception as e: print(f"Failed to download the text file: {e}") return "" def create_vector_store(text_content): embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) texts = text_splitter.split_text(text_content) if not texts: print("No text chunks created.") return None vector_store = Chroma.from_texts(texts, embeddings, collection_name="insurance_cosine") print("Vector DB Successfully Created!") return vector_store def load_vector_store(): embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") try: db = Chroma(embedding_function=embeddings, collection_name="insurance_cosine") print("Vector DB Successfully Loaded!") return db except Exception as e: print("Failed to load Vector DB:", e) return None def answer_query(query): try: vector_store = load_vector_store() if not vector_store: return None docs = vector_store.similarity_search(query) print(f"\n\nDocuments retrieved: {len(docs)}") if not docs: print("No documents match the query.") return None docs_content = [doc.page_content for doc in docs] all_docs_content = " ".join(docs_content) client = Groq(api_key=GROQ_API_KEY) template = """ ### [INST] Instruction: You are an AI assistant named Goose. Your purpose is to provide accurate, relevant, and helpful information to users in a friendly, warm, and supportive manner, similar to ChatGPT. When responding to queries, please keep the following guidelines in mind: - When someone says hi, or small talk, only respond in a sentence. - Retrieve relevant information from your knowledge base to formulate accurate and informative responses. - Always maintain a positive, friendly, and encouraging tone in your interactions with users. - Strictly write crisp and clear answers, don't write unnecessary stuff. - Only answer the asked question, don't hallucinate or print any pre-information. - After providing the answer, always ask for any other help needed in the next paragraph. - Writing in bullet format is our top preference. Remember, your goal is to be a reliable, friendly, and supportive AI assistant that provides accurate information while creating a positive user experience, just like ChatGPT. Adapt your communication style to best suit each user's needs and preferences. ### Docs: {docs} ### Question: {question} """ chat_completion = client.chat.completions.create( messages=[ { "role": "system", "content": template.format(docs=all_docs_content, question=query) }, { "role": "user", "content": query } ], model="llama3-8b-8192", ) answer = chat_completion.choices[0].message.content.strip() return answer except Exception as e: print("An error occurred while getting the answer: ", str(e)) return None def process_query(query): try: response = answer_query(query) if response: return "Answer: " + response else: return "No answer found." except Exception as e: print("An error occurred while getting the answer: ", str(e)) return "An error occurred: " + str(e) # Set up the Gradio interface def launch_assistant(): text_content = fetch_text_file_from_huggingface_space() if not text_content.strip(): print("No text content fetched.") return vector_store = create_vector_store(text_content) if not vector_store: print("Failed to create Vector DB.") return iface = gr.Interface( fn=process_query, inputs=gr.Textbox(lines=7, label="Enter your question"), outputs="text", title="Goose AI Assistant", description="Ask a question and get an answer from the AI assistant." ) iface.launch() launch_assistant()