Spaces:
Sleeping
Sleeping
import os | |
import io | |
import re | |
import numpy as np | |
import pytesseract | |
from PIL import Image | |
from typing import List | |
from sentence_transformers import SentenceTransformer | |
from langchain_community.vectorstores import Chroma | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.embeddings import SentenceTransformerEmbeddings | |
from groq import Groq | |
import gradio as gr | |
import requests | |
# Ensure the Tesseract OCR path is set correctly | |
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' | |
GROQ_API_KEY = os.getenv("groq") | |
def preprocess_text(text): | |
try: | |
text = text.replace('\n', ' ').replace('\r', ' ') | |
text = re.sub(r'[^\x00-\x7F]+', ' ', text) | |
text = text.lower() | |
text = re.sub(r'[^\w\s]', '', text) | |
text = re.sub(r'\s+', ' ', text).strip() | |
return text | |
except Exception as e: | |
print("Failed to preprocess text:", e) | |
return "" | |
def fetch_text_file_from_huggingface_space(): | |
url = "https://huggingface.co/spaces/Luciferalive/goosev9/blob/main/extracted_text.txt" | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
text_content = response.text | |
print("Successfully downloaded the text file") | |
return text_content | |
except Exception as e: | |
print(f"Failed to download the text file: {e}") | |
return "" | |
def create_vector_store(text_content): | |
embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
texts = text_splitter.split_text(text_content) | |
if not texts: | |
print("No text chunks created.") | |
return None | |
vector_store = Chroma.from_texts(texts, embeddings, collection_name="insurance_cosine") | |
print("Vector DB Successfully Created!") | |
return vector_store | |
def load_vector_store(): | |
embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") | |
try: | |
db = Chroma(embedding_function=embeddings, collection_name="insurance_cosine") | |
print("Vector DB Successfully Loaded!") | |
return db | |
except Exception as e: | |
print("Failed to load Vector DB:", e) | |
return None | |
def answer_query(query): | |
try: | |
vector_store = load_vector_store() | |
if not vector_store: | |
return None | |
docs = vector_store.similarity_search(query) | |
print(f"\n\nDocuments retrieved: {len(docs)}") | |
if not docs: | |
print("No documents match the query.") | |
return None | |
docs_content = [doc.page_content for doc in docs] | |
all_docs_content = " ".join(docs_content) | |
client = Groq(api_key=GROQ_API_KEY) | |
template = """ | |
### [INST] Instruction: | |
You are an AI assistant named Goose. Your purpose is to provide accurate, relevant, and helpful information to users in a friendly, warm, and supportive manner, similar to ChatGPT. When responding to queries, please keep the following guidelines in mind: | |
- When someone says hi, or small talk, only respond in a sentence. | |
- Retrieve relevant information from your knowledge base to formulate accurate and informative responses. | |
- Always maintain a positive, friendly, and encouraging tone in your interactions with users. | |
- Strictly write crisp and clear answers, don't write unnecessary stuff. | |
- Only answer the asked question, don't hallucinate or print any pre-information. | |
- After providing the answer, always ask for any other help needed in the next paragraph. | |
- Writing in bullet format is our top preference. | |
Remember, your goal is to be a reliable, friendly, and supportive AI assistant that provides accurate information while creating a positive user experience, just like ChatGPT. Adapt your communication style to best suit each user's needs and preferences. | |
### Docs: {docs} | |
### Question: {question} | |
""" | |
chat_completion = client.chat.completions.create( | |
messages=[ | |
{ | |
"role": "system", | |
"content": template.format(docs=all_docs_content, question=query) | |
}, | |
{ | |
"role": "user", | |
"content": query | |
} | |
], | |
model="llama3-8b-8192", | |
) | |
answer = chat_completion.choices[0].message.content.strip() | |
return answer | |
except Exception as e: | |
print("An error occurred while getting the answer: ", str(e)) | |
return None | |
def process_query(query): | |
try: | |
response = answer_query(query) | |
if response: | |
return "Answer: " + response | |
else: | |
return "No answer found." | |
except Exception as e: | |
print("An error occurred while getting the answer: ", str(e)) | |
return "An error occurred: " + str(e) | |
# Set up the Gradio interface | |
def launch_assistant(): | |
text_content = fetch_text_file_from_huggingface_space() | |
if not text_content.strip(): | |
print("No text content fetched.") | |
return | |
vector_store = create_vector_store(text_content) | |
if not vector_store: | |
print("Failed to create Vector DB.") | |
return | |
iface = gr.Interface( | |
fn=process_query, | |
inputs=gr.Textbox(lines=7, label="Enter your question"), | |
outputs="text", | |
title="Goose AI Assistant", | |
description="Ask a question and get an answer from the AI assistant." | |
) | |
iface.launch() | |
launch_assistant() |