Spaces:

FridayMaster
/

CHATBOT1

Sleeping

File size: 2,328 Bytes

import os
import pandas as pd
import PyPDF2
import spacy
import faiss
from sentence_transformers import SentenceTransformer
import torch
import gradio as gr

# Load and preprocess PDF text
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text

# Path to your PDF file
pdf_path = 'FridayMaster/UBANTUMANUAL/Getting Started with Ubuntu 16.04.pdf'

# Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_path)


# Convert the text to a DataFrame
df = pd.DataFrame({'text': [pdf_text]})

# Load the custom embedding model
class CustomEmbeddingModel:
    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name)

    def embed_text(self, text):
        return self.model.encode(text, convert_to_tensor=True)

embedding_model = CustomEmbeddingModel('distilbert-base-uncased')  # Replace with your model name

# Load Spacy model for preprocessing
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if token.is_alpha]
    return ' '.join(tokens)

# Apply preprocessing and embedding
df['text'] = df['text'].apply(preprocess_text)
df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))

# Create a FAISS index
index = faiss.IndexFlatL2(768)  # Assuming embeddings are 768-dimensional
embeddings = torch.stack(df['text_embeddings'].tolist())
faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
faiss_index.add(embeddings.numpy())

# Function to generate a response
def generate_response(prompt):
    query_embedding = embedding_model.embed_text(prompt).unsqueeze(0)
    distances, indices = faiss_index.search(query_embedding.numpy(), k=1)
    response = df.iloc[indices[0][0]]['text']
    return response

# Gradio interface
iface = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(label="Enter your query", placeholder="Ask about Ubuntu..."),
    outputs=gr.Textbox(label="Response"),
    title="Ubuntu Manual Chatbot",
    description="Ask questions about the Ubuntu manual."
)

if __name__ == "__main__":
    iface.launch()