Spaces:
Sleeping
Sleeping
File size: 2,328 Bytes
93452e4 3d00632 8918a3e 3d00632 93452e4 4c4e926 3d00632 e4261d6 3d00632 9042b33 93452e4 3d00632 9042b33 3d00632 1842c48 3d00632 8918a3e 3d00632 8918a3e 3d70771 e4261d6 3d00632 7dbc572 3d00632 8918a3e 3d00632 8918a3e 1842c48 3d00632 1842c48 8918a3e 3d70771 3d00632 3d70771 3d00632 67be4ed 3d00632 97c8253 3d70771 3d00632 7dbc572 e4261d6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import os
import pandas as pd
import PyPDF2
import spacy
import faiss
from sentence_transformers import SentenceTransformer
import torch
import gradio as gr
# Load and preprocess PDF text
def extract_text_from_pdf(pdf_path):
text = ""
with open(pdf_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text()
return text
# Path to your PDF file
pdf_path = 'FridayMaster/UBANTUMANUAL/Getting Started with Ubuntu 16.04.pdf'
# Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_path)
# Convert the text to a DataFrame
df = pd.DataFrame({'text': [pdf_text]})
# Load the custom embedding model
class CustomEmbeddingModel:
def __init__(self, model_name):
self.model = SentenceTransformer(model_name)
def embed_text(self, text):
return self.model.encode(text, convert_to_tensor=True)
embedding_model = CustomEmbeddingModel('distilbert-base-uncased') # Replace with your model name
# Load Spacy model for preprocessing
nlp = spacy.load("en_core_web_sm")
def preprocess_text(text):
doc = nlp(text)
tokens = [token.lemma_.lower() for token in doc if token.is_alpha]
return ' '.join(tokens)
# Apply preprocessing and embedding
df['text'] = df['text'].apply(preprocess_text)
df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))
# Create a FAISS index
index = faiss.IndexFlatL2(768) # Assuming embeddings are 768-dimensional
embeddings = torch.stack(df['text_embeddings'].tolist())
faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
faiss_index.add(embeddings.numpy())
# Function to generate a response
def generate_response(prompt):
query_embedding = embedding_model.embed_text(prompt).unsqueeze(0)
distances, indices = faiss_index.search(query_embedding.numpy(), k=1)
response = df.iloc[indices[0][0]]['text']
return response
# Gradio interface
iface = gr.Interface(
fn=generate_response,
inputs=gr.Textbox(label="Enter your query", placeholder="Ask about Ubuntu..."),
outputs=gr.Textbox(label="Response"),
title="Ubuntu Manual Chatbot",
description="Ask questions about the Ubuntu manual."
)
if __name__ == "__main__":
iface.launch()
|