import pandas as pd import PyPDF2 import spacy from sklearn.feature_extraction.text import TfidfVectorizer from sentence_transformers import SentenceTransformer, util import torch import gradio as gr # Load and preprocess PDF text def extract_text_from_pdf(pdf_path): text = "" with open(pdf_path, 'rb') as pdf_file: pdf_reader = PyPDF2.PdfReader(pdf_file) for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] text += page.extract_text() return text # Extract text from the PDF pdf_text = extract_text_from_pdf('Getting_Started_with_Ubuntu_16.04.pdf') # Replace with your PDF path # Convert the text to a DataFrame df = pd.DataFrame({'text': [pdf_text]}) # Load the custom embedding model class CustomEmbeddingModel: def __init__(self, model_name): self.model = SentenceTransformer(model_name) def embed_text(self, text): return self.model.encode(text, convert_to_tensor=True) embedding_model = CustomEmbeddingModel('distilbert-base-uncased') # Replace with your model name # Load Spacy model for preprocessing nlp = spacy.load("en_core_web_sm") def preprocess_text(text): doc = nlp(text) tokens = [token.lemma_.lower() for token in doc if token.is_alpha] return ' '.join(tokens) # Apply preprocessing and embedding df['text'] = df['text'].apply(preprocess_text) df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x)) # Create a FAISS index index = faiss.IndexFlatL2(768) # Assuming embeddings are 768-dimensional embeddings = torch.stack(df['text_embeddings'].tolist()) faiss_index = faiss.IndexFlatL2(embeddings.shape[1]) faiss_index.add(embeddings.numpy()) # Function to generate a response def generate_response(prompt): query_embedding = embedding_model.embed_text(prompt).unsqueeze(0) distances, indices = faiss_index.search(query_embedding.numpy(), k=1) response = df.iloc[indices[0][0]]['text'] return response # Gradio interface iface = gr.Interface( fn=generate_response, inputs=gr.Textbox(label="Enter your query", placeholder="Ask about Ubuntu..."), outputs=gr.Textbox(label="Response"), title="Ubuntu Manual Chatbot", description="Ask questions about the Ubuntu manual." ) if __name__ == "__main__": iface.launch()