File size: 2,328 Bytes
93452e4
3d00632
8918a3e
3d00632
93452e4
 
4c4e926
3d00632
 
 
 
 
e4261d6
 
 
 
 
3d00632
 
9042b33
 
93452e4
3d00632
9042b33
 
3d00632
 
 
 
1842c48
3d00632
 
8918a3e
3d00632
 
8918a3e
3d70771
e4261d6
3d00632
 
7dbc572
3d00632
 
 
8918a3e
3d00632
 
 
 
 
 
8918a3e
 
 
 
 
1842c48
3d00632
1842c48
8918a3e
 
 
3d70771
 
3d00632
3d70771
 
3d00632
67be4ed
3d00632
 
97c8253
 
 
3d70771
3d00632
7dbc572
e4261d6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import pandas as pd
import PyPDF2
import spacy
import faiss
from sentence_transformers import SentenceTransformer
import torch
import gradio as gr

# Load and preprocess PDF text
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text

# Path to your PDF file
pdf_path = 'FridayMaster/UBANTUMANUAL/Getting Started with Ubuntu 16.04.pdf'

# Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_path)


# Convert the text to a DataFrame
df = pd.DataFrame({'text': [pdf_text]})

# Load the custom embedding model
class CustomEmbeddingModel:
    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name)

    def embed_text(self, text):
        return self.model.encode(text, convert_to_tensor=True)

embedding_model = CustomEmbeddingModel('distilbert-base-uncased')  # Replace with your model name

# Load Spacy model for preprocessing
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if token.is_alpha]
    return ' '.join(tokens)

# Apply preprocessing and embedding
df['text'] = df['text'].apply(preprocess_text)
df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))

# Create a FAISS index
index = faiss.IndexFlatL2(768)  # Assuming embeddings are 768-dimensional
embeddings = torch.stack(df['text_embeddings'].tolist())
faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
faiss_index.add(embeddings.numpy())

# Function to generate a response
def generate_response(prompt):
    query_embedding = embedding_model.embed_text(prompt).unsqueeze(0)
    distances, indices = faiss_index.search(query_embedding.numpy(), k=1)
    response = df.iloc[indices[0][0]]['text']
    return response

# Gradio interface
iface = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(label="Enter your query", placeholder="Ask about Ubuntu..."),
    outputs=gr.Textbox(label="Response"),
    title="Ubuntu Manual Chatbot",
    description="Ask questions about the Ubuntu manual."
)

if __name__ == "__main__":
    iface.launch()