FridayMaster commited on
Commit
929a283
1 Parent(s): aca97ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -70
app.py CHANGED
@@ -1,78 +1,106 @@
1
- import os
2
- import pandas as pd
3
- import PyPDF2
4
- import spacy
5
  import faiss
 
 
6
  from sentence_transformers import SentenceTransformer
7
- import torch
8
- import gradio as gr
9
-
10
- # Load and preprocess PDF text
11
- def extract_text_from_pdf(pdf_path):
12
- text = ""
13
- with open(pdf_path, 'rb') as pdf_file:
14
- pdf_reader = PyPDF2.PdfReader(pdf_file)
15
- for page_num in range(len(pdf_reader.pages)):
16
- page = pdf_reader.pages[page_num]
17
- text += page.extract_text()
18
- return text
19
-
20
- # Path to your PDF file
21
- pdf_path = 'FridayMaster/UBANTUMANUAL/Getting Started with Ubuntu 16.04.pdf'
22
-
23
- # Extract text from the PDF
24
- pdf_text = extract_text_from_pdf(pdf_path)
25
-
26
-
27
- # Convert the text to a DataFrame
28
- df = pd.DataFrame({'text': [pdf_text]})
29
-
30
- # Load the custom embedding model
31
- class CustomEmbeddingModel:
32
- def __init__(self, model_name):
33
- self.model = SentenceTransformer(model_name)
34
-
35
- def embed_text(self, text):
36
- return self.model.encode(text, convert_to_tensor=True)
37
-
38
- embedding_model = CustomEmbeddingModel('distilbert-base-uncased') # Replace with your model name
39
-
40
- # Load Spacy model for preprocessing
41
- nlp = spacy.load("en_core_web_sm")
42
-
43
- def preprocess_text(text):
44
- doc = nlp(text)
45
- tokens = [token.lemma_.lower() for token in doc if token.is_alpha]
46
- return ' '.join(tokens)
47
-
48
- # Apply preprocessing and embedding
49
- df['text'] = df['text'].apply(preprocess_text)
50
- df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))
51
-
52
- # Create a FAISS index
53
- index = faiss.IndexFlatL2(768) # Assuming embeddings are 768-dimensional
54
- embeddings = torch.stack(df['text_embeddings'].tolist())
55
- faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
56
- faiss_index.add(embeddings.numpy())
57
-
58
- # Function to generate a response
59
- def generate_response(prompt):
60
- query_embedding = embedding_model.embed_text(prompt).unsqueeze(0)
61
- distances, indices = faiss_index.search(query_embedding.numpy(), k=1)
62
- response = df.iloc[indices[0][0]]['text']
63
- return response
64
-
65
- # Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  iface = gr.Interface(
67
- fn=generate_response,
68
- inputs=gr.Textbox(label="Enter your query", placeholder="Ask about Ubuntu..."),
69
- outputs=gr.Textbox(label="Response"),
70
- title="Ubuntu Manual Chatbot",
71
- description="Ask questions about the Ubuntu manual."
72
  )
73
 
74
  if __name__ == "__main__":
75
  iface.launch()
76
 
77
-
78
-
 
1
+ import gradio as gr
 
 
 
2
  import faiss
3
+ import numpy as np
4
+ import openai
5
  from sentence_transformers import SentenceTransformer
6
+ from nltk.tokenize import sent_tokenize
7
+
8
+ # Load the Ubuntu manual from a .txt file
9
+ with open("/content/ubuntu_manual.txt", "r", encoding="utf-8") as file:
10
+ full_text = file.read()
11
+
12
+ # Function to chunk the text into smaller pieces
13
+ def chunk_text(text, chunk_size=500): # Larger chunks
14
+ sentences = sent_tokenize(text)
15
+ chunks = []
16
+ current_chunk = []
17
+
18
+ for sentence in sentences:
19
+ if len(current_chunk) + len(sentence.split()) <= chunk_size:
20
+ current_chunk.append(sentence)
21
+ else:
22
+ chunks.append(" ".join(current_chunk))
23
+ current_chunk = [sentence]
24
+
25
+ if current_chunk:
26
+ chunks.append(" ".join(current_chunk))
27
+
28
+ return chunks
29
+
30
+ # Apply chunking to the entire text
31
+ manual_chunks = chunk_text(full_text, chunk_size=500)
32
+
33
+ # Load your FAISS index
34
+ index = faiss.read_index("path/to/your/faiss_index.bin")
35
+
36
+ # Load your embedding model
37
+ embedding_model = SentenceTransformer('your_embedding_model_name')
38
+
39
+ # OpenAI API key
40
+ openai.api_key = 'your-openai-api-key'
41
+
42
+ # Function to create embeddings
43
+ def embed_text(text_list):
44
+ return np.array(embedding_model.encode(text_list), dtype=np.float32)
45
+
46
+ # Function to retrieve relevant chunks for a user query
47
+ def retrieve_chunks(query, k=5):
48
+ query_embedding = embed_text([query])
49
+
50
+ # Search the FAISS index
51
+ distances, indices = index.search(query_embedding, k=k)
52
+
53
+ # Debugging: Print out the distances and indices
54
+ print("Distances:", distances)
55
+ print("Indices:", indices)
56
+
57
+ # Check if indices are valid
58
+ if len(indices[0]) == 0:
59
+ return []
60
+
61
+ # Ensure indices are within bounds
62
+ valid_indices = [i for i in indices[0] if i < len(manual_chunks)]
63
+ if not valid_indices:
64
+ return []
65
+
66
+ # Retrieve relevant chunks
67
+ relevant_chunks = [manual_chunks[i] for i in valid_indices]
68
+ return relevant_chunks
69
+
70
+ # Function to truncate long inputs
71
+ def truncate_input(text, max_length=512):
72
+ tokens = generator_tokenizer.encode(text, truncation=True, max_length=max_length, return_tensors="pt")
73
+ return tokens
74
+
75
+ # Function to perform RAG: Retrieve chunks and generate a response
76
+ def rag_response(query, k=5, max_new_tokens=150):
77
+ # Step 1: Retrieve relevant chunks
78
+ relevant_chunks = retrieve_chunks(query, k=k)
79
+
80
+ if not relevant_chunks:
81
+ return "Sorry, I couldn't find relevant information."
82
+
83
+ # Step 2: Combine the query with retrieved chunks
84
+ augmented_input = query + "\n" + "\n".join(relevant_chunks)
85
+
86
+ # Truncate and encode the input
87
+ inputs = truncate_input(augmented_input)
88
+
89
+ # Generate response
90
+ outputs = generator_model.generate(inputs, max_new_tokens=max_new_tokens)
91
+ generated_text = generator_tokenizer.decode(outputs[0], skip_special_tokens=True)
92
+
93
+ return generated_text
94
+
95
+ # Gradio Interface
96
  iface = gr.Interface(
97
+ fn=rag_response,
98
+ inputs="text",
99
+ outputs="text",
100
+ title="RAG Chatbot with FAISS and GPT-3.5",
101
+ description="Ask me anything!"
102
  )
103
 
104
  if __name__ == "__main__":
105
  iface.launch()
106