FridayMaster commited on
Commit
8918a3e
·
verified ·
1 Parent(s): 3854eb9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -23
app.py CHANGED
@@ -1,11 +1,9 @@
1
  import pandas as pd
2
- import PyPDF2 # For PDF extraction
3
  import spacy
4
- from langchain.chains import ConversationalRetrievalChain
5
- from langchain.llms import OpenAI
6
- from langchain.vectorstores import FAISS
7
  import torch
8
- from transformers import AutoTokenizer, AutoModel
9
  import gradio as gr
10
 
11
  # Load and preprocess PDF text
@@ -27,14 +25,10 @@ df = pd.DataFrame({'text': [pdf_text]})
27
  # Load the custom embedding model
28
  class CustomEmbeddingModel:
29
  def __init__(self, model_name):
30
- self.tokenizer = AutoTokenizer.from_pretrained(model_name)
31
- self.model = AutoModel.from_pretrained(model_name)
32
 
33
  def embed_text(self, text):
34
- inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
35
- with torch.no_grad():
36
- embeddings = self.model(**inputs).last_hidden_state.mean(dim=1)
37
- return embeddings[0].numpy()
38
 
39
  embedding_model = CustomEmbeddingModel('distilbert-base-uncased') # Replace with your model name
40
 
@@ -43,27 +37,24 @@ nlp = spacy.load("en_core_web_sm")
43
 
44
  def preprocess_text(text):
45
  doc = nlp(text)
46
- tokens = [token.lemma_.lower() for token in doc if token.text.lower() not in stopwords.words('english') and token.is_alpha]
47
  return ' '.join(tokens)
48
 
49
  # Apply preprocessing and embedding
50
  df['text'] = df['text'].apply(preprocess_text)
51
  df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))
52
 
53
- # Create FAISS vector store
54
- documents = df['text'].tolist()
55
- embeddings = df['text_embeddings'].tolist()
56
- vector_store = FAISS.from_documents(documents, embeddings)
57
-
58
- # Create LangChain model and chain
59
- llm_model = OpenAI('gpt-3.5-turbo') # You can replace this with a different LLM if desired
60
- retriever = vector_store.as_retriever()
61
- chain = ConversationalRetrievalChain.from_llm(llm_model, retriever=retriever)
62
 
63
  # Function to generate a response
64
  def generate_response(prompt):
65
- result = chain({"query": prompt})
66
- response = result["result"]
 
67
  return response
68
 
69
  # Gradio interface
 
1
  import pandas as pd
2
+ import PyPDF2
3
  import spacy
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sentence_transformers import SentenceTransformer, util
 
6
  import torch
 
7
  import gradio as gr
8
 
9
  # Load and preprocess PDF text
 
25
  # Load the custom embedding model
26
  class CustomEmbeddingModel:
27
  def __init__(self, model_name):
28
+ self.model = SentenceTransformer(model_name)
 
29
 
30
  def embed_text(self, text):
31
+ return self.model.encode(text, convert_to_tensor=True)
 
 
 
32
 
33
  embedding_model = CustomEmbeddingModel('distilbert-base-uncased') # Replace with your model name
34
 
 
37
 
38
  def preprocess_text(text):
39
  doc = nlp(text)
40
+ tokens = [token.lemma_.lower() for token in doc if token.is_alpha]
41
  return ' '.join(tokens)
42
 
43
  # Apply preprocessing and embedding
44
  df['text'] = df['text'].apply(preprocess_text)
45
  df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))
46
 
47
+ # Create a FAISS index
48
+ index = faiss.IndexFlatL2(768) # Assuming embeddings are 768-dimensional
49
+ embeddings = torch.stack(df['text_embeddings'].tolist())
50
+ faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
51
+ faiss_index.add(embeddings.numpy())
 
 
 
 
52
 
53
  # Function to generate a response
54
  def generate_response(prompt):
55
+ query_embedding = embedding_model.embed_text(prompt).unsqueeze(0)
56
+ distances, indices = faiss_index.search(query_embedding.numpy(), k=1)
57
+ response = df.iloc[indices[0][0]]['text']
58
  return response
59
 
60
  # Gradio interface