Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,11 +1,9 @@
|
|
1 |
import pandas as pd
|
2 |
-
import PyPDF2
|
3 |
import spacy
|
4 |
-
from
|
5 |
-
from
|
6 |
-
from langchain.vectorstores import FAISS
|
7 |
import torch
|
8 |
-
from transformers import AutoTokenizer, AutoModel
|
9 |
import gradio as gr
|
10 |
|
11 |
# Load and preprocess PDF text
|
@@ -27,14 +25,10 @@ df = pd.DataFrame({'text': [pdf_text]})
|
|
27 |
# Load the custom embedding model
|
28 |
class CustomEmbeddingModel:
|
29 |
def __init__(self, model_name):
|
30 |
-
self.
|
31 |
-
self.model = AutoModel.from_pretrained(model_name)
|
32 |
|
33 |
def embed_text(self, text):
|
34 |
-
|
35 |
-
with torch.no_grad():
|
36 |
-
embeddings = self.model(**inputs).last_hidden_state.mean(dim=1)
|
37 |
-
return embeddings[0].numpy()
|
38 |
|
39 |
embedding_model = CustomEmbeddingModel('distilbert-base-uncased') # Replace with your model name
|
40 |
|
@@ -43,27 +37,24 @@ nlp = spacy.load("en_core_web_sm")
|
|
43 |
|
44 |
def preprocess_text(text):
|
45 |
doc = nlp(text)
|
46 |
-
tokens = [token.lemma_.lower() for token in doc if token.
|
47 |
return ' '.join(tokens)
|
48 |
|
49 |
# Apply preprocessing and embedding
|
50 |
df['text'] = df['text'].apply(preprocess_text)
|
51 |
df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))
|
52 |
|
53 |
-
# Create FAISS
|
54 |
-
|
55 |
-
embeddings = df['text_embeddings'].tolist()
|
56 |
-
|
57 |
-
|
58 |
-
# Create LangChain model and chain
|
59 |
-
llm_model = OpenAI('gpt-3.5-turbo') # You can replace this with a different LLM if desired
|
60 |
-
retriever = vector_store.as_retriever()
|
61 |
-
chain = ConversationalRetrievalChain.from_llm(llm_model, retriever=retriever)
|
62 |
|
63 |
# Function to generate a response
|
64 |
def generate_response(prompt):
|
65 |
-
|
66 |
-
|
|
|
67 |
return response
|
68 |
|
69 |
# Gradio interface
|
|
|
1 |
import pandas as pd
|
2 |
+
import PyPDF2
|
3 |
import spacy
|
4 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
+
from sentence_transformers import SentenceTransformer, util
|
|
|
6 |
import torch
|
|
|
7 |
import gradio as gr
|
8 |
|
9 |
# Load and preprocess PDF text
|
|
|
25 |
# Load the custom embedding model
|
26 |
class CustomEmbeddingModel:
|
27 |
def __init__(self, model_name):
|
28 |
+
self.model = SentenceTransformer(model_name)
|
|
|
29 |
|
30 |
def embed_text(self, text):
|
31 |
+
return self.model.encode(text, convert_to_tensor=True)
|
|
|
|
|
|
|
32 |
|
33 |
embedding_model = CustomEmbeddingModel('distilbert-base-uncased') # Replace with your model name
|
34 |
|
|
|
37 |
|
38 |
def preprocess_text(text):
|
39 |
doc = nlp(text)
|
40 |
+
tokens = [token.lemma_.lower() for token in doc if token.is_alpha]
|
41 |
return ' '.join(tokens)
|
42 |
|
43 |
# Apply preprocessing and embedding
|
44 |
df['text'] = df['text'].apply(preprocess_text)
|
45 |
df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))
|
46 |
|
47 |
+
# Create a FAISS index
|
48 |
+
index = faiss.IndexFlatL2(768) # Assuming embeddings are 768-dimensional
|
49 |
+
embeddings = torch.stack(df['text_embeddings'].tolist())
|
50 |
+
faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
|
51 |
+
faiss_index.add(embeddings.numpy())
|
|
|
|
|
|
|
|
|
52 |
|
53 |
# Function to generate a response
|
54 |
def generate_response(prompt):
|
55 |
+
query_embedding = embedding_model.embed_text(prompt).unsqueeze(0)
|
56 |
+
distances, indices = faiss_index.search(query_embedding.numpy(), k=1)
|
57 |
+
response = df.iloc[indices[0][0]]['text']
|
58 |
return response
|
59 |
|
60 |
# Gradio interface
|