Pedro Gengo commited on
Commit
1b69bf3
1 Parent(s): da6d819

Adding app

Browse files
app.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import gradio as gr
3
+ import numpy as np
4
+ import pandas as pd
5
+ import torch.nn.functional as F
6
+ from sentence_transformers import SentenceTransformer
7
+
8
+ DIM = 768
9
+ model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
10
+
11
+ papers_df = pd.read_csv("data/cvpr2024_papers_with_details.csv", index=None, on_bad_lines='skip')
12
+ papers_df = papers_df[~papers_df["summary"].isna() & ~papers_df["pdf_path"].isna()]
13
+
14
+ with open('data/embeddings.npy', 'rb') as f:
15
+ embeddings = np.load(f)
16
+
17
+ index = faiss.IndexFlatL2(DIM)
18
+ index.add(embeddings)
19
+
20
+
21
+ def encode_query(query):
22
+ query_embeddings = model.encode([query], convert_to_tensor=True)
23
+ query_embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],))
24
+ query_embeddings = embeddings[:, :DIM]
25
+ query_embeddings = F.normalize(embeddings, p=2, dim=1)
26
+ return query_embeddings
27
+
28
+ def search_nearest_papers(query, k=5):
29
+ query_embeddings = encode_query(query)
30
+ D, I = index.search(query_embeddings.numpy(), k)
31
+ return papers_df.iloc[I[0]][["title", "summary", "pdf_path"]]
32
+
33
+ demo = gr.Interface(
34
+ search_nearest_papers,
35
+ [
36
+ "text",
37
+ gr.inputs.Slider(1, 10, default=5),
38
+ ],
39
+ "dataframe",
40
+ )
41
+
42
+ if __name__ == "__main__":
43
+ demo.launch()
data/cvpr2024_papers_with_details.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:939ae59573f4807a026ca5eeaa9bafc6daa3cbb6a7fb9b1f163991c6ede3ac09
3
+ size 3200
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ faiss-cpu
2
+ sentence-transformers
3
+ einops
4
+ pandas
5
+ numpy