A-Roucher commited on
Commit
c9fa165
1 Parent(s): d3b59ef

fix: change requirements

Browse files
Files changed (2) hide show
  1. app.py +14 -15
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,19 +1,18 @@
1
  import streamlit as st
2
  from sentence_transformers import SentenceTransformer
3
  import datasets
4
-
5
- x = st.slider('Select a value')
6
- st.write(x, 'squared is', x * x)
7
 
8
  st.sidebar.text_input("Type your quote here")
9
 
10
  dataset = datasets.load_dataset('A-Roucher/english_historical_quotes', download_mode="force_redownload")
11
 
12
- dataset = dataset['train']
13
 
14
  model_name = "sentence-transformers/all-MiniLM-L6-v2" # BAAI/bge-small-en-v1.5" # "Cohere/Cohere-embed-english-light-v3.0" # "sentence-transformers/all-MiniLM-L6-v2"
15
-
16
  encoder = SentenceTransformer(model_name)
 
17
  embeddings = encoder.encode(
18
  dataset["quote"],
19
  batch_size=4,
@@ -22,8 +21,8 @@ embeddings = encoder.encode(
22
  normalize_embeddings=True,
23
  )
24
 
25
- dataset_embeddings = datasets.Dataset.from_dict({"embeddings": embeddings})
26
- dataset_embeddings.add_faiss_index(column="embeddings")
27
 
28
  # dataset_embeddings.save_faiss_index('embeddings', 'output/index_alone.faiss')
29
 
@@ -36,17 +35,17 @@ sentence_embedding = encoder.encode([sentence])
36
  # scores, samples = dataset_embeddings.search(
37
  # sentence_embedding, k=10
38
  # )
39
-
 
40
  from sentence_transformers.util import semantic_search
41
 
42
- # hits = semantic_search(sentence_embedding, dataset_embeddings[:, :], top_k=5)
43
- author_indexes = list(range(1000))
44
- hits = semantic_search(sentence_embedding, dataset_embeddings[author_indexes, :], top_k=5)
45
- st.write(hits)
46
- list_hits = [author_indexes[i['corpus_id']] for i in hits[0]]
47
- st.write(dataset_embeddings.select([12676, 4967, 2612, 8884, 4797]))
48
-
49
 
 
 
 
 
50
 
51
  # sentence_embedding = model.encode([sentence])
52
  # scores, sample_indexes = QUOTES_INDEX.search(
 
1
  import streamlit as st
2
  from sentence_transformers import SentenceTransformer
3
  import datasets
4
+ import faiss
5
+ import torch
 
6
 
7
  st.sidebar.text_input("Type your quote here")
8
 
9
  dataset = datasets.load_dataset('A-Roucher/english_historical_quotes', download_mode="force_redownload")
10
 
11
+ dataset = datasets.Dataset.from_dict(dataset['train'][:100])
12
 
13
  model_name = "sentence-transformers/all-MiniLM-L6-v2" # BAAI/bge-small-en-v1.5" # "Cohere/Cohere-embed-english-light-v3.0" # "sentence-transformers/all-MiniLM-L6-v2"
 
14
  encoder = SentenceTransformer(model_name)
15
+
16
  embeddings = encoder.encode(
17
  dataset["quote"],
18
  batch_size=4,
 
21
  normalize_embeddings=True,
22
  )
23
 
24
+ # dataset_embeddings = datasets.Dataset.from_dict({"embeddings": embeddings})
25
+ # dataset_embeddings.add_faiss_index(column="embeddings")
26
 
27
  # dataset_embeddings.save_faiss_index('embeddings', 'output/index_alone.faiss')
28
 
 
35
  # scores, samples = dataset_embeddings.search(
36
  # sentence_embedding, k=10
37
  # )
38
+ sentence_embedding_tensor = torch.Tensor(sentence_embedding)
39
+ dataset_embeddings_tensor = torch.Tensor(embeddings)
40
  from sentence_transformers.util import semantic_search
41
 
42
+ author_indexes = list(range(10))
43
+ hits = semantic_search(sentence_embedding_tensor, dataset_embeddings_tensor[author_indexes, :], top_k=5)
 
 
 
 
 
44
 
45
+ list_hits = [author_indexes[i['corpus_id']] for i in hits[0]]
46
+ print(list_hits)
47
+ print(dataset)
48
+ st.write(dataset.select(list_hits))
49
 
50
  # sentence_embedding = model.encode([sentence])
51
  # scores, sample_indexes = QUOTES_INDEX.search(
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
- datasets==2.5.2
 
2
  sentence_transformers==2.2.2
3
  streamlit==1.28.1
 
1
+ datasets==2.14.6
2
+ faiss-cpu==1.7.3
3
  sentence_transformers==2.2.2
4
  streamlit==1.28.1