taskswithcode commited on
Commit
9dabfa9
·
1 Parent(s): 580bfe6

Added files

Browse files
Files changed (3) hide show
  1. app.py +56 -166
  2. sim_app_examples.json +5 -0
  3. sim_app_models.json +134 -0
app.py CHANGED
@@ -1,161 +1,29 @@
1
  import time
 
2
  import streamlit as st
3
  import string
4
  from io import StringIO
5
  import pdb
6
  import json
7
- from twc_embeddings import HFModel,SimCSEModel,SGPTModel
8
  import torch
9
 
10
 
11
  MAX_INPUT = 100
12
 
 
 
 
 
 
 
 
 
 
13
 
14
  from transformers import BertTokenizer, BertForMaskedLM
15
 
16
- model_names = [
17
-
18
- { "name":"sentence-transformers/all-MiniLM-L6-v2",
19
- "model":"sentence-transformers/all-MiniLM-L6-v2",
20
- "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
21
- "orig_author_url":"https://github.com/UKPLab",
22
- "orig_author":"Ubiquitous Knowledge Processing Lab",
23
- "sota_info": {
24
- "task":"Over 3.8 million downloads from huggingface",
25
- "sota_link":"https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2"
26
- },
27
- "paper_url":"https://arxiv.org/abs/1908.10084",
28
- "mark":True,
29
- "class":"HFModel"},
30
- { "name":"sentence-transformers/paraphrase-MiniLM-L6-v2",
31
- "model":"sentence-transformers/paraphrase-MiniLM-L6-v2",
32
- "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
33
- "orig_author_url":"https://github.com/UKPLab",
34
- "orig_author":"Ubiquitous Knowledge Processing Lab",
35
- "sota_info": {
36
- "task":"Over 2 million downloads from huggingface",
37
- "sota_link":"https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2"
38
- },
39
- "paper_url":"https://arxiv.org/abs/1908.10084",
40
- "mark":True,
41
- "class":"HFModel"},
42
- { "name":"sentence-transformers/bert-base-nli-mean-tokens",
43
- "model":"sentence-transformers/bert-base-nli-mean-tokens",
44
- "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
45
- "orig_author_url":"https://github.com/UKPLab",
46
- "orig_author":"Ubiquitous Knowledge Processing Lab",
47
- "sota_info": {
48
- "task":"Over 700,000 downloads from huggingface",
49
- "sota_link":"https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens"
50
- },
51
- "paper_url":"https://arxiv.org/abs/1908.10084",
52
- "mark":True,
53
- "class":"HFModel"},
54
- { "name":"sentence-transformers/all-mpnet-base-v2",
55
- "model":"sentence-transformers/all-mpnet-base-v2",
56
- "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
57
- "orig_author_url":"https://github.com/UKPLab",
58
- "orig_author":"Ubiquitous Knowledge Processing Lab",
59
- "sota_info": {
60
- "task":"Over 500,000 downloads from huggingface",
61
- "sota_link":"https://huggingface.co/sentence-transformers/all-mpnet-base-v2"
62
- },
63
- "paper_url":"https://arxiv.org/abs/1908.10084",
64
- "mark":True,
65
- "class":"HFModel"},
66
- { "name":"sentence-transformers/all-MiniLM-L12-v2",
67
- "model":"sentence-transformers/all-MiniLM-L12-v2",
68
- "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
69
- "orig_author_url":"https://github.com/UKPLab",
70
- "orig_author":"Ubiquitous Knowledge Processing Lab",
71
- "sota_info": {
72
- "task":"Over 500,000 downloads from huggingface",
73
- "sota_link":"https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2"
74
- },
75
- "paper_url":"https://arxiv.org/abs/1908.10084",
76
- "mark":True,
77
- "class":"HFModel"},
78
-
79
- { "name":"SGPT-125M",
80
- "model":"Muennighoff/SGPT-125M-weightedmean-nli-bitfit",
81
- "fork_url":"https://github.com/taskswithcode/sgpt",
82
- "orig_author_url":"https://github.com/Muennighoff",
83
- "orig_author":"Niklas Muennighoff",
84
- "sota_info": {
85
- "task":"#1 in multiple information retrieval & search tasks(smaller variant)",
86
- "sota_link":"https://paperswithcode.com/paper/sgpt-gpt-sentence-embeddings-for-semantic",
87
- },
88
- "paper_url":"https://arxiv.org/abs/2202.08904v5",
89
- "mark":True,
90
- "class":"SGPTModel"},
91
- { "name":"SGPT-1.3B",
92
- "model": "Muennighoff/SGPT-1.3B-weightedmean-msmarco-specb-bitfit",
93
- "fork_url":"https://github.com/taskswithcode/sgpt",
94
- "orig_author_url":"https://github.com/Muennighoff",
95
- "orig_author":"Niklas Muennighoff",
96
- "sota_info": {
97
- "task":"#1 in multiple information retrieval & search tasks(smaller variant)",
98
- "sota_link":"https://paperswithcode.com/paper/sgpt-gpt-sentence-embeddings-for-semantic",
99
- },
100
- "paper_url":"https://arxiv.org/abs/2202.08904v5",
101
- "Note":"If this large model takes too long or fails to load , try this ",
102
- "alt_url":"http://www.taskswithcode.com/sentence_similarity/",
103
- "mark":True,
104
- "class":"SGPTModel"},
105
- { "name":"SGPT-5.8B",
106
- "model": "Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit" ,
107
- "fork_url":"https://github.com/taskswithcode/sgpt",
108
- "orig_author_url":"https://github.com/Muennighoff",
109
- "orig_author":"Niklas Muennighoff",
110
- "Note":"If this large model takes too long or fails to load , try this ",
111
- "alt_url":"http://www.taskswithcode.com/sentence_similarity/",
112
- "sota_info": {
113
- "task":"#1 in multiple information retrieval & search tasks",
114
- "sota_link":"https://paperswithcode.com/paper/sgpt-gpt-sentence-embeddings-for-semantic",
115
- },
116
- "paper_url":"https://arxiv.org/abs/2202.08904v5",
117
- "mark":True,
118
- "class":"SGPTModel"},
119
-
120
- { "name":"SIMCSE-large" ,
121
- "model":"princeton-nlp/sup-simcse-roberta-large",
122
- "fork_url":"https://github.com/taskswithcode/SimCSE",
123
- "orig_author_url":"https://github.com/princeton-nlp",
124
- "orig_author":"Princeton Natural Language Processing",
125
- "Note":"If this large model takes too long or fails to load , try this ",
126
- "alt_url":"http://www.taskswithcode.com/sentence_similarity/",
127
- "sota_info": {
128
- "task":"Within top 10 in multiple semantic textual similarity tasks",
129
- "sota_link":"https://paperswithcode.com/paper/simcse-simple-contrastive-learning-of"
130
- },
131
- "paper_url":"https://arxiv.org/abs/2104.08821v4",
132
- "mark":True,
133
- "class":"SimCSEModel","sota_link":"https://paperswithcode.com/sota/semantic-textual-similarity-on-sick"},
134
-
135
- { "name":"SIMCSE-base" ,
136
- "model":"princeton-nlp/sup-simcse-roberta-base",
137
- "fork_url":"https://github.com/taskswithcode/SimCSE",
138
- "orig_author_url":"https://github.com/princeton-nlp",
139
- "orig_author":"Princeton Natural Language Processing",
140
- "sota_info": {
141
- "task":"Within top 10 in multiple semantic textual similarity tasks(smaller variant)",
142
- "sota_link":"https://paperswithcode.com/paper/simcse-simple-contrastive-learning-of"
143
- },
144
- "paper_url":"https://arxiv.org/abs/2104.08821v4",
145
- "mark":True,
146
- "class":"SimCSEModel","sota_link":"https://paperswithcode.com/sota/semantic-textual-similarity-on-sick"},
147
-
148
-
149
- ]
150
-
151
-
152
-
153
-
154
-
155
- example_file_names = {
156
- "Machine learning terms (30+ phrases)": "small_test.txt",
157
- "Customer feedback mixed with noise (50+ sentences)":"larger_test.txt"
158
- }
159
 
160
  view_count_file = "view_count.txt"
161
 
@@ -177,12 +45,12 @@ def get_views():
177
 
178
 
179
 
180
- def construct_model_info_for_display():
181
  options_arr = []
182
  markdown_str = f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\"><br/><b>Models evaluated ({len(model_names)})</b></div>"
183
  for node in model_names:
184
  options_arr .append(node["name"])
185
- if (node["mark"] == True):
186
  markdown_str += f"<div style=\"font-size:16px; color: #5f5f5f; text-align: left\">&nbsp;•&nbsp;Model:&nbsp;<a href=\'{node['paper_url']}\' target='_blank'>{node['name']}</a><br/>&nbsp;&nbsp;&nbsp;&nbsp;Code released by:&nbsp;<a href=\'{node['orig_author_url']}\' target='_blank'>{node['orig_author']}</a><br/>&nbsp;&nbsp;&nbsp;&nbsp;Model info:&nbsp;<a href=\'{node['sota_info']['sota_link']}\' target='_blank'>{node['sota_info']['task']}</a></div>"
187
  if ("Note" in node):
188
  markdown_str += f"<div style=\"font-size:16px; color: #a91212; text-align: left\">&nbsp;&nbsp;&nbsp;&nbsp;{node['Note']}<a href=\'{node['alt_url']}\' target='_blank'>link</a></div>"
@@ -194,7 +62,7 @@ def construct_model_info_for_display():
194
  return options_arr,markdown_str
195
 
196
 
197
- st.set_page_config(page_title='TWC - Compare popular/state-of-the-art models for Sentence Similarity task', page_icon="logo.jpg", layout='centered', initial_sidebar_state='auto',
198
  menu_items={
199
  'About': 'This app was created by taskswithcode. http://taskswithcode.com'
200
 
@@ -206,7 +74,7 @@ with col:
206
 
207
 
208
  @st.experimental_memo
209
- def load_model(model_name):
210
  try:
211
  ret_model = None
212
  for node in model_names:
@@ -235,18 +103,18 @@ def uncached_compute_similarity(sentences,_model,model_name,main_index):
235
  #st.success("Similarity computation complete")
236
  return results
237
 
238
- def get_model_info(model_name):
239
  for node in model_names:
240
  if (model_name == node["name"]):
241
  return node
242
 
243
- def run_test(model_name,sentences,display_area,main_index,user_uploaded):
244
  display_area.text("Loading model:" + model_name)
245
- model_info = get_model_info(model_name)
246
  if ("Note" in model_info):
247
  fail_link = f"{model_info['Note']} [link]({model_info['alt_url']})"
248
  display_area.write(fail_link)
249
- model = load_model(model_name)
250
  display_area.text("Model " + model_name + " load complete")
251
  try:
252
  if (user_uploaded):
@@ -266,13 +134,20 @@ def run_test(model_name,sentences,display_area,main_index,user_uploaded):
266
 
267
 
268
 
269
- def display_results(orig_sentences,main_index,results,response_info):
270
  main_sent = f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">{response_info}<br/><br/></div>"
271
- main_sent += "<div style=\"font-size:14px; color: #6f6f6f; text-align: left\">Results sorted by cosine distance. Closest(1) to furthest(-1) away from main sentence</div>"
272
- main_sent += f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\"><b>Main sentence:</b>&nbsp;&nbsp;{orig_sentences[main_index]}</div>"
 
 
 
273
  body_sent = []
274
  download_data = {}
 
275
  for key in results:
 
 
 
276
  index = orig_sentences.index(key) + 1
277
  body_sent.append(f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\">{index}]&nbsp;{key}&nbsp;&nbsp;&nbsp;<b>{results[key]:.2f}</b></div>")
278
  download_data[key] = f"{results[key]:.2f}"
@@ -287,9 +162,15 @@ def init_session():
287
  st.session_state["main_index"] = 1
288
  st.session_state["file_name"] = "default"
289
 
290
- def main():
291
  init_session()
292
- st.markdown("<h5 style='text-align: center;'>Compare popular/state-of-the-art models for Sentence Similarity task</h5>", unsafe_allow_html=True)
 
 
 
 
 
 
293
  st.markdown(f"<div style='color: #9f9f9f; text-align: right'>views:&nbsp;{get_views()}</div>", unsafe_allow_html=True)
294
 
295
 
@@ -298,17 +179,23 @@ def main():
298
 
299
  with st.form('twc_form'):
300
 
301
- uploaded_file = st.file_uploader("Step 1. Upload text file(one sentence in a line) or choose an example text file below", type=".txt")
 
 
 
302
 
303
- selected_file_index = st.selectbox(label='Example files ',
304
  options = list(dict.keys(example_file_names)), index=0, key = "twc_file")
305
  st.write("")
306
- options_arr,markdown_str = construct_model_info_for_display()
307
  selection_label = 'Step 2. Select Model'
308
  selected_model = st.selectbox(label=selection_label,
309
  options = options_arr, index=0, key = "twc_model")
310
  st.write("")
311
- main_index = st.number_input('Step 3. Enter index of sentence in file to make it the main sentence',value=1,min_value = 1)
 
 
 
312
  st.write("")
313
  submit_button = st.form_submit_button('Run')
314
 
@@ -321,8 +208,8 @@ def main():
321
  st.session_state["file_name"] = uploaded_file.name
322
  sentences = StringIO(uploaded_file.getvalue().decode("utf-8")).read()
323
  else:
324
- st.session_state["file_name"] = example_file_names[selected_file_index]
325
- sentences = open(example_file_names[selected_file_index]).read()
326
  sentences = sentences.split("\n")[:-1]
327
  if (len(sentences) < main_index):
328
  main_index = len(sentences)
@@ -332,12 +219,12 @@ def main():
332
  sentences = sentences[:MAX_INPUT]
333
  st.session_state["model_name"] = selected_model
334
  st.session_state["main_index"] = main_index
335
- results = run_test(selected_model,sentences,display_area,main_index - 1,(uploaded_file is not None))
336
  display_area.empty()
337
  with display_area.container():
338
  device = 'GPU' if torch.cuda.is_available() else 'CPU'
339
  response_info = f"Computation time on {device}: {time.time() - start:.2f} secs for {len(sentences)} sentences"
340
- display_results(sentences,main_index - 1,results,response_info)
341
  #st.json(results)
342
  st.download_button(
343
  label="Download results as json",
@@ -359,5 +246,8 @@ def main():
359
 
360
 
361
  if __name__ == "__main__":
362
- main()
 
 
 
363
 
 
1
  import time
2
+ import sys
3
  import streamlit as st
4
  import string
5
  from io import StringIO
6
  import pdb
7
  import json
8
+ from twc_embeddings import HFModel,SimCSEModel,SGPTModel,CausalLMModel,SGPTQnAModel
9
  import torch
10
 
11
 
12
  MAX_INPUT = 100
13
 
14
+ SEM_SIMILARITY="1"
15
+ DOC_RETRIEVAL="2"
16
+ CLUSTERING="3"
17
+
18
+
19
+ use_case = {"1":"Finding similar phrases/sentences","2":"Retrieving semantically matching information to a query. It may not be a factual match","3":"Clustering"}
20
+
21
+
22
+
23
 
24
  from transformers import BertTokenizer, BertForMaskedLM
25
 
26
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  view_count_file = "view_count.txt"
29
 
 
45
 
46
 
47
 
48
+ def construct_model_info_for_display(model_names):
49
  options_arr = []
50
  markdown_str = f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\"><br/><b>Models evaluated ({len(model_names)})</b></div>"
51
  for node in model_names:
52
  options_arr .append(node["name"])
53
+ if (node["mark"] == "True"):
54
  markdown_str += f"<div style=\"font-size:16px; color: #5f5f5f; text-align: left\">&nbsp;•&nbsp;Model:&nbsp;<a href=\'{node['paper_url']}\' target='_blank'>{node['name']}</a><br/>&nbsp;&nbsp;&nbsp;&nbsp;Code released by:&nbsp;<a href=\'{node['orig_author_url']}\' target='_blank'>{node['orig_author']}</a><br/>&nbsp;&nbsp;&nbsp;&nbsp;Model info:&nbsp;<a href=\'{node['sota_info']['sota_link']}\' target='_blank'>{node['sota_info']['task']}</a></div>"
55
  if ("Note" in node):
56
  markdown_str += f"<div style=\"font-size:16px; color: #a91212; text-align: left\">&nbsp;&nbsp;&nbsp;&nbsp;{node['Note']}<a href=\'{node['alt_url']}\' target='_blank'>link</a></div>"
 
62
  return options_arr,markdown_str
63
 
64
 
65
+ st.set_page_config(page_title='TWC - Compare popular/state-of-the-art models for tasks using sentence embeddings', page_icon="logo.jpg", layout='centered', initial_sidebar_state='auto',
66
  menu_items={
67
  'About': 'This app was created by taskswithcode. http://taskswithcode.com'
68
 
 
74
 
75
 
76
  @st.experimental_memo
77
+ def load_model(model_name,model_names):
78
  try:
79
  ret_model = None
80
  for node in model_names:
 
103
  #st.success("Similarity computation complete")
104
  return results
105
 
106
+ def get_model_info(model_names,model_name):
107
  for node in model_names:
108
  if (model_name == node["name"]):
109
  return node
110
 
111
+ def run_test(model_names,model_name,sentences,display_area,main_index,user_uploaded):
112
  display_area.text("Loading model:" + model_name)
113
+ model_info = get_model_info(model_names,model_name)
114
  if ("Note" in model_info):
115
  fail_link = f"{model_info['Note']} [link]({model_info['alt_url']})"
116
  display_area.write(fail_link)
117
+ model = load_model(model_name,model_names)
118
  display_area.text("Model " + model_name + " load complete")
119
  try:
120
  if (user_uploaded):
 
134
 
135
 
136
 
137
+ def display_results(orig_sentences,main_index,results,response_info,app_mode):
138
  main_sent = f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">{response_info}<br/><br/></div>"
139
+ score_text = "cosine_distance" if app_mode == "similarity" else "cosine_distance/score"
140
+ pivot_name = "main sentence" if app_mode == "similarity" else "query"
141
+ main_sent += f"<div style=\"font-size:14px; color: #6f6f6f; text-align: left\">Results sorted by {score_text}. Closest to furthest away from {pivot_name}</div>"
142
+ pivot_name = pivot_name[0].upper() + pivot_name[1:]
143
+ main_sent += f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\"><b>{pivot_name}:</b>&nbsp;&nbsp;{orig_sentences[main_index]}</div>"
144
  body_sent = []
145
  download_data = {}
146
+ first = True
147
  for key in results:
148
+ if (app_mode == DOC_RETRIEVAL and first):
149
+ first = False
150
+ continue
151
  index = orig_sentences.index(key) + 1
152
  body_sent.append(f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\">{index}]&nbsp;{key}&nbsp;&nbsp;&nbsp;<b>{results[key]:.2f}</b></div>")
153
  download_data[key] = f"{results[key]:.2f}"
 
162
  st.session_state["main_index"] = 1
163
  st.session_state["file_name"] = "default"
164
 
165
+ def app_main(app_mode,example_files,model_name_files):
166
  init_session()
167
+ with open(example_files) as fp:
168
+ example_file_names = json.load(fp)
169
+ with open(model_name_files) as fp:
170
+ model_names = json.load(fp)
171
+ curr_use_case = use_case[app_mode].split(".")[0]
172
+ st.markdown("<h5 style='text-align: center;'>Compare popular/state-of-the-art models for tasks using sentence embeddings</h5>", unsafe_allow_html=True)
173
+ st.markdown(f"<div style='color: #4f4f4f; text-align: left'>Use cases for sentence embeddings<br/>&nbsp;&nbsp;&nbsp;•&nbsp;&nbsp;{use_case['1']}<br/>&nbsp;&nbsp;&nbsp;•&nbsp;&nbsp;{use_case['2']}<br/>&nbsp;&nbsp;&nbsp;•&nbsp;&nbsp;{use_case['3']}<br/><i>This app illustrates <b>'{curr_use_case}'</b> use case</i></div>", unsafe_allow_html=True)
174
  st.markdown(f"<div style='color: #9f9f9f; text-align: right'>views:&nbsp;{get_views()}</div>", unsafe_allow_html=True)
175
 
176
 
 
179
 
180
  with st.form('twc_form'):
181
 
182
+ step1_line = "Step 1. Upload text file(one sentence in a line) or choose an example text file below"
183
+ if (app_mode == DOC_RETRIEVAL):
184
+ step1_line += ". The first line is treated as the query"
185
+ uploaded_file = st.file_uploader(step1_line, type=".txt")
186
 
187
+ selected_file_index = st.selectbox(label=f'Example files ({len(example_file_names)})',
188
  options = list(dict.keys(example_file_names)), index=0, key = "twc_file")
189
  st.write("")
190
+ options_arr,markdown_str = construct_model_info_for_display(model_names)
191
  selection_label = 'Step 2. Select Model'
192
  selected_model = st.selectbox(label=selection_label,
193
  options = options_arr, index=0, key = "twc_model")
194
  st.write("")
195
+ if (app_mode == "similarity"):
196
+ main_index = st.number_input('Step 3. Enter index of sentence in file to make it the main sentence',value=1,min_value = 1)
197
+ else:
198
+ main_index = 1
199
  st.write("")
200
  submit_button = st.form_submit_button('Run')
201
 
 
208
  st.session_state["file_name"] = uploaded_file.name
209
  sentences = StringIO(uploaded_file.getvalue().decode("utf-8")).read()
210
  else:
211
+ st.session_state["file_name"] = example_file_names[selected_file_index]["name"]
212
+ sentences = open(example_file_names[selected_file_index]["name"]).read()
213
  sentences = sentences.split("\n")[:-1]
214
  if (len(sentences) < main_index):
215
  main_index = len(sentences)
 
219
  sentences = sentences[:MAX_INPUT]
220
  st.session_state["model_name"] = selected_model
221
  st.session_state["main_index"] = main_index
222
+ results = run_test(model_names,selected_model,sentences,display_area,main_index - 1,(uploaded_file is not None))
223
  display_area.empty()
224
  with display_area.container():
225
  device = 'GPU' if torch.cuda.is_available() else 'CPU'
226
  response_info = f"Computation time on {device}: {time.time() - start:.2f} secs for {len(sentences)} sentences"
227
+ display_results(sentences,main_index - 1,results,response_info,app_mode)
228
  #st.json(results)
229
  st.download_button(
230
  label="Download results as json",
 
246
 
247
 
248
  if __name__ == "__main__":
249
+ #print("comand line input:",len(sys.argv),str(sys.argv))
250
+ #app_main(sys.argv[1],sys.argv[2],sys.argv[3])
251
+ app_main("1","sim_app_examples.json","sim_app_models.json")
252
+ #app_main("2","doc_app_examples.json","doc_app_models.json")
253
 
sim_app_examples.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "Machine learning terms (phrases test)": {"name":"small_test.txt"},
3
+ "Customer feedback mixed with noise":{"name":"larger_test.txt"},
4
+ "Movie reviews": {"name":"imdb_sent.txt"}
5
+ }
sim_app_models.json ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+
3
+ { "name":"sentence-transformers/all-MiniLM-L6-v2",
4
+ "model":"sentence-transformers/all-MiniLM-L6-v2",
5
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
6
+ "orig_author_url":"https://github.com/UKPLab",
7
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
8
+ "sota_info": {
9
+ "task":"Over 3.8 million downloads from huggingface",
10
+ "sota_link":"https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2"
11
+ },
12
+ "paper_url":"https://arxiv.org/abs/1908.10084",
13
+ "mark":"True",
14
+ "class":"HFModel"},
15
+ { "name":"sentence-transformers/paraphrase-MiniLM-L6-v2",
16
+ "model":"sentence-transformers/paraphrase-MiniLM-L6-v2",
17
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
18
+ "orig_author_url":"https://github.com/UKPLab",
19
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
20
+ "sota_info": {
21
+ "task":"Over 2 million downloads from huggingface",
22
+ "sota_link":"https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2"
23
+ },
24
+ "paper_url":"https://arxiv.org/abs/1908.10084",
25
+ "mark":"True",
26
+ "class":"HFModel"},
27
+ { "name":"sentence-transformers/bert-base-nli-mean-tokens",
28
+ "model":"sentence-transformers/bert-base-nli-mean-tokens",
29
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
30
+ "orig_author_url":"https://github.com/UKPLab",
31
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
32
+ "sota_info": {
33
+ "task":"Over 700,000 downloads from huggingface",
34
+ "sota_link":"https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens"
35
+ },
36
+ "paper_url":"https://arxiv.org/abs/1908.10084",
37
+ "mark":"True",
38
+ "class":"HFModel"},
39
+ { "name":"sentence-transformers/all-mpnet-base-v2",
40
+ "model":"sentence-transformers/all-mpnet-base-v2",
41
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
42
+ "orig_author_url":"https://github.com/UKPLab",
43
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
44
+ "sota_info": {
45
+ "task":"Over 500,000 downloads from huggingface",
46
+ "sota_link":"https://huggingface.co/sentence-transformers/all-mpnet-base-v2"
47
+ },
48
+ "paper_url":"https://arxiv.org/abs/1908.10084",
49
+ "mark":"True",
50
+ "class":"HFModel"},
51
+ { "name":"sentence-transformers/all-MiniLM-L12-v2",
52
+ "model":"sentence-transformers/all-MiniLM-L12-v2",
53
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
54
+ "orig_author_url":"https://github.com/UKPLab",
55
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
56
+ "sota_info": {
57
+ "task":"Over 500,000 downloads from huggingface",
58
+ "sota_link":"https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2"
59
+ },
60
+ "paper_url":"https://arxiv.org/abs/1908.10084",
61
+ "mark":"True",
62
+ "class":"HFModel"},
63
+
64
+ { "name":"SGPT-125M",
65
+ "model":"Muennighoff/SGPT-125M-weightedmean-nli-bitfit",
66
+ "fork_url":"https://github.com/taskswithcode/sgpt",
67
+ "orig_author_url":"https://github.com/Muennighoff",
68
+ "orig_author":"Niklas Muennighoff",
69
+ "sota_info": {
70
+ "task":"#1 in multiple information retrieval & search tasks(smaller variant)",
71
+ "sota_link":"https://paperswithcode.com/paper/sgpt-gpt-sentence-embeddings-for-semantic"
72
+ },
73
+ "paper_url":"https://arxiv.org/abs/2202.08904v5",
74
+ "mark":"True",
75
+ "class":"SGPTModel"},
76
+ { "name":"SGPT-1.3B",
77
+ "model": "Muennighoff/SGPT-1.3B-weightedmean-msmarco-specb-bitfit",
78
+ "fork_url":"https://github.com/taskswithcode/sgpt",
79
+ "orig_author_url":"https://github.com/Muennighoff",
80
+ "orig_author":"Niklas Muennighoff",
81
+ "sota_info": {
82
+ "task":"#1 in multiple information retrieval & search tasks(smaller variant)",
83
+ "sota_link":"https://paperswithcode.com/paper/sgpt-gpt-sentence-embeddings-for-semantic"
84
+ },
85
+ "paper_url":"https://arxiv.org/abs/2202.08904v5",
86
+ "Note":"If this large model takes too long or fails to load , try this ",
87
+ "alt_url":"http://www.taskswithcode.com/sentence_similarity/",
88
+ "mark":"True",
89
+ "class":"SGPTModel"},
90
+ { "name":"SGPT-5.8B",
91
+ "model": "Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit" ,
92
+ "fork_url":"https://github.com/taskswithcode/sgpt",
93
+ "orig_author_url":"https://github.com/Muennighoff",
94
+ "orig_author":"Niklas Muennighoff",
95
+ "Note":"If this large model takes too long or fails to load , try this ",
96
+ "alt_url":"http://www.taskswithcode.com/sentence_similarity/",
97
+ "sota_info": {
98
+ "task":"#1 in multiple information retrieval & search tasks",
99
+ "sota_link":"https://paperswithcode.com/paper/sgpt-gpt-sentence-embeddings-for-semantic"
100
+ },
101
+ "paper_url":"https://arxiv.org/abs/2202.08904v5",
102
+ "mark":"True",
103
+ "class":"SGPTModel"},
104
+
105
+ { "name":"SIMCSE-large" ,
106
+ "model":"princeton-nlp/sup-simcse-roberta-large",
107
+ "fork_url":"https://github.com/taskswithcode/SimCSE",
108
+ "orig_author_url":"https://github.com/princeton-nlp",
109
+ "orig_author":"Princeton Natural Language Processing",
110
+ "Note":"If this large model takes too long or fails to load , try this ",
111
+ "alt_url":"http://www.taskswithcode.com/sentence_similarity/",
112
+ "sota_info": {
113
+ "task":"Within top 10 in multiple semantic textual similarity tasks",
114
+ "sota_link":"https://paperswithcode.com/paper/simcse-simple-contrastive-learning-of"
115
+ },
116
+ "paper_url":"https://arxiv.org/abs/2104.08821v4",
117
+ "mark":"True",
118
+ "class":"SimCSEModel","sota_link":"https://paperswithcode.com/sota/semantic-textual-similarity-on-sick"},
119
+
120
+ { "name":"SIMCSE-base" ,
121
+ "model":"princeton-nlp/sup-simcse-roberta-base",
122
+ "fork_url":"https://github.com/taskswithcode/SimCSE",
123
+ "orig_author_url":"https://github.com/princeton-nlp",
124
+ "orig_author":"Princeton Natural Language Processing",
125
+ "sota_info": {
126
+ "task":"Within top 10 in multiple semantic textual similarity tasks(smaller variant)",
127
+ "sota_link":"https://paperswithcode.com/paper/simcse-simple-contrastive-learning-of"
128
+ },
129
+ "paper_url":"https://arxiv.org/abs/2104.08821v4",
130
+ "mark":"True",
131
+ "class":"SimCSEModel","sota_link":"https://paperswithcode.com/sota/semantic-textual-similarity-on-sick"}
132
+
133
+
134
+ ]