felix commited on
Commit
01a5a51
·
1 Parent(s): 55d3f7a

improvements

Browse files
Files changed (2) hide show
  1. Addr-Test.xlsx +0 -0
  2. app.py +15 -4
Addr-Test.xlsx DELETED
Binary file (11 kB)
 
app.py CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
4
  import numpy as np
5
  import torch
6
  from transformers import AlbertTokenizer, AlbertModel
 
7
  from sklearn.metrics.pairwise import cosine_similarity
8
  from io import BytesIO
9
 
@@ -12,6 +13,14 @@ model_size='base'
12
  tokenizer = AlbertTokenizer.from_pretrained('albert-' + model_size + '-v2')
13
  model = AlbertModel.from_pretrained('albert-' + model_size + '-v2')
14
 
 
 
 
 
 
 
 
 
15
  def get_embedding(input_text):
16
  encoded_input = tokenizer(input_text, return_tensors='pt')
17
  input_ids = encoded_input.input_ids
@@ -31,7 +40,7 @@ def get_embedding(input_text):
31
  #sentence_embedding = output.last_hidden_state[0][0]
32
  return sentence_embedding.tolist()
33
 
34
- st. set_page_config(layout="wide")
35
  st.title('Upload the Address Dataset')
36
 
37
  st.markdown('Upload an Excel file to view the data in a table.')
@@ -92,7 +101,7 @@ if uploaded_file is not None:
92
  end = num_items
93
  stop_iter = True
94
 
95
- data_caqh.iloc[start:end, embedding_col_index] = data_caqh.iloc[start:end, full_addr_col_index].apply(get_embedding)
96
 
97
  progress_bar.progress(value=progress, text=f"CAQH embeddings: {(i + 1) * step_size} processed out of {num_items}")
98
 
@@ -123,7 +132,8 @@ if uploaded_file is not None:
123
  end = num_items
124
  stop_iter = True
125
 
126
- data_ndb.iloc[start:end, embedding_col_index] = data_ndb.iloc[start:end, full_addr_col_index].apply(get_embedding)
 
127
 
128
  progress_bar.progress(value=progress, text=f"NDB embeddings: {(i + 1) * step_size} processed out of {num_items}")
129
 
@@ -142,10 +152,11 @@ if uploaded_file is not None:
142
  if sim > max_similarity:
143
  max_similarity = sim
144
  matched_row = ndb_row
145
- if max_similarity >= 0.98:
146
  data_caqh.at[i, 'matched-addr'] = matched_row['full-addr']
147
  data_caqh.at[i, 'similarity-score'] = max_similarity
148
  else:
 
149
  data_caqh.at[i, 'matched-addr'] = 'No Matches'
150
 
151
  progress = i / num_items
 
4
  import numpy as np
5
  import torch
6
  from transformers import AlbertTokenizer, AlbertModel
7
+ from sentence_transformers import SentenceTransformer
8
  from sklearn.metrics.pairwise import cosine_similarity
9
  from io import BytesIO
10
 
 
13
  tokenizer = AlbertTokenizer.from_pretrained('albert-' + model_size + '-v2')
14
  model = AlbertModel.from_pretrained('albert-' + model_size + '-v2')
15
 
16
+ model_sbert = SentenceTransformer('sentence-transformers/paraphrase-albert-base-v2')
17
+ # for regular burt 0.98
18
+ similarity_threshold = 0.9
19
+
20
+ def get_sbert_embedding(input_text):
21
+ embedding = model_sbert.encode(input_text)
22
+ return embedding.tolist()
23
+
24
  def get_embedding(input_text):
25
  encoded_input = tokenizer(input_text, return_tensors='pt')
26
  input_ids = encoded_input.input_ids
 
40
  #sentence_embedding = output.last_hidden_state[0][0]
41
  return sentence_embedding.tolist()
42
 
43
+ st.set_page_config(layout="wide")
44
  st.title('Upload the Address Dataset')
45
 
46
  st.markdown('Upload an Excel file to view the data in a table.')
 
101
  end = num_items
102
  stop_iter = True
103
 
104
+ data_caqh.iloc[start:end, embedding_col_index] = data_caqh.iloc[start:end, full_addr_col_index].apply(get_sbert_embedding)
105
 
106
  progress_bar.progress(value=progress, text=f"CAQH embeddings: {(i + 1) * step_size} processed out of {num_items}")
107
 
 
132
  end = num_items
133
  stop_iter = True
134
 
135
+ # or get_embedding
136
+ data_ndb.iloc[start:end, embedding_col_index] = data_ndb.iloc[start:end, full_addr_col_index].apply(get_sbert_embedding)
137
 
138
  progress_bar.progress(value=progress, text=f"NDB embeddings: {(i + 1) * step_size} processed out of {num_items}")
139
 
 
152
  if sim > max_similarity:
153
  max_similarity = sim
154
  matched_row = ndb_row
155
+ if max_similarity >= similarity_threshold:
156
  data_caqh.at[i, 'matched-addr'] = matched_row['full-addr']
157
  data_caqh.at[i, 'similarity-score'] = max_similarity
158
  else:
159
+ print(f"max similarity was {max_similarity}")
160
  data_caqh.at[i, 'matched-addr'] = 'No Matches'
161
 
162
  progress = i / num_items