DanichOne commited on
Commit
b12d155
·
verified ·
1 Parent(s): 28ff1e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -11
app.py CHANGED
@@ -13,7 +13,6 @@ import string
13
  import unicodedata
14
  import time
15
 
16
-
17
  nltk.download('punkt')
18
  POST_ID = 0
19
  REFERENDUM_TYPE = "referendums_v2"
@@ -99,7 +98,7 @@ def get_proposals():
99
  def get_embeddings():
100
  global df_emb
101
  for i in range(len(df)):
102
- df_emb.loc[i] = [model.encode(markdn_2_str(df.iloc[i]['content']))]
103
 
104
 
105
  def update_proposals():
@@ -117,6 +116,7 @@ def update_proposals():
117
  POST_ID += 1
118
  else:
119
  print('proposals updated at {t}'.format(t=time.strftime("%H:%M:%S", time.localtime())))
 
120
  event.set()
121
  flag = False
122
 
@@ -125,15 +125,17 @@ def update_embeddings():
125
  global df_emb
126
  while True:
127
  event.wait()
128
- print(POST_ID)
129
- print(len(df))
130
 
131
  if len(df) != len(df_emb):
132
  id_to_add = [x + len(df_emb) for x in range(len(df) - len(df_emb))]
133
  for i in id_to_add:
134
- df_emb.loc[i] = [model.encode(markdn_2_str(df.iloc[i]['content']))]
 
 
135
  else:
136
  event.clear()
 
 
137
 
138
 
139
  def run_periodically():
@@ -144,7 +146,7 @@ def run_periodically():
144
  def compare_proposals(prop, count):
145
  query_emb = model.encode(markdn_2_str(prop))
146
  new_df = pd.DataFrame(columns=['sim1'])
147
- new_df['sim1'] = df_emb.apply(lambda row: dot_product(row[0], query_emb), axis=1)
148
  best_match = np.argsort(-new_df['sim1'])[0:count]
149
  res = [df.iloc[x]['content'] for x in best_match]
150
  stat = [df.iloc[x]['status'] for x in best_match]
@@ -163,12 +165,22 @@ if __name__ == '__main__':
163
  print('model downloaded')
164
 
165
  df = pd.DataFrame(columns=['content', 'status', 'ksm'])
166
- df_emb = pd.DataFrame(columns=['content'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
- print('proposal collection start')
169
- get_proposals()
170
- print('proposals collected, embeddings calculation start')
171
- get_embeddings()
172
 
173
  POST_ID = len(df)
174
 
 
13
  import unicodedata
14
  import time
15
 
 
16
  nltk.download('punkt')
17
  POST_ID = 0
18
  REFERENDUM_TYPE = "referendums_v2"
 
98
  def get_embeddings():
99
  global df_emb
100
  for i in range(len(df)):
101
+ df_emb.loc[i] = model.encode(markdn_2_str(df.iloc[i]['content']))
102
 
103
 
104
  def update_proposals():
 
116
  POST_ID += 1
117
  else:
118
  print('proposals updated at {t}'.format(t=time.strftime("%H:%M:%S", time.localtime())))
119
+ df.to_excel('df.xlsx', index=False)
120
  event.set()
121
  flag = False
122
 
 
125
  global df_emb
126
  while True:
127
  event.wait()
 
 
128
 
129
  if len(df) != len(df_emb):
130
  id_to_add = [x + len(df_emb) for x in range(len(df) - len(df_emb))]
131
  for i in id_to_add:
132
+ print(model.encode(markdn_2_str(df.iloc[i])))
133
+ print(len(model.encode(markdn_2_str(df.iloc[i]))))
134
+ df_emb.loc[i] = model.encode(markdn_2_str(df.iloc[i]))
135
  else:
136
  event.clear()
137
+ df_emb.to_csv('df_emb.csv', index=False)
138
+
139
 
140
 
141
  def run_periodically():
 
146
  def compare_proposals(prop, count):
147
  query_emb = model.encode(markdn_2_str(prop))
148
  new_df = pd.DataFrame(columns=['sim1'])
149
+ new_df['sim1'] = df_emb.apply(lambda row: dot_product(row, query_emb), axis=1)
150
  best_match = np.argsort(-new_df['sim1'])[0:count]
151
  res = [df.iloc[x]['content'] for x in best_match]
152
  stat = [df.iloc[x]['status'] for x in best_match]
 
165
  print('model downloaded')
166
 
167
  df = pd.DataFrame(columns=['content', 'status', 'ksm'])
168
+ df_emb = pd.DataFrame(columns=range(768))
169
+ print(df_emb)
170
+
171
+
172
+ df = pd.read_excel('df.xlsx')
173
+ df_emb = pd.read_csv('df_emb.csv')
174
+ df = df.loc[:, ['content', 'status', 'ksm']]
175
+
176
+ # get_proposals()
177
+ # get_embeddings()
178
+
179
+ # df.to_excel('df.xlsx', index=False)
180
+ # df_emb.to_csv('df_emb.csv', index=False)
181
+
182
+
183
 
 
 
 
 
184
 
185
  POST_ID = len(df)
186