cmulgy commited on
Commit
e0e609c
1 Parent(s): 8be0131
Files changed (2) hide show
  1. arxiv_agent.py +119 -63
  2. utils.py +26 -9
arxiv_agent.py CHANGED
@@ -4,8 +4,11 @@ import json
4
  import time
5
  import datetime
6
  from xml.etree import ElementTree
7
-
 
 
8
  import requests
 
9
  import warnings
10
  warnings.filterwarnings("ignore")
11
  os.environ['KMP_DUPLICATE_LIB_OK']='True'
@@ -13,6 +16,24 @@ from utils import *
13
  import thread6
14
  MAX_DAILY_PAPER = 200
15
  DAY_TIME = 60 * 60 * 24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  def feedback_thought(input_ls): # preload
18
  agent, query, ansA, ansB, feedbackA, feedbackB = input_ls
@@ -39,8 +60,9 @@ def feedback_thought(input_ls): # preload
39
  json_data[date][query]["feedbackA"] = feedbackA
40
  json_data[date][query]["answerB"] = (ansB)
41
  json_data[date][query]["feedbackB"] = feedbackB
42
- with open(filename,"w") as f:
43
- json.dump(json_data,f)
 
44
 
45
  preferred_ans = ""
46
  if feedbackA == 1:
@@ -71,12 +93,12 @@ def feedback_thought(input_ls): # preload
71
  agent.thought_embedding[date] = [get_bert_embedding([tem_thought])[0]]
72
  else:
73
  agent.thought_embedding[date].append(get_bert_embedding([tem_thought])[0])
 
 
 
74
 
75
- with open(filename_thought,"w") as f:
76
- json.dump(json_data_thought,f)
77
-
78
- with open(agent.thought_embedding_path, "wb") as f:
79
- pickle.dump(agent.thought_embedding, f)
80
 
81
  # return "Give feedback successfully!"
82
 
@@ -96,7 +118,7 @@ def dailyDownload(agent_ls):
96
 
97
  json_file = agent.dataset_path
98
 
99
- update_file=update_json_file(json_file, data_collector)
100
 
101
  time_chunks_embed={}
102
 
@@ -105,43 +127,53 @@ def dailyDownload(agent_ls):
105
  papers = data[date]['abstract']
106
  papers_embedding=get_bert_embedding(papers)
107
  time_chunks_embed[date.strftime("%m/%d/%Y")] = papers_embedding
108
- update_paper_file=update_pickle_file(agent.embedding_path,time_chunks_embed)
109
  agent.paper = update_file
110
  agent.paper_embedding = update_paper_file
111
  print("Today is " + agent.newest_day.strftime("%m/%d/%Y"))
112
 
113
  def dailySave(agent_ls):
114
  agent = agent_ls[0]
 
 
115
  while True:
116
  time.sleep(DAY_TIME)
117
- with open(agent.trend_idea_path, "w") as f_:
118
- json.dump(agent.trend_idea, f_)
119
-
120
- with open(agent.thought_path, "w") as f_:
121
- json.dump(agent.thought, f_)
122
-
123
- with open(agent.thought_embedding_path, "wb") as f:
124
- pickle.dump(agent.thought_embedding, f)
125
-
126
- with open(agent.profile_path,"w") as f:
127
- json.dump(agent.profile,f)
128
 
 
 
 
 
 
 
 
129
 
130
  class ArxivAgent:
131
  def __init__(self):
132
 
133
- self.dataset_path = "./dataset/paper.json"
134
- self.thought_path = "./dataset/thought.json"
135
- self.trend_idea_path = "./dataset/trend_idea.json"
136
- self.profile_path = "./dataset/profile.json"
137
-
138
- self.embedding_path = "./dataset/paper_embedding.pkl"
139
- self.thought_embedding_path = './dataset/thought_embedding.pkl'
140
-
141
- self.feedback_path = 'dataset/feedback.json'
 
142
  self.today = datetime.datetime.now().strftime("%m/%d/%Y")
143
 
144
  self.newest_day = ""
 
 
 
 
 
145
  self.load_cache()
146
 
147
  self.download()
@@ -315,15 +347,21 @@ class ArxivAgent:
315
  data_collector.append(data)
316
 
317
  json_file = self.dataset_path
318
- if not os.path.exists(json_file):
 
 
 
 
319
  with open(json_file,'w')as a:
320
- print("create " + json_file)
321
 
322
- update_file=update_json_file(json_file, data_collector)
323
 
324
- if not os.path.exists(self.embedding_path):
 
 
325
  with open(self.embedding_path,'wb')as a:
326
- print("create " + self.embedding_path)
327
  time_chunks_embed={}
328
 
329
  for data in data_collector:
@@ -331,75 +369,87 @@ class ArxivAgent:
331
  papers = data[date]['abstract']
332
  papers_embedding=get_bert_embedding(papers)
333
  time_chunks_embed[date.strftime("%m/%d/%Y")] = papers_embedding
334
- update_paper_file=update_pickle_file(self.embedding_path,time_chunks_embed)
335
  self.paper = update_file
336
  self.paper_embedding = update_paper_file
337
 
338
 
339
 
340
  def load_cache(self):
341
- filename = self.feedback_path
342
 
343
- if os.path.exists(filename):
 
 
 
344
  with open(filename,"rb") as f:
345
  content = f.read()
346
  if not content:
347
  m = {}
348
  else:
349
  m = json.loads(content)
350
- else:
351
  with open(filename, mode='w', encoding='utf-8') as ff:
352
  m = {}
353
  self.feedback = m.copy()
354
 
355
  filename = self.trend_idea_path
356
 
357
- if os.path.exists(filename):
 
 
358
  with open(filename,"rb") as f:
359
  content = f.read()
360
  if not content:
361
  m = {}
362
  else:
363
  m = json.loads(content)
364
- else:
365
  with open(filename, mode='w', encoding='utf-8') as ff:
366
  m = {}
367
  self.trend_idea = m.copy()
368
 
 
369
  filename = self.profile_path
370
- if os.path.exists(filename):
 
 
371
  with open(filename,"rb") as f:
372
  content = f.read()
373
  if not content:
374
  m = {}
375
  else:
376
  m = json.loads(content)
377
- else:
378
  with open(filename, mode='w', encoding='utf-8') as ff:
379
  m = {}
380
  self.profile = m.copy()
381
 
 
382
  filename = self.thought_path
383
  filename_emb = self.thought_embedding_path
384
- if os.path.exists(filename):
 
 
385
  with open(filename,"rb") as f:
386
  content = f.read()
387
  if not content:
388
  m = {}
389
  else:
390
  m = json.loads(content)
391
- else:
392
  with open(filename, mode='w', encoding='utf-8') as ff:
393
  m = {}
394
 
395
- if os.path.exists(filename_emb):
 
 
396
  with open(filename_emb,"rb") as f:
397
  content = f.read()
398
  if not content:
399
  m_emb = {}
400
  else:
401
  m_emb = pickle.loads(content)
402
- else:
403
  with open(filename_emb, mode='w', encoding='utf-8') as ff:
404
  m_emb = {}
405
 
@@ -407,6 +457,23 @@ class ArxivAgent:
407
  self.thought_embedding = m_emb.copy()
408
 
409
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
 
411
 
412
 
@@ -421,27 +488,16 @@ class ArxivAgent:
421
  def update_comment(self, comment):
422
  date = datetime.datetime.now().strftime("%m/%d/%Y")
423
 
424
- filename = 'dataset/comment.json'
425
- if os.path.exists(filename):
426
- with open(filename,"r") as f:
427
- content = f.read()
428
- if not content:
429
- m = {}
430
- else:
431
- m = json.loads(content)
432
- else:
433
- with open(filename, mode='w', encoding='utf-8') as ff:
434
- m = {}
435
-
436
 
437
- json_data = m.copy()
438
 
439
  if date not in json_data:
440
  json_data[date] = [comment]
441
  else: json_data[date].append(comment)
442
-
443
- with open(filename,"w") as f:
444
- json.dump(json_data,f)
445
  return "Thanks for your comment!"
446
 
447
 
 
4
  import time
5
  import datetime
6
  from xml.etree import ElementTree
7
+ from huggingface_hub import CommitScheduler
8
+ from huggingface_hub import HfApi
9
+ from pathlib import Path
10
  import requests
11
+ from datasets import load_dataset_builder
12
  import warnings
13
  warnings.filterwarnings("ignore")
14
  os.environ['KMP_DUPLICATE_LIB_OK']='True'
 
16
  import thread6
17
  MAX_DAILY_PAPER = 200
18
  DAY_TIME = 60 * 60 * 24
19
+ DAY_TIME_MIN = 60 * 24
20
+ DATA_REPO_ID = "cmulgy/ArxivCopilot_data"
21
+ READ_WRITE_TOKEN = os.environ['READ_WRITE']
22
+ api = HfApi(token = READ_WRITE_TOKEN)
23
+
24
+ DATASET_DIR = Path(".")
25
+ DATASET_DIR.mkdir(parents=True, exist_ok=True)
26
+ from huggingface_hub import hf_hub_download
27
+
28
+
29
+ scheduler = CommitScheduler(
30
+ repo_id=DATA_REPO_ID,
31
+ repo_type="dataset",
32
+ folder_path=DATASET_DIR,
33
+ path_in_repo=".",
34
+ hf_api = api,
35
+ every = DAY_TIME_MIN,
36
+ )
37
 
38
  def feedback_thought(input_ls): # preload
39
  agent, query, ansA, ansB, feedbackA, feedbackB = input_ls
 
60
  json_data[date][query]["feedbackA"] = feedbackA
61
  json_data[date][query]["answerB"] = (ansB)
62
  json_data[date][query]["feedbackB"] = feedbackB
63
+ with scheduler.lock:
64
+ with open(filename,"w") as f:
65
+ json.dump(json_data,f)
66
 
67
  preferred_ans = ""
68
  if feedbackA == 1:
 
93
  agent.thought_embedding[date] = [get_bert_embedding([tem_thought])[0]]
94
  else:
95
  agent.thought_embedding[date].append(get_bert_embedding([tem_thought])[0])
96
+ with scheduler.lock:
97
+ with open(filename_thought,"w") as f:
98
+ json.dump(json_data_thought,f)
99
 
100
+ with open(agent.thought_embedding_path, "wb") as f:
101
+ pickle.dump(agent.thought_embedding, f)
 
 
 
102
 
103
  # return "Give feedback successfully!"
104
 
 
118
 
119
  json_file = agent.dataset_path
120
 
121
+ update_file=update_json_file(json_file, data_collector, scheduler)
122
 
123
  time_chunks_embed={}
124
 
 
127
  papers = data[date]['abstract']
128
  papers_embedding=get_bert_embedding(papers)
129
  time_chunks_embed[date.strftime("%m/%d/%Y")] = papers_embedding
130
+ update_paper_file=update_pickle_file(agent.embedding_path,time_chunks_embed, scheduler)
131
  agent.paper = update_file
132
  agent.paper_embedding = update_paper_file
133
  print("Today is " + agent.newest_day.strftime("%m/%d/%Y"))
134
 
135
  def dailySave(agent_ls):
136
  agent = agent_ls[0]
137
+
138
+
139
  while True:
140
  time.sleep(DAY_TIME)
141
+ with scheduler.lock:
142
+ with open(agent.trend_idea_path, "w") as f_:
143
+ json.dump(agent.trend_idea, f_)
144
+
145
+ with open(agent.thought_path, "w") as f_:
146
+ json.dump(agent.thought, f_)
 
 
 
 
 
147
 
148
+ with open(agent.thought_embedding_path, "wb") as f:
149
+ pickle.dump(agent.thought_embedding, f)
150
+
151
+ with open(agent.profile_path,"w") as f:
152
+ json.dump(agent.profile,f)
153
+ with open(agent.comment_path,"w") as f:
154
+ json.dump(agent.comment,f)
155
 
156
  class ArxivAgent:
157
  def __init__(self):
158
 
159
+ self.dataset_path = DATASET_DIR / "dataset/paper.json"
160
+ self.thought_path = DATASET_DIR / "dataset/thought.json"
161
+ self.trend_idea_path = DATASET_DIR / "dataset/trend_idea.json"
162
+ self.profile_path = DATASET_DIR / "dataset/profile.json"
163
+ self.comment_path = DATASET_DIR / "dataset/comment.json"
164
+
165
+ self.embedding_path = DATASET_DIR / "dataset/paper_embedding.pkl"
166
+ self.thought_embedding_path = DATASET_DIR / "dataset/thought_embedding.pkl"
167
+
168
+ self.feedback_path = DATASET_DIR / "dataset/feedback.json"
169
  self.today = datetime.datetime.now().strftime("%m/%d/%Y")
170
 
171
  self.newest_day = ""
172
+
173
+
174
+ # import pdb
175
+ # pdb.set_trace()
176
+
177
  self.load_cache()
178
 
179
  self.download()
 
347
  data_collector.append(data)
348
 
349
  json_file = self.dataset_path
350
+
351
+
352
+ try:
353
+ hf_hub_download(repo_id=DATA_REPO_ID, filename="dataset/paper.json", local_dir = ".", repo_type="dataset")
354
+ except:
355
  with open(json_file,'w')as a:
356
+ print(json_file)
357
 
358
+ update_file=update_json_file(json_file, data_collector, scheduler)
359
 
360
+ try:
361
+ hf_hub_download(repo_id=DATA_REPO_ID, filename="dataset/paper_embedding.pkl", local_dir = ".", repo_type="dataset")
362
+ except:
363
  with open(self.embedding_path,'wb')as a:
364
+ print(self.embedding_path)
365
  time_chunks_embed={}
366
 
367
  for data in data_collector:
 
369
  papers = data[date]['abstract']
370
  papers_embedding=get_bert_embedding(papers)
371
  time_chunks_embed[date.strftime("%m/%d/%Y")] = papers_embedding
372
+ update_paper_file=update_pickle_file(self.embedding_path,time_chunks_embed, scheduler)
373
  self.paper = update_file
374
  self.paper_embedding = update_paper_file
375
 
376
 
377
 
378
  def load_cache(self):
 
379
 
380
+
381
+ filename = self.feedback_path
382
+ try:
383
+ hf_hub_download(repo_id=DATA_REPO_ID, filename="dataset/feedback.json", local_dir = ".", repo_type="dataset")
384
  with open(filename,"rb") as f:
385
  content = f.read()
386
  if not content:
387
  m = {}
388
  else:
389
  m = json.loads(content)
390
+ except:
391
  with open(filename, mode='w', encoding='utf-8') as ff:
392
  m = {}
393
  self.feedback = m.copy()
394
 
395
  filename = self.trend_idea_path
396
 
397
+ # if os.path.exists(filename):
398
+ try:
399
+ hf_hub_download(repo_id=DATA_REPO_ID, filename="dataset/trend_idea.json", local_dir = ".", repo_type="dataset")
400
  with open(filename,"rb") as f:
401
  content = f.read()
402
  if not content:
403
  m = {}
404
  else:
405
  m = json.loads(content)
406
+ except:
407
  with open(filename, mode='w', encoding='utf-8') as ff:
408
  m = {}
409
  self.trend_idea = m.copy()
410
 
411
+
412
  filename = self.profile_path
413
+ # if os.path.exists(filename):
414
+ try:
415
+ hf_hub_download(repo_id=DATA_REPO_ID, filename="dataset/profile.json", local_dir = ".", repo_type="dataset")
416
  with open(filename,"rb") as f:
417
  content = f.read()
418
  if not content:
419
  m = {}
420
  else:
421
  m = json.loads(content)
422
+ except:
423
  with open(filename, mode='w', encoding='utf-8') as ff:
424
  m = {}
425
  self.profile = m.copy()
426
 
427
+
428
  filename = self.thought_path
429
  filename_emb = self.thought_embedding_path
430
+ # if os.path.exists(filename):
431
+ try:
432
+ hf_hub_download(repo_id=DATA_REPO_ID, filename="dataset/thought.json", local_dir = ".", repo_type="dataset")
433
  with open(filename,"rb") as f:
434
  content = f.read()
435
  if not content:
436
  m = {}
437
  else:
438
  m = json.loads(content)
439
+ except:
440
  with open(filename, mode='w', encoding='utf-8') as ff:
441
  m = {}
442
 
443
+ # if os.path.exists(filename_emb):
444
+ try:
445
+ hf_hub_download(repo_id=DATA_REPO_ID, filename="dataset/thought_embedding.pkl", local_dir = ".", repo_type="dataset")
446
  with open(filename_emb,"rb") as f:
447
  content = f.read()
448
  if not content:
449
  m_emb = {}
450
  else:
451
  m_emb = pickle.loads(content)
452
+ except:
453
  with open(filename_emb, mode='w', encoding='utf-8') as ff:
454
  m_emb = {}
455
 
 
457
  self.thought_embedding = m_emb.copy()
458
 
459
 
460
+ filename = self.comment_path
461
+ # if os.path.exists(filename):
462
+ try:
463
+ hf_hub_download(repo_id=DATA_REPO_ID, filename="dataset/comment.json", local_dir = ".", repo_type="dataset")
464
+
465
+ with open(filename,"r") as f:
466
+ content = f.read()
467
+ if not content:
468
+ m = {}
469
+ else:
470
+ m = json.loads(content)
471
+ except:
472
+ with open(filename, mode='w', encoding='utf-8') as ff:
473
+ m = {}
474
+
475
+
476
+ self.comment = m.copy()
477
 
478
 
479
 
 
488
  def update_comment(self, comment):
489
  date = datetime.datetime.now().strftime("%m/%d/%Y")
490
 
491
+
 
 
 
 
 
 
 
 
 
 
 
492
 
493
+ json_data = self.comment
494
 
495
  if date not in json_data:
496
  json_data[date] = [comment]
497
  else: json_data[date].append(comment)
498
+ # with scheduler.lock:
499
+ # with open(filename,"w") as f:
500
+ # json.dump(json_data,f)
501
  return "Thanks for your comment!"
502
 
503
 
utils.py CHANGED
@@ -275,14 +275,14 @@ def summarize_research_field(profile, keywords, dataset,data_embedding):
275
  content = completion.choices[0].message["content"]
276
  content_l.append(content)
277
  return content_l, retrieve_paper
278
- def update_json_file(filename,data_all):
279
  with open(filename,"r") as f:
280
  content = f.read()
281
  if not content:
282
  m = {}
283
  else:
284
  m = json.loads(content)
285
-
286
  json_data = m.copy()
287
 
288
  # update papers in each keywords
@@ -296,11 +296,12 @@ def update_json_file(filename,data_all):
296
  papers['ch_abs']=copy.deepcopy(papers['abstract'])
297
  # print(papers.published)
298
  json_data[time] = papers
299
- with open(filename,"w") as f_:
300
- json.dump(json_data,f_)
 
301
  return json_data
302
 
303
- def update_pickle_file(filename, data_all):
304
 
305
  # if os.path.exists(filename):
306
  # with open(filename,"rb") as f:
@@ -311,8 +312,23 @@ def update_pickle_file(filename, data_all):
311
  # m = {}
312
  # else:
313
  # m = json.load(content)
314
- with open(filename, "rb") as file:
315
- m = pickle.load(file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  # json_data = m.copy()
317
  # else:
318
  # with open(filename, mode='wb', encoding='utf-8') as ff:
@@ -325,8 +341,9 @@ def update_pickle_file(filename, data_all):
325
  for time in data_all.keys():
326
  embeddings = data_all[time]
327
  pickle_data[time] =embeddings
328
- with open(filename, "wb") as f:
329
- pickle.dump(pickle_data, f)
 
330
 
331
  return pickle_data
332
  def json_to_md(filename):
 
275
  content = completion.choices[0].message["content"]
276
  content_l.append(content)
277
  return content_l, retrieve_paper
278
+ def update_json_file(filename,data_all, scheduler):
279
  with open(filename,"r") as f:
280
  content = f.read()
281
  if not content:
282
  m = {}
283
  else:
284
  m = json.loads(content)
285
+
286
  json_data = m.copy()
287
 
288
  # update papers in each keywords
 
296
  papers['ch_abs']=copy.deepcopy(papers['abstract'])
297
  # print(papers.published)
298
  json_data[time] = papers
299
+ with scheduler.lock:
300
+ with open(filename,"w") as f_:
301
+ json.dump(json_data,f_)
302
  return json_data
303
 
304
+ def update_pickle_file(filename, data_all, scheduler):
305
 
306
  # if os.path.exists(filename):
307
  # with open(filename,"rb") as f:
 
312
  # m = {}
313
  # else:
314
  # m = json.load(content)
315
+
316
+ # if os.path.exists(filename):
317
+ with open(filename,"rb") as f:
318
+ content = f.read()
319
+ if not content:
320
+ m = {}
321
+ else:
322
+ m = pickle.loads(content)
323
+ # else:
324
+ # with open(filename, mode='w', encoding='utf-8') as ff:
325
+ # m = {}
326
+ # if os.path.exists(filename):
327
+ # with open(filename, "rb") as file:
328
+ # m = pickle.load(file)
329
+ # else:
330
+ # m = {}
331
+
332
  # json_data = m.copy()
333
  # else:
334
  # with open(filename, mode='wb', encoding='utf-8') as ff:
 
341
  for time in data_all.keys():
342
  embeddings = data_all[time]
343
  pickle_data[time] =embeddings
344
+ with scheduler.lock:
345
+ with open(filename, "wb") as f:
346
+ pickle.dump(pickle_data, f)
347
 
348
  return pickle_data
349
  def json_to_md(filename):