rbiswasfc commited on
Commit
0889949
·
1 Parent(s): 9afacec
Files changed (3) hide show
  1. app copy.py +0 -134
  2. app.py +19 -117
  3. main.py +89 -29
app copy.py DELETED
@@ -1,134 +0,0 @@
1
- import base64
2
- import os
3
- from collections import defaultdict
4
- from datetime import date, datetime, timedelta
5
- from io import BytesIO
6
-
7
- import dotenv
8
- from datasets import load_dataset
9
- from dateutil.parser import parse
10
- from dateutil.tz import tzutc
11
- from fasthtml.common import *
12
- from huggingface_hub import login, whoami
13
-
14
- dotenv.load_dotenv()
15
-
16
- style = Style("""
17
- .grid { margin-bottom: 1rem; }
18
- .card { display: flex; flex-direction: column; }
19
- .card img { margin-bottom: 0.5rem; }
20
- .card h5 { margin: 0; font-size: 0.9rem; line-height: 1.2; }
21
- .card a { color: inherit; text-decoration: none; }
22
- .card a:hover { text-decoration: underline; }
23
- """)
24
-
25
- app, rt = fast_app(html_style=(style,))
26
-
27
- login(token=os.environ.get("HF_TOKEN"))
28
-
29
- hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
30
- HF_REPO_ID = f"{hf_user}/zotero-articles"
31
-
32
- abstract_ds = load_dataset(HF_REPO_ID, "abstracts", split="train")
33
- article_ds = load_dataset(HF_REPO_ID, "articles", split="train")
34
-
35
- image_ds = load_dataset(HF_REPO_ID, "images", split="train")
36
- image_ds = image_ds.filter(lambda x: x["page_number"] == 1)
37
-
38
-
39
- def parse_date(date_string):
40
- try:
41
- return parse(date_string).astimezone(tzutc()).date()
42
- except ValueError:
43
- return date.today()
44
-
45
-
46
- def get_week_start(date_obj):
47
- return date_obj - timedelta(days=date_obj.weekday())
48
-
49
-
50
- week2articles = defaultdict(list)
51
- for article in article_ds:
52
- date_added = parse_date(article["date_added"])
53
- week_start = get_week_start(date_added)
54
- week2articles[week_start].append(article["arxiv_id"])
55
-
56
- weeks = sorted(week2articles.keys(), reverse=True)
57
-
58
-
59
- def get_article_details(arxiv_id):
60
- article = article_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)[0]
61
- abstract = abstract_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
62
- image = image_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
63
- return article, abstract, image
64
-
65
-
66
- def generate_week_content(current_week):
67
- week_index = weeks.index(current_week)
68
- prev_week = weeks[week_index + 1] if week_index < len(weeks) - 1 else None
69
- next_week = weeks[week_index - 1] if week_index > 0 else None
70
-
71
- nav_buttons = Group(
72
- Button(
73
- "← Previous Week",
74
- hx_get=f"/week/{prev_week}" if prev_week else "#",
75
- hx_target="#content",
76
- hx_swap="innerHTML",
77
- disabled=not prev_week,
78
- ),
79
- Button(
80
- "Next Week →",
81
- hx_get=f"/week/{next_week}" if next_week else "#",
82
- hx_target="#content",
83
- hx_swap="innerHTML",
84
- disabled=not next_week,
85
- ),
86
- )
87
-
88
- articles = week2articles[current_week]
89
- article_cards = []
90
- for arxiv_id in articles:
91
- article, abstract, image = get_article_details(arxiv_id)
92
- article_title = article["contents"][0].get("paper_title", "article") if article["contents"] else "article"
93
-
94
- card_content = [H5(A(article_title, href=f"https://arxiv.org/abs/{arxiv_id}", target="_blank"))]
95
-
96
- if image:
97
- pil_image = image[0]["image"]
98
- img_byte_arr = BytesIO()
99
- pil_image.save(img_byte_arr, format="JPEG")
100
- img_byte_arr = img_byte_arr.getvalue()
101
- image_url = f"data:image/jpeg;base64,{base64.b64encode(img_byte_arr).decode('utf-8')}"
102
- card_content.insert(
103
- 1, Img(src=image_url, alt="Article image", style="max-width: 100%; height: auto; margin-bottom: 15px;")
104
- )
105
-
106
- article_cards.append(Card(*card_content, cls="mb-4"))
107
-
108
- grid = Grid(*article_cards, style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 1rem;")
109
-
110
- week_end = current_week + timedelta(days=6)
111
- return Div(
112
- nav_buttons,
113
- H3(f"Week of {current_week.strftime('%B %d')} - {week_end.strftime('%B %d, %Y')} ({len(articles)} articles)"),
114
- grid,
115
- nav_buttons,
116
- id="content",
117
- )
118
-
119
-
120
- @rt("/")
121
- def get():
122
- return Titled("AnswerAI Zotero Weekly", generate_week_content(weeks[0]))
123
-
124
-
125
- @rt("/week/{date}")
126
- def get(date: str):
127
- try:
128
- current_week = datetime.strptime(date, "%Y-%m-%d").date()
129
- return generate_week_content(current_week)
130
- except Exception as e:
131
- return Div(f"Error displaying articles: {str(e)}")
132
-
133
-
134
- serve()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,134 +1,36 @@
1
- import base64
2
  import os
3
- from collections import defaultdict
4
- from datetime import date, datetime, timedelta
5
- from io import BytesIO
6
 
7
  import dotenv
8
- from datasets import load_dataset
9
- from dateutil.parser import parse
10
- from dateutil.tz import tzutc
11
  from fasthtml.common import *
12
- from huggingface_hub import login, whoami
13
 
14
  dotenv.load_dotenv()
15
-
16
- style = Style("""
17
- .grid { margin-bottom: 1rem; }
18
- .card { display: flex; flex-direction: column; }
19
- .card img { margin-bottom: 0.5rem; }
20
- .card h5 { margin: 0; font-size: 0.9rem; line-height: 1.2; }
21
- .card a { color: inherit; text-decoration: none; }
22
- .card a:hover { text-decoration: underline; }
23
- """)
24
-
25
- app, rt = fast_app(html_style=(style,))
26
-
27
  login(token=os.environ.get("HF_TOKEN"))
 
28
 
29
  hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
30
- HF_REPO_ID = f"{hf_user}/zotero-articles"
31
-
32
- abstract_ds = load_dataset(HF_REPO_ID, "abstracts", split="train")
33
- article_ds = load_dataset(HF_REPO_ID, "articles", split="train")
34
-
35
- image_ds = load_dataset(HF_REPO_ID, "images", split="train")
36
- image_ds = image_ds.filter(lambda x: x["page_number"] == 1)
37
-
38
-
39
- def parse_date(date_string):
40
- try:
41
- return parse(date_string).astimezone(tzutc()).date()
42
- except ValueError:
43
- return date.today()
44
-
45
-
46
- def get_week_start(date_obj):
47
- return date_obj - timedelta(days=date_obj.weekday())
48
-
49
-
50
- week2articles = defaultdict(list)
51
- for article in article_ds:
52
- date_added = parse_date(article["date_added"])
53
- week_start = get_week_start(date_added)
54
- week2articles[week_start].append(article["arxiv_id"])
55
-
56
- weeks = sorted(week2articles.keys(), reverse=True)
57
-
58
-
59
- def get_article_details(arxiv_id):
60
- article = article_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)[0]
61
- abstract = abstract_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
62
- image = image_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
63
- return article, abstract, image
64
-
65
 
66
- def generate_week_content(current_week):
67
- week_index = weeks.index(current_week)
68
- prev_week = weeks[week_index + 1] if week_index < len(weeks) - 1 else None
69
- next_week = weeks[week_index - 1] if week_index > 0 else None
70
-
71
- nav_buttons = Group(
72
- Button(
73
- "← Previous Week",
74
- hx_get=f"/week/{prev_week}" if prev_week else "#",
75
- hx_target="#content",
76
- hx_swap="innerHTML",
77
- disabled=not prev_week,
78
- ),
79
- Button(
80
- "Next Week →",
81
- hx_get=f"/week/{next_week}" if next_week else "#",
82
- hx_target="#content",
83
- hx_swap="innerHTML",
84
- disabled=not next_week,
85
- ),
86
- )
87
-
88
- articles = week2articles[current_week]
89
- article_cards = []
90
- for arxiv_id in articles:
91
- article, abstract, image = get_article_details(arxiv_id)
92
- article_title = article["contents"][0].get("paper_title", "article") if article["contents"] else "article"
93
-
94
- card_content = [H5(A(article_title, href=f"https://arxiv.org/abs/{arxiv_id}", target="_blank"))]
95
-
96
- if image:
97
- pil_image = image[0]["image"]
98
- img_byte_arr = BytesIO()
99
- pil_image.save(img_byte_arr, format="JPEG")
100
- img_byte_arr = img_byte_arr.getvalue()
101
- image_url = f"data:image/jpeg;base64,{base64.b64encode(img_byte_arr).decode('utf-8')}"
102
- card_content.insert(
103
- 0, Img(src=image_url, alt="Article image", style="max-width: 100%; height: auto; margin-bottom: 15px;")
104
- )
105
-
106
- article_cards.append(Card(*card_content, cls="mb-4"))
107
-
108
- grid = Grid(*article_cards, style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 1rem;")
109
-
110
- week_end = current_week + timedelta(days=6)
111
- return Div(
112
- nav_buttons,
113
- H3(f"Week of {current_week.strftime('%B %d')} - {week_end.strftime('%B %d, %Y')} ({len(articles)} articles)"),
114
- grid,
115
- nav_buttons,
116
- id="content",
117
- )
118
 
119
 
120
  @rt("/")
121
  def get():
122
- return Titled("AnswerAI Zotero Weekly", generate_week_content(weeks[0]))
123
-
124
-
125
- @rt("/week/{date}")
126
- def get(date: str):
127
- try:
128
- current_week = datetime.strptime(date, "%Y-%m-%d").date()
129
- return generate_week_content(current_week)
130
- except Exception as e:
131
- return Div(f"Error displaying articles: {str(e)}")
 
 
 
 
132
 
133
 
134
  serve()
 
 
1
  import os
 
 
 
2
 
3
  import dotenv
 
 
 
4
  from fasthtml.common import *
5
+ from huggingface_hub import HfApi, login, whoami
6
 
7
  dotenv.load_dotenv()
 
 
 
 
 
 
 
 
 
 
 
 
8
  login(token=os.environ.get("HF_TOKEN"))
9
+ api = HfApi()
10
 
11
  hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
12
+ HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-texts"
13
+ HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-images"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ app, rt = fast_app()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
 
18
  @rt("/")
19
  def get():
20
+ info = api.dataset_info(HF_REPO_ID_TXT)
21
+ text_last_modified = info.last_modified.strftime("%d-%b-%y at %H:%M:%S")
22
+
23
+ info = api.dataset_info(HF_REPO_ID_IMG)
24
+ img_last_modified = info.last_modified.strftime("%d-%b-%y at %H:%M:%S")
25
+
26
+ return Titled(
27
+ "Zotero Refresh Pipeline",
28
+ Div(
29
+ H3("Status"),
30
+ P(f"{HF_REPO_ID_TXT} : {text_last_modified} (last updated)"),
31
+ P(f"{HF_REPO_ID_IMG}: {img_last_modified} (last updated)"),
32
+ ),
33
+ )
34
 
35
 
36
  serve()
main.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import re
 
3
  import time
4
 
5
  import dotenv
@@ -9,17 +10,19 @@ import requests
9
  import schedule
10
  import srsly
11
  from bs4 import BeautifulSoup
12
- from datasets import Dataset, Image, load_dataset
13
- from huggingface_hub import create_repo, login, whoami
14
  from PIL import Image as PILImage
15
  from retry import retry
16
  from tqdm.auto import tqdm
17
 
18
  dotenv.load_dotenv()
19
  login(token=os.environ.get("HF_TOKEN"))
 
20
 
21
  hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
22
- HF_REPO_ID = f"{hf_user}/zotero-articles"
 
23
 
24
 
25
  ########################################################
@@ -66,7 +69,7 @@ def get_zotero_items(debug=False):
66
  print(f"# items fetched {len(items)}")
67
 
68
  if debug:
69
- if len(items) > 500:
70
  break
71
 
72
  return items
@@ -103,11 +106,18 @@ def get_arxiv_items(items):
103
  if arxiv_id in visited:
104
  continue
105
 
 
 
 
 
106
  arxiv_items.append(
107
  {
108
  "arxiv_id": arxiv_id,
109
  "arxiv_url": arxiv_url,
 
 
110
  "pdf_url": pdf_url,
 
111
  "added_by": item["meta"]["createdByUser"]["username"],
112
  "date_added": data.get("dateAdded", ""),
113
  }
@@ -129,10 +139,10 @@ def fetch_arxiv_htmls(arxiv_items):
129
  for item in tqdm(arxiv_items):
130
  html = fetch_arxiv_html(item["arxiv_id"])
131
  if html:
132
- item["raw_html"] = html
133
  else:
134
  print(f"failed to fetch html for {item['arxiv_id']}")
135
- item["raw_html"] = "Error"
136
 
137
  return arxiv_items
138
 
@@ -326,7 +336,7 @@ def download_arxiv_pdf(arxiv_id):
326
  raise Exception(f"Failed to download PDF. Status code: {response.status_code}")
327
 
328
 
329
- def pdf_to_jpegs(pdf_content, output_folder):
330
  # Create output folder if it doesn't exist
331
  os.makedirs(output_folder, exist_ok=True)
332
 
@@ -345,6 +355,9 @@ def pdf_to_jpegs(pdf_content, output_folder):
345
  pix.save(image_path)
346
  # print(f"Saved {image_path}")
347
 
 
 
 
348
  doc.close()
349
 
350
 
@@ -392,8 +405,6 @@ def create_hf_image_dataset(base_dir):
392
  "image": [d["image"] for d in data],
393
  "arxiv_id": [d["arxiv_id"] for d in data],
394
  "page_number": [d["page_number"] for d in data],
395
- "width": [d["width"] for d in data],
396
- "height": [d["height"] for d in data],
397
  }
398
  )
399
 
@@ -409,9 +420,17 @@ def create_hf_image_dataset(base_dir):
409
 
410
 
411
  def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
412
- repo_id = HF_REPO_ID
 
 
 
 
 
 
 
 
413
  create_repo(
414
- repo_id=repo_id,
415
  token=os.environ.get("HF_TOKEN"),
416
  private=True,
417
  repo_type="dataset",
@@ -421,23 +440,44 @@ def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
421
  # upload image dataset
422
  try:
423
  img_ds = create_hf_image_dataset("data/arxiv_images")
424
- img_ds.push_to_hub(repo_id, "images", token=os.environ.get("HF_TOKEN"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
 
 
426
  # push id_to_abstract
427
  abstract_ds = Dataset.from_pandas(abstract_df)
428
- abstract_ds.push_to_hub(repo_id, "abstracts", token=os.environ.get("HF_TOKEN"))
429
 
430
  # push arxiv_items
431
  arxiv_ds = Dataset.from_pandas(contents_df)
432
- arxiv_ds.push_to_hub(repo_id, "articles", token=os.environ.get("HF_TOKEN"))
433
 
434
  # push processed_arxiv_ids
435
  processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
436
  processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
437
- processed_arxiv_ids_ds.push_to_hub(repo_id, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN"))
438
  except Exception as e:
439
  print(e)
440
 
 
 
 
 
 
 
441
 
442
  ########################################################
443
  ### MAIN
@@ -445,21 +485,20 @@ def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
445
 
446
 
447
  def main():
448
- items = get_zotero_items(debug=True)
 
 
449
  print(f"# of items fetched from zotero: {len(items)}")
450
  arxiv_items = get_arxiv_items(items)
451
  print(f"# of arxiv papers: {len(arxiv_items)}")
452
 
453
  # get already processed arxiv ids from HF
454
  try:
455
- existing_arxiv_ids = load_dataset(HF_REPO_ID, "processed_arxiv_ids")["train"]["arxiv_id"]
456
  except Exception as e:
457
  print(e)
458
- try:
459
- existing_arxiv_ids = srsly.read_json("data/processed_arxiv_ids.json")
460
- except Exception as e:
461
- print(e)
462
- existing_arxiv_ids = []
463
  existing_arxiv_ids = set(existing_arxiv_ids)
464
  print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
465
 
@@ -468,15 +507,27 @@ def main():
468
  arxiv_items = fetch_arxiv_htmls(arxiv_items)
469
  print(f"# of new arxiv items: {len(arxiv_items)}")
470
 
 
 
 
 
471
  processed_arxiv_ids = set()
 
 
 
 
 
 
 
 
 
472
  for item in arxiv_items:
473
  # download images --
474
  save_arxiv_article_images(item["arxiv_id"])
475
 
476
  # parse html
477
  try:
478
- item["contents"] = parse_html_content(item["raw_html"])
479
- processed_arxiv_ids.add(item["arxiv_id"])
480
  except Exception as e:
481
  print(f"Failed to parse html for {item['arxiv_id']}: {e}")
482
  item["contents"] = []
@@ -484,12 +535,21 @@ def main():
484
  if len(item["contents"]) == 0:
485
  print("Extracting from pdf...")
486
  md_content = get_pdf_text(item["arxiv_id"]) # fix this
 
 
487
  if md_content:
488
  item["contents"] = parse_markdown_content(md_content, item["arxiv_id"])
489
- processed_arxiv_ids.add(item["arxiv_id"])
490
  else:
491
  item["contents"] = []
492
 
 
 
 
 
 
 
 
 
493
  # save contents ---
494
  processed_arxiv_ids = list(processed_arxiv_ids)
495
  print(f"# of processed arxiv ids: {len(processed_arxiv_ids)}")
@@ -507,7 +567,7 @@ def main():
507
 
508
  # add to existing dataset
509
  try:
510
- old_abstract_df = load_dataset(HF_REPO_ID, "abstracts")["train"].to_pandas()
511
  except Exception as e:
512
  print(e)
513
  old_abstract_df = pd.DataFrame(columns=abstract_df.columns)
@@ -520,7 +580,7 @@ def main():
520
  contents_df = pd.DataFrame(arxiv_items)
521
  print(contents_df.head())
522
  try:
523
- old_contents_df = load_dataset(HF_REPO_ID, "articles")["train"].to_pandas()
524
  except Exception as e:
525
  print(e)
526
  old_contents_df = pd.DataFrame(columns=contents_df.columns)
@@ -531,7 +591,7 @@ def main():
531
  contents_df = contents_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)
532
 
533
  # upload to hf
534
- processed_arxiv_ids = list(set(processed_arxiv_ids + list(processed_arxiv_ids)))
535
  upload_to_hf(abstract_df, contents_df, processed_arxiv_ids)
536
 
537
  # save as local copy
@@ -545,7 +605,7 @@ def schedule_periodic_task():
545
  """
546
  Schedule the main task to run at the user-defined frequency
547
  """
548
- main() # run once initially
549
 
550
  frequency = "daily" # TODO: env
551
  if frequency == "hourly":
 
1
  import os
2
  import re
3
+ import shutil
4
  import time
5
 
6
  import dotenv
 
10
  import schedule
11
  import srsly
12
  from bs4 import BeautifulSoup
13
+ from datasets import Dataset, Image, concatenate_datasets, load_dataset
14
+ from huggingface_hub import HfApi, create_repo, login, whoami
15
  from PIL import Image as PILImage
16
  from retry import retry
17
  from tqdm.auto import tqdm
18
 
19
  dotenv.load_dotenv()
20
  login(token=os.environ.get("HF_TOKEN"))
21
+ api = HfApi()
22
 
23
  hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
24
+ HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-texts"
25
+ HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-images"
26
 
27
 
28
  ########################################################
 
69
  print(f"# items fetched {len(items)}")
70
 
71
  if debug:
72
+ if len(items) > 1600:
73
  break
74
 
75
  return items
 
106
  if arxiv_id in visited:
107
  continue
108
 
109
+ authors = []
110
+ for author in data.get("creators", []):
111
+ authors.append(f"{author.get('firstName', '')} {author.get('lastName', '')}")
112
+
113
  arxiv_items.append(
114
  {
115
  "arxiv_id": arxiv_id,
116
  "arxiv_url": arxiv_url,
117
+ "title": data.get("title", ""),
118
+ "authors": authors,
119
  "pdf_url": pdf_url,
120
+ "date_published": data.get("date", ""),
121
  "added_by": item["meta"]["createdByUser"]["username"],
122
  "date_added": data.get("dateAdded", ""),
123
  }
 
139
  for item in tqdm(arxiv_items):
140
  html = fetch_arxiv_html(item["arxiv_id"])
141
  if html:
142
+ item["raw_content"] = html
143
  else:
144
  print(f"failed to fetch html for {item['arxiv_id']}")
145
+ item["raw_content"] = "Error"
146
 
147
  return arxiv_items
148
 
 
336
  raise Exception(f"Failed to download PDF. Status code: {response.status_code}")
337
 
338
 
339
+ def pdf_to_jpegs(pdf_content, output_folder, max_pages=128):
340
  # Create output folder if it doesn't exist
341
  os.makedirs(output_folder, exist_ok=True)
342
 
 
355
  pix.save(image_path)
356
  # print(f"Saved {image_path}")
357
 
358
+ if page_num >= max_pages:
359
+ break
360
+
361
  doc.close()
362
 
363
 
 
405
  "image": [d["image"] for d in data],
406
  "arxiv_id": [d["arxiv_id"] for d in data],
407
  "page_number": [d["page_number"] for d in data],
 
 
408
  }
409
  )
410
 
 
420
 
421
 
422
  def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
423
+ # repo_id = HF_REPO_ID
424
+ create_repo(
425
+ repo_id=HF_REPO_ID_TXT,
426
+ token=os.environ.get("HF_TOKEN"),
427
+ private=True,
428
+ repo_type="dataset",
429
+ exist_ok=True,
430
+ )
431
+
432
  create_repo(
433
+ repo_id=HF_REPO_ID_IMG,
434
  token=os.environ.get("HF_TOKEN"),
435
  private=True,
436
  repo_type="dataset",
 
440
  # upload image dataset
441
  try:
442
  img_ds = create_hf_image_dataset("data/arxiv_images")
443
+ try:
444
+ old_img_ds = load_dataset(HF_REPO_ID_IMG, "images")["train"]
445
+ img_ds = concatenate_datasets([old_img_ds, img_ds])
446
+ except Exception as e:
447
+ print(e)
448
+ img_ds.push_to_hub(HF_REPO_ID_IMG, "images", token=os.environ.get("HF_TOKEN"))
449
+ except Exception as e:
450
+ print(e)
451
+
452
+ # upload first pages only
453
+ try:
454
+ img_ds = img_ds.filter(lambda x: x["page_number"] == 1)
455
+ img_ds.push_to_hub(HF_REPO_ID_IMG, "images_first_page", token=os.environ.get("HF_TOKEN"))
456
+ except Exception as e:
457
+ print(e)
458
 
459
+ try:
460
  # push id_to_abstract
461
  abstract_ds = Dataset.from_pandas(abstract_df)
462
+ abstract_ds.push_to_hub(HF_REPO_ID_TXT, "abstracts", token=os.environ.get("HF_TOKEN"))
463
 
464
  # push arxiv_items
465
  arxiv_ds = Dataset.from_pandas(contents_df)
466
+ arxiv_ds.push_to_hub(HF_REPO_ID_TXT, "articles", token=os.environ.get("HF_TOKEN"))
467
 
468
  # push processed_arxiv_ids
469
  processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
470
  processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
471
+ processed_arxiv_ids_ds.push_to_hub(HF_REPO_ID_TXT, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN"))
472
  except Exception as e:
473
  print(e)
474
 
475
+ # trigger refresh of connected datasets
476
+ print("==" * 40)
477
+ print("Triggering refresh of connected datasets")
478
+ api.restart_space(repo_id="answerdotai/zotero-weekly")
479
+ print("==" * 40)
480
+
481
 
482
  ########################################################
483
  ### MAIN
 
485
 
486
 
487
  def main():
488
+ # items = get_zotero_items(debug=True)
489
+ items = get_zotero_items(debug=False)
490
+
491
  print(f"# of items fetched from zotero: {len(items)}")
492
  arxiv_items = get_arxiv_items(items)
493
  print(f"# of arxiv papers: {len(arxiv_items)}")
494
 
495
  # get already processed arxiv ids from HF
496
  try:
497
+ existing_arxiv_ids = load_dataset(HF_REPO_ID_TXT, "processed_arxiv_ids")["train"]["arxiv_id"]
498
  except Exception as e:
499
  print(e)
500
+ existing_arxiv_ids = []
501
+
 
 
 
502
  existing_arxiv_ids = set(existing_arxiv_ids)
503
  print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
504
 
 
507
  arxiv_items = fetch_arxiv_htmls(arxiv_items)
508
  print(f"# of new arxiv items: {len(arxiv_items)}")
509
 
510
+ if len(arxiv_items) == 0:
511
+ print("No new arxiv items to process")
512
+ return
513
+
514
  processed_arxiv_ids = set()
515
+ pbar = tqdm(range(len(arxiv_items)))
516
+
517
+ # remove "data" directory if it exists
518
+ if os.path.exists("data"):
519
+ try:
520
+ shutil.rmtree("data")
521
+ except Exception as e:
522
+ print(e)
523
+
524
  for item in arxiv_items:
525
  # download images --
526
  save_arxiv_article_images(item["arxiv_id"])
527
 
528
  # parse html
529
  try:
530
+ item["contents"] = parse_html_content(item["raw_content"])
 
531
  except Exception as e:
532
  print(f"Failed to parse html for {item['arxiv_id']}: {e}")
533
  item["contents"] = []
 
535
  if len(item["contents"]) == 0:
536
  print("Extracting from pdf...")
537
  md_content = get_pdf_text(item["arxiv_id"]) # fix this
538
+ item["raw_content"] = md_content
539
+
540
  if md_content:
541
  item["contents"] = parse_markdown_content(md_content, item["arxiv_id"])
 
542
  else:
543
  item["contents"] = []
544
 
545
+ if len(item["contents"]) > 0:
546
+ processed_arxiv_ids.add(item["arxiv_id"])
547
+ if len(item["authors"]) == 0:
548
+ item["authors"] = [] # ["unknown"]
549
+ item["title"] = item["contents"][0]["paper_title"]
550
+ pbar.update(1)
551
+ pbar.close()
552
+
553
  # save contents ---
554
  processed_arxiv_ids = list(processed_arxiv_ids)
555
  print(f"# of processed arxiv ids: {len(processed_arxiv_ids)}")
 
567
 
568
  # add to existing dataset
569
  try:
570
+ old_abstract_df = load_dataset(HF_REPO_ID_TXT, "abstracts")["train"].to_pandas()
571
  except Exception as e:
572
  print(e)
573
  old_abstract_df = pd.DataFrame(columns=abstract_df.columns)
 
580
  contents_df = pd.DataFrame(arxiv_items)
581
  print(contents_df.head())
582
  try:
583
+ old_contents_df = load_dataset(HF_REPO_ID_TXT, "articles")["train"].to_pandas()
584
  except Exception as e:
585
  print(e)
586
  old_contents_df = pd.DataFrame(columns=contents_df.columns)
 
591
  contents_df = contents_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)
592
 
593
  # upload to hf
594
+ processed_arxiv_ids = list(set(processed_arxiv_ids + list(existing_arxiv_ids)))
595
  upload_to_hf(abstract_df, contents_df, processed_arxiv_ids)
596
 
597
  # save as local copy
 
605
  """
606
  Schedule the main task to run at the user-defined frequency
607
  """
608
+ # main() # run once initially
609
 
610
  frequency = "daily" # TODO: env
611
  if frequency == "hourly":