rbiswasfc commited on
Commit
3aba82e
1 Parent(s): a67a3ad
Files changed (3) hide show
  1. .gitignore +2 -1
  2. app.py +17 -8
  3. main.py +28 -9
.gitignore CHANGED
@@ -2,4 +2,5 @@
2
  *.json
3
  data
4
  .ipynb_checkpoints
5
- __pycache__
 
 
2
  *.json
3
  data
4
  .ipynb_checkpoints
5
+ __pycache__
6
+ .sesskey
app.py CHANGED
@@ -27,14 +27,13 @@ app, rt = fast_app(html_style=(style,))
27
  login(token=os.environ.get("HF_TOKEN"))
28
 
29
  hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
30
- HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-article-texts"
31
- HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-article-images"
32
 
33
  abstract_ds = load_dataset(HF_REPO_ID_TXT, "abstracts", split="train")
34
  article_ds = load_dataset(HF_REPO_ID_TXT, "articles", split="train")
35
 
36
- image_ds = load_dataset(HF_REPO_ID_IMG, "images", split="train")
37
- image_ds = image_ds.filter(lambda x: x["page_number"] == 1)
38
 
39
 
40
  def parse_date(date_string):
@@ -56,11 +55,21 @@ for article in article_ds:
56
 
57
  weeks = sorted(week2articles.keys(), reverse=True)
58
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  def get_article_details(arxiv_id):
61
- article = article_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)[0]
62
- abstract = abstract_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
63
- image = image_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
64
  return article, abstract, image
65
 
66
 
@@ -103,7 +112,7 @@ def generate_week_content(current_week):
103
  ]
104
 
105
  if image:
106
- pil_image = image[0]["image"]
107
  img_byte_arr = BytesIO()
108
  pil_image.save(img_byte_arr, format="JPEG")
109
  img_byte_arr = img_byte_arr.getvalue()
 
27
  login(token=os.environ.get("HF_TOKEN"))
28
 
29
  hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
30
+ HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-texts"
31
+ HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-images"
32
 
33
  abstract_ds = load_dataset(HF_REPO_ID_TXT, "abstracts", split="train")
34
  article_ds = load_dataset(HF_REPO_ID_TXT, "articles", split="train")
35
 
36
+ image_ds = load_dataset(HF_REPO_ID_IMG, "images_first_page", split="train")
 
37
 
38
 
39
  def parse_date(date_string):
 
55
 
56
  weeks = sorted(week2articles.keys(), reverse=True)
57
 
58
+ arxiv2article = {article["arxiv_id"]: article for article in article_ds}
59
+ arxiv2abstract = {abstract["arxiv_id"]: abstract for abstract in abstract_ds}
60
+ arxiv2image = {image["arxiv_id"]: image for image in image_ds}
61
+
62
+ # def get_article_details(arxiv_id):
63
+ # article = article_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)[0]
64
+ # abstract = abstract_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
65
+ # image = image_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
66
+ # return article, abstract, image
67
+
68
 
69
  def get_article_details(arxiv_id):
70
+ article = arxiv2article.get(arxiv_id, {})
71
+ abstract = arxiv2abstract.get(arxiv_id, {})
72
+ image = arxiv2image.get(arxiv_id, {})
73
  return article, abstract, image
74
 
75
 
 
112
  ]
113
 
114
  if image:
115
+ pil_image = image["image"] # image[0]["image"]
116
  img_byte_arr = BytesIO()
117
  pil_image.save(img_byte_arr, format="JPEG")
118
  img_byte_arr = img_byte_arr.getvalue()
main.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import re
 
3
  import time
4
 
5
  import dotenv
@@ -19,8 +20,8 @@ dotenv.load_dotenv()
19
  login(token=os.environ.get("HF_TOKEN"))
20
 
21
  hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
22
- HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-article-texts"
23
- HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-article-images"
24
 
25
 
26
  ########################################################
@@ -67,7 +68,7 @@ def get_zotero_items(debug=False):
67
  print(f"# items fetched {len(items)}")
68
 
69
  if debug:
70
- if len(items) > 1500:
71
  break
72
 
73
  return items
@@ -334,7 +335,7 @@ def download_arxiv_pdf(arxiv_id):
334
  raise Exception(f"Failed to download PDF. Status code: {response.status_code}")
335
 
336
 
337
- def pdf_to_jpegs(pdf_content, output_folder):
338
  # Create output folder if it doesn't exist
339
  os.makedirs(output_folder, exist_ok=True)
340
 
@@ -353,6 +354,9 @@ def pdf_to_jpegs(pdf_content, output_folder):
353
  pix.save(image_path)
354
  # print(f"Saved {image_path}")
355
 
 
 
 
356
  doc.close()
357
 
358
 
@@ -444,6 +448,13 @@ def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
444
  except Exception as e:
445
  print(e)
446
 
 
 
 
 
 
 
 
447
  try:
448
  # push id_to_abstract
449
  abstract_ds = Dataset.from_pandas(abstract_df)
@@ -479,11 +490,8 @@ def main():
479
  existing_arxiv_ids = load_dataset(HF_REPO_ID_TXT, "processed_arxiv_ids")["train"]["arxiv_id"]
480
  except Exception as e:
481
  print(e)
482
- try:
483
- existing_arxiv_ids = srsly.read_json("data/processed_arxiv_ids.json")
484
- except Exception as e:
485
- print(e)
486
- existing_arxiv_ids = []
487
  existing_arxiv_ids = set(existing_arxiv_ids)
488
  print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
489
 
@@ -492,9 +500,20 @@ def main():
492
  arxiv_items = fetch_arxiv_htmls(arxiv_items)
493
  print(f"# of new arxiv items: {len(arxiv_items)}")
494
 
 
 
 
 
495
  processed_arxiv_ids = set()
496
  pbar = tqdm(range(len(arxiv_items)))
497
 
 
 
 
 
 
 
 
498
  for item in arxiv_items:
499
  # download images --
500
  save_arxiv_article_images(item["arxiv_id"])
 
1
  import os
2
  import re
3
+ import shutil
4
  import time
5
 
6
  import dotenv
 
20
  login(token=os.environ.get("HF_TOKEN"))
21
 
22
  hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
23
+ HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-texts"
24
+ HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-images"
25
 
26
 
27
  ########################################################
 
68
  print(f"# items fetched {len(items)}")
69
 
70
  if debug:
71
+ if len(items) > 1600:
72
  break
73
 
74
  return items
 
335
  raise Exception(f"Failed to download PDF. Status code: {response.status_code}")
336
 
337
 
338
+ def pdf_to_jpegs(pdf_content, output_folder, max_pages=128):
339
  # Create output folder if it doesn't exist
340
  os.makedirs(output_folder, exist_ok=True)
341
 
 
354
  pix.save(image_path)
355
  # print(f"Saved {image_path}")
356
 
357
+ if page_num >= max_pages:
358
+ break
359
+
360
  doc.close()
361
 
362
 
 
448
  except Exception as e:
449
  print(e)
450
 
451
+ # upload first pages only
452
+ try:
453
+ img_ds = img_ds.filter(lambda x: x["page_number"] == 1)
454
+ img_ds.push_to_hub(HF_REPO_ID_IMG, "images_first_page", token=os.environ.get("HF_TOKEN"))
455
+ except Exception as e:
456
+ print(e)
457
+
458
  try:
459
  # push id_to_abstract
460
  abstract_ds = Dataset.from_pandas(abstract_df)
 
490
  existing_arxiv_ids = load_dataset(HF_REPO_ID_TXT, "processed_arxiv_ids")["train"]["arxiv_id"]
491
  except Exception as e:
492
  print(e)
493
+ existing_arxiv_ids = []
494
+
 
 
 
495
  existing_arxiv_ids = set(existing_arxiv_ids)
496
  print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
497
 
 
500
  arxiv_items = fetch_arxiv_htmls(arxiv_items)
501
  print(f"# of new arxiv items: {len(arxiv_items)}")
502
 
503
+ if len(arxiv_items) == 0:
504
+ print("No new arxiv items to process")
505
+ return
506
+
507
  processed_arxiv_ids = set()
508
  pbar = tqdm(range(len(arxiv_items)))
509
 
510
+ # remove "data" directory if it exists
511
+ if os.path.exists("data"):
512
+ try:
513
+ shutil.rmtree("data")
514
+ except Exception as e:
515
+ print(e)
516
+
517
  for item in arxiv_items:
518
  # download images --
519
  save_arxiv_article_images(item["arxiv_id"])