rbiswasfc commited on
Commit
747d03a
1 Parent(s): 6c3b726
Files changed (2) hide show
  1. main.py +51 -19
  2. requirements.txt +2 -1
main.py CHANGED
@@ -9,7 +9,7 @@ import requests
9
  import schedule
10
  import srsly
11
  from bs4 import BeautifulSoup
12
- from datasets import Dataset, Image, load_dataset
13
  from huggingface_hub import create_repo, login, whoami
14
  from PIL import Image as PILImage
15
  from retry import retry
@@ -19,7 +19,8 @@ dotenv.load_dotenv()
19
  login(token=os.environ.get("HF_TOKEN"))
20
 
21
  hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
22
- HF_REPO_ID = f"{hf_user}/zotero-articles"
 
23
 
24
 
25
  ########################################################
@@ -66,7 +67,7 @@ def get_zotero_items(debug=False):
66
  print(f"# items fetched {len(items)}")
67
 
68
  if debug:
69
- if len(items) > 500:
70
  break
71
 
72
  return items
@@ -103,11 +104,18 @@ def get_arxiv_items(items):
103
  if arxiv_id in visited:
104
  continue
105
 
 
 
 
 
106
  arxiv_items.append(
107
  {
108
  "arxiv_id": arxiv_id,
109
  "arxiv_url": arxiv_url,
 
 
110
  "pdf_url": pdf_url,
 
111
  "added_by": item["meta"]["createdByUser"]["username"],
112
  "date_added": data.get("dateAdded", ""),
113
  }
@@ -129,10 +137,10 @@ def fetch_arxiv_htmls(arxiv_items):
129
  for item in tqdm(arxiv_items):
130
  html = fetch_arxiv_html(item["arxiv_id"])
131
  if html:
132
- item["raw_html"] = html
133
  else:
134
  print(f"failed to fetch html for {item['arxiv_id']}")
135
- item["raw_html"] = "Error"
136
 
137
  return arxiv_items
138
 
@@ -392,8 +400,6 @@ def create_hf_image_dataset(base_dir):
392
  "image": [d["image"] for d in data],
393
  "arxiv_id": [d["arxiv_id"] for d in data],
394
  "page_number": [d["page_number"] for d in data],
395
- "width": [d["width"] for d in data],
396
- "height": [d["height"] for d in data],
397
  }
398
  )
399
 
@@ -409,9 +415,17 @@ def create_hf_image_dataset(base_dir):
409
 
410
 
411
  def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
412
- repo_id = HF_REPO_ID
 
 
 
 
 
 
 
 
413
  create_repo(
414
- repo_id=repo_id,
415
  token=os.environ.get("HF_TOKEN"),
416
  private=True,
417
  repo_type="dataset",
@@ -421,20 +435,28 @@ def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
421
  # upload image dataset
422
  try:
423
  img_ds = create_hf_image_dataset("data/arxiv_images")
424
- img_ds.push_to_hub(repo_id, "images", token=os.environ.get("HF_TOKEN"))
 
 
 
 
 
 
 
425
 
 
426
  # push id_to_abstract
427
  abstract_ds = Dataset.from_pandas(abstract_df)
428
- abstract_ds.push_to_hub(repo_id, "abstracts", token=os.environ.get("HF_TOKEN"))
429
 
430
  # push arxiv_items
431
  arxiv_ds = Dataset.from_pandas(contents_df)
432
- arxiv_ds.push_to_hub(repo_id, "articles", token=os.environ.get("HF_TOKEN"))
433
 
434
  # push processed_arxiv_ids
435
  processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
436
  processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
437
- processed_arxiv_ids_ds.push_to_hub(repo_id, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN"))
438
  except Exception as e:
439
  print(e)
440
 
@@ -454,7 +476,7 @@ def main():
454
 
455
  # get already processed arxiv ids from HF
456
  try:
457
- existing_arxiv_ids = load_dataset(HF_REPO_ID, "processed_arxiv_ids")["train"]["arxiv_id"]
458
  except Exception as e:
459
  print(e)
460
  try:
@@ -471,14 +493,15 @@ def main():
471
  print(f"# of new arxiv items: {len(arxiv_items)}")
472
 
473
  processed_arxiv_ids = set()
 
 
474
  for item in arxiv_items:
475
  # download images --
476
  save_arxiv_article_images(item["arxiv_id"])
477
 
478
  # parse html
479
  try:
480
- item["contents"] = parse_html_content(item["raw_html"])
481
- processed_arxiv_ids.add(item["arxiv_id"])
482
  except Exception as e:
483
  print(f"Failed to parse html for {item['arxiv_id']}: {e}")
484
  item["contents"] = []
@@ -486,12 +509,21 @@ def main():
486
  if len(item["contents"]) == 0:
487
  print("Extracting from pdf...")
488
  md_content = get_pdf_text(item["arxiv_id"]) # fix this
 
 
489
  if md_content:
490
  item["contents"] = parse_markdown_content(md_content, item["arxiv_id"])
491
- processed_arxiv_ids.add(item["arxiv_id"])
492
  else:
493
  item["contents"] = []
494
 
 
 
 
 
 
 
 
 
495
  # save contents ---
496
  processed_arxiv_ids = list(processed_arxiv_ids)
497
  print(f"# of processed arxiv ids: {len(processed_arxiv_ids)}")
@@ -509,7 +541,7 @@ def main():
509
 
510
  # add to existing dataset
511
  try:
512
- old_abstract_df = load_dataset(HF_REPO_ID, "abstracts")["train"].to_pandas()
513
  except Exception as e:
514
  print(e)
515
  old_abstract_df = pd.DataFrame(columns=abstract_df.columns)
@@ -522,7 +554,7 @@ def main():
522
  contents_df = pd.DataFrame(arxiv_items)
523
  print(contents_df.head())
524
  try:
525
- old_contents_df = load_dataset(HF_REPO_ID, "articles")["train"].to_pandas()
526
  except Exception as e:
527
  print(e)
528
  old_contents_df = pd.DataFrame(columns=contents_df.columns)
 
9
  import schedule
10
  import srsly
11
  from bs4 import BeautifulSoup
12
+ from datasets import Dataset, Image, concatenate_datasets, load_dataset
13
  from huggingface_hub import create_repo, login, whoami
14
  from PIL import Image as PILImage
15
  from retry import retry
 
19
  login(token=os.environ.get("HF_TOKEN"))
20
 
21
  hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
22
+ HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-article-texts"
23
+ HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-article-images"
24
 
25
 
26
  ########################################################
 
67
  print(f"# items fetched {len(items)}")
68
 
69
  if debug:
70
+ if len(items) > 1500:
71
  break
72
 
73
  return items
 
104
  if arxiv_id in visited:
105
  continue
106
 
107
+ authors = []
108
+ for author in data.get("creators", []):
109
+ authors.append(f"{author.get('firstName', '')} {author.get('lastName', '')}")
110
+
111
  arxiv_items.append(
112
  {
113
  "arxiv_id": arxiv_id,
114
  "arxiv_url": arxiv_url,
115
+ "title": data.get("title", ""),
116
+ "authors": authors,
117
  "pdf_url": pdf_url,
118
+ "date_published": data.get("date", ""),
119
  "added_by": item["meta"]["createdByUser"]["username"],
120
  "date_added": data.get("dateAdded", ""),
121
  }
 
137
  for item in tqdm(arxiv_items):
138
  html = fetch_arxiv_html(item["arxiv_id"])
139
  if html:
140
+ item["raw_content"] = html
141
  else:
142
  print(f"failed to fetch html for {item['arxiv_id']}")
143
+ item["raw_content"] = "Error"
144
 
145
  return arxiv_items
146
 
 
400
  "image": [d["image"] for d in data],
401
  "arxiv_id": [d["arxiv_id"] for d in data],
402
  "page_number": [d["page_number"] for d in data],
 
 
403
  }
404
  )
405
 
 
415
 
416
 
417
  def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
418
+ # repo_id = HF_REPO_ID
419
+ create_repo(
420
+ repo_id=HF_REPO_ID_TXT,
421
+ token=os.environ.get("HF_TOKEN"),
422
+ private=True,
423
+ repo_type="dataset",
424
+ exist_ok=True,
425
+ )
426
+
427
  create_repo(
428
+ repo_id=HF_REPO_ID_IMG,
429
  token=os.environ.get("HF_TOKEN"),
430
  private=True,
431
  repo_type="dataset",
 
435
  # upload image dataset
436
  try:
437
  img_ds = create_hf_image_dataset("data/arxiv_images")
438
+ try:
439
+ old_img_ds = load_dataset(HF_REPO_ID_IMG, "images")["train"]
440
+ img_ds = concatenate_datasets([old_img_ds, img_ds])
441
+ except Exception as e:
442
+ print(e)
443
+ img_ds.push_to_hub(HF_REPO_ID_IMG, "images", token=os.environ.get("HF_TOKEN"))
444
+ except Exception as e:
445
+ print(e)
446
 
447
+ try:
448
  # push id_to_abstract
449
  abstract_ds = Dataset.from_pandas(abstract_df)
450
+ abstract_ds.push_to_hub(HF_REPO_ID_TXT, "abstracts", token=os.environ.get("HF_TOKEN"))
451
 
452
  # push arxiv_items
453
  arxiv_ds = Dataset.from_pandas(contents_df)
454
+ arxiv_ds.push_to_hub(HF_REPO_ID_TXT, "articles", token=os.environ.get("HF_TOKEN"))
455
 
456
  # push processed_arxiv_ids
457
  processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
458
  processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
459
+ processed_arxiv_ids_ds.push_to_hub(HF_REPO_ID_TXT, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN"))
460
  except Exception as e:
461
  print(e)
462
 
 
476
 
477
  # get already processed arxiv ids from HF
478
  try:
479
+ existing_arxiv_ids = load_dataset(HF_REPO_ID_TXT, "processed_arxiv_ids")["train"]["arxiv_id"]
480
  except Exception as e:
481
  print(e)
482
  try:
 
493
  print(f"# of new arxiv items: {len(arxiv_items)}")
494
 
495
  processed_arxiv_ids = set()
496
+ pbar = tqdm(range(len(arxiv_items)))
497
+
498
  for item in arxiv_items:
499
  # download images --
500
  save_arxiv_article_images(item["arxiv_id"])
501
 
502
  # parse html
503
  try:
504
+ item["contents"] = parse_html_content(item["raw_content"])
 
505
  except Exception as e:
506
  print(f"Failed to parse html for {item['arxiv_id']}: {e}")
507
  item["contents"] = []
 
509
  if len(item["contents"]) == 0:
510
  print("Extracting from pdf...")
511
  md_content = get_pdf_text(item["arxiv_id"]) # fix this
512
+ item["raw_content"] = md_content
513
+
514
  if md_content:
515
  item["contents"] = parse_markdown_content(md_content, item["arxiv_id"])
 
516
  else:
517
  item["contents"] = []
518
 
519
+ if len(item["contents"]) > 0:
520
+ processed_arxiv_ids.add(item["arxiv_id"])
521
+ if len(item["authors"]) == 0:
522
+ item["authors"] = [] # ["unknown"]
523
+ item["title"] = item["contents"][0]["paper_title"]
524
+ pbar.update(1)
525
+ pbar.close()
526
+
527
  # save contents ---
528
  processed_arxiv_ids = list(processed_arxiv_ids)
529
  print(f"# of processed arxiv ids: {len(processed_arxiv_ids)}")
 
541
 
542
  # add to existing dataset
543
  try:
544
+ old_abstract_df = load_dataset(HF_REPO_ID_TXT, "abstracts")["train"].to_pandas()
545
  except Exception as e:
546
  print(e)
547
  old_abstract_df = pd.DataFrame(columns=abstract_df.columns)
 
554
  contents_df = pd.DataFrame(arxiv_items)
555
  print(contents_df.head())
556
  try:
557
+ old_contents_df = load_dataset(HF_REPO_ID_TXT, "articles")["train"].to_pandas()
558
  except Exception as e:
559
  print(e)
560
  old_contents_df = pd.DataFrame(columns=contents_df.columns)
requirements.txt CHANGED
@@ -12,4 +12,5 @@ retry
12
  pandas
13
  datasets
14
  PyMuPDF
15
- pillow
 
 
12
  pandas
13
  datasets
14
  PyMuPDF
15
+ pillow
16
+ tqdm