Spaces:
Sleeping
Sleeping
refresh
Browse files- .gitignore +2 -1
- app.py +17 -8
- main.py +28 -9
.gitignore
CHANGED
@@ -2,4 +2,5 @@
|
|
2 |
*.json
|
3 |
data
|
4 |
.ipynb_checkpoints
|
5 |
-
__pycache__
|
|
|
|
2 |
*.json
|
3 |
data
|
4 |
.ipynb_checkpoints
|
5 |
+
__pycache__
|
6 |
+
.sesskey
|
app.py
CHANGED
@@ -27,14 +27,13 @@ app, rt = fast_app(html_style=(style,))
|
|
27 |
login(token=os.environ.get("HF_TOKEN"))
|
28 |
|
29 |
hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
|
30 |
-
HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-
|
31 |
-
HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-
|
32 |
|
33 |
abstract_ds = load_dataset(HF_REPO_ID_TXT, "abstracts", split="train")
|
34 |
article_ds = load_dataset(HF_REPO_ID_TXT, "articles", split="train")
|
35 |
|
36 |
-
image_ds = load_dataset(HF_REPO_ID_IMG, "
|
37 |
-
image_ds = image_ds.filter(lambda x: x["page_number"] == 1)
|
38 |
|
39 |
|
40 |
def parse_date(date_string):
|
@@ -56,11 +55,21 @@ for article in article_ds:
|
|
56 |
|
57 |
weeks = sorted(week2articles.keys(), reverse=True)
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
def get_article_details(arxiv_id):
|
61 |
-
article =
|
62 |
-
abstract =
|
63 |
-
image =
|
64 |
return article, abstract, image
|
65 |
|
66 |
|
@@ -103,7 +112,7 @@ def generate_week_content(current_week):
|
|
103 |
]
|
104 |
|
105 |
if image:
|
106 |
-
pil_image = image[0]["image"]
|
107 |
img_byte_arr = BytesIO()
|
108 |
pil_image.save(img_byte_arr, format="JPEG")
|
109 |
img_byte_arr = img_byte_arr.getvalue()
|
|
|
27 |
login(token=os.environ.get("HF_TOKEN"))
|
28 |
|
29 |
hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
|
30 |
+
HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-texts"
|
31 |
+
HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-images"
|
32 |
|
33 |
abstract_ds = load_dataset(HF_REPO_ID_TXT, "abstracts", split="train")
|
34 |
article_ds = load_dataset(HF_REPO_ID_TXT, "articles", split="train")
|
35 |
|
36 |
+
image_ds = load_dataset(HF_REPO_ID_IMG, "images_first_page", split="train")
|
|
|
37 |
|
38 |
|
39 |
def parse_date(date_string):
|
|
|
55 |
|
56 |
weeks = sorted(week2articles.keys(), reverse=True)
|
57 |
|
58 |
+
arxiv2article = {article["arxiv_id"]: article for article in article_ds}
|
59 |
+
arxiv2abstract = {abstract["arxiv_id"]: abstract for abstract in abstract_ds}
|
60 |
+
arxiv2image = {image["arxiv_id"]: image for image in image_ds}
|
61 |
+
|
62 |
+
# def get_article_details(arxiv_id):
|
63 |
+
# article = article_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)[0]
|
64 |
+
# abstract = abstract_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
|
65 |
+
# image = image_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
|
66 |
+
# return article, abstract, image
|
67 |
+
|
68 |
|
69 |
def get_article_details(arxiv_id):
|
70 |
+
article = arxiv2article.get(arxiv_id, {})
|
71 |
+
abstract = arxiv2abstract.get(arxiv_id, {})
|
72 |
+
image = arxiv2image.get(arxiv_id, {})
|
73 |
return article, abstract, image
|
74 |
|
75 |
|
|
|
112 |
]
|
113 |
|
114 |
if image:
|
115 |
+
pil_image = image["image"] # image[0]["image"]
|
116 |
img_byte_arr = BytesIO()
|
117 |
pil_image.save(img_byte_arr, format="JPEG")
|
118 |
img_byte_arr = img_byte_arr.getvalue()
|
main.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import os
|
2 |
import re
|
|
|
3 |
import time
|
4 |
|
5 |
import dotenv
|
@@ -19,8 +20,8 @@ dotenv.load_dotenv()
|
|
19 |
login(token=os.environ.get("HF_TOKEN"))
|
20 |
|
21 |
hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
|
22 |
-
HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-
|
23 |
-
HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-
|
24 |
|
25 |
|
26 |
########################################################
|
@@ -67,7 +68,7 @@ def get_zotero_items(debug=False):
|
|
67 |
print(f"# items fetched {len(items)}")
|
68 |
|
69 |
if debug:
|
70 |
-
if len(items) >
|
71 |
break
|
72 |
|
73 |
return items
|
@@ -334,7 +335,7 @@ def download_arxiv_pdf(arxiv_id):
|
|
334 |
raise Exception(f"Failed to download PDF. Status code: {response.status_code}")
|
335 |
|
336 |
|
337 |
-
def pdf_to_jpegs(pdf_content, output_folder):
|
338 |
# Create output folder if it doesn't exist
|
339 |
os.makedirs(output_folder, exist_ok=True)
|
340 |
|
@@ -353,6 +354,9 @@ def pdf_to_jpegs(pdf_content, output_folder):
|
|
353 |
pix.save(image_path)
|
354 |
# print(f"Saved {image_path}")
|
355 |
|
|
|
|
|
|
|
356 |
doc.close()
|
357 |
|
358 |
|
@@ -444,6 +448,13 @@ def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
|
|
444 |
except Exception as e:
|
445 |
print(e)
|
446 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
447 |
try:
|
448 |
# push id_to_abstract
|
449 |
abstract_ds = Dataset.from_pandas(abstract_df)
|
@@ -479,11 +490,8 @@ def main():
|
|
479 |
existing_arxiv_ids = load_dataset(HF_REPO_ID_TXT, "processed_arxiv_ids")["train"]["arxiv_id"]
|
480 |
except Exception as e:
|
481 |
print(e)
|
482 |
-
|
483 |
-
|
484 |
-
except Exception as e:
|
485 |
-
print(e)
|
486 |
-
existing_arxiv_ids = []
|
487 |
existing_arxiv_ids = set(existing_arxiv_ids)
|
488 |
print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
|
489 |
|
@@ -492,9 +500,20 @@ def main():
|
|
492 |
arxiv_items = fetch_arxiv_htmls(arxiv_items)
|
493 |
print(f"# of new arxiv items: {len(arxiv_items)}")
|
494 |
|
|
|
|
|
|
|
|
|
495 |
processed_arxiv_ids = set()
|
496 |
pbar = tqdm(range(len(arxiv_items)))
|
497 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
498 |
for item in arxiv_items:
|
499 |
# download images --
|
500 |
save_arxiv_article_images(item["arxiv_id"])
|
|
|
1 |
import os
|
2 |
import re
|
3 |
+
import shutil
|
4 |
import time
|
5 |
|
6 |
import dotenv
|
|
|
20 |
login(token=os.environ.get("HF_TOKEN"))
|
21 |
|
22 |
hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
|
23 |
+
HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-texts"
|
24 |
+
HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-images"
|
25 |
|
26 |
|
27 |
########################################################
|
|
|
68 |
print(f"# items fetched {len(items)}")
|
69 |
|
70 |
if debug:
|
71 |
+
if len(items) > 1600:
|
72 |
break
|
73 |
|
74 |
return items
|
|
|
335 |
raise Exception(f"Failed to download PDF. Status code: {response.status_code}")
|
336 |
|
337 |
|
338 |
+
def pdf_to_jpegs(pdf_content, output_folder, max_pages=128):
|
339 |
# Create output folder if it doesn't exist
|
340 |
os.makedirs(output_folder, exist_ok=True)
|
341 |
|
|
|
354 |
pix.save(image_path)
|
355 |
# print(f"Saved {image_path}")
|
356 |
|
357 |
+
if page_num >= max_pages:
|
358 |
+
break
|
359 |
+
|
360 |
doc.close()
|
361 |
|
362 |
|
|
|
448 |
except Exception as e:
|
449 |
print(e)
|
450 |
|
451 |
+
# upload first pages only
|
452 |
+
try:
|
453 |
+
img_ds = img_ds.filter(lambda x: x["page_number"] == 1)
|
454 |
+
img_ds.push_to_hub(HF_REPO_ID_IMG, "images_first_page", token=os.environ.get("HF_TOKEN"))
|
455 |
+
except Exception as e:
|
456 |
+
print(e)
|
457 |
+
|
458 |
try:
|
459 |
# push id_to_abstract
|
460 |
abstract_ds = Dataset.from_pandas(abstract_df)
|
|
|
490 |
existing_arxiv_ids = load_dataset(HF_REPO_ID_TXT, "processed_arxiv_ids")["train"]["arxiv_id"]
|
491 |
except Exception as e:
|
492 |
print(e)
|
493 |
+
existing_arxiv_ids = []
|
494 |
+
|
|
|
|
|
|
|
495 |
existing_arxiv_ids = set(existing_arxiv_ids)
|
496 |
print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
|
497 |
|
|
|
500 |
arxiv_items = fetch_arxiv_htmls(arxiv_items)
|
501 |
print(f"# of new arxiv items: {len(arxiv_items)}")
|
502 |
|
503 |
+
if len(arxiv_items) == 0:
|
504 |
+
print("No new arxiv items to process")
|
505 |
+
return
|
506 |
+
|
507 |
processed_arxiv_ids = set()
|
508 |
pbar = tqdm(range(len(arxiv_items)))
|
509 |
|
510 |
+
# remove "data" directory if it exists
|
511 |
+
if os.path.exists("data"):
|
512 |
+
try:
|
513 |
+
shutil.rmtree("data")
|
514 |
+
except Exception as e:
|
515 |
+
print(e)
|
516 |
+
|
517 |
for item in arxiv_items:
|
518 |
# download images --
|
519 |
save_arxiv_article_images(item["arxiv_id"])
|