Spaces:
Sleeping
Sleeping
import dataclasses | |
from multiprocessing import cpu_count | |
import tqdm | |
import requests | |
import streamlit as st | |
import pandas as pd | |
from datasets import Dataset, load_dataset | |
from paperswithcode import PapersWithCodeClient | |
class PaperInfo: | |
date: str | |
arxiv_id: str | |
github: str | |
title: str | |
paper_page: str | |
upvotes: int | |
num_comments: int | |
def get_df(start_date: str = None, end_date: str = None) -> pd.DataFrame: | |
""" | |
Load the initial dataset as a Pandas dataframe. | |
One can optionally specify a start_date and end_date to only include data between these dates. | |
""" | |
df = pd.merge( | |
left=load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(), | |
right=load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(), | |
on="arxiv_id", | |
) | |
df = df[::-1].reset_index(drop=True) | |
paper_info = [] | |
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)): | |
info = PaperInfo( | |
**row, | |
paper_page=f"https://huggingface.co/papers/{row.arxiv_id}", | |
) | |
paper_info.append(info) | |
df = pd.DataFrame([dataclasses.asdict(info) for info in paper_info]) | |
# set date as index | |
df = df.set_index('date') | |
df.index = pd.to_datetime(df.index) | |
if start_date is not None and end_date is not None: | |
# only include data between start_date and end_date | |
df = df[(df.index >= start_date) & (df.index <= end_date)] | |
return df | |
def get_github_url(client: PapersWithCodeClient, paper_title: str) -> str: | |
""" | |
Get the Github URL for a paper. | |
""" | |
repo_url = "" | |
try: | |
# get paper ID | |
results = client.paper_list(q=paper_title).results | |
paper_id = results[0].id | |
# get paper | |
paper = client.paper_get(paper_id=paper_id) | |
# get repositories | |
repositories = client.paper_repository_list(paper_id=paper.id).results | |
for repo in repositories: | |
if repo.is_official: | |
repo_url = repo.url | |
except: | |
pass | |
return repo_url | |
def add_metadata_batch(batch, client: PapersWithCodeClient): | |
""" | |
Add metadata to a batch of papers. | |
""" | |
# get Github URLs for all papers in the batch | |
github_urls = [] | |
for paper_title in batch["title"]: | |
github_url = get_github_url(client, paper_title) | |
github_urls.append(github_url) | |
# overwrite the Github links | |
batch["github"] = github_urls | |
return batch | |
def add_hf_assets(batch): | |
""" | |
Add Hugging Face assets to a batch of papers. | |
""" | |
num_spaces = [] | |
num_models = [] | |
num_datasets = [] | |
for arxiv_id in batch["arxiv_id"]: | |
if arxiv_id != "": | |
response = requests.get(f"https://huggingface.co/api/arxiv/{arxiv_id}/repos") | |
result = response.json() | |
num_spaces_example = len(result["spaces"]) | |
num_models_example = len(result["models"]) | |
num_datasets_example = len(result["datasets"]) | |
else: | |
num_spaces_example = 0 | |
num_models_example = 0 | |
num_datasets_example = 0 | |
num_spaces.append(num_spaces_example) | |
num_models.append(num_models_example) | |
num_datasets.append(num_datasets_example) | |
batch["num_models"] = num_models | |
batch["num_datasets"] = num_datasets | |
batch["num_spaces"] = num_spaces | |
return batch | |
def check_hf_mention(batch): | |
""" | |
Check if a paper mentions Hugging Face in the README. | |
""" | |
hf_mentions = [] | |
for github_url in batch["github"]: | |
hf_mention = 0 | |
if github_url != "": | |
# get README text using Github API | |
owner = github_url.split("/")[-2] | |
repo = github_url.split("/")[-1] | |
branch = "main" | |
url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/README.md" | |
response = requests.get(url) | |
if response.status_code != 200: | |
# try master branch as second attempt | |
branch = "master" | |
url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/README.md" | |
response = requests.get(url) | |
if response.status_code == 200: | |
# get text | |
text = response.text | |
if "huggingface" in text.lower() or "hugging face" in text.lower(): | |
hf_mention = 1 | |
hf_mentions.append(hf_mention) | |
# overwrite the Github links | |
batch["hf_mention"] = hf_mentions | |
return batch | |
def process_data(start_date: str, end_date: str) -> pd.DataFrame: | |
""" | |
Load the dataset and enrich it with metadata. | |
""" | |
# step 1. load as HF dataset | |
df = get_df(start_date, end_date) | |
dataset = Dataset.from_pandas(df) | |
# step 2. enrich using PapersWithCode API | |
dataset = dataset.map(add_metadata_batch, batched=True, batch_size=4, num_proc=cpu_count(), fn_kwargs={"client": PapersWithCodeClient()}) | |
# step 3. enrich using Hugging Face API | |
dataset = dataset.map(add_hf_assets, batched=True, batch_size=4, num_proc=cpu_count()) | |
# step 4. check if Hugging Face is mentioned in the README | |
dataset = dataset.map(check_hf_mention, batched=True, batch_size=4, num_proc=cpu_count()) | |
# return as Pandas dataframe | |
# making sure that the date is set as index | |
dataframe = dataset.to_pandas() | |
dataframe = dataframe.set_index('date') | |
dataframe.index = pd.to_datetime(dataframe.index) | |
return dataframe | |
def get_data() -> pd.DataFrame: | |
# step 1: load pre-processed data | |
df = load_dataset("nielsr/daily-papers-enriched", split="train").to_pandas() | |
df = df.set_index('date') | |
df = df.sort_index() | |
df.index = pd.to_datetime(df.index) | |
# step 2: check how much extra data we need to process | |
latest_day = df.iloc[-1].name.strftime('%Y-%m-%d') | |
today = pd.Timestamp.today().strftime('%Y-%m-%d') | |
print("Latest day:", latest_day) | |
print("Today:", today) | |
# step 3: process the missing data | |
if latest_day < today: | |
print(f"Processing data from {latest_day} to {today}") | |
new_df = process_data(start_date=latest_day, end_date=today) | |
print("Original df:", df.head()) | |
print("New df:", new_df.head()) | |
df = pd.concat([df, new_df]) | |
df = df.sort_index() | |
return df |