Spaces:

nielsr
/

community-science-progress

Sleeping

App Files Files Community

nielsr HF staff commited on Jul 22, 2024

Commit

1396667

1 Parent(s): 9325c4d

Improve data processing

Browse files

Files changed (2) hide show

app.py +10 -13
load_dataframe.py +39 -13

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
-# from load_dataframe import get_data
 def aggregated_data(df, aggregation_level="week"):
@@ -25,17 +25,11 @@ def aggregated_data(df, aggregation_level="week"):
     # Calculate the growth rate
     growth_rate = percentage_papers_with_artifacts.pct_change() * 100
-    print("Type of growth rate:", growth_rate)
-    print("Growth rate:", type(growth_rate))
-    # growth_rate = growth_rate.dropna()
-    print("Growht rate after removing nan:", growth_rate)
     # Display the average growth rate as a big number
     average_growth_rate = growth_rate.mean()
-    st.metric(label=f"{aggregation_level.capitalize()}ly average Growth Rate", value=f"{average_growth_rate:.2f}%")
     # Create the plot
     plt.figure(figsize=(12, 6))
@@ -109,12 +103,15 @@ def main():
     selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])
     # TODO use this instead
-    # df = get_data()
-    df = pd.read_csv('daily_papers_enriched (3).csv')
     df = df.drop(['Unnamed: 0'], axis=1) if 'Unnamed: 0' in df.columns else df
     # Use date as index
-    df = df.set_index('date')
-    df.index = pd.to_datetime(df.index)
     df = df.sort_index()
     if selection == "Daily/weekly/monthly data":

 import numpy as np
 import matplotlib.pyplot as plt
+from load_dataframe import get_data
 def aggregated_data(df, aggregation_level="week"):
     # Calculate the growth rate
     growth_rate = percentage_papers_with_artifacts.pct_change() * 100
+    growth_rate = growth_rate.replace([float('inf'), float('-inf')], pd.NA).dropna()
     # Display the average growth rate as a big number
     average_growth_rate = growth_rate.mean()
+    st.metric(label=f"{aggregation_level.capitalize()}ly Average Growth Rate", value=f"{average_growth_rate:.2f}%")
     # Create the plot
     plt.figure(figsize=(12, 6))
     selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])
     # TODO use this instead
+    df = get_data()
+    print(df.head())
+    # df = pd.read_csv('daily_papers_enriched (3).csv')
     df = df.drop(['Unnamed: 0'], axis=1) if 'Unnamed: 0' in df.columns else df
     # Use date as index
+    # df = df.set_index('date')
+    # df.index = pd.to_datetime(df.index)
     df = df.sort_index()
     if selection == "Daily/weekly/monthly data":

load_dataframe.py CHANGED Viewed

@@ -20,7 +20,7 @@ class PaperInfo:
     num_comments: int
-def get_df() -> pd.DataFrame:
     """
     Load the initial dataset as a Pandas dataframe.
     """
@@ -39,7 +39,16 @@ def get_df() -> pd.DataFrame:
             paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
         )
         paper_info.append(info)
-    return pd.DataFrame([dataclasses.asdict(info) for info in paper_info])
 def get_github_url(client: PapersWithCodeClient, paper_title: str) -> str:
@@ -152,21 +161,15 @@ def check_hf_mention(batch):
     return batch
-@st.cache_data
-def get_data() -> pd.DataFrame:
     """
     Load the dataset and enrich it with metadata.
     """
-    # step 1. load as Pandas dataframe
-    df = get_df()
-    df['date'] = pd.to_datetime(df['date'])
-    # step 2. enrich using PapersWithCode API
     dataset = Dataset.from_pandas(df)
-    # TODO remove
-    # dataset = dataset.select(range(10))
     dataset = dataset.map(add_metadata_batch, batched=True, batch_size=4, num_proc=cpu_count(), fn_kwargs={"client": PapersWithCodeClient()})
     # step 3. enrich using Hugging Face API
@@ -184,4 +187,27 @@ def get_data() -> pd.DataFrame:
     print("First few rows of the dataset:")
     print(dataframe.head())
-    return dataframe

     num_comments: int
+def get_df(start_date: str, end_date: str) -> pd.DataFrame:
     """
     Load the initial dataset as a Pandas dataframe.
     """
             paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
         )
         paper_info.append(info)
+    df = pd.DataFrame([dataclasses.asdict(info) for info in paper_info])
+    # set date as index
+    df = df.set_index('date')
+    df.index = pd.to_datetime(df.index)
+    # only include data between start_date and end_date
+    df = df[(df.index >= start_date) & (df.index <= end_date)]
+    return df
 def get_github_url(client: PapersWithCodeClient, paper_title: str) -> str:
     return batch
+def process_data(start_date: str, end_date: str) -> pd.DataFrame:
     """
     Load the dataset and enrich it with metadata.
     """
+    # step 1. load as HF dataset
+    df = get_df(start_date, end_date)
     dataset = Dataset.from_pandas(df)
+    # step 2. enrich using PapersWithCode API
     dataset = dataset.map(add_metadata_batch, batched=True, batch_size=4, num_proc=cpu_count(), fn_kwargs={"client": PapersWithCodeClient()})
     # step 3. enrich using Hugging Face API
     print("First few rows of the dataset:")
     print(dataframe.head())
+    return dataframe
+@st.cache_data
+def get_data() -> pd.DataFrame:
+    # step 1: load pre-processed data
+    df = load_dataset("nielsr/daily-papers-enriched", split="train").to_pandas()
+    df = df.set_index('date')
+    df = df.sort_index()
+    df.index = pd.to_datetime(df.index)
+    # step 2: check how much extra data we need to process
+    latest_day = df.iloc[-1].name.strftime('%d-%m-%Y')
+    today = pd.Timestamp.today().strftime('%d-%m-%Y')
+    # step 3: process the missing data
+    if latest_day < today:
+        print(f"Processing data from {latest_day} to {today}")
+        new_df = process_data(start_date=latest_day, end_date=today)
+        new_df = new_df[new_df.index > latest_day]
+        df = pd.concat([df, new_df])
+    return df