Improve data processing
Browse files- app.py +10 -13
- load_dataframe.py +39 -13
app.py
CHANGED
@@ -5,7 +5,7 @@ import pandas as pd
|
|
5 |
import numpy as np
|
6 |
import matplotlib.pyplot as plt
|
7 |
|
8 |
-
|
9 |
|
10 |
|
11 |
def aggregated_data(df, aggregation_level="week"):
|
@@ -25,17 +25,11 @@ def aggregated_data(df, aggregation_level="week"):
|
|
25 |
|
26 |
# Calculate the growth rate
|
27 |
growth_rate = percentage_papers_with_artifacts.pct_change() * 100
|
28 |
-
|
29 |
-
print("Type of growth rate:", growth_rate)
|
30 |
-
print("Growth rate:", type(growth_rate))
|
31 |
-
|
32 |
-
# growth_rate = growth_rate.dropna()
|
33 |
-
|
34 |
-
print("Growht rate after removing nan:", growth_rate)
|
35 |
|
36 |
# Display the average growth rate as a big number
|
37 |
average_growth_rate = growth_rate.mean()
|
38 |
-
st.metric(label=f"{aggregation_level.capitalize()}ly
|
39 |
|
40 |
# Create the plot
|
41 |
plt.figure(figsize=(12, 6))
|
@@ -109,12 +103,15 @@ def main():
|
|
109 |
selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])
|
110 |
|
111 |
# TODO use this instead
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
114 |
df = df.drop(['Unnamed: 0'], axis=1) if 'Unnamed: 0' in df.columns else df
|
115 |
# Use date as index
|
116 |
-
df = df.set_index('date')
|
117 |
-
df.index = pd.to_datetime(df.index)
|
118 |
df = df.sort_index()
|
119 |
|
120 |
if selection == "Daily/weekly/monthly data":
|
|
|
5 |
import numpy as np
|
6 |
import matplotlib.pyplot as plt
|
7 |
|
8 |
+
from load_dataframe import get_data
|
9 |
|
10 |
|
11 |
def aggregated_data(df, aggregation_level="week"):
|
|
|
25 |
|
26 |
# Calculate the growth rate
|
27 |
growth_rate = percentage_papers_with_artifacts.pct_change() * 100
|
28 |
+
growth_rate = growth_rate.replace([float('inf'), float('-inf')], pd.NA).dropna()
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
# Display the average growth rate as a big number
|
31 |
average_growth_rate = growth_rate.mean()
|
32 |
+
st.metric(label=f"{aggregation_level.capitalize()}ly Average Growth Rate", value=f"{average_growth_rate:.2f}%")
|
33 |
|
34 |
# Create the plot
|
35 |
plt.figure(figsize=(12, 6))
|
|
|
103 |
selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])
|
104 |
|
105 |
# TODO use this instead
|
106 |
+
df = get_data()
|
107 |
+
|
108 |
+
print(df.head())
|
109 |
+
|
110 |
+
# df = pd.read_csv('daily_papers_enriched (3).csv')
|
111 |
df = df.drop(['Unnamed: 0'], axis=1) if 'Unnamed: 0' in df.columns else df
|
112 |
# Use date as index
|
113 |
+
# df = df.set_index('date')
|
114 |
+
# df.index = pd.to_datetime(df.index)
|
115 |
df = df.sort_index()
|
116 |
|
117 |
if selection == "Daily/weekly/monthly data":
|
load_dataframe.py
CHANGED
@@ -20,7 +20,7 @@ class PaperInfo:
|
|
20 |
num_comments: int
|
21 |
|
22 |
|
23 |
-
def get_df() -> pd.DataFrame:
|
24 |
"""
|
25 |
Load the initial dataset as a Pandas dataframe.
|
26 |
"""
|
@@ -39,7 +39,16 @@ def get_df() -> pd.DataFrame:
|
|
39 |
paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
|
40 |
)
|
41 |
paper_info.append(info)
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
|
45 |
def get_github_url(client: PapersWithCodeClient, paper_title: str) -> str:
|
@@ -152,21 +161,15 @@ def check_hf_mention(batch):
|
|
152 |
return batch
|
153 |
|
154 |
|
155 |
-
|
156 |
-
def get_data() -> pd.DataFrame:
|
157 |
"""
|
158 |
Load the dataset and enrich it with metadata.
|
159 |
"""
|
160 |
-
# step 1. load as
|
161 |
-
df = get_df()
|
162 |
-
df['date'] = pd.to_datetime(df['date'])
|
163 |
-
|
164 |
-
# step 2. enrich using PapersWithCode API
|
165 |
dataset = Dataset.from_pandas(df)
|
166 |
|
167 |
-
#
|
168 |
-
# dataset = dataset.select(range(10))
|
169 |
-
|
170 |
dataset = dataset.map(add_metadata_batch, batched=True, batch_size=4, num_proc=cpu_count(), fn_kwargs={"client": PapersWithCodeClient()})
|
171 |
|
172 |
# step 3. enrich using Hugging Face API
|
@@ -184,4 +187,27 @@ def get_data() -> pd.DataFrame:
|
|
184 |
print("First few rows of the dataset:")
|
185 |
print(dataframe.head())
|
186 |
|
187 |
-
return dataframe
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
num_comments: int
|
21 |
|
22 |
|
23 |
+
def get_df(start_date: str, end_date: str) -> pd.DataFrame:
|
24 |
"""
|
25 |
Load the initial dataset as a Pandas dataframe.
|
26 |
"""
|
|
|
39 |
paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
|
40 |
)
|
41 |
paper_info.append(info)
|
42 |
+
|
43 |
+
df = pd.DataFrame([dataclasses.asdict(info) for info in paper_info])
|
44 |
+
|
45 |
+
# set date as index
|
46 |
+
df = df.set_index('date')
|
47 |
+
df.index = pd.to_datetime(df.index)
|
48 |
+
# only include data between start_date and end_date
|
49 |
+
df = df[(df.index >= start_date) & (df.index <= end_date)]
|
50 |
+
|
51 |
+
return df
|
52 |
|
53 |
|
54 |
def get_github_url(client: PapersWithCodeClient, paper_title: str) -> str:
|
|
|
161 |
return batch
|
162 |
|
163 |
|
164 |
+
def process_data(start_date: str, end_date: str) -> pd.DataFrame:
|
|
|
165 |
"""
|
166 |
Load the dataset and enrich it with metadata.
|
167 |
"""
|
168 |
+
# step 1. load as HF dataset
|
169 |
+
df = get_df(start_date, end_date)
|
|
|
|
|
|
|
170 |
dataset = Dataset.from_pandas(df)
|
171 |
|
172 |
+
# step 2. enrich using PapersWithCode API
|
|
|
|
|
173 |
dataset = dataset.map(add_metadata_batch, batched=True, batch_size=4, num_proc=cpu_count(), fn_kwargs={"client": PapersWithCodeClient()})
|
174 |
|
175 |
# step 3. enrich using Hugging Face API
|
|
|
187 |
print("First few rows of the dataset:")
|
188 |
print(dataframe.head())
|
189 |
|
190 |
+
return dataframe
|
191 |
+
|
192 |
+
|
193 |
+
@st.cache_data
|
194 |
+
def get_data() -> pd.DataFrame:
|
195 |
+
|
196 |
+
# step 1: load pre-processed data
|
197 |
+
df = load_dataset("nielsr/daily-papers-enriched", split="train").to_pandas()
|
198 |
+
df = df.set_index('date')
|
199 |
+
df = df.sort_index()
|
200 |
+
df.index = pd.to_datetime(df.index)
|
201 |
+
|
202 |
+
# step 2: check how much extra data we need to process
|
203 |
+
latest_day = df.iloc[-1].name.strftime('%d-%m-%Y')
|
204 |
+
today = pd.Timestamp.today().strftime('%d-%m-%Y')
|
205 |
+
|
206 |
+
# step 3: process the missing data
|
207 |
+
if latest_day < today:
|
208 |
+
print(f"Processing data from {latest_day} to {today}")
|
209 |
+
new_df = process_data(start_date=latest_day, end_date=today)
|
210 |
+
new_df = new_df[new_df.index > latest_day]
|
211 |
+
df = pd.concat([df, new_df])
|
212 |
+
|
213 |
+
return df
|