nielsr HF staff commited on
Commit
1396667
1 Parent(s): 9325c4d

Improve data processing

Browse files
Files changed (2) hide show
  1. app.py +10 -13
  2. load_dataframe.py +39 -13
app.py CHANGED
@@ -5,7 +5,7 @@ import pandas as pd
5
  import numpy as np
6
  import matplotlib.pyplot as plt
7
 
8
- # from load_dataframe import get_data
9
 
10
 
11
  def aggregated_data(df, aggregation_level="week"):
@@ -25,17 +25,11 @@ def aggregated_data(df, aggregation_level="week"):
25
 
26
  # Calculate the growth rate
27
  growth_rate = percentage_papers_with_artifacts.pct_change() * 100
28
-
29
- print("Type of growth rate:", growth_rate)
30
- print("Growth rate:", type(growth_rate))
31
-
32
- # growth_rate = growth_rate.dropna()
33
-
34
- print("Growht rate after removing nan:", growth_rate)
35
 
36
  # Display the average growth rate as a big number
37
  average_growth_rate = growth_rate.mean()
38
- st.metric(label=f"{aggregation_level.capitalize()}ly average Growth Rate", value=f"{average_growth_rate:.2f}%")
39
 
40
  # Create the plot
41
  plt.figure(figsize=(12, 6))
@@ -109,12 +103,15 @@ def main():
109
  selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])
110
 
111
  # TODO use this instead
112
- # df = get_data()
113
- df = pd.read_csv('daily_papers_enriched (3).csv')
 
 
 
114
  df = df.drop(['Unnamed: 0'], axis=1) if 'Unnamed: 0' in df.columns else df
115
  # Use date as index
116
- df = df.set_index('date')
117
- df.index = pd.to_datetime(df.index)
118
  df = df.sort_index()
119
 
120
  if selection == "Daily/weekly/monthly data":
 
5
  import numpy as np
6
  import matplotlib.pyplot as plt
7
 
8
+ from load_dataframe import get_data
9
 
10
 
11
  def aggregated_data(df, aggregation_level="week"):
 
25
 
26
  # Calculate the growth rate
27
  growth_rate = percentage_papers_with_artifacts.pct_change() * 100
28
+ growth_rate = growth_rate.replace([float('inf'), float('-inf')], pd.NA).dropna()
 
 
 
 
 
 
29
 
30
  # Display the average growth rate as a big number
31
  average_growth_rate = growth_rate.mean()
32
+ st.metric(label=f"{aggregation_level.capitalize()}ly Average Growth Rate", value=f"{average_growth_rate:.2f}%")
33
 
34
  # Create the plot
35
  plt.figure(figsize=(12, 6))
 
103
  selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])
104
 
105
  # TODO use this instead
106
+ df = get_data()
107
+
108
+ print(df.head())
109
+
110
+ # df = pd.read_csv('daily_papers_enriched (3).csv')
111
  df = df.drop(['Unnamed: 0'], axis=1) if 'Unnamed: 0' in df.columns else df
112
  # Use date as index
113
+ # df = df.set_index('date')
114
+ # df.index = pd.to_datetime(df.index)
115
  df = df.sort_index()
116
 
117
  if selection == "Daily/weekly/monthly data":
load_dataframe.py CHANGED
@@ -20,7 +20,7 @@ class PaperInfo:
20
  num_comments: int
21
 
22
 
23
- def get_df() -> pd.DataFrame:
24
  """
25
  Load the initial dataset as a Pandas dataframe.
26
  """
@@ -39,7 +39,16 @@ def get_df() -> pd.DataFrame:
39
  paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
40
  )
41
  paper_info.append(info)
42
- return pd.DataFrame([dataclasses.asdict(info) for info in paper_info])
 
 
 
 
 
 
 
 
 
43
 
44
 
45
  def get_github_url(client: PapersWithCodeClient, paper_title: str) -> str:
@@ -152,21 +161,15 @@ def check_hf_mention(batch):
152
  return batch
153
 
154
 
155
- @st.cache_data
156
- def get_data() -> pd.DataFrame:
157
  """
158
  Load the dataset and enrich it with metadata.
159
  """
160
- # step 1. load as Pandas dataframe
161
- df = get_df()
162
- df['date'] = pd.to_datetime(df['date'])
163
-
164
- # step 2. enrich using PapersWithCode API
165
  dataset = Dataset.from_pandas(df)
166
 
167
- # TODO remove
168
- # dataset = dataset.select(range(10))
169
-
170
  dataset = dataset.map(add_metadata_batch, batched=True, batch_size=4, num_proc=cpu_count(), fn_kwargs={"client": PapersWithCodeClient()})
171
 
172
  # step 3. enrich using Hugging Face API
@@ -184,4 +187,27 @@ def get_data() -> pd.DataFrame:
184
  print("First few rows of the dataset:")
185
  print(dataframe.head())
186
 
187
- return dataframe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  num_comments: int
21
 
22
 
23
+ def get_df(start_date: str, end_date: str) -> pd.DataFrame:
24
  """
25
  Load the initial dataset as a Pandas dataframe.
26
  """
 
39
  paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
40
  )
41
  paper_info.append(info)
42
+
43
+ df = pd.DataFrame([dataclasses.asdict(info) for info in paper_info])
44
+
45
+ # set date as index
46
+ df = df.set_index('date')
47
+ df.index = pd.to_datetime(df.index)
48
+ # only include data between start_date and end_date
49
+ df = df[(df.index >= start_date) & (df.index <= end_date)]
50
+
51
+ return df
52
 
53
 
54
  def get_github_url(client: PapersWithCodeClient, paper_title: str) -> str:
 
161
  return batch
162
 
163
 
164
+ def process_data(start_date: str, end_date: str) -> pd.DataFrame:
 
165
  """
166
  Load the dataset and enrich it with metadata.
167
  """
168
+ # step 1. load as HF dataset
169
+ df = get_df(start_date, end_date)
 
 
 
170
  dataset = Dataset.from_pandas(df)
171
 
172
+ # step 2. enrich using PapersWithCode API
 
 
173
  dataset = dataset.map(add_metadata_batch, batched=True, batch_size=4, num_proc=cpu_count(), fn_kwargs={"client": PapersWithCodeClient()})
174
 
175
  # step 3. enrich using Hugging Face API
 
187
  print("First few rows of the dataset:")
188
  print(dataframe.head())
189
 
190
+ return dataframe
191
+
192
+
193
+ @st.cache_data
194
+ def get_data() -> pd.DataFrame:
195
+
196
+ # step 1: load pre-processed data
197
+ df = load_dataset("nielsr/daily-papers-enriched", split="train").to_pandas()
198
+ df = df.set_index('date')
199
+ df = df.sort_index()
200
+ df.index = pd.to_datetime(df.index)
201
+
202
+ # step 2: check how much extra data we need to process
203
+ latest_day = df.iloc[-1].name.strftime('%d-%m-%Y')
204
+ today = pd.Timestamp.today().strftime('%d-%m-%Y')
205
+
206
+ # step 3: process the missing data
207
+ if latest_day < today:
208
+ print(f"Processing data from {latest_day} to {today}")
209
+ new_df = process_data(start_date=latest_day, end_date=today)
210
+ new_df = new_df[new_df.index > latest_day]
211
+ df = pd.concat([df, new_df])
212
+
213
+ return df