davanstrien HF staff commited on
Commit
976f652
1 Parent(s): ad38c8f

improvements

Browse files
Files changed (1) hide show
  1. app.py +66 -18
app.py CHANGED
@@ -40,7 +40,7 @@ def format_row_for_model(row):
40
  int2label = {0: "new_dataset", 1: "not_new_dataset"}
41
 
42
 
43
- def get_predictions(data: list[dict], model=None, batch_size=32):
44
  if model is None:
45
  model = load_model()
46
  predictions = []
@@ -65,8 +65,8 @@ def create_markdown(row):
65
  updated = updated.strftime("%Y-%m-%d")
66
  broad_category = row["broad_category"]
67
  category = row["category"]
68
- return f""" <h1> {title} </h1> updated: {updated}
69
- | category: {broad_category} | subcategory: {category} |
70
  \n\n{abstract}
71
  \n\n [Hugging Face Papers page]({hub_paper_url})
72
  """
@@ -87,34 +87,82 @@ def prepare_data():
87
  return df
88
 
89
 
90
- all_possible_arxiv_categories = prepare_data().category.unique().tolist()
91
- broad_categories = prepare_data().broad_category.unique().tolist()
92
 
93
 
94
- def create_markdown_summary(categories=broad_categories, all_categories=None):
95
  df = prepare_data()
96
- if categories is not None:
 
 
 
 
97
  df = df[df["broad_category"].isin(categories)]
98
- return "\n\n".join(df["markdown"].tolist())
 
 
 
 
 
 
99
 
100
 
101
  scheduler = BackgroundScheduler()
102
  scheduler.add_job(prepare_data, "cron", hour=3, minute=30)
103
  scheduler.start()
104
 
 
 
 
 
 
 
 
 
105
  with gr.Blocks() as demo:
106
- gr.Markdown("## New Datasets in Machine Learning")
107
  gr.Markdown(
108
- "This Space attempts to show new papers on arXiv that are *likely* to be papers"
109
- " introducing new datasets. \n\n"
110
- )
111
- broad_categories = gr.Dropdown(
112
- choices=broad_categories,
113
- label="Categories",
114
- multiselect=True,
115
- value=broad_categories,
116
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  results = gr.Markdown(create_markdown_summary())
118
- broad_categories.change(create_markdown_summary, broad_categories, results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  demo.launch()
 
40
  int2label = {0: "new_dataset", 1: "not_new_dataset"}
41
 
42
 
43
+ def get_predictions(data: list[dict], model=None, batch_size=64):
44
  if model is None:
45
  model = load_model()
46
  predictions = []
 
65
  updated = updated.strftime("%Y-%m-%d")
66
  broad_category = row["broad_category"]
67
  category = row["category"]
68
+ return f""" <h2> {title} </h2> Updated: {updated}
69
+ | Category: {broad_category} | Subcategory: {category} |
70
  \n\n{abstract}
71
  \n\n [Hugging Face Papers page]({hub_paper_url})
72
  """
 
87
  return df
88
 
89
 
90
+ all_possible_arxiv_categories = sorted(prepare_data().category.unique().tolist())
91
+ broad_categories = sorted(prepare_data().broad_category.unique().tolist())
92
 
93
 
94
+ def create_markdown_summary(categories=None, new_only=True, narrow_categories=None):
95
  df = prepare_data()
96
+ if new_only:
97
+ df = df[df["prediction"] == "new_dataset"]
98
+ if narrow_categories is not None:
99
+ df = df[df["category"].isin(narrow_categories)]
100
+ if categories is not None and not narrow_categories:
101
  df = df[df["broad_category"].isin(categories)]
102
+ number_of_results = len(df)
103
+ results = (
104
+ "<h1 style='text-align: center'> arXiv papers related to datasets</h1> \n\n"
105
+ )
106
+ results += f"Number of results: {number_of_results}\n\n"
107
+ results += "\n\n<br>".join(df["markdown"].tolist())
108
+ return results
109
 
110
 
111
  scheduler = BackgroundScheduler()
112
  scheduler.add_job(prepare_data, "cron", hour=3, minute=30)
113
  scheduler.start()
114
 
115
+ description = """This Space shows recent papers on arXiv that are *likely* to be papers introducing new datasets related to machine learning. \n\n
116
+ The Space works by:
117
+ - searching for papers on arXiv with the term `dataset` in the title + "machine learning" in the abstract
118
+ - passing the abstract and title of the papers to a machine learning model that predicts if the paper is introducing a new dataset or not
119
+
120
+ This Space is a WIP in progress. The model is not perfect, and the search query is not perfect. If you have suggestions for how to improve this Space, please open a Discussion.\n\n"""
121
+
122
+
123
  with gr.Blocks() as demo:
 
124
  gr.Markdown(
125
+ "<h1 style='text-align: center'> &#x2728;New Datasets in Machine Learning "
126
+ " &#x2728; </h1>"
 
 
 
 
 
 
127
  )
128
+ gr.Markdown(description)
129
+ with gr.Row():
130
+ broad_categories = gr.Dropdown(
131
+ choices=broad_categories,
132
+ label="Broad arXiv Category",
133
+ multiselect=True,
134
+ value="cs",
135
+ size="sm",
136
+ )
137
+ with gr.Accordion("Advanced Options", open=False):
138
+ gr.Markdown(
139
+ "Narrow by arXiv categories. **Note** this will take precedence over the"
140
+ " broad category selection."
141
+ )
142
+ narrow_categories = gr.Dropdown(
143
+ choices=all_possible_arxiv_categories,
144
+ value=None,
145
+ multiselect=True,
146
+ label="Narrow arXiv Category",
147
+ )
148
+ gr.ClearButton(narrow_categories, "Clear Narrow Categories", size="sm")
149
+ with gr.Row():
150
+ new_only = gr.Checkbox(True, label="New Datasets Only", size="sm")
151
  results = gr.Markdown(create_markdown_summary())
152
+ broad_categories.change(
153
+ create_markdown_summary,
154
+ inputs=[broad_categories, new_only, narrow_categories],
155
+ outputs=results,
156
+ )
157
+ narrow_categories.change(
158
+ create_markdown_summary,
159
+ inputs=[broad_categories, new_only, narrow_categories],
160
+ outputs=results,
161
+ )
162
+ new_only.select(
163
+ create_markdown_summary,
164
+ [broad_categories, new_only, narrow_categories],
165
+ results,
166
+ )
167
 
168
  demo.launch()