MilesCranmer commited on
Commit
fea9443
1 Parent(s): bb76c1f

Plotting of pareto front

Browse files
Files changed (1) hide show
  1. gui/app.py +94 -32
gui/app.py CHANGED
@@ -1,9 +1,14 @@
1
  import gradio as gr
2
  import numpy as np
 
3
  import pandas as pd
 
4
  import multiprocessing as mp
 
 
5
  import tempfile
6
- from typing import Optional
 
7
 
8
  empty_df = pd.DataFrame(
9
  {
@@ -18,7 +23,7 @@ test_equations = [
18
  ]
19
 
20
 
21
- def generate_data(s: str, num_points: int, noise_level: float):
22
  x = np.linspace(0, 10, num_points)
23
  for (k, v) in {
24
  "sin": "np.sin",
@@ -30,7 +35,8 @@ def generate_data(s: str, num_points: int, noise_level: float):
30
  }.items():
31
  s = s.replace(k, v)
32
  y = eval(s)
33
- noise = np.random.normal(0, noise_level, y.shape)
 
34
  y_noisy = y + noise
35
  return pd.DataFrame({"x": x}), y_noisy
36
 
@@ -41,6 +47,7 @@ def _greet_dispatch(
41
  test_equation,
42
  num_points,
43
  noise_level,
 
44
  niterations,
45
  maxsize,
46
  binary_operators,
@@ -74,32 +81,56 @@ def _greet_dispatch(
74
  y = np.array(df[col_to_fit])
75
  X = df.drop([col_to_fit], axis=1)
76
  else:
77
- # X, y = generate_data(block["test_equation"], block["num_points"], block["noise_level"])
78
- X, y = generate_data(test_equation, num_points, noise_level)
79
-
80
- queue = mp.Queue()
81
- process = mp.Process(
82
- target=greet,
83
- kwargs=dict(
84
- X=X,
85
- y=y,
86
- queue=queue,
87
- niterations=niterations,
88
- maxsize=maxsize,
89
- binary_operators=binary_operators,
90
- unary_operators=unary_operators,
91
- seed=seed,
92
- ),
93
- )
94
- process.start()
95
- output = queue.get()
96
- process.join()
97
- return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
 
100
  def greet(
101
  *,
102
- queue: mp.Queue,
103
  X,
104
  y,
105
  niterations: int,
@@ -107,6 +138,7 @@ def greet(
107
  binary_operators: list,
108
  unary_operators: list,
109
  seed: int,
 
110
  ):
111
  import pysr
112
 
@@ -121,13 +153,10 @@ def greet(
121
  procs=0,
122
  deterministic=True,
123
  random_state=seed,
 
124
  )
125
  model.fit(X, y)
126
 
127
- df = model.equations_[["complexity", "loss", "equation"]]
128
- # Convert all columns to string type:
129
- queue.put(df)
130
-
131
  return 0
132
 
133
 
@@ -154,6 +183,7 @@ def _data_layout():
154
  step=1,
155
  )
156
  noise_level = gr.Slider(minimum=0, maximum=1, value=0.1, label="Noise Level")
 
157
  with gr.Tab("Upload Data"):
158
  file_input = gr.File(label="Upload a CSV File")
159
  gr.Markdown(
@@ -165,6 +195,7 @@ def _data_layout():
165
  test_equation=test_equation,
166
  num_points=num_points,
167
  noise_level=noise_level,
 
168
  example_plot=example_plot,
169
  )
170
 
@@ -233,6 +264,7 @@ def main():
233
  blocks = {**blocks, **_settings_layout()}
234
 
235
  with gr.Column():
 
236
  blocks["df"] = gr.Dataframe(
237
  headers=["complexity", "loss", "equation"],
238
  datatype=["number", "number", "str"],
@@ -249,6 +281,7 @@ def main():
249
  "test_equation",
250
  "num_points",
251
  "noise_level",
 
252
  "niterations",
253
  "maxsize",
254
  "binary_operators",
@@ -256,7 +289,7 @@ def main():
256
  "seed",
257
  ]
258
  ],
259
- outputs=[blocks["df"]],
260
  )
261
 
262
  # Any update to the equation choice will trigger a replot:
@@ -264,18 +297,47 @@ def main():
264
  blocks["test_equation"],
265
  blocks["num_points"],
266
  blocks["noise_level"],
 
267
  ]
268
  for eqn_component in eqn_components:
269
  eqn_component.change(replot, eqn_components, blocks["example_plot"])
270
 
 
 
 
 
 
 
 
271
  demo.launch(debug=True)
272
 
273
 
274
- def replot(test_equation, num_points, noise_level):
275
- X, y = generate_data(test_equation, num_points, noise_level)
276
  df = pd.DataFrame({"x": X["x"], "y": y})
277
  return df
278
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
280
  if __name__ == "__main__":
281
  main()
 
1
  import gradio as gr
2
  import numpy as np
3
+ import os
4
  import pandas as pd
5
+ import time
6
  import multiprocessing as mp
7
+ from matplotlib import pyplot as plt
8
+ plt.ioff()
9
  import tempfile
10
+ from typing import Optional, Union
11
+ from pathlib import Path
12
 
13
  empty_df = pd.DataFrame(
14
  {
 
23
  ]
24
 
25
 
26
+ def generate_data(s: str, num_points: int, noise_level: float, data_seed: int):
27
  x = np.linspace(0, 10, num_points)
28
  for (k, v) in {
29
  "sin": "np.sin",
 
35
  }.items():
36
  s = s.replace(k, v)
37
  y = eval(s)
38
+ rstate = np.random.RandomState(data_seed)
39
+ noise = rstate.normal(0, noise_level, y.shape)
40
  y_noisy = y + noise
41
  return pd.DataFrame({"x": x}), y_noisy
42
 
 
47
  test_equation,
48
  num_points,
49
  noise_level,
50
+ data_seed,
51
  niterations,
52
  maxsize,
53
  binary_operators,
 
81
  y = np.array(df[col_to_fit])
82
  X = df.drop([col_to_fit], axis=1)
83
  else:
84
+ X, y = generate_data(test_equation, num_points, noise_level, data_seed)
85
+
86
+ with tempfile.TemporaryDirectory() as tmpdirname:
87
+ base = Path(tmpdirname)
88
+ equation_file = base / "hall_of_fame.csv"
89
+ equation_file_bkup = base / "hall_of_fame.csv.bkup"
90
+ process = mp.Process(
91
+ target=greet,
92
+ kwargs=dict(
93
+ X=X,
94
+ y=y,
95
+ niterations=niterations,
96
+ maxsize=maxsize,
97
+ binary_operators=binary_operators,
98
+ unary_operators=unary_operators,
99
+ seed=seed,
100
+ equation_file=equation_file,
101
+ ),
102
+ )
103
+ process.start()
104
+ while process.is_alive():
105
+ if equation_file_bkup.exists():
106
+ try:
107
+ # First, copy the file to a the copy file
108
+ equation_file_copy = base / "hall_of_fame_copy.csv"
109
+ os.system(f"cp {equation_file_bkup} {equation_file_copy}")
110
+ df = pd.read_csv(equation_file_copy)
111
+ # Ensure it is pareto dominated, with more complex expressions
112
+ # having higher loss. Otherwise remove those rows.
113
+ # TODO: Not sure why this occurs; could be the result of a late copy?
114
+ df.sort_values("Complexity", ascending=True, inplace=True)
115
+ df.reset_index(inplace=True)
116
+ bad_idx = []
117
+ min_loss = None
118
+ for i in df.index:
119
+ if min_loss is None or df.loc[i, "Loss"] < min_loss:
120
+ min_loss = float(df.loc[i, "Loss"])
121
+ else:
122
+ bad_idx.append(i)
123
+ df.drop(index=bad_idx, inplace=True)
124
+ yield df[["Complexity", "Loss", "Equation"]]
125
+ except pd.errors.EmptyDataError:
126
+ pass
127
+ time.sleep(1)
128
+
129
+ process.join()
130
 
131
 
132
  def greet(
133
  *,
 
134
  X,
135
  y,
136
  niterations: int,
 
138
  binary_operators: list,
139
  unary_operators: list,
140
  seed: int,
141
+ equation_file: Union[str, Path],
142
  ):
143
  import pysr
144
 
 
153
  procs=0,
154
  deterministic=True,
155
  random_state=seed,
156
+ equation_file=equation_file,
157
  )
158
  model.fit(X, y)
159
 
 
 
 
 
160
  return 0
161
 
162
 
 
183
  step=1,
184
  )
185
  noise_level = gr.Slider(minimum=0, maximum=1, value=0.1, label="Noise Level")
186
+ data_seed = gr.Number(value=0, label="Random Seed")
187
  with gr.Tab("Upload Data"):
188
  file_input = gr.File(label="Upload a CSV File")
189
  gr.Markdown(
 
195
  test_equation=test_equation,
196
  num_points=num_points,
197
  noise_level=noise_level,
198
+ data_seed=data_seed,
199
  example_plot=example_plot,
200
  )
201
 
 
264
  blocks = {**blocks, **_settings_layout()}
265
 
266
  with gr.Column():
267
+ blocks["pareto"] = gr.Plot()
268
  blocks["df"] = gr.Dataframe(
269
  headers=["complexity", "loss", "equation"],
270
  datatype=["number", "number", "str"],
 
281
  "test_equation",
282
  "num_points",
283
  "noise_level",
284
+ "data_seed",
285
  "niterations",
286
  "maxsize",
287
  "binary_operators",
 
289
  "seed",
290
  ]
291
  ],
292
+ outputs=blocks["df"],
293
  )
294
 
295
  # Any update to the equation choice will trigger a replot:
 
297
  blocks["test_equation"],
298
  blocks["num_points"],
299
  blocks["noise_level"],
300
+ blocks["data_seed"],
301
  ]
302
  for eqn_component in eqn_components:
303
  eqn_component.change(replot, eqn_components, blocks["example_plot"])
304
 
305
+ # Update plot when dataframe is updated:
306
+ blocks["df"].change(
307
+ replot_pareto,
308
+ inputs=[blocks["df"], blocks["maxsize"]],
309
+ outputs=[blocks["pareto"]],
310
+ )
311
+
312
  demo.launch(debug=True)
313
 
314
 
315
+ def replot(test_equation, num_points, noise_level, data_seed):
316
+ X, y = generate_data(test_equation, num_points, noise_level, data_seed)
317
  df = pd.DataFrame({"x": X["x"], "y": y})
318
  return df
319
 
320
+ def replot_pareto(df, maxsize):
321
+ # Matplotlib log-log plot of loss vs complexity:
322
+ fig, ax = plt.subplots(figsize=(5, 5))
323
+
324
+ ax.set_xlabel('Complexity', fontsize=14)
325
+ ax.set_ylabel('Loss', fontsize=14)
326
+ if len(df) == 0 or 'Equation' not in df.columns:
327
+ return fig
328
+
329
+ ax.loglog(df['Complexity'], df['Loss'], marker='o', linestyle='-', color='b')
330
+ ax.set_xlim(1, maxsize + 1)
331
+ # Set ylim to next power of 2:
332
+ ytop = 2 ** (np.ceil(np.log2(df['Loss'].max())))
333
+ ybottom = 2 ** (np.floor(np.log2(df['Loss'].min() + 1e-20)))
334
+ ax.set_ylim(ybottom, ytop)
335
+ ax.grid(True, which="both", ls="--", linewidth=0.5)
336
+ fig.tight_layout()
337
+ ax.tick_params(axis='both', which='major', labelsize=12)
338
+ ax.tick_params(axis='both', which='minor', labelsize=10)
339
+
340
+ return fig
341
 
342
  if __name__ == "__main__":
343
  main()