nesticot commited on
Commit
06cd542
·
verified ·
1 Parent(s): 8dab285

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +735 -596
app.py CHANGED
@@ -1,596 +1,735 @@
1
- import polars as pl
2
- import api_scraper
3
- mlb_scrape = api_scraper.MLB_Scrape()
4
-
5
- from stuff_model import *
6
- from shiny import App, reactive, ui, render
7
- from shiny.ui import h2, tags
8
- from api_scraper import MLB_Scrape
9
- import datetime
10
- from stuff_model import feature_engineering as fe
11
- from stuff_model import stuff_apply
12
- from pytabulator import TableOptions, Tabulator, output_tabulator, render_tabulator, theme
13
- theme.tabulator_site()
14
- scraper = MLB_Scrape()
15
-
16
- df_year_old_group = pl.read_parquet('pitch_data_agg_2024.parquet')
17
-
18
- pitcher_old_dict = dict(zip(df_year_old_group['pitcher_id'],df_year_old_group['pitcher_name']))
19
-
20
-
21
-
22
-
23
- app_ui = ui.page_fluid(
24
- ui.card(
25
- ui.card_header("2025 Spring Training Pitch Data App"),
26
- ui.row(
27
- ui.column(4,
28
- ui.markdown("""This app generates a table which shows the 2025 Spring Training data.
29
-
30
- * Differences are calculated based on 2024 regular season data
31
- * If 2024 data does not exist for pitcher, 2023 Data is used
32
- * If no difference exists, the pitch is labelled as a new pitch"""),
33
-
34
-
35
- ui.input_action_button(
36
- "refresh",
37
- "Refresh Data",
38
- class_="btn-primary",
39
- width="100%"
40
- )
41
- ),
42
- ui.column(3,
43
- ui.div(
44
- "By: ",
45
- ui.tags.a(
46
- "@TJStats",
47
- href="https://x.com/TJStats",
48
- target="_blank"
49
- )
50
- ),
51
- ui.tags.p("Data: MLB"),
52
- ui.tags.p(
53
- ui.tags.a(
54
- "Support me on Patreon for more baseball content",
55
- href="https://www.patreon.com/TJ_Stats",
56
- target="_blank"
57
- )
58
- )
59
- )
60
- ),
61
- ui.navset_tab(
62
- ui.nav("All Pitches",
63
- output_tabulator("table_all")
64
- ),
65
- ui.nav("Daily Pitches",
66
- output_tabulator("table_daily")
67
- ),
68
- ui.nav("tjStuff+",
69
- output_tabulator("table_tjstuff")
70
- ),
71
- )
72
- )
73
- )
74
-
75
- def server(input, output, session):
76
- @output
77
- @render_tabulator
78
- @reactive.event(input.refresh)
79
- def table_all():
80
-
81
- import polars as pl
82
- df_spring = pl.read_parquet(f"hf://datasets/TJStatsApps/mlb_data/data/mlb_pitch_data_2025_spring.parquet")
83
-
84
-
85
- date = (datetime.datetime.now() - datetime.timedelta(hours=8)).date()
86
- print(datetime.datetime.now())
87
- date_str = date.strftime('%Y-%m-%d')
88
- # Initialize the scraper
89
-
90
-
91
- game_list_input = (scraper.get_schedule(year_input=[int(date_str[0:4])], sport_id=[1], game_type=['S'])
92
- .filter(pl.col('date') == date)['game_id'])
93
-
94
- data = scraper.get_data(game_list_input)
95
- df = scraper.get_data_df(data)
96
-
97
- df_spring = pl.concat([df_spring, df]).sort('game_date', descending=True)
98
-
99
-
100
-
101
- # df_year_old = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_mlb,df_aaa,df_a,df_afl])))
102
- # df_year_2old = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_mlb_2023])))
103
- df_spring_stuff = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_spring])))
104
-
105
-
106
-
107
- import polars as pl
108
-
109
- # Compute total pitches for each pitcher
110
- df_pitcher_totals = df_spring_stuff.group_by("pitcher_id").agg(
111
- pl.col("start_speed").count().alias("pitcher_total")
112
- )
113
-
114
- df_spring_group = df_spring_stuff.group_by(['pitcher_id', 'pitcher_name', 'pitch_type']).agg([
115
- pl.col('start_speed').count().alias('count'),
116
- pl.col('start_speed').mean().alias('start_speed'),
117
- pl.col('start_speed').max().alias('max_start_speed'),
118
- pl.col('ivb').mean().alias('ivb'),
119
- pl.col('hb').mean().alias('hb'),
120
- pl.col('release_pos_z').mean().alias('release_pos_z'),
121
- pl.col('release_pos_x').mean().alias('release_pos_x'),
122
- pl.col('extension').mean().alias('extension'),
123
- pl.col('tj_stuff_plus').mean().alias('tj_stuff_plus'),
124
- (pl.col('start_speed').filter(pl.col('batter_hand')=='L').count()).alias('rhh_count'),
125
- (pl.col('start_speed').filter(pl.col('batter_hand')=='R').count()).alias('lhh_count')
126
- ])
127
-
128
- # Join total pitches per pitcher to the grouped DataFrame on pitcher_id
129
- df_spring_group = df_spring_group.join(df_pitcher_totals, on="pitcher_id", how="left")
130
-
131
- # Now calculate the pitch percent for each pitcher/pitch_type combination
132
- df_spring_group = df_spring_group.with_columns(
133
- (pl.col("count") / pl.col("pitcher_total")).alias("pitch_percent")
134
- )
135
-
136
- # Optionally, if you want the percentage of left/right-handed batters within the group:
137
- df_spring_group = df_spring_group.with_columns([
138
- (pl.col("rhh_count") / pl.col("pitcher_total")).alias("rhh_percent"),
139
- (pl.col("lhh_count") / pl.col("pitcher_total")).alias("lhh_percent")
140
- ])
141
-
142
- df_merge = df_spring_group.join(df_year_old_group,on=['pitcher_id','pitch_type'],how='left',suffix='_old')
143
-
144
-
145
- df_merge = df_merge.with_columns(
146
- pl.col('pitcher_id').is_in(df_year_old_group['pitcher_id']).alias('exists_in_old')
147
- )
148
-
149
- df_merge = df_merge.with_columns(
150
- pl.when(pl.col('start_speed_old').is_null() & pl.col('exists_in_old'))
151
- .then(pl.lit(True))
152
- .otherwise(pl.lit(None))
153
- .alias("new_pitch")
154
- )
155
-
156
- import polars as pl
157
-
158
- # Define the columns to subtract
159
- cols_to_subtract = [
160
- ("start_speed", "start_speed_old"),
161
- ("max_start_speed", "max_start_speed_old"),
162
- ("ivb", "ivb_old"),
163
- ("hb", "hb_old"),
164
- ("release_pos_z", "release_pos_z_old"),
165
- ("release_pos_x", "release_pos_x_old"),
166
- ("extension", "extension_old"),
167
- ("tj_stuff_plus", "tj_stuff_plus_old")
168
- ]
169
-
170
- df_merge = df_merge.with_columns([
171
- # Step 1: Create _diff columns with the default value (e.g., 80) if old is null
172
- pl.when(pl.col(old).is_null())
173
- .then(pl.lit(10000)) # If old is null, assign 80 as the default
174
- .otherwise(pl.col(new) - pl.col(old)) # Otherwise subtract old from new
175
- .alias(new + "_diff")
176
- for new, old in cols_to_subtract
177
- ])
178
-
179
- # Step 2: Format the columns with (value (+diff)) - exclude brackets if diff is 80
180
- df_merge = df_merge.with_columns([
181
- pl.when(pl.col(new + "_diff").eq(10000)) # If diff is 80, no need to include brackets
182
- .then(pl.col(new).round(1).cast(pl.Utf8)+'\n\t') # Just return the new value as string
183
- .otherwise(
184
- pl.col(new).round(1).cast(pl.Utf8) +
185
- "\n(" +
186
- pl.col(new + "_diff").round(1)
187
- .map_elements(lambda x: f"{x:+.1f}") +
188
- ")"
189
- ).alias(new + "_formatted")
190
- for new, _ in cols_to_subtract
191
- ])
192
-
193
-
194
-
195
-
196
-
197
-
198
- percent_cols = ['pitch_percent', 'rhh_percent', 'lhh_percent']
199
-
200
- df_merge = df_merge.with_columns([
201
- (pl.col(col) * 100) # Convert to percentage
202
- .round(1) # Round to 1 decimal
203
- .map_elements(lambda x: f"{x:.1f}%") # Format as string with '%'
204
- .alias(col + "_formatted")
205
- for col in percent_cols
206
- ]).sort(['pitcher_id','count'],descending=True)
207
-
208
-
209
- columns = [
210
- { "title": "Pitcher Name", "field": "pitcher_name", "width": 250, "headerFilter":"input" ,"frozen":True,},
211
- { "title": "Team", "field": "pitcher_team", "width": 100, "headerFilter":"input" ,"frozen":True,},
212
- { "title": "Pitch Type", "field": "pitch_type", "width": 125, "headerFilter":"input" ,"frozen":True,},
213
- { "title": "New Pitch?", "field": "new_pitch", "width": 125, "headerFilter":"input" ,"frozen":False,},
214
- { "title": "Pitches", "field": "count", "width": 100 , "headerFilter":"input","contextMenu":True},
215
- { "title": "Pitch%", "field": "pitch_percent_formatted", "width": 100, "headerFilter":"input"},
216
- { "title": "RHH%", "field": "rhh_percent_formatted", "width": 100, "headerFilter":"input"},
217
- { "title": "LHH%", "field": "lhh_percent_formatted", "width": 100, "headerFilter":"input"},
218
- { "title": "Velocity", "field": "start_speed_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
219
- { "title": "Max Velo", "field": "max_start_speed_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
220
- { "title": "iVB", "field": "ivb_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
221
- { "title": "HB", "field": "hb_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
222
- { "title": "RelH", "field": "release_pos_z_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
223
- { "title": "RelS", "field": "release_pos_x_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
224
- { "title": "Extension", "field": "extension_formatted", "width": 125, "headerFilter":"input", "formatter":"textarea" },
225
- { "title": "tjStuff+", "field": "tj_stuff_plus_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" }
226
- ]
227
-
228
-
229
- df_plot = df_merge.to_pandas()
230
-
231
- team_dict = dict(zip(df_spring['pitcher_id'],df_spring['pitcher_team']))
232
- df_plot['pitcher_team'] = df_plot['pitcher_id'].map(team_dict)
233
-
234
-
235
-
236
- return Tabulator(
237
- df_plot,
238
-
239
- table_options=TableOptions(
240
- height=750,
241
-
242
- columns=columns,
243
- )
244
- )
245
-
246
-
247
- @output
248
- @render_tabulator
249
- @reactive.event(input.refresh)
250
- def table_daily():
251
-
252
- import polars as pl
253
- df_spring = pl.read_parquet(f"hf://datasets/TJStatsApps/mlb_data/data/mlb_pitch_data_2025_spring.parquet")
254
-
255
-
256
- import datetime
257
-
258
- date = (datetime.datetime.now() - datetime.timedelta(hours=8)).date()
259
- print(datetime.datetime.now())
260
-
261
- date_str = date.strftime('%Y-%m-%d')
262
- # Initialize the scraper
263
-
264
-
265
- game_list_input = (scraper.get_schedule(year_input=[int(date_str[0:4])], sport_id=[1], game_type=['S'])
266
- .filter(pl.col('date') == date)['game_id'])
267
-
268
- data = scraper.get_data(game_list_input)
269
- df = scraper.get_data_df(data)
270
-
271
- df_spring = pl.concat([df_spring, df]).sort('game_date', descending=True)
272
-
273
-
274
-
275
- # df_year_old = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_mlb,df_aaa,df_a,df_afl])))
276
- # df_year_2old = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_mlb_2023])))
277
- df_spring_stuff = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_spring])))
278
-
279
-
280
-
281
- import polars as pl
282
-
283
- # Compute total pitches for each pitcher
284
- df_pitcher_totals = df_spring_stuff.group_by(["pitcher_id",'game_id','game_date']).agg(
285
- pl.col("start_speed").count().alias("pitcher_total")
286
- )
287
-
288
- df_spring_group = df_spring_stuff.group_by(['pitcher_id', 'pitcher_name', 'pitch_type','game_id','game_date']).agg([
289
- pl.col('start_speed').count().alias('count'),
290
- pl.col('start_speed').mean().alias('start_speed'),
291
- pl.col('start_speed').max().alias('max_start_speed'),
292
- pl.col('ivb').mean().alias('ivb'),
293
- pl.col('hb').mean().alias('hb'),
294
- pl.col('release_pos_z').mean().alias('release_pos_z'),
295
- pl.col('release_pos_x').mean().alias('release_pos_x'),
296
- pl.col('extension').mean().alias('extension'),
297
- pl.col('tj_stuff_plus').mean().alias('tj_stuff_plus'),
298
- (pl.col('start_speed').filter(pl.col('batter_hand')=='L').count()).alias('rhh_count'),
299
- (pl.col('start_speed').filter(pl.col('batter_hand')=='R').count()).alias('lhh_count')
300
- ])
301
-
302
- # Join total pitches per pitcher to the grouped DataFrame on pitcher_id
303
- df_spring_group = df_spring_group.join(df_pitcher_totals, on=["pitcher_id",'game_id','game_date'], how="left")
304
-
305
- # Now calculate the pitch percent for each pitcher/pitch_type combination
306
- df_spring_group = df_spring_group.with_columns(
307
- (pl.col("count") / pl.col("pitcher_total")).alias("pitch_percent")
308
- )
309
-
310
- # Optionally, if you want the percentage of left/right-handed batters within the group:
311
- df_spring_group = df_spring_group.with_columns([
312
- (pl.col("rhh_count") / pl.col("pitcher_total")).alias("rhh_percent"),
313
- (pl.col("lhh_count") / pl.col("pitcher_total")).alias("lhh_percent")
314
- ])
315
-
316
- df_merge = df_spring_group.join(df_year_old_group,on=['pitcher_id','pitch_type'],how='left',suffix='_old')
317
-
318
-
319
- df_merge = df_merge.with_columns(
320
- pl.col('pitcher_id').is_in(df_year_old_group['pitcher_id']).alias('exists_in_old')
321
- )
322
-
323
- df_merge = df_merge.with_columns(
324
- pl.when(pl.col('start_speed_old').is_null() & pl.col('exists_in_old'))
325
- .then(pl.lit(True))
326
- .otherwise(pl.lit(None))
327
- .alias("new_pitch")
328
- )
329
-
330
- import polars as pl
331
-
332
- # Define the columns to subtract
333
- cols_to_subtract = [
334
- ("start_speed", "start_speed_old"),
335
- ("max_start_speed", "max_start_speed_old"),
336
- ("ivb", "ivb_old"),
337
- ("hb", "hb_old"),
338
- ("release_pos_z", "release_pos_z_old"),
339
- ("release_pos_x", "release_pos_x_old"),
340
- ("extension", "extension_old"),
341
- ("tj_stuff_plus", "tj_stuff_plus_old")
342
- ]
343
-
344
- df_merge = df_merge.with_columns([
345
- # Step 1: Create _diff columns with the default value (e.g., 80) if old is null
346
- pl.when(pl.col(old).is_null())
347
- .then(pl.lit(10000)) # If old is null, assign 80 as the default
348
- .otherwise(pl.col(new) - pl.col(old)) # Otherwise subtract old from new
349
- .alias(new + "_diff")
350
- for new, old in cols_to_subtract
351
- ])
352
-
353
- # Step 2: Format the columns with (value (+diff)) - exclude brackets if diff is 80
354
- df_merge = df_merge.with_columns([
355
- pl.when(pl.col(new + "_diff").eq(10000)) # If diff is 80, no need to include brackets
356
- .then(pl.col(new).round(1).cast(pl.Utf8)+'\n\t') # Just return the new value as string
357
- .otherwise(
358
- pl.col(new).round(1).cast(pl.Utf8) +
359
- "\n(" +
360
- pl.col(new + "_diff").round(1)
361
- .map_elements(lambda x: f"{x:+.1f}") +
362
- ")"
363
- ).alias(new + "_formatted")
364
- for new, _ in cols_to_subtract
365
- ])
366
-
367
-
368
-
369
-
370
-
371
-
372
- percent_cols = ['pitch_percent', 'rhh_percent', 'lhh_percent']
373
-
374
- df_merge = df_merge.with_columns([
375
- (pl.col(col) * 100) # Convert to percentage
376
- .round(1) # Round to 1 decimal
377
- .map_elements(lambda x: f"{x:.1f}%") # Format as string with '%'
378
- .alias(col + "_formatted")
379
- for col in percent_cols
380
- ]).sort(['pitcher_id','count'],descending=True)
381
-
382
-
383
- columns = [
384
- { "title": "Pitcher Name", "field": "pitcher_name", "width": 250, "headerFilter":"input" ,"frozen":True,},
385
- { "title": "Team", "field": "pitcher_team", "width": 100, "headerFilter":"input" ,"frozen":True,},
386
- { "title": "Pitch Type", "field": "pitch_type", "width": 125, "headerFilter":"input" ,"frozen":True,},
387
- { "title": "New Pitch?", "field": "new_pitch", "width": 125, "headerFilter":"input" ,"frozen":False,},
388
- { "title": "Date", "field": "game_date", "width": 100, "headerFilter":"input" ,"frozen":True,},
389
- { "title": "Pitches", "field": "count", "width": 100 , "headerFilter":"input"},
390
- { "title": "Pitch%", "field": "pitch_percent_formatted", "width": 100, "headerFilter":"input"},
391
- { "title": "RHH%", "field": "rhh_percent_formatted", "width": 100, "headerFilter":"input"},
392
- { "title": "LHH%", "field": "lhh_percent_formatted", "width": 100, "headerFilter":"input"},
393
- { "title": "Velocity", "field": "start_speed_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
394
- { "title": "Max Velo", "field": "max_start_speed_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
395
- { "title": "iVB", "field": "ivb_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
396
- { "title": "HB", "field": "hb_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
397
- { "title": "RelH", "field": "release_pos_z_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
398
- { "title": "RelS", "field": "release_pos_x_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
399
- { "title": "Extension", "field": "extension_formatted", "width": 125, "headerFilter":"input", "formatter":"textarea" },
400
- { "title": "tjStuff+", "field": "tj_stuff_plus_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" }
401
- ]
402
-
403
-
404
- df_plot = df_merge.to_pandas()
405
-
406
- team_dict = dict(zip(df_spring['pitcher_id'],df_spring['pitcher_team']))
407
- df_plot['pitcher_team'] = df_plot['pitcher_id'].map(team_dict)
408
-
409
-
410
-
411
- return Tabulator(
412
- df_plot,
413
-
414
- table_options=TableOptions(
415
- height=750,
416
-
417
- columns=columns,
418
- )
419
- )
420
-
421
- @output
422
- @render_tabulator
423
- @reactive.event(input.refresh)
424
- def table_tjstuff():
425
-
426
- import polars as pl
427
- df_spring = pl.read_parquet(f"hf://datasets/TJStatsApps/mlb_data/data/mlb_pitch_data_2025_spring.parquet")
428
-
429
-
430
- import datetime
431
-
432
- date = (datetime.datetime.now() - datetime.timedelta(hours=8)).date()
433
- print(datetime.datetime.now())
434
-
435
- date_str = date.strftime('%Y-%m-%d')
436
- # Initialize the scraper
437
-
438
-
439
- game_list_input = (scraper.get_schedule(year_input=[int(date_str[0:4])], sport_id=[1], game_type=['S'])
440
- .filter(pl.col('date') == date)['game_id'])
441
-
442
- data = scraper.get_data(game_list_input)
443
- df = scraper.get_data_df(data)
444
-
445
- df_spring = pl.concat([df_spring, df]).sort('game_date', descending=True)
446
-
447
-
448
-
449
- # df_year_old = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_mlb,df_aaa,df_a,df_afl])))
450
- # df_year_2old = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_mlb_2023])))
451
- df_spring_stuff = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_spring])))
452
-
453
-
454
-
455
- import polars as pl
456
-
457
- # Compute total pitches for each pitcher
458
- df_pitcher_totals = df_spring_stuff.group_by(["pitcher_id"]).agg(
459
- pl.col("start_speed").count().alias("pitcher_total")
460
- )
461
-
462
- df_spring_group = df_spring_stuff.group_by(['pitcher_id', 'pitcher_name', 'pitch_type']).agg([
463
- pl.col('start_speed').count().alias('count'),
464
- pl.col('start_speed').mean().alias('start_speed'),
465
- pl.col('start_speed').max().alias('max_start_speed'),
466
- pl.col('ivb').mean().alias('ivb'),
467
- pl.col('hb').mean().alias('hb'),
468
- pl.col('release_pos_z').mean().alias('release_pos_z'),
469
- pl.col('release_pos_x').mean().alias('release_pos_x'),
470
- pl.col('extension').mean().alias('extension'),
471
- pl.col('tj_stuff_plus').mean().alias('tj_stuff_plus'),
472
- (pl.col('start_speed').filter(pl.col('batter_hand')=='L').count()).alias('rhh_count'),
473
- (pl.col('start_speed').filter(pl.col('batter_hand')=='R').count()).alias('lhh_count')
474
- ])
475
-
476
- # Join total pitches per pitcher to the grouped DataFrame on pitcher_id
477
- df_spring_group = df_spring_group.join(df_pitcher_totals, on=["pitcher_id"], how="left")
478
-
479
- # Now calculate the pitch percent for each pitcher/pitch_type combination
480
- df_spring_group = df_spring_group.with_columns(
481
- (pl.col("count") / pl.col("pitcher_total")).alias("pitch_percent")
482
- )
483
-
484
- # Optionally, if you want the percentage of left/right-handed batters within the group:
485
- df_spring_group = df_spring_group.with_columns([
486
- (pl.col("rhh_count") / pl.col("pitcher_total")).alias("rhh_percent"),
487
- (pl.col("lhh_count") / pl.col("pitcher_total")).alias("lhh_percent")
488
- ])
489
-
490
- df_merge = df_spring_group.join(df_year_old_group,on=['pitcher_id','pitch_type'],how='left',suffix='_old')
491
-
492
-
493
- df_merge = df_merge.with_columns(
494
- pl.col('pitcher_id').is_in(df_year_old_group['pitcher_id']).alias('exists_in_old')
495
- )
496
-
497
- df_merge = df_merge.with_columns(
498
- pl.when(pl.col('start_speed_old').is_null() & pl.col('exists_in_old'))
499
- .then(pl.lit(True))
500
- .otherwise(pl.lit(None))
501
- .alias("new_pitch")
502
- )
503
-
504
- import polars as pl
505
-
506
- # Define the columns to subtract
507
- cols_to_subtract = [
508
- ("start_speed", "start_speed_old"),
509
- ("max_start_speed", "max_start_speed_old"),
510
- ("ivb", "ivb_old"),
511
- ("hb", "hb_old"),
512
- ("release_pos_z", "release_pos_z_old"),
513
- ("release_pos_x", "release_pos_x_old"),
514
- ("extension", "extension_old"),
515
- ("tj_stuff_plus", "tj_stuff_plus_old")
516
- ]
517
-
518
- df_merge = df_merge.with_columns([
519
- # Step 1: Create _diff columns with the default value (e.g., 80) if old is null
520
- pl.when(pl.col(old).is_null())
521
- .then(pl.lit(None)) # If old is null, assign 80 as the default
522
- .otherwise(pl.col(new) - pl.col(old)) # Otherwise subtract old from new
523
- .alias(new + "_diff")
524
- for new, old in cols_to_subtract
525
- ])
526
-
527
- # Step 2: Format the columns with (value (+diff)) - exclude brackets if diff is 80
528
- # Step 2: Format the columns with (value (+diff)) - exclude brackets if diff is 80
529
- df_merge = df_merge.with_columns([
530
-
531
- pl.col(new).round(1).cast(pl.Utf8).alias(new + "_formatted")
532
- for new, _ in cols_to_subtract
533
- ])
534
-
535
-
536
-
537
- df_merge = df_merge.with_columns([
538
- pl.col("tj_stuff_plus_old").round(1).cast(pl.Utf8).alias("tj_stuff_plus_old"),
539
- pl.col("tj_stuff_plus_diff").round(1).map_elements(lambda x: f"{x:+.1f}").alias("tj_stuff_plus_diff")
540
- ])
541
-
542
-
543
-
544
- percent_cols = ['pitch_percent', 'rhh_percent', 'lhh_percent']
545
-
546
- df_merge = df_merge.with_columns([
547
- (pl.col(col) * 100) # Convert to percentage
548
- .round(1) # Round to 1 decimal
549
- .map_elements(lambda x: f"{x:.1f}%") # Format as string with '%'
550
- .alias(col + "_formatted")
551
- for col in percent_cols
552
- ]).sort(['pitcher_id','count'],descending=True)
553
-
554
-
555
-
556
-
557
- columns = [
558
- { "title": "Pitcher Name", "field": "pitcher_name", "width": 250, "headerFilter":"input" ,"frozen":True,},
559
- { "title": "Team", "field": "pitcher_team", "width": 90, "headerFilter":"input" ,"frozen":True,},
560
- { "title": "Pitch Type", "field": "pitch_type", "width": 125, "headerFilter":"input" ,"frozen":True,},
561
- { "title": "New?", "field": "new_pitch", "width": 125, "headerFilter":"input" ,"frozen":False,},
562
- { "title": "Pitches", "field": "count", "width": 100 , "headerFilter":"input"},
563
- { "title": "Pitch%", "field": "pitch_percent_formatted", "width": 100, "headerFilter":"input"},
564
- { "title": "RHH%", "field": "rhh_percent_formatted", "width": 90, "headerFilter":"input"},
565
- { "title": "LHH%", "field": "lhh_percent_formatted", "width": 90, "headerFilter":"input"},
566
- { "title": "Velocity", "field": "start_speed_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
567
- { "title": "Max Velo", "field": "max_start_speed_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
568
- { "title": "iVB", "field": "ivb_formatted", "width": 80, "headerFilter":"input", "formatter":"textarea" },
569
- { "title": "HB", "field": "hb_formatted", "width": 80, "headerFilter":"input", "formatter":"textarea" },
570
- { "title": "RelH", "field": "release_pos_z_formatted", "width": 80, "headerFilter":"input", "formatter":"textarea" },
571
- { "title": "RelS", "field": "release_pos_x_formatted", "width": 80, "headerFilter":"input", "formatter":"textarea" },
572
- { "title": "Extension", "field": "extension_formatted", "width": 125, "headerFilter":"input", "formatter":"textarea" },
573
- { "title": "tjStuff+", "field": "tj_stuff_plus_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
574
- { "title": "2024 tjStuff+", "field": "tj_stuff_plus_old", "width": 100, "headerFilter":"input", "formatter":"textarea" },
575
- { "title": "Δ", "field": "tj_stuff_plus_diff", "width": 100, "headerFilter":"input", "formatter":"textarea" }
576
- ]
577
-
578
-
579
- df_plot = df_merge.sort(['pitcher_id','count'],descending=True).to_pandas()
580
-
581
- team_dict = dict(zip(df_spring['pitcher_id'],df_spring['pitcher_team']))
582
- df_plot['pitcher_team'] = df_plot['pitcher_id'].map(team_dict)
583
-
584
-
585
-
586
- return Tabulator(
587
- df_plot,
588
-
589
- table_options=TableOptions(
590
- height=750,
591
-
592
- columns=columns,
593
- )
594
- )
595
-
596
- app = App(app_ui, server)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import polars as pl
2
+ import api_scraper
3
+ mlb_scrape = api_scraper.MLB_Scrape()
4
+
5
+ from stuff_model import *
6
+ from shiny import App, reactive, ui, render
7
+ from shiny.ui import h2, tags
8
+ from api_scraper import MLB_Scrape
9
+ import datetime
10
+ from stuff_model import feature_engineering as fe
11
+ from stuff_model import stuff_apply
12
+ from pytabulator import TableOptions, Tabulator, output_tabulator, render_tabulator, theme
13
+ theme.tabulator_site()
14
+ scraper = MLB_Scrape()
15
+
16
+ df_year_old_group = pl.read_parquet('pitch_data_agg_2024.parquet')
17
+
18
+ pitcher_old_dict = dict(zip(df_year_old_group['pitcher_id'],df_year_old_group['pitcher_name']))
19
+
20
+
21
+
22
+
23
+ app_ui = ui.page_fluid(
24
+ ui.card(
25
+ ui.card_header("2025 Spring Training Pitch Data App"),
26
+ ui.row(
27
+ ui.column(4,
28
+ ui.markdown("""This app generates a table which shows the 2025 Spring Training data.
29
+
30
+ * Differences are calculated based on 2024 regular season data
31
+ * If 2024 data does not exist for pitcher, 2023 Data is used
32
+ * If no difference exists, the pitch is labelled as a new pitch"""),
33
+
34
+
35
+ ui.input_action_button(
36
+ "refresh",
37
+ "Refresh Data",
38
+ class_="btn-primary",
39
+ width="100%"
40
+ )
41
+ ),
42
+ ui.column(3,
43
+ ui.div(
44
+ "By: ",
45
+ ui.tags.a(
46
+ "@TJStats",
47
+ href="https://x.com/TJStats",
48
+ target="_blank"
49
+ )
50
+ ),
51
+ ui.tags.p("Data: MLB"),
52
+ ui.tags.p(
53
+ ui.tags.a(
54
+ "Support me on Patreon for more baseball content",
55
+ href="https://www.patreon.com/TJ_Stats",
56
+ target="_blank"
57
+ )
58
+ )
59
+ )
60
+ ),
61
+ ui.navset_tab(
62
+ ui.nav("All Pitches",
63
+ output_tabulator("table_all")
64
+ ),
65
+ ui.nav("Daily Pitches",
66
+ output_tabulator("table_daily")
67
+ ),
68
+ ui.nav("tjStuff+",
69
+ output_tabulator("table_tjstuff")
70
+ ),
71
+ ui.nav("tjStuff+ Summary",
72
+ output_tabulator("table_stuff_all")
73
+ )
74
+
75
+ )
76
+ )
77
+ )
78
+
79
+ def server(input, output, session):
80
+ @output
81
+ @render_tabulator
82
+ @reactive.event(input.refresh)
83
+ def table_all():
84
+
85
+ import polars as pl
86
+ df_spring = pl.read_parquet(f"hf://datasets/TJStatsApps/mlb_data/data/mlb_pitch_data_2025_spring.parquet")
87
+
88
+
89
+ date = (datetime.datetime.now() - datetime.timedelta(hours=8)).date()
90
+ print(datetime.datetime.now())
91
+ date_str = date.strftime('%Y-%m-%d')
92
+ # Initialize the scraper
93
+
94
+
95
+ game_list_input = (scraper.get_schedule(year_input=[int(date_str[0:4])], sport_id=[1], game_type=['S'])
96
+ .filter(pl.col('date') == date)['game_id'])
97
+
98
+ data = scraper.get_data(game_list_input)
99
+ df = scraper.get_data_df(data)
100
+
101
+ df_spring = pl.concat([df_spring, df]).sort('game_date', descending=True)
102
+
103
+
104
+
105
+ # df_year_old = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_mlb,df_aaa,df_a,df_afl])))
106
+ # df_year_2old = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_mlb_2023])))
107
+ df_spring_stuff = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_spring])))
108
+
109
+
110
+
111
+ import polars as pl
112
+
113
+ # Compute total pitches for each pitcher
114
+ df_pitcher_totals = df_spring_stuff.group_by("pitcher_id").agg(
115
+ pl.col("start_speed").count().alias("pitcher_total")
116
+ )
117
+
118
+ df_spring_group = df_spring_stuff.group_by(['pitcher_id', 'pitcher_name', 'pitch_type']).agg([
119
+ pl.col('start_speed').count().alias('count'),
120
+ pl.col('start_speed').mean().alias('start_speed'),
121
+ pl.col('start_speed').max().alias('max_start_speed'),
122
+ pl.col('ivb').mean().alias('ivb'),
123
+ pl.col('hb').mean().alias('hb'),
124
+ pl.col('release_pos_z').mean().alias('release_pos_z'),
125
+ pl.col('release_pos_x').mean().alias('release_pos_x'),
126
+ pl.col('extension').mean().alias('extension'),
127
+ pl.col('tj_stuff_plus').mean().alias('tj_stuff_plus'),
128
+ (pl.col('start_speed').filter(pl.col('batter_hand')=='L').count()).alias('rhh_count'),
129
+ (pl.col('start_speed').filter(pl.col('batter_hand')=='R').count()).alias('lhh_count')
130
+ ])
131
+
132
+ # Join total pitches per pitcher to the grouped DataFrame on pitcher_id
133
+ df_spring_group = df_spring_group.join(df_pitcher_totals, on="pitcher_id", how="left")
134
+
135
+ # Now calculate the pitch percent for each pitcher/pitch_type combination
136
+ df_spring_group = df_spring_group.with_columns(
137
+ (pl.col("count") / pl.col("pitcher_total")).alias("pitch_percent")
138
+ )
139
+
140
+ # Optionally, if you want the percentage of left/right-handed batters within the group:
141
+ df_spring_group = df_spring_group.with_columns([
142
+ (pl.col("rhh_count") / pl.col("pitcher_total")).alias("rhh_percent"),
143
+ (pl.col("lhh_count") / pl.col("pitcher_total")).alias("lhh_percent")
144
+ ])
145
+
146
+ df_merge = df_spring_group.join(df_year_old_group,on=['pitcher_id','pitch_type'],how='left',suffix='_old')
147
+
148
+
149
+ df_merge = df_merge.with_columns(
150
+ pl.col('pitcher_id').is_in(df_year_old_group['pitcher_id']).alias('exists_in_old')
151
+ )
152
+
153
+ df_merge = df_merge.with_columns(
154
+ pl.when(pl.col('start_speed_old').is_null() & pl.col('exists_in_old'))
155
+ .then(pl.lit(True))
156
+ .otherwise(pl.lit(None))
157
+ .alias("new_pitch")
158
+ )
159
+
160
+ import polars as pl
161
+
162
+ # Define the columns to subtract
163
+ cols_to_subtract = [
164
+ ("start_speed", "start_speed_old"),
165
+ ("max_start_speed", "max_start_speed_old"),
166
+ ("ivb", "ivb_old"),
167
+ ("hb", "hb_old"),
168
+ ("release_pos_z", "release_pos_z_old"),
169
+ ("release_pos_x", "release_pos_x_old"),
170
+ ("extension", "extension_old"),
171
+ ("tj_stuff_plus", "tj_stuff_plus_old")
172
+ ]
173
+
174
+ df_merge = df_merge.with_columns([
175
+ # Step 1: Create _diff columns with the default value (e.g., 80) if old is null
176
+ pl.when(pl.col(old).is_null())
177
+ .then(pl.lit(10000)) # If old is null, assign 80 as the default
178
+ .otherwise(pl.col(new) - pl.col(old)) # Otherwise subtract old from new
179
+ .alias(new + "_diff")
180
+ for new, old in cols_to_subtract
181
+ ])
182
+
183
+ # Step 2: Format the columns with (value (+diff)) - exclude brackets if diff is 80
184
+ df_merge = df_merge.with_columns([
185
+ pl.when(pl.col(new + "_diff").eq(10000)) # If diff is 80, no need to include brackets
186
+ .then(pl.col(new).round(1).cast(pl.Utf8)+'\n\t') # Just return the new value as string
187
+ .otherwise(
188
+ pl.col(new).round(1).cast(pl.Utf8) +
189
+ "\n(" +
190
+ pl.col(new + "_diff").round(1)
191
+ .map_elements(lambda x: f"{x:+.1f}") +
192
+ ")"
193
+ ).alias(new + "_formatted")
194
+ for new, _ in cols_to_subtract
195
+ ])
196
+
197
+
198
+
199
+
200
+
201
+
202
+ percent_cols = ['pitch_percent', 'rhh_percent', 'lhh_percent']
203
+
204
+ df_merge = df_merge.with_columns([
205
+ (pl.col(col) * 100) # Convert to percentage
206
+ .round(1) # Round to 1 decimal
207
+ .map_elements(lambda x: f"{x:.1f}%") # Format as string with '%'
208
+ .alias(col + "_formatted")
209
+ for col in percent_cols
210
+ ]).sort(['pitcher_id','count'],descending=True)
211
+
212
+
213
+ columns = [
214
+ { "title": "Pitcher Name", "field": "pitcher_name", "width": 250, "headerFilter":"input" ,"frozen":True,},
215
+ { "title": "Team", "field": "pitcher_team", "width": 100, "headerFilter":"input" ,"frozen":True,},
216
+ { "title": "Pitch Type", "field": "pitch_type", "width": 125, "headerFilter":"input" ,"frozen":True,},
217
+ { "title": "New Pitch?", "field": "new_pitch", "width": 125, "headerFilter":"input" ,"frozen":False,},
218
+ { "title": "Pitches", "field": "count", "width": 100 , "headerFilter":"input","contextMenu":True},
219
+ { "title": "Pitch%", "field": "pitch_percent_formatted", "width": 100, "headerFilter":"input"},
220
+ { "title": "RHH%", "field": "rhh_percent_formatted", "width": 100, "headerFilter":"input"},
221
+ { "title": "LHH%", "field": "lhh_percent_formatted", "width": 100, "headerFilter":"input"},
222
+ { "title": "Velocity", "field": "start_speed_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
223
+ { "title": "Max Velo", "field": "max_start_speed_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
224
+ { "title": "iVB", "field": "ivb_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
225
+ { "title": "HB", "field": "hb_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
226
+ { "title": "RelH", "field": "release_pos_z_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
227
+ { "title": "RelS", "field": "release_pos_x_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
228
+ { "title": "Extension", "field": "extension_formatted", "width": 125, "headerFilter":"input", "formatter":"textarea" },
229
+ { "title": "tjStuff+", "field": "tj_stuff_plus_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" }
230
+ ]
231
+
232
+
233
+ df_plot = df_merge.to_pandas()
234
+
235
+ team_dict = dict(zip(df_spring['pitcher_id'],df_spring['pitcher_team']))
236
+ df_plot['pitcher_team'] = df_plot['pitcher_id'].map(team_dict)
237
+
238
+
239
+
240
+ return Tabulator(
241
+ df_plot,
242
+
243
+ table_options=TableOptions(
244
+ height=750,
245
+
246
+ columns=columns,
247
+ )
248
+ )
249
+
250
+
251
+ @output
252
+ @render_tabulator
253
+ @reactive.event(input.refresh)
254
+ def table_daily():
255
+
256
+ import polars as pl
257
+ df_spring = pl.read_parquet(f"hf://datasets/TJStatsApps/mlb_data/data/mlb_pitch_data_2025_spring.parquet")
258
+
259
+
260
+ import datetime
261
+
262
+ date = (datetime.datetime.now() - datetime.timedelta(hours=8)).date()
263
+ print(datetime.datetime.now())
264
+
265
+ date_str = date.strftime('%Y-%m-%d')
266
+ # Initialize the scraper
267
+
268
+
269
+ game_list_input = (scraper.get_schedule(year_input=[int(date_str[0:4])], sport_id=[1], game_type=['S'])
270
+ .filter(pl.col('date') == date)['game_id'])
271
+
272
+ data = scraper.get_data(game_list_input)
273
+ df = scraper.get_data_df(data)
274
+
275
+ df_spring = pl.concat([df_spring, df]).sort('game_date', descending=True)
276
+
277
+
278
+
279
+ # df_year_old = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_mlb,df_aaa,df_a,df_afl])))
280
+ # df_year_2old = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_mlb_2023])))
281
+ df_spring_stuff = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_spring])))
282
+
283
+
284
+
285
+ import polars as pl
286
+
287
+ # Compute total pitches for each pitcher
288
+ df_pitcher_totals = df_spring_stuff.group_by(["pitcher_id",'game_id','game_date']).agg(
289
+ pl.col("start_speed").count().alias("pitcher_total")
290
+ )
291
+
292
+ df_spring_group = df_spring_stuff.group_by(['pitcher_id', 'pitcher_name', 'pitch_type','game_id','game_date']).agg([
293
+ pl.col('start_speed').count().alias('count'),
294
+ pl.col('start_speed').mean().alias('start_speed'),
295
+ pl.col('start_speed').max().alias('max_start_speed'),
296
+ pl.col('ivb').mean().alias('ivb'),
297
+ pl.col('hb').mean().alias('hb'),
298
+ pl.col('release_pos_z').mean().alias('release_pos_z'),
299
+ pl.col('release_pos_x').mean().alias('release_pos_x'),
300
+ pl.col('extension').mean().alias('extension'),
301
+ pl.col('tj_stuff_plus').mean().alias('tj_stuff_plus'),
302
+ (pl.col('start_speed').filter(pl.col('batter_hand')=='L').count()).alias('rhh_count'),
303
+ (pl.col('start_speed').filter(pl.col('batter_hand')=='R').count()).alias('lhh_count')
304
+ ])
305
+
306
+ # Join total pitches per pitcher to the grouped DataFrame on pitcher_id
307
+ df_spring_group = df_spring_group.join(df_pitcher_totals, on=["pitcher_id",'game_id','game_date'], how="left")
308
+
309
+ # Now calculate the pitch percent for each pitcher/pitch_type combination
310
+ df_spring_group = df_spring_group.with_columns(
311
+ (pl.col("count") / pl.col("pitcher_total")).alias("pitch_percent")
312
+ )
313
+
314
+ # Optionally, if you want the percentage of left/right-handed batters within the group:
315
+ df_spring_group = df_spring_group.with_columns([
316
+ (pl.col("rhh_count") / pl.col("pitcher_total")).alias("rhh_percent"),
317
+ (pl.col("lhh_count") / pl.col("pitcher_total")).alias("lhh_percent")
318
+ ])
319
+
320
+ df_merge = df_spring_group.join(df_year_old_group,on=['pitcher_id','pitch_type'],how='left',suffix='_old')
321
+
322
+
323
+ df_merge = df_merge.with_columns(
324
+ pl.col('pitcher_id').is_in(df_year_old_group['pitcher_id']).alias('exists_in_old')
325
+ )
326
+
327
+ df_merge = df_merge.with_columns(
328
+ pl.when(pl.col('start_speed_old').is_null() & pl.col('exists_in_old'))
329
+ .then(pl.lit(True))
330
+ .otherwise(pl.lit(None))
331
+ .alias("new_pitch")
332
+ )
333
+
334
+ import polars as pl
335
+
336
+ # Define the columns to subtract
337
+ cols_to_subtract = [
338
+ ("start_speed", "start_speed_old"),
339
+ ("max_start_speed", "max_start_speed_old"),
340
+ ("ivb", "ivb_old"),
341
+ ("hb", "hb_old"),
342
+ ("release_pos_z", "release_pos_z_old"),
343
+ ("release_pos_x", "release_pos_x_old"),
344
+ ("extension", "extension_old"),
345
+ ("tj_stuff_plus", "tj_stuff_plus_old")
346
+ ]
347
+
348
+ df_merge = df_merge.with_columns([
349
+ # Step 1: Create _diff columns with the default value (e.g., 80) if old is null
350
+ pl.when(pl.col(old).is_null())
351
+ .then(pl.lit(10000)) # If old is null, assign 80 as the default
352
+ .otherwise(pl.col(new) - pl.col(old)) # Otherwise subtract old from new
353
+ .alias(new + "_diff")
354
+ for new, old in cols_to_subtract
355
+ ])
356
+
357
+ # Step 2: Format the columns with (value (+diff)) - exclude brackets if diff is 80
358
+ df_merge = df_merge.with_columns([
359
+ pl.when(pl.col(new + "_diff").eq(10000)) # If diff is 80, no need to include brackets
360
+ .then(pl.col(new).round(1).cast(pl.Utf8)+'\n\t') # Just return the new value as string
361
+ .otherwise(
362
+ pl.col(new).round(1).cast(pl.Utf8) +
363
+ "\n(" +
364
+ pl.col(new + "_diff").round(1)
365
+ .map_elements(lambda x: f"{x:+.1f}") +
366
+ ")"
367
+ ).alias(new + "_formatted")
368
+ for new, _ in cols_to_subtract
369
+ ])
370
+
371
+
372
+
373
+
374
+
375
+
376
+ percent_cols = ['pitch_percent', 'rhh_percent', 'lhh_percent']
377
+
378
+ df_merge = df_merge.with_columns([
379
+ (pl.col(col) * 100) # Convert to percentage
380
+ .round(1) # Round to 1 decimal
381
+ .map_elements(lambda x: f"{x:.1f}%") # Format as string with '%'
382
+ .alias(col + "_formatted")
383
+ for col in percent_cols
384
+ ]).sort(['pitcher_id','count'],descending=True)
385
+
386
+
387
+ columns = [
388
+ { "title": "Pitcher Name", "field": "pitcher_name", "width": 250, "headerFilter":"input" ,"frozen":True,},
389
+ { "title": "Team", "field": "pitcher_team", "width": 100, "headerFilter":"input" ,"frozen":True,},
390
+ { "title": "Pitch Type", "field": "pitch_type", "width": 125, "headerFilter":"input" ,"frozen":True,},
391
+ { "title": "New Pitch?", "field": "new_pitch", "width": 125, "headerFilter":"input" ,"frozen":False,},
392
+ { "title": "Date", "field": "game_date", "width": 100, "headerFilter":"input" ,"frozen":True,},
393
+ { "title": "Pitches", "field": "count", "width": 100 , "headerFilter":"input"},
394
+ { "title": "Pitch%", "field": "pitch_percent_formatted", "width": 100, "headerFilter":"input"},
395
+ { "title": "RHH%", "field": "rhh_percent_formatted", "width": 100, "headerFilter":"input"},
396
+ { "title": "LHH%", "field": "lhh_percent_formatted", "width": 100, "headerFilter":"input"},
397
+ { "title": "Velocity", "field": "start_speed_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
398
+ { "title": "Max Velo", "field": "max_start_speed_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
399
+ { "title": "iVB", "field": "ivb_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
400
+ { "title": "HB", "field": "hb_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
401
+ { "title": "RelH", "field": "release_pos_z_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
402
+ { "title": "RelS", "field": "release_pos_x_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
403
+ { "title": "Extension", "field": "extension_formatted", "width": 125, "headerFilter":"input", "formatter":"textarea" },
404
+ { "title": "tjStuff+", "field": "tj_stuff_plus_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" }
405
+ ]
406
+
407
+
408
+ df_plot = df_merge.to_pandas()
409
+
410
+ team_dict = dict(zip(df_spring['pitcher_id'],df_spring['pitcher_team']))
411
+ df_plot['pitcher_team'] = df_plot['pitcher_id'].map(team_dict)
412
+
413
+
414
+
415
+ return Tabulator(
416
+ df_plot,
417
+
418
+ table_options=TableOptions(
419
+ height=750,
420
+
421
+ columns=columns,
422
+ )
423
+ )
424
+
425
+ @output
426
+ @render_tabulator
427
+ @reactive.event(input.refresh)
428
+ def table_tjstuff():
429
+
430
+ import polars as pl
431
+ df_spring = pl.read_parquet(f"hf://datasets/TJStatsApps/mlb_data/data/mlb_pitch_data_2025_spring.parquet")
432
+
433
+
434
+ import datetime
435
+
436
+ date = (datetime.datetime.now() - datetime.timedelta(hours=8)).date()
437
+ print(datetime.datetime.now())
438
+
439
+ date_str = date.strftime('%Y-%m-%d')
440
+ # Initialize the scraper
441
+
442
+
443
+ game_list_input = (scraper.get_schedule(year_input=[int(date_str[0:4])], sport_id=[1], game_type=['S'])
444
+ .filter(pl.col('date') == date)['game_id'])
445
+
446
+ data = scraper.get_data(game_list_input)
447
+ df = scraper.get_data_df(data)
448
+
449
+ df_spring = pl.concat([df_spring, df]).sort('game_date', descending=True)
450
+
451
+
452
+
453
+ # df_year_old = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_mlb,df_aaa,df_a,df_afl])))
454
+ # df_year_2old = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_mlb_2023])))
455
+ df_spring_stuff = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_spring])))
456
+
457
+
458
+
459
+ import polars as pl
460
+
461
+ # Compute total pitches for each pitcher
462
+ df_pitcher_totals = df_spring_stuff.group_by(["pitcher_id"]).agg(
463
+ pl.col("start_speed").count().alias("pitcher_total")
464
+ )
465
+
466
+ df_spring_group = df_spring_stuff.group_by(['pitcher_id', 'pitcher_name', 'pitch_type']).agg([
467
+ pl.col('start_speed').count().alias('count'),
468
+ pl.col('start_speed').mean().alias('start_speed'),
469
+ pl.col('start_speed').max().alias('max_start_speed'),
470
+ pl.col('ivb').mean().alias('ivb'),
471
+ pl.col('hb').mean().alias('hb'),
472
+ pl.col('release_pos_z').mean().alias('release_pos_z'),
473
+ pl.col('release_pos_x').mean().alias('release_pos_x'),
474
+ pl.col('extension').mean().alias('extension'),
475
+ pl.col('tj_stuff_plus').mean().alias('tj_stuff_plus'),
476
+ (pl.col('start_speed').filter(pl.col('batter_hand')=='L').count()).alias('rhh_count'),
477
+ (pl.col('start_speed').filter(pl.col('batter_hand')=='R').count()).alias('lhh_count')
478
+ ])
479
+
480
+ # Join total pitches per pitcher to the grouped DataFrame on pitcher_id
481
+ df_spring_group = df_spring_group.join(df_pitcher_totals, on=["pitcher_id"], how="left")
482
+
483
+ # Now calculate the pitch percent for each pitcher/pitch_type combination
484
+ df_spring_group = df_spring_group.with_columns(
485
+ (pl.col("count") / pl.col("pitcher_total")).alias("pitch_percent")
486
+ )
487
+
488
+ # Optionally, if you want the percentage of left/right-handed batters within the group:
489
+ df_spring_group = df_spring_group.with_columns([
490
+ (pl.col("rhh_count") / pl.col("pitcher_total")).alias("rhh_percent"),
491
+ (pl.col("lhh_count") / pl.col("pitcher_total")).alias("lhh_percent")
492
+ ])
493
+
494
+ df_merge = df_spring_group.join(df_year_old_group,on=['pitcher_id','pitch_type'],how='left',suffix='_old')
495
+
496
+
497
+ df_merge = df_merge.with_columns(
498
+ pl.col('pitcher_id').is_in(df_year_old_group['pitcher_id']).alias('exists_in_old')
499
+ )
500
+
501
+ df_merge = df_merge.with_columns(
502
+ pl.when(pl.col('start_speed_old').is_null() & pl.col('exists_in_old'))
503
+ .then(pl.lit(True))
504
+ .otherwise(pl.lit(None))
505
+ .alias("new_pitch")
506
+ )
507
+
508
+ import polars as pl
509
+
510
+ # Define the columns to subtract
511
+ cols_to_subtract = [
512
+ ("start_speed", "start_speed_old"),
513
+ ("max_start_speed", "max_start_speed_old"),
514
+ ("ivb", "ivb_old"),
515
+ ("hb", "hb_old"),
516
+ ("release_pos_z", "release_pos_z_old"),
517
+ ("release_pos_x", "release_pos_x_old"),
518
+ ("extension", "extension_old"),
519
+ ("tj_stuff_plus", "tj_stuff_plus_old")
520
+ ]
521
+
522
+ df_merge = df_merge.with_columns([
523
+ # Step 1: Create _diff columns with the default value (e.g., 80) if old is null
524
+ pl.when(pl.col(old).is_null())
525
+ .then(pl.lit(None)) # If old is null, assign 80 as the default
526
+ .otherwise(pl.col(new) - pl.col(old)) # Otherwise subtract old from new
527
+ .alias(new + "_diff")
528
+ for new, old in cols_to_subtract
529
+ ])
530
+
531
+ # Step 2: Format the columns with (value (+diff)) - exclude brackets if diff is 80
532
+ # Step 2: Format the columns with (value (+diff)) - exclude brackets if diff is 80
533
+ df_merge = df_merge.with_columns([
534
+
535
+ pl.col(new).round(1).cast(pl.Utf8).alias(new + "_formatted")
536
+ for new, _ in cols_to_subtract
537
+ ])
538
+
539
+
540
+
541
+ df_merge = df_merge.with_columns([
542
+ pl.col("tj_stuff_plus_old").round(1).cast(pl.Utf8).alias("tj_stuff_plus_old"),
543
+ pl.col("tj_stuff_plus_diff").round(1).map_elements(lambda x: f"{x:+.1f}").alias("tj_stuff_plus_diff")
544
+ ])
545
+
546
+
547
+
548
+ percent_cols = ['pitch_percent', 'rhh_percent', 'lhh_percent']
549
+
550
+ df_merge = df_merge.with_columns([
551
+ (pl.col(col) * 100) # Convert to percentage
552
+ .round(1) # Round to 1 decimal
553
+ .map_elements(lambda x: f"{x:.1f}%") # Format as string with '%'
554
+ .alias(col + "_formatted")
555
+ for col in percent_cols
556
+ ]).sort(['pitcher_id','count'],descending=True)
557
+
558
+
559
+
560
+
561
+ columns = [
562
+ { "title": "Pitcher Name", "field": "pitcher_name", "width": 250, "headerFilter":"input" ,"frozen":True,},
563
+ { "title": "Team", "field": "pitcher_team", "width": 90, "headerFilter":"input" ,"frozen":True,},
564
+ { "title": "Pitch Type", "field": "pitch_type", "width": 125, "headerFilter":"input" ,"frozen":True,},
565
+ { "title": "New?", "field": "new_pitch", "width": 125, "headerFilter":"input" ,"frozen":False,},
566
+ { "title": "Pitches", "field": "count", "width": 100 , "headerFilter":"input"},
567
+ { "title": "Pitch%", "field": "pitch_percent_formatted", "width": 100, "headerFilter":"input"},
568
+ { "title": "RHH%", "field": "rhh_percent_formatted", "width": 90, "headerFilter":"input"},
569
+ { "title": "LHH%", "field": "lhh_percent_formatted", "width": 90, "headerFilter":"input"},
570
+ { "title": "Velocity", "field": "start_speed_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
571
+ { "title": "Max Velo", "field": "max_start_speed_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
572
+ { "title": "iVB", "field": "ivb_formatted", "width": 80, "headerFilter":"input", "formatter":"textarea" },
573
+ { "title": "HB", "field": "hb_formatted", "width": 80, "headerFilter":"input", "formatter":"textarea" },
574
+ { "title": "RelH", "field": "release_pos_z_formatted", "width": 80, "headerFilter":"input", "formatter":"textarea" },
575
+ { "title": "RelS", "field": "release_pos_x_formatted", "width": 80, "headerFilter":"input", "formatter":"textarea" },
576
+ { "title": "Extension", "field": "extension_formatted", "width": 125, "headerFilter":"input", "formatter":"textarea" },
577
+ { "title": "tjStuff+", "field": "tj_stuff_plus_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
578
+ { "title": "2024 tjStuff+", "field": "tj_stuff_plus_old", "width": 100, "headerFilter":"input", "formatter":"textarea" },
579
+ { "title": "Δ", "field": "tj_stuff_plus_diff", "width": 100, "headerFilter":"input", "formatter":"textarea" }
580
+ ]
581
+
582
+
583
+ df_plot = df_merge.sort(['pitcher_id','count'],descending=True).to_pandas()
584
+
585
+ team_dict = dict(zip(df_spring['pitcher_id'],df_spring['pitcher_team']))
586
+ df_plot['pitcher_team'] = df_plot['pitcher_id'].map(team_dict)
587
+
588
+
589
+
590
+ return Tabulator(
591
+ df_plot,
592
+
593
+ table_options=TableOptions(
594
+ height=750,
595
+
596
+ columns=columns,
597
+ )
598
+ )
599
+
600
+ @output
601
+ @render_tabulator
602
+ @reactive.event(input.refresh)
603
+ def table_stuff_all():
604
+
605
+ import polars as pl
606
+ df_spring = pl.read_parquet(f"hf://datasets/TJStatsApps/mlb_data/data/mlb_pitch_data_2025_spring.parquet")
607
+
608
+
609
+ date = (datetime.datetime.now() - datetime.timedelta(hours=8)).date()
610
+ print(datetime.datetime.now())
611
+ date_str = date.strftime('%Y-%m-%d')
612
+ # Initialize the scraper
613
+
614
+
615
+ game_list_input = (scraper.get_schedule(year_input=[int(date_str[0:4])], sport_id=[1], game_type=['S'])
616
+ .filter(pl.col('date') == date)['game_id'])
617
+
618
+ data = scraper.get_data(game_list_input)
619
+ df = scraper.get_data_df(data)
620
+
621
+ df_spring = pl.concat([df_spring, df]).sort('game_date', descending=True)
622
+
623
+
624
+
625
+ # df_year_old = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_mlb,df_aaa,df_a,df_afl])))
626
+ # df_year_2old = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_mlb_2023])))
627
+ df_spring_stuff = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_spring])))
628
+
629
+
630
+
631
+
632
+
633
+
634
+
635
+ # Aggregate tj_stuff_plus by pitcher_id and year
636
+ df_agg_2024_pitch = df_spring_stuff.group_by(['pitcher_id','pitcher_name', 'pitch_type']).agg(
637
+ pl.col('tj_stuff_plus').len().alias('count'),
638
+ pl.col('tj_stuff_plus').mean()
639
+ )
640
+
641
+ # Calculate the weighted average of 'tj_stuff_plus' for each pitcher
642
+ df_weighted_avg = df_agg_2024_pitch.with_columns(
643
+ (pl.col('tj_stuff_plus') * pl.col('count')).alias('weighted_tj_stuff_plus')
644
+ ).group_by(['pitcher_id', 'pitcher_name']).agg(
645
+ pl.col('count').sum().alias('total_count'),
646
+ pl.col('weighted_tj_stuff_plus').sum().alias('total_weighted_tj_stuff_plus')
647
+ ).with_columns(
648
+ (pl.col('total_weighted_tj_stuff_plus') / pl.col('total_count')).alias('tj_stuff_plus')
649
+ ).select(['pitcher_id', 'pitcher_name', 'tj_stuff_plus', 'total_count'])
650
+
651
+ # Add the 'pitch_type' column with value "All"
652
+ df_weighted_avg = df_weighted_avg.with_columns(
653
+ pl.lit("All").alias('pitch_type')
654
+ )
655
+
656
+ # Select and rename columns to match the original DataFrame
657
+ df_weighted_avg = df_weighted_avg.select([
658
+ 'pitcher_id',
659
+ 'pitcher_name',
660
+
661
+ 'pitch_type',
662
+ pl.col('total_count').alias('count'),
663
+ 'tj_stuff_plus'
664
+ ])
665
+
666
+ # Concatenate the new rows with the original DataFrame
667
+ df_small = pl.concat([df_agg_2024_pitch, df_weighted_avg])
668
+
669
+ df_game_count = df_spring_stuff.group_by(['pitcher_id']).agg(
670
+
671
+ (((pl.col('game_id').count())).alias('pitches')/((pl.col('game_id').n_unique()))).alias('pitches_per_game'),
672
+ )
673
+
674
+
675
+
676
+ count_dict = dict(zip(df_small.filter(pl.col('pitch_type')=='All')['pitcher_id'],
677
+ df_small.filter(pl.col('pitch_type')=='All')['count']))
678
+ # Check if 'FS' column exists, if not create it and fill with None
679
+
680
+ df_small_pivot = (df_small.pivot(index=['pitcher_id','pitcher_name'],
681
+ columns='pitch_type',
682
+ values='tj_stuff_plus').with_columns(
683
+ pl.col("pitcher_id").replace_strict(count_dict, default=None).alias("count")))
684
+
685
+ # Check if 'FS' column exists, if not create it and fill with None
686
+ for col in ['CH', 'CU', 'FC', 'FF', 'FS', 'SI', 'SL', 'ST', 'All']:
687
+ if col not in df_small_pivot.columns:
688
+ df_small_pivot = df_small_pivot.with_columns(pl.lit(None).alias(col))
689
+
690
+ df_small_pivot.select(['pitcher_id','pitcher_name','count','CH','CU','FC','FF','FS','SI','SL','ST','All']).sort('All',descending=True)#.head(10)#.write_clipboard()
691
+
692
+
693
+ df_small_pivot = df_small_pivot.with_columns([
694
+ pl.col(col).round(0).alias(col) for col in ['CH', 'CU', 'FC', 'FF', 'FS', 'SI', 'SL', 'ST', 'All']
695
+ ])
696
+
697
+ df_plot = df_small_pivot.sort(['pitcher_id','count'],descending=True).to_pandas()
698
+
699
+ team_dict = dict(zip(df_spring['pitcher_id'],df_spring['pitcher_team']))
700
+ df_plot['pitcher_team'] = df_plot['pitcher_id'].map(team_dict)
701
+
702
+
703
+ columns = [
704
+ { "title": "Pitcher Name", "field": "pitcher_name", "width": 250, "headerFilter":"input" ,"frozen":True,},
705
+ { "title": "Team", "field": "pitcher_team", "width": 90, "headerFilter":"input" ,"frozen":True,},
706
+ { "title": "Pitches", "field": "count", "width": 100 , "headerFilter":"input"},
707
+ { "title": "CH", "field": "CH", "width": 80, "formatter":"textarea" },
708
+ { "title": "CU", "field": "CU", "width": 80, "formatter":"textarea" },
709
+ { "title": "FC", "field": "FC", "width": 80, "formatter":"textarea" },
710
+ { "title": "FF", "field": "FF", "width": 80, "formatter":"textarea" },
711
+ { "title": "FS", "field": "FS", "width": 80, "formatter":"textarea" },
712
+ { "title": "SI", "field": "SI", "width": 80, "formatter":"textarea" },
713
+ { "title": "SL", "field": "SL", "width": 80, "formatter":"textarea" },
714
+ { "title": "ST", "field": "ST", "width": 80, "formatter":"textarea" },
715
+ { "title": "All", "field": "All", "width": 80, "formatter":"textarea" }
716
+
717
+ ]
718
+
719
+
720
+
721
+
722
+ return Tabulator(
723
+ df_plot,
724
+
725
+ table_options=TableOptions(
726
+ height=750,
727
+
728
+
729
+ columns=columns,
730
+ ),
731
+ )
732
+
733
+
734
+
735
+ app = App(app_ui, server)