File size: 2,113 Bytes
2b7b43f
 
 
4e07a41
2b7b43f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import polars as pl
import joblib

model = joblib.load('stuff_model/lgbm_model_2020_2024.joblib')
# Read the values from the text file
with open('stuff_model/target_stats.txt', 'r') as file:
    lines = file.readlines()
    target_mean = float(lines[0].strip())
    target_std = float(lines[1].strip())
    
# Define the features to be used for training
features = ['start_speed',
            'spin_rate',
            'extension',
            'az',
            'ax',
            'x0',
            'z0',
            'speed_diff',
            'az_diff',
            'ax_diff']


def stuff_apply(df:pl.DataFrame) -> pl.DataFrame:
    # Filter the dataframe to include only the rows for the year 2024 and drop rows with null values in the specified features and target column
    # df_test = df.drop_nulls(subset=features)
    df_test = df.clone()

    # Predict the target values for the 2024 data using the trained model
    df_test = df_test.with_columns(
        pl.Series(name="target", values=model.predict(df_test[features].to_numpy()))
    )
    # Standardize the target column to create a z-score
    df_test = df_test.with_columns(
        ((pl.col('target') - target_mean) / target_std).alias('target_zscore')
    )

    # Convert the z-score to tj_stuff_plus
    df_test = df_test.with_columns(
        (100 - (pl.col('target_zscore') * 10)).alias('tj_stuff_plus')
    )

    df_pitch_types = pl.read_csv('stuff_model/tj_stuff_plus_pitch.csv')

    # Join the pitch type statistics with the main DataFrame based on pitch_type
    df_pitch_all = df_test.join(df_pitch_types, left_on='pitch_type', right_on='pitch_type')

    # Normalize pitch_grade values to a range between -0.5 and 0.5 based on the percentiles
    df_pitch_all = df_pitch_all.with_columns(
        ((pl.col('tj_stuff_plus') - pl.col('mean')) / pl.col('std')).alias('pitch_grade')
    )

    # Scale the pitch_grade values to a range between 20 and 80
    df_pitch_all = df_pitch_all.with_columns(
        (pl.col('pitch_grade') * 10 + 50).clip(20, 80)
    )
    return df_pitch_all