#import streamlit as st import pandas as pd import numpy as np import csv import json import matplotlib.pyplot as plt import ast #import pickle import sklearn from sklearn import linear_model df = pd.read_csv('emily_election.csv') #loaded_model = pickle.load(open(filename, 'rb')) df['runtime'] = df['cumulative_ad_runtime'].apply(lambda s: int(s.split('days')[0])) df['impressions'] = df['cumulative_impressions_by_region'].apply(lambda d: ast.literal_eval(d)) df['impressions'] = df['impressions'].apply(lambda d: np.array(list(d.values())).sum()) #feature 3 (for later) df['audience_size'] = df['cumulative_est_audience'].apply(lambda d: ast.literal_eval(d)) df['audience_size'] = df['audience_size'].apply(lambda d: np.array(list(d.values())).sum()) #data = df[['runtime', 'spend', 'impressions']] data = df[['runtime', 'spend', 'audience_size','impressions']] msk = np.random.rand(len(data)) < 0.8 train = data[msk] test = data[~msk] #new_train = train[train['impressions'] < 1000000] train['spend'] = pd.to_numeric(train['spend'],errors='coerce') new_train = train[(train['spend'] > 250)] new_train = new_train[new_train['runtime']>4] new_train.shape #this model predicts impressions given the runtime and the spend regr = linear_model.LinearRegression() new_train['log_runtime'] = np.log(new_train['runtime']) new_train['log_spend'] = np.log(new_train['spend']) new_train['log_impressions'] = np.log(new_train['impressions']) new_train.replace([np.inf, -np.inf], np.nan, inplace=True) new_train.dropna(inplace=True) x = np.asanyarray(new_train[['log_runtime', 'log_spend']]) y = np.asanyarray(new_train[['log_impressions']]) print(x) regr.fit(x, y) #y_pred= regr.predict(new_train[['log_runtime', 'log_spend']]) # # The coefficients #print(regr.coef_) #print('R-squared score: %.2f' % regr.score(x, y)) #print('Standard Deviation: %.2f' % np.sqrt(sum((y - y_pred)**2) / (len(y) - 2)))