Note: With 💡 emoji would contain our insights
Goal: predict the total number of Washington D.C. bicycle users on an hourly basis.
cnt
, casual
, and registered
).from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Bike Sharing Prediction
import sklearn
import os
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import make_column_transformer,make_column_selector
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer
from sklearn.impute import KNNImputer, SimpleImputer
sns.set()
df = pd.read_csv("hour.csv",index_col='instant',parse_dates=['dteday'],dayfirst=True)
df.head()
df.dtypes #check the variable types
# Check how many unique values each column has
df.nunique()
💡From this we can say weathersit
and hr
are categorical variables
we will try to use
hr
as categoricla variable
# checking for monotonicity
# monotonic is defined as either a non-increasing or non-decreasing set of ordered values
print(df.index.is_monotonic_increasing)
print(df.index.is_unique)
df.isna().sum()
💡 Five columns such as weathersit
,temp
,atemp
,hum
,and windspeed
contains null values
##see the distribution of our null values
plt.figure(figsize=(15,10))
sns.heatmap(df.isnull(), cbar=False)
plt.show()
gf = df.dteday.value_counts().sort_index().plot(figsize=(40,12))
gf.set_title("Inconsistency of the data",fontsize= 30)
gf.set_ylabel('Count of daily data',fontsize=20)
gf.tick_params(axis='both', which='major', labelsize=20)
plt.savefig('Inconsistency of Data.png')
df_in = df.groupby('dteday').count()
incosistent_dates = list(df_in.loc[df_in['cnt']<10].index)
df.loc[df['dteday'].isin(incosistent_dates)]
💡 Since dealing with inconsistency requires some domain knowledge we tried to research a little bit more about those dates. > So we found that:
So it can be inferred that these missing hours are due to some underlying cause and hence cannot be imputed.
# check for duplicates
df.duplicated().sum()
df.describe()
For normally distributed data (temp, atmep, windspead and hum), the skewness should be close to 0.
if skewness >0, it would have more weight in right tail
numerical_columns = ['temp', 'atemp', 'hum','windspeed']
fig, ax = plt.subplots(ncols=4, figsize = (56, 8))
for i, axes in enumerate(ax.ravel()):
# histogram and kernel density estimation function of numerical variables
df[numerical_columns[i]].plot.hist(ax=axes,alpha=0.3)
axes_kde = axes.twinx()
df[numerical_columns[i]].plot.kde(ax=axes_kde)
axes.set_title(f'Distribution of {numerical_columns[i]}', fontsize=15)
plt.savefig('Numerical Skewness.png')
temp = round(df["temp"].skew(),5)
print ("the skew of temp is", temp)
atemp = round(df["atemp"].skew(),5)
print ("the skew of atemp is", atemp)
hum = round(df["hum"].skew(),5)
print ("the skew of hum is", hum)
windspeed = round(df["windspeed"].skew(),5)
print ("the skew of windspeed is", windspeed)
we can see the windspeed tend to skew more to right tail
fig, ax = plt.subplots(2, 2, figsize = (20, 8))
for i, axes in enumerate(ax.ravel()):
# box plot estimation function of the numerical values
df[numerical_columns[i]].plot.box(ax=axes)
axes.set_title(f'Box plot of {numerical_columns[i]}', fontsize=15)
windspeed
have some skewed values, we will decide later on how to deal with it¶df['weathersit'].value_counts()
plt.figure(figsize=(10,7))
plt.pie(df['weathersit'].value_counts(),labels=['Clear_cloudy','Mist_cloudy','Light_Snow','Heavy_rain'],autopct='%1.0f%%',textprops={'fontsize': 14})
plt.title('Weather Situation Distribution',fontsize = 18)
plt.tight_layout()
plt.show()
plt.savefig('weather_distribution.png')
From above we can that Clear,Few clouds, Partly cloudy, Partly cloudy
has high value.
df.dtypes
df.head()
plt.figure(figsize = (24,4))
sns.lineplot(x='dteday',y = 'cnt',data = df,ci=None)
plt.title('Demand of Bicycles over time',fontsize=20)
plt.xlabel(None)
plt.savefig('over_time.png')
therefore, in data engineer part, we would like to extract feature related to season,month to see how it affect the count of rental bikes.
# Checking the correlation between the variables
plt.figure(figsize = (10,6))
matrix = np.triu(df.corr())
sns.heatmap(df.corr(), annot = True, cmap="YlGnBu", mask=matrix)
plt.title("Correlation Matrix")
plt.show()
temp
and atemp
are highly correlated so we have to select one Both variables are normalized, but seems that temp is perfectly distributed with a median value of 0.5 and the Skewness is closer to 0 (the skew of temp is -0.0055, the skew of atemp is -0.08999)
Registered
and Casual
is related to cnt
(data leakage), so we also not exclude feature set.# 1.Season (1:Spring, 2:Summer, 3:Autumn, 4:Winter)
def season(mydate):
# get the year from the date object
year = mydate.year
# create a list of seasons for comparison
seasons = [
(2, dt.date(year, 12, 21), dt.date(year, 12, 31)),
(4, dt.date(year, 6, 21), dt.date(year, 9, 20)),
(1, dt.date(year, 9, 21), dt.date(year, 12, 20)),
(2, dt.date(year, 1, 1), dt.date(year, 3, 20)),
(3, dt.date(year, 3, 21), dt.date(year, 6, 20)),
]
# find the corresponding from the list and return it
for season in seasons:
if mydate >= season[1] and mydate <= season[2]:
return season[0]
# 2. Since the data is related to Boston in U.S., we decided to used U.S. Holiday to define our holiday function
from pandas.tseries.holiday import USFederalHolidayCalendar
def get_holiday(date):
cal = USFederalHolidayCalendar()
holidays = cal.holidays(start='2011'+'-01-01', end='2012'+'-12-31').strftime("%Y-%m-%d")
holidays = list(holidays)
holidays.extend(['2011-01-01','2011-12-25','2012-01-01','2012-11-11'])
return np.where(date.isin(holidays), 1, 0)
df['season'] = df['dteday'].map(season)
df['month'] = df['dteday'].dt.month
df['year'] = df['dteday'].dt.year
df['weekday'] = df['dteday'].dt.weekday
# if it is weekday 1, if not 0
df['working_day'] = np.where(df['dteday'].dt.weekday < 5, 1, 0)
# office hour- define as 9 am -17 pm
df['office_hour'] = np.where((df['hr'] >= 9) & (df['hr'] < 17) & (df['dteday'].dt.weekday < 5), 1, 0)
# daytime - define as 6am till 22 pm
df['daytime'] = np.where((df['hr'] >= 6) & (df['hr'] < 22), 1, 0)
# rushhour morning - in week day if it is during 6 am- 10 am, define as rushhour morning
# rushhour evening - in week day if it is during 15 pm to 19pm define as rushhour evening
df['rushhour_morning'] = np.where((df['hr'] >= 6) & (df['hr'] < 10) & (df['dteday'].dt.weekday < 5), 1, 0)
df['rushhour_evening'] = np.where((df['hr'] >= 15) & (df['hr'] < 19) & (df['dteday'].dt.weekday < 5), 1, 0)
# high season - summer, if it is 1 is high season, if 0 is not
df['highseason'] = np.where(df['dteday'].map(season) == 3, 1, 0)
df['holiday'] = get_holiday(df['dteday'])
def get_temp_bins(temp):
# define bins
bins = [-np.inf, 0.19, 0.49, 0.69, 0.89, np.inf]
labels = ['low','low-medium','medium','medium-high','high']
return pd.cut(temp, bins,labels=labels).cat.codes.to_frame()
def get_hum_bins(hum):
# define bins
bins = [-np.inf, 0.19, 0.49, 0.69, 0.89, np.inf]
labels = ['low','low-medium','medium','medium-high','high']
return pd.cut(hum, bins,labels=labels).cat.codes.to_frame()
df['temp_bin'] = get_temp_bins(df['temp'].fillna(method='ffill'))
df['hum_bin'] = get_hum_bins(df['hum'].fillna(method='ffill'))
windspeed
has skewness issue and contains lots of outlier through the distribution# we use log1p - Return the natural logarithm of one plus the input array
df['windspeed_log1p'] = np.log1p(df['windspeed'].fillna(method='ffill'))
windspeed_skew = round(df["windspeed"].skew(),5)
windspeedlog1p_skew = round(df["windspeed_log1p"].skew(),5)
print(windspeed_skew,windspeedlog1p_skew) # after log feature, the skewness is closer to 0
df.head(3)
plt.figure(figsize = (24,10))
matrix = np.triu(df.corr())
sns.heatmap(df.corr(), annot = True, cmap="YlGnBu", mask=matrix)
plt.title("Correlation Matric")
plt.show()
temp
in the pipeline because it is highly correlated with atemp
casual
and registered
due to data leakage with our target varible cnt
temp bin
and hum bin
has highly correlated with temp
and hum
,because it create bin from this two columns, we will futther used it in our pipeline and take trails on itfig, ax = plt.subplots(figsize=(24, 6))
average_week_demand = df.groupby(["weekday", "hr"]).mean()["cnt"]
average_week_demand.plot(ax=ax)
_ = ax.set(
title="Average hourly bike demand during the week",
xticks=[i * 24 for i in range(7)],
xticklabels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat","Sun"],
xlabel="Time of the week",
ylabel="Number of bike rentals",
)
plt.savefig('Average hourly bike demand during week.png')
💡 From Average Hourly Bike demand during the week graph
# Average Monthly Count Distribution plot
f, axes = plt.subplots(nrows=1, ncols=1, figsize=(15, 6))
group_month = pd.DataFrame(df.groupby(['month', 'working_day'])['cnt'].mean()).reset_index()
sns.barplot(data=group_month, x='month', y='cnt', hue='working_day', ax=axes)
axes.set(xlabel='Month', ylabel='Count', title='Average bike rentals per Month')
handles, _ = axes.get_legend_handles_labels()
axes.legend(handles, ['Not a Working Day', 'Working Day'])
plt.show()
mydata_w = df.loc[df.working_day==1]
mydata_nw = df.loc[df.working_day==0]
fig = plt.figure(figsize=(24, 8))
# Working Day
axes = fig.add_subplot(1, 2, 1)
f = axes.scatter(mydata_w.hr, mydata_w['cnt'], c=mydata_w.temp, cmap = 'YlOrRd')
axes.set(xticks = range(24), xlabel='Hours in day', ylabel='Count', title='Working Day: Count vs. Day Hour with Temperature Gradient')
cbar = plt.colorbar(f)
cbar.set_label('Temperature in degree C')
# Non Working Day
axes = fig.add_subplot(1, 2, 2)
f = axes.scatter(mydata_nw.hr, mydata_nw['cnt'], c=mydata_nw.temp, cmap = 'YlOrRd')
axes.set(xticks = range(24), xlabel='Hours in day', ylabel='Count', title='Non Working Day: Count vs. Day Hour with Temperature Gradient')
cbar = plt.colorbar(f)
cbar.set_label('Temperature in degree C')
plt.show();
plt.savefig('work_not_working.png')
💡 From the above plot we can see the 2 patterns across the hours in a day in bike rentals
sns.set()
fig, axes = plt.subplots(nrows=3,ncols=3, sharey=True, figsize=(28,20))
fig.subplots_adjust(hspace=0.3, wspace=0.10)
cols = ['year','season','month','office_hour','rushhour_morning','working_day','holiday','daytime','rushhour_evening']
for i,ax in enumerate(axes.ravel()):
sns.boxplot(y="cnt",x=cols[i], data=df, orient="v", notch=True, ax=ax,palette="Set3")
ax.set(xlabel=cols[i], ylabel='Count', title=f"Box Plot On Count Across {cols[i]}")
fig.savefig('boxplot.png')
Season (1:Spring, 2:Summer, 3:Autumn, 4:Winter)
sns.set(context='talk',style='ticks',font_scale=0.8)
fig, axes = plt.subplots(nrows=1,ncols=1, sharey=True, figsize=(24,6))
sns.boxplot(y="cnt",x="hr", data=df, orient="v", notch=True, ax=axes)
axes.set(xlabel='Hour Of The Day', ylabel='Count', title="Box Plot On Count Across Hour Of The Day")
plt.show()
plt.close('all')
plt.savefig('Box Plot per Hour.png')
fig = plt.figure()
fig, axes = plt.subplots(nrows=2,ncols=2, figsize=(24,16))
fig.subplots_adjust(hspace=0.25, wspace=0.12)
seasonOrder = [1,2,3,4]
WeekdayOrder = [5, 6, 0, 1, 2, 3, 4]
# 1.Season (1:Spring, 2:Summer, 3:Autumn, 4:Winter)
ax1 = sns.lineplot(x="hr", y="cnt",hue="season", hue_order=seasonOrder, data=df,palette='Paired', ax=axes[0][0],ci=None)
#xticklabels=["Spring", "Summer", "Autumn", "Winter"],
ax1.set(xlabel='Hour Of The Day', ylabel='Average Count (continuous line )', title ="Average Bicycle Count By Hour Of The Day Across Season", label='big')
#ax1a = ax1.twinx()
ax1a = sns.lineplot(x="hr", y="temp", hue="season", hue_order=seasonOrder, data=df,palette='Paired',ax=axes[0][1],ci=None)
ax1a.set(xlabel='Hour Of The Day',ylabel='Temperature', label='big', ylim=(df.temp.min(),df.temp.max()))
ax2 = sns.lineplot(x="hr", y="cnt", hue="weekday", hue_order=WeekdayOrder, data=df,palette='Paired',ax=axes[1][0],ci=None)
ax2.set(xlabel='Hour Of The Day', ylabel='Average Count (continuous line)',title="Average Bicycle Count By Hour Of The Day Across Weekdays",label='big')
#ax2a = ax2.twinx()
ax2a = sns.lineplot(x="hr", y="temp", hue="weekday", hue_order=WeekdayOrder, data=df,palette='Paired',ax=axes[1][1],ci=None)
ax2a.set(xlabel='Hour Of The Day',ylabel='Temperature', label='big', ylim=(df.temp.min(),df.temp.max()))
plt.show()
plt.close('all')
plt.savefig('Seasons.png')
Season (1:Spring, 2:Summer, 3:Autumn, 4:Winter)
fig = plt.figure(figsize=(24, 8))
axes = fig.add_subplot(1, 3, 1)
sns.regplot(data=df, x='temp', y='cnt',ax=axes,line_kws={"color": "black"})
axes.set(title='Reg Plot for Temperature vs. Count')
axes = fig.add_subplot(1, 3, 2)
sns.regplot(data=df, x='hum', y='cnt',ax=axes, color='r',line_kws={"color": "black"})
axes.set(title='Reg Plot for Humidity vs. Count')
axes = fig.add_subplot(1, 3, 3)
sns.regplot(data=df, x='windspeed', y='cnt',ax=axes, color='g',line_kws={"color": "black"})
axes.set(title='Reg Plot for Windspeed vs. Count') #why do we not use log windspeed
plt.show()
plt.savefig('regplot.png')
Time Based Cross-validation
df.dtypes
#divide to train and test
train = df.loc[(df["dteday"].isin(pd.date_range(start = "01/01/2011", end = "30/09/2012")))==True]
test = df.loc[(df["dteday"].isin(pd.date_range(start = "01/01/2011", end = "30/09/2012")))==False] # year 2012 4th quarter
#create 'X_train', 'y_train', 'X_test', and 'y_test'
X_train = train.drop("cnt", axis="columns")
y_train = train['cnt']
X_test = test.drop("cnt", axis="columns")
y_test = test['cnt']
df.shape
from sklearn.model_selection import TimeSeriesSplit
ts_cv = TimeSeriesSplit(
n_splits=5,
gap=48,
max_train_size=10000,
test_size=1000,
)
Let us manually inspect the various splits to check that the TimeSeriesSplit works as we expect, starting with the first split:
all_splits = list(ts_cv.split(X_train, y_train))
train_0, test_0 = all_splits[0]
X_train.iloc[test_0].head(3)
X_train.iloc[test_0].shape
X_train.iloc[train_0].head(3)
X_train.iloc[train_0].shape
casual
,registered
(due to data leakage) and atemp (highly correlated with
temp`) dteday
because we already generate lots of feature related to time and to avoid weight more on variable of time, we need to drop ithum
and windspeed
, we use median to replace it. Median will be more robust compare with imputing meantemp
we use KNN imputer, due to the the nature of tempeture, we choose the KNN imputer to impute itdf.isna().sum() # now, we are going to deal with the null value
df.dtypes
sklearn.set_config(display='diagram')
transformer = make_column_transformer(
("drop", ["atemp",'dteday','casual','registered']),
(KNNImputer(n_neighbors=2), ["temp"]),
(SimpleImputer(strategy="median"), ["hum"]),
(SimpleImputer(strategy="median"), ["windspeed"]),
(make_pipeline(SimpleImputer(strategy='most_frequent'),OneHotEncoder(handle_unknown='ignore')),["weathersit"]),
remainder = MinMaxScaler()
)
transformer
time_related_features = [
'season', 'month', 'year', 'weekday','working_day', 'office_hour', 'daytime',
'rushhour_morning','rushhour_evening', 'highseason', 'holiday','hr'
]
one_hot_transformer = make_column_transformer(
("drop", ["atemp",'dteday','casual','registered']),
(KNNImputer(n_neighbors=2), ["temp"]),
(SimpleImputer(strategy="median"), ["hum"]),
(SimpleImputer(strategy="median"), ["windspeed"]),
(make_pipeline(SimpleImputer(strategy='most_frequent'),OneHotEncoder(handle_unknown='ignore')),["weathersit"]),
(OneHotEncoder(handle_unknown='ignore'), time_related_features),
remainder = MinMaxScaler()
)
one_hot_transformer
month
,weekday
and hr
) using a sine and cosine transformation with the matching period.month
,weekday
and hr
def sin_transformer(period):
return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))
def cos_transformer(period):
return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))
cyclic_transformer = make_column_transformer(
("drop", ["atemp",'dteday','casual','registered']),
(KNNImputer(n_neighbors=2), ["temp"]),
(SimpleImputer(strategy="median"), ["hum"]),
(SimpleImputer(strategy="median"), ["windspeed"]),
(make_pipeline(SimpleImputer(strategy='most_frequent'),OneHotEncoder(handle_unknown='ignore')),["weathersit"]),
(sin_transformer(12), ["month"]),
(cos_transformer(12), ["month"]),
(sin_transformer(7), ["weekday"]),
(cos_transformer(7), ["weekday"]),
(sin_transformer(24), ["hr"]),
(cos_transformer(24), ["hr"]),
remainder=MinMaxScaler(),
)
cyclic_transformer
Bonus points for:
- Plotting validation results to justify further choices (parameter ranges, other validations...).
- Following an incremental approach (baseline models first, then more complex models, then combining models...)
# creating the pipeline with transformer 1
ct = Pipeline([
('transformer', transformer),
('model',LinearRegression())
])
ct
from sklearn.model_selection import cross_val_score
#R2 - coefficient of determination, best would be 1
lr_score = cross_val_score(ct,X_train,y_train,scoring='r2',cv=ts_cv)
lr_score
lr_score.mean()
we add OneHotEncoder for the categroical varibale
#create a pipeline with one_hot_transformer
ct_cat = Pipeline([
('transformer', one_hot_transformer),
('model',LinearRegression())
])
ct_cat
lr2_score = cross_val_score(ct_cat,X_train,y_train,scoring='r2',cv=ts_cv)
lr2_score
lr2_score.mean()
periodic feature engineering strategy.
#create a pipeline with cyclic transformer
ct_cyc = Pipeline([
('transformer', cyclic_transformer),
('model',LinearRegression())
])
ct_cyc
lr3_score = cross_val_score(ct_cyc,X_train,y_train,scoring='r2',cv=ts_cv)
lr3_score
lr3_score.mean()
ct2 = Pipeline([
('transformer', transformer),
('model',RandomForestRegressor())
])
rf_score = cross_val_score(ct2,X_train,y_train,scoring='r2',cv=ts_cv)
rf_score
rf_score.mean()
ct2_cat = Pipeline([
('transformer', one_hot_transformer),
('model',RandomForestRegressor())
])
rf_score2 = cross_val_score(ct2_cat,X_train,y_train,scoring='r2',cv=ts_cv)
rf_score2
rf_score2.mean()
ct2_cyc = Pipeline([
('transformer', cyclic_transformer),
('model',RandomForestRegressor())
])
rf_score3 = cross_val_score(ct2_cyc,X_train,y_train,scoring='r2',cv=ts_cv)
rf_score3
rf_score3.mean()
ct3 = Pipeline([
('transformer', transformer),
('model',)
])DecisionTreeRegressor()
dt_score = cross_val_score(ct3,X_train,y_train,scoring='r2',cv=ts_cv)
dt_score
dt_score.mean()
ct3_cat = Pipeline([
('transformer', one_hot_transformer),
('model',DecisionTreeRegressor())
])
dt_score2 = cross_val_score(ct3_cat,X_train,y_train,scoring='r2',cv=ts_cv)
dt_score2
dt_score2.mean()
ct3_cyc = Pipeline([
('transformer', cyclic_transformer),
('model',DecisionTreeRegressor())
])
dt_score3 = cross_val_score(ct3_cyc,X_train,y_train,scoring='r2',cv=ts_cv)
dt_score3
dt_score3.mean()
#create three different pipelines based on different transformers
clf = Pipeline([
('transformer', transformer),
('model',LinearRegression())
])
clf2 = Pipeline([
('transformer', one_hot_transformer),
('model',LinearRegression())
])
clf3 = Pipeline([
('transformer', cyclic_transformer),
('model',LinearRegression())
])
clf
param_grid = [
{
"transformer__knnimputer__n_neighbors": range(2, 6),
"transformer__simpleimputer-1__strategy": ["mean",'median'],
"transformer__simpleimputer-2__strategy": ["mean",'median'],
"model": [LinearRegression()],
"model__normalize": [False,True],
},
{
"transformer__knnimputer__n_neighbors": range(2, 6),
"transformer__simpleimputer-1__strategy": ["mean",'median'],
"transformer__simpleimputer-2__strategy": ["mean",'median'],
"model": [DecisionTreeRegressor()],
"model__splitter":['best','random'],
"model__max_depth": [1,5,9,12],
"model__min_samples_leaf":[1,3,6,7],
"model__min_weight_fraction_leaf":[0.1,0.3,0.4],
"model__max_features":["auto","sqrt",None],
"model__max_leaf_nodes":[None,10,30,60],
"model__random_state": [42],
},
{
"transformer__knnimputer__n_neighbors": range(2, 6),
"transformer__simpleimputer-1__strategy": ["mean",'median'],
"transformer__simpleimputer-2__strategy": ["mean",'median'],
"model": [RandomForestRegressor()],
"model__random_state": [42],
"model__max_depth": [3,6,7,10],
"model__n_estimators": [30, 40, 70, 100],
"model__max_features": ["auto","sqrt"],
"model__min_samples_leaf": [1, 2, 4],
}
]
clf
from sklearn.model_selection import GridSearchCV
try:
results = pd.read_csv("grid_search_result_1.csv")
except:
gs1 = GridSearchCV(clf, param_grid, n_jobs=-1, cv=ts_cv, scoring="r2")
gs1.fit(X_train, y_train)
print(gs1.best_score_)
results = pd.DataFrame(gs1.cv_results_).sort_values("rank_test_score")
results.to_csv("grid_search_result_1.csv")
results.head()
### Plot best value for each model
results['model'] = np.where(results['param_model'].str.contains('LinearRegression'), 'LinearRegression',
(np.where(results['param_model'].str.contains('RandomForest'),'RandomForestRegressor','DecisionTreeRegressor')))
model_maximum = results.groupby(['model'])[['mean_test_score']].max().reset_index()
model_maximum
sns.barplot(x='model',y='mean_test_score',data=model_maximum)
plt.show()
linear_regression_params = results.loc[results['param_model'].str.contains('LinearRegression')]
linear_regression_params = linear_regression_params[[
'param_transformer__knnimputer__n_neighbors',
'param_model__normalize',
'param_transformer__simpleimputer-1__strategy',
'param_transformer__simpleimputer-2__strategy',
'mean_test_score'
]].rename(columns={
"param_transformer__knnimputer__n_neighbors": "n_neighbors",
'param_model__normalize':'normalize',
'param_transformer__simpleimputer-1__strategy':'simpleimputer-1',
'param_transformer__simpleimputer-2__strategy':'simpleimputer-2',
}).dropna()
linear_regression_params.head(3)
fig, ax = plt.subplots(nrows=1,ncols=3,figsize=(22,8))
hues = ['normalize','simpleimputer-1','simpleimputer-2']
for i,axes in enumerate(ax.ravel()):
sns.scatterplot(x='n_neighbors',y='mean_test_score',hue=hues[i],data=linear_regression_params,ax=axes,ci=None)
plt.show()
rf_regression_params = results.loc[results['param_model'].str.contains('RandomForest')]
print(len(rf_regression_params))
rf_regression_params = rf_regression_params[[
'param_transformer__knnimputer__n_neighbors',
'param_model__splitter',
'param_transformer__simpleimputer-1__strategy',
'param_transformer__simpleimputer-2__strategy',
'param_model__max_depth',
'param_model__min_samples_leaf',
'param_model__n_estimators',
'param_model__max_features',
'mean_test_score'
]].rename(columns={
"param_transformer__knnimputer__n_neighbors": "n_neighbors",
'param_model__splitter':'splitter',
'param_transformer__simpleimputer-1__strategy':'simpleimputer-1',
'param_transformer__simpleimputer-2__strategy':'simpleimputer-2',
'param_model__max_depth':'max_depth',
'param_model__min_samples_leaf':'min_samples_leaf',
'param_model__n_estimators':'n_estimators',
'param_model__max_features':'max_features',
})
rf_regression_params.head(3)
rf_regression_params.shape
fig, ax = plt.subplots(nrows=1,ncols=3,figsize=(22,8))
hues = ['simpleimputer-1','simpleimputer-2','max_features']
for i,axes in enumerate(ax.ravel()):
sns.scatterplot(x='max_depth',y='mean_test_score',hue=hues[i],data=rf_regression_params,ax=axes)
plt.show()
fig, ax = plt.subplots(nrows=1,ncols=3,figsize=(22,8))
hues = ['simpleimputer-1','simpleimputer-2','max_features']
for i,axes in enumerate(ax.ravel()):
sns.scatterplot(x='n_estimators',y='mean_test_score',hue=hues[i],data=rf_regression_params,ax=axes,ci=None)
plt.show()
clf2
try:
results2 = pd.read_csv("grid_search_result_2.csv")
except:
gs2 = GridSearchCV(clf2, param_grid, n_jobs=-1, cv=ts_cv, scoring="r2")
gs2.fit(X_train, y_train)
print(gs2.best_score_)
results2 = pd.DataFrame(gs2.cv_results_).sort_values("rank_test_score")
results2.to_csv("grid_search_result_2.csv")
results2.head()
### Plot best value for each model
results2['model'] = np.where(results2['param_model'].str.contains('LinearRegression'), 'LinearRegression',
(np.where(results2['param_model'].str.contains('RandomForest'),'RandomForestRegressor','DecisionTreeRegressor')))
model_maximum2 = results2.groupby(['model'])[['mean_test_score']].max().reset_index()
model_maximum2
sns.barplot(x='model',y='mean_test_score',data=model_maximum2)
plt.show()
linear_regression_params = results2.loc[results2['param_model'].str.contains('LinearRegression')]
linear_regression_params = linear_regression_params[[
'param_transformer__knnimputer__n_neighbors',
'param_model__normalize',
'param_transformer__simpleimputer-1__strategy',
'param_transformer__simpleimputer-2__strategy',
'mean_test_score'
]].rename(columns={
"param_transformer__knnimputer__n_neighbors": "n_neighbors",
'param_model__normalize':'normalize',
'param_transformer__simpleimputer-1__strategy':'simpleimputer-1',
'param_transformer__simpleimputer-2__strategy':'simpleimputer-2',
}).dropna()
linear_regression_params.head(3)
fig, ax = plt.subplots(nrows=1,ncols=3,figsize=(22,8))
hues = ['normalize','simpleimputer-1','simpleimputer-2']
for i,axes in enumerate(ax.ravel()):
sns.scatterplot(x='n_neighbors',y='mean_test_score',hue=hues[i],data=linear_regression_params,ax=axes,ci=None)
plt.show()
rf_regression_params = results2.loc[results2['param_model'].str.contains('RandomForest')]
rf_regression_params = rf_regression_params[[
'param_transformer__knnimputer__n_neighbors',
'param_model__splitter',
'param_transformer__simpleimputer-1__strategy',
'param_transformer__simpleimputer-2__strategy',
'param_model__max_depth',
'param_model__min_samples_leaf',
'param_model__n_estimators',
'param_model__max_features',
'mean_test_score'
]].rename(columns={
"param_transformer__knnimputer__n_neighbors": "n_neighbors",
'param_model__splitter':'splitter',
'param_transformer__simpleimputer-1__strategy':'simpleimputer-1',
'param_transformer__simpleimputer-2__strategy':'simpleimputer-2',
'param_model__max_depth':'max_depth',
'param_model__min_samples_leaf':'min_samples_leaf',
'param_model__n_estimators':'n_estimators',
'param_model__max_features':'max_features',
})
rf_regression_params.head(3)
fig, ax = plt.subplots(nrows=1,ncols=3,figsize=(22,8))
hues = ['simpleimputer-1','simpleimputer-2','max_features']
for i,axes in enumerate(ax.ravel()):
sns.scatterplot(x='max_depth',y='mean_test_score',hue=hues[i],data=rf_regression_params,ax=axes)
plt.show()
fig, ax = plt.subplots(nrows=1,ncols=3,figsize=(22,8))
hues = ['simpleimputer-1','simpleimputer-2','max_features']
for i,axes in enumerate(ax.ravel()):
sns.scatterplot(x='n_estimators',y='mean_test_score',hue=hues[i],data=rf_regression_params,ax=axes,ci=None)
plt.show()
clf3
try:
results3 = pd.read_csv("grid_search_result_3.csv")
except:
gs3 = GridSearchCV(clf3, param_grid, n_jobs=-1, cv=ts_cv, scoring="r2")
gs3.fit(X_train, y_train)
print(gs3.best_score_)
results3 = pd.DataFrame(gs3.cv_results_).sort_values("rank_test_score")
results3.to_csv("grid_search_result_3.csv")
results3.head()
### Plot best value for each model
results3['model'] = np.where(results3['param_model'].str.contains('LinearRegression'), 'LinearRegression',
(np.where(results3['param_model'].str.contains('RandomForest'),'RandomForestRegressor','DecisionTreeRegressor')))
model_maximum3 = results3.groupby(['model'])[['mean_test_score']].max().reset_index()
model_maximum3
sns.barplot(x='model',y='mean_test_score',data=model_maximum3)
plt.show()
# since we have our best model in pipeline one we choose our result from pipeline one
final_params = results.loc[results['rank_test_score']==1]['params'].values
print(final_params[0])
#use the best transformer
final_transformer = make_column_transformer(
("drop", ["atemp",'dteday','casual','registered']),
(KNNImputer(n_neighbors=5), ["temp"]),
(SimpleImputer(strategy="median"), ["hum"]),
(SimpleImputer(strategy="median"), ["windspeed"]),
(make_pipeline(SimpleImputer(strategy='most_frequent'),OneHotEncoder(handle_unknown='ignore')),["weathersit"]),
remainder = MinMaxScaler()
)
#create a final model with the best parameters
final_model = Pipeline([
('transformer', final_transformer),
('model',RandomForestRegressor(max_depth=10, min_samples_leaf=2, n_estimators=70,random_state=42))
])
final_model.fit(X_train,y_train)
y_pred = final_model.predict(X_test)
### the r2 score
from sklearn.metrics import r2_score
print(f'The r2 score the test data is {r2_score(y_test,y_pred)}')
#plot Actual vs Predicted
sns.set()
fig, ax = plt.subplots(figsize=(24, 8))
ax.plot(y_test.values, label="actual values")
ax.plot(y_pred, label="predicted values", alpha=0.7)
ax.set_title("Actual vs Prediction", fontsize=20)
ax.legend()
plt.show()
#Train Actual and Test Predicted
sns.set()
fig, ax = plt.subplots(figsize=(20, 8))
sns.lineplot(x=X_train.dteday, y=y_train.values, label="Original",ax=ax , ci=None)
sns.lineplot(x=X_test.dteday,y=y_pred, label="predicted values", alpha=0.7,ax=ax,ci=None)
ax.set_title("Train vs Test Predicted", fontsize=20)
ax.legend()
plt.show()
# scatter plot
fig, ax = plt.subplots(figsize=(20, 8))
ax.scatter(y_test.values,y_test.index, color='r', label="Actual")
ax.scatter(y_pred,y_test.index, color='b', label="Predictions")
ax.set_xlabel('Count')
ax.set_ylabel('Instant')
ax.set_title('Prediction vs Actual Comparison')
plt.legend()
plt.show()