# Importing visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.express as px


# Importing basic libraries
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import itertools
from numpy import percentile
# Importing Machine Learning libraries
import sklearn


# Tranformers
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, Normalizer, OneHotEncoder,FunctionTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import SelectFromModel
from category_encoders import TargetEncoder


# Model Evaluation
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, KFold,RandomizedSearchCV, learning_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report


# Pipelines
from sklearn.pipeline import Pipeline,make_pipeline, FeatureUnion 
# from sklearn.base import Besvalidimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_transformer,make_column_selector

# Machine Learning Models
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier,RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.extmath import softmax

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

sns.set()


# Importing the data
df_train = pd.read_csv('data/train.csv')


# Setting the Id as index might help in fetching data easier
df_train.set_index('Id',inplace=True)


# Inspecting first rows
df_train.head()


print(f'valid set has {df_train.shape[0]} observations and {df_train.shape[1]-1} features.')

valid set has 15120 observations and 54 features.


#profile = ProfileReport(df_train, title="Pandas Profiling Report", minimal=False)
#profile


# Create list of col for each
features = [x for x in df_train.columns if x not in ['Id','Cover_Type']]
wilderness_cols = [x for x in features if x.startswith('Wild')]
soiltype_cols = [x for x in features if x.startswith('Soil')]
binary_cols = [x for x in features if x.startswith('Soil') or x in wilderness_cols]
numerical_cols = [x for x in features if x not in binary_cols]


# Check data type of columns
df_train.dtypes

Elevation                             int64
Aspect                                int64
Slope                                 int64
Horizontal_Distance_To_Hydrology      int64
Vertical_Distance_To_Hydrology        int64
Horizontal_Distance_To_Roadways       int64
Hillshade_9am                         int64
Hillshade_Noon                        int64
Hillshade_3pm                         int64
Horizontal_Distance_To_Fire_Points    int64
Wilderness_Area1                      int64
Wilderness_Area2                      int64
Wilderness_Area3                      int64
Wilderness_Area4                      int64
Soil_Type1                            int64
Soil_Type2                            int64
Soil_Type3                            int64
Soil_Type4                            int64
Soil_Type5                            int64
Soil_Type6                            int64
Soil_Type7                            int64
Soil_Type8                            int64
Soil_Type9                            int64
Soil_Type10                           int64
Soil_Type11                           int64
Soil_Type12                           int64
Soil_Type13                           int64
Soil_Type14                           int64
Soil_Type15                           int64
Soil_Type16                           int64
Soil_Type17                           int64
Soil_Type18                           int64
Soil_Type19                           int64
Soil_Type20                           int64
Soil_Type21                           int64
Soil_Type22                           int64
Soil_Type23                           int64
Soil_Type24                           int64
Soil_Type25                           int64
Soil_Type26                           int64
Soil_Type27                           int64
Soil_Type28                           int64
Soil_Type29                           int64
Soil_Type30                           int64
Soil_Type31                           int64
Soil_Type32                           int64
Soil_Type33                           int64
Soil_Type34                           int64
Soil_Type35                           int64
Soil_Type36                           int64
Soil_Type37                           int64
Soil_Type38                           int64
Soil_Type39                           int64
Soil_Type40                           int64
Cover_Type                            int64
dtype: object


val_df_train = df_train.filter(regex="Soil_Type", axis=1)
df_train1 = df_train.copy()
ohe_cols = ["Wilderness_Area", "Soil_Type"]
for idx, val in enumerate(ohe_cols):

    # Filter data not on its contents but on it's index labels
    val_df_train = df_train.filter(regex=val, axis=1)
    # Check is every row of the selected columns has one and only one column coded as 1
    print(f"{val} only one and only column encoded as one? {((val_df_train.sum(axis=1)) == 1).all()}")
    df_train1[val] = val_df_train.dot(val_df_train.columns)

Wilderness_Area only one and only column encoded as one? True
Soil_Type only one and only column encoded as one? True


# Function from github
def split_numbers_chars(row):
    '''This function fetches the numerical characters at the end of a string
    and returns alphabetical character and numerical chaarcters respectively'''
    head = row.rstrip('0123456789')
    tail = row[len(head):]
    return head, tail

def reverse_one_hot_encode(dataframe, start_loc, end_loc, numeric_column_name):
    ''' this function takes the start and end location of the one-hot-encoded column set and numeric column name to be created as arguments
    1) transforms one-hot-encoded columns into one column consisting of column names with string data type
    2) splits string column into the alphabetical and numerical characters
    3) fetches numerical character and creates numeric column in the given dataframe
    '''
    dataframe['String_Column'] = (dataframe.iloc[:, start_loc:end_loc] == 1).idxmax(1)
    dataframe['Tuple_Column'] = dataframe['String_Column'].apply(split_numbers_chars)
    dataframe[numeric_column_name] = dataframe['Tuple_Column'].apply(lambda x: x[1]).astype('int64')
    dataframe.drop(columns=['String_Column','Tuple_Column'], inplace=True)


# Creating a couple
df_train1 = df_train.copy()


# Reverse One-Hot Encode Categorical Variables
reverse_one_hot_encode(df_train1, 11, 15, "Wilderness_Type")
reverse_one_hot_encode(df_train1, 14, 54, "Soil_Type")


# Drop Wilderness and soil_type cols
df_train1 = df_train1.drop(columns = wilderness_cols)
df_train1 = df_train1.drop(columns = soiltype_cols)


# Change to categorical after 
df_train1['Soil_Type'] = df_train1.Soil_Type.astype('category')
df_train1['Wilderness_Type'] = df_train1.Wilderness_Type.astype('category')


# Checking Missing Values
df_train[df_train.isnull().any(axis=1)]


# Count per Type of Forest Cover Type
df_train["Cover_Type"].value_counts()

5    2160
2    2160
1    2160
7    2160
3    2160
6    2160
4    2160
Name: Cover_Type, dtype: int64


# Plotting histograms for numerical features
numerical_features = df_train[numerical_cols]

palette = itertools.cycle(sns.color_palette('Set2'))
plt.figure(figsize = (10,15))
for i, col in enumerate(numerical_features):
    plt.subplot(5,2,i+1)
    sns.distplot(numerical_features[col],color=next(palette))
    plt.title(f'{col} Distribution', fontsize = 15)
    plt.tight_layout()

plt.show()


palette = itertools.cycle(sns.color_palette())

def plot_numerical_graph1(feature):
    plt.figure()
    plt.subplots(2, 3, figsize=(16, 10))
    for i, label in enumerate([1,2,3,4,5,6], start = 1):
        plt.subplot(2, 3, i)
        plt.gca().set_aspect('auto')
        sns.histplot(df_train[df_train.Cover_Type == label][feature],color = next(palette), kde=True, stat = 'density', bins=50, label=f'cover type {label}')
        plt.xlabel(feature, fontsize=12)
        plt.legend()
        plt.title(f'{label} Distribution for each Cover Type');
    
    plt.show()


plot_numerical_graph1('Elevation')
plt.show()

<Figure size 432x288 with 0 Axes>


# Observe describe statistics of numerical values
df_train[numerical_cols].describe().T.sort_values('std',ascending = False).drop(columns= 'count').sort_values('std',ascending = False).round(3)


%matplotlib inline
plt.subplots(figsize = (21,14))
sns.boxplot(data = df_train[numerical_cols], orient='h', palette=sns.color_palette('hls'))
plt.title('Spread of data in Numerical Features', size = 20)
plt.xlabel('No.of Observation', size = 15)
plt.ylabel('Features', size = 15)
plt.xticks(size = 12)
plt.yticks(size = 12)
plt.show()


for col in numerical_cols:
    print(f'Sum of 0 values in {col}:{(df_train[col] == 0).sum()}')

Sum of 0 values in Elevation:0
Sum of 0 values in Aspect:110
Sum of 0 values in Slope:5
Sum of 0 values in Horizontal_Distance_To_Hydrology:1590
Sum of 0 values in Vertical_Distance_To_Hydrology:1890
Sum of 0 values in Horizontal_Distance_To_Roadways:3
Sum of 0 values in Hillshade_9am:1
Sum of 0 values in Hillshade_Noon:0
Sum of 0 values in Hillshade_3pm:88
Sum of 0 values in Horizontal_Distance_To_Fire_Points:2


def split_missing(data):    
    feature_cols_missing= [col for col in data.columns if col  not in ['Hillshade_3pm','Id']]
    X_train=data[feature_cols_missing][data.Hillshade_3pm!=0]
    y_train=data['Hillshade_3pm'][data.Hillshade_3pm!=0]
    X_valid=data[feature_cols_missing][data.Hillshade_3pm==0]
    return X_train,y_train,X_valid


def fill_missing(data):
    x_train,y_train,x_valid=split_missing(data)
    rfg=RandomForestRegressor(n_estimators=100)
    rfg.fit(x_train,y_train)
    data.Hillshade_3pm.loc[data.Hillshade_3pm==0]=np.around(rfg.predict(x_valid))
    return data


df_train = fill_missing(df_train)


# Finding outliers
def outlier_function(df_train, col_name):
    ''' this function detects first and third quartile and interquartile range for a given column of a dataframe
    then calculates upper and lower limits to determine outliers conservatively
    returns the number of lower and uper limit and number of outliers respectively
    '''
    first_quartile = np.percentile(np.array(df_train[col_name].tolist()), 25)
    third_quartile = np.percentile(np.array(df_train[col_name].tolist()), 75)
    IQR = third_quartile - first_quartile
                      
    upper_limit = third_quartile+(3*IQR)
    lower_limit = first_quartile-(3*IQR)
    outlier_count = 0
                      
    for value in df_train[col_name].tolist():
        if (value < lower_limit) | (value > upper_limit):
            outlier_count +=1
    return lower_limit, upper_limit, outlier_count


# Finding outliers per columns
for col in numerical_cols:
    if outlier_function(df_train, col)[2] > 0:
        print("There are {} outliers in {}".format(outlier_function(df_train, col)[2], col))

There are 53 outliers in Horizontal_Distance_To_Hydrology
There are 49 outliers in Vertical_Distance_To_Hydrology
There are 3 outliers in Horizontal_Distance_To_Roadways
There are 7 outliers in Hillshade_9am
There are 20 outliers in Hillshade_Noon
There are 132 outliers in Horizontal_Distance_To_Fire_Points


# Find outliers per cove
from numpy import percentile
import itertools
def findCutOff(feature):
    #print(feature.index[feature>20])
    q25, q75 = percentile(feature, 25), percentile(feature, 75)
    iqr = q75 - q25
    # calculate the outlier cutoff
    cut_off = iqr * 3
    lower, upper = q25 - cut_off, q75 + cut_off
    #return lower, upper
    return feature.index[(feature<lower)|(feature>upper)].tolist()


columns = ['Slope','Vertical_Distance_To_Hydrology','Horizontal_Distance_To_Hydrology','Horizontal_Distance_To_Roadways',
          'Horizontal_Distance_To_Fire_Points','Hillshade_9am','Hillshade_Noon','Hillshade_3pm']

all_outliers=[]

for column in columns:
    #outliers
    slope= df_train.groupby('Cover_Type')[column].agg(findCutOff)
    merged_slope = list(itertools.chain.from_iterable(slope))
    all_outliers.append(merged_slope)

all_outliers = list(itertools.chain.from_iterable(all_outliers))
print("Total number of outliers: ",len(all_outliers))
print("Total number of outliers by cover type: ")
unique_index=np.unique(all_outliers)

df_train.loc[unique_index,:].Cover_Type.value_counts()

Total number of outliers:  170
Total number of outliers by cover type:

5    82
2    41
1    26
7     9
6     1
Name: Cover_Type, dtype: int64


#df_train.drop(unique_index,axis='rows',inplace=True)


cmap = sns.color_palette("Set2")
labels = 'Type1: Spruce/Fir', 'Type2: Lodgepole Pine', 'Type3: Ponderosa Pine', 'Type4: Cottonwood/Willow', 'Type 5: Aspen', 'Type 6: Douglas-fir', 'Type 7: Krummholz'
numerical_features['Cover_Type'] = df_train['Cover_Type']

plt.figure(figsize = (20,30))
for i, col in enumerate(numerical_cols):
    plt.subplot(5,2,i+1)
    ax = sns.boxplot(x =df_train['Cover_Type'], y = col, data = df_train, palette = cmap,width = 0.5)
    ax.set_xticklabels(labels, rotation=30)
    plt.title(f'Distribution of {col} with Cover Type', fontsize = 15)
    plt.tight_layout()
plt.show()


classes = np.array(list(df_train.Cover_Type.values))
classes

array([5, 5, 2, ..., 3, 3, 3], dtype=int64)


classes = np.array(list(df_train.Cover_Type.values))
cmap = sns.color_palette("Set2")
def plotRelation(feat1, feat2,cmap ='plasma' ):
    plt.scatter(feat1, feat2, c = classes, s=10,cmap=cmap)
    plt.xlabel(feat1.name)
    plt.ylabel(feat2.name)

ax = plt.figure(figsize = (20,4))
ax.add_subplot(131)
plotRelation( df_train.Aspect,df_train.Hillshade_9am)
ax.add_subplot(132)
plotRelation(df_train.Aspect,df_train.Hillshade_Noon)
ax.add_subplot(133)
plotRelation(df_train.Aspect,df_train.Hillshade_3pm)


ax = plt.figure(figsize=(25,20))
ax.add_subplot(331)
plotRelation( df_train.Slope,df_train.Hillshade_9am,cmap='hsv')
ax.add_subplot(332)
plotRelation(df_train.Slope,df_train.Hillshade_Noon,cmap='hsv')
ax.add_subplot(333)
plotRelation(df_train.Slope,df_train.Hillshade_3pm,cmap='hsv')


soil_dummies = df_train.loc[:,df_train.columns.str.startswith('Soil_Type')]
#wild = wild_dummies.idxmax(axis=1)
soil = soil_dummies.idxmax(axis=1)
#wild.name = 'Wilderness'
soil.name = 'Soil'
df_train['Soil'] = soil
plt.figure(figsize=(20,8))
sns.countplot(x='Soil',data=df_train,palette=cmap);
plt.xticks(rotation=90);
plt.xlabel('Soil Data Types')
plt.ylabel('No. of observations')
plt.title('Count of each Soil Type')

Text(0.5, 1.0, 'Count of each Soil Type')


temp = df_train[soiltype_cols].sum().sort_values()
irrelevant_cols = pd.DataFrame(temp[temp < 100]).T.columns.tolist()
irrelevant_cols

['Soil_Type15',
 'Soil_Type7',
 'Soil_Type8',
 'Soil_Type25',
 'Soil_Type28',
 'Soil_Type36',
 'Soil_Type9',
 'Soil_Type27',
 'Soil_Type21',
 'Soil_Type34',
 'Soil_Type37',
 'Soil_Type19',
 'Soil_Type26',
 'Soil_Type18']


# def remove_irrelevant_soil_types(df_train):
#     df_train = df_train.drop(columns = ['Soil_Type15','Soil_Type7','Soil_Type8','Soil_Type25','Soil_Type28','Soil_Type36',
#                             'Soil_Type9','Soil_Type27','Soil_Type21','Soil_Type34','Soil_Type37','Soil_Type19',
#                             'Soil_Type26','Soil_Type18'])
#     return df_train


#irrelevant_soil_types_remover = FunctionTransformer(remove_irrelevant_soil_types)


def remove_soil_type71518(df_train):
    df_train.drop(['Soil_Type7','Soil_Type15','Soil_Type8'],axis='columns',inplace=True)
    return  df_train


#df_train.drop(['Soil_Type7','Soil_Type15','Soil_Type8'],axis='columns',inplace=True)


df_train.dtypes

Elevation                               int64
Aspect                                  int64
Slope                                   int64
Horizontal_Distance_To_Hydrology        int64
Vertical_Distance_To_Hydrology          int64
Horizontal_Distance_To_Roadways         int64
Hillshade_9am                           int64
Hillshade_Noon                          int64
Hillshade_3pm                         float64
Horizontal_Distance_To_Fire_Points      int64
Wilderness_Area1                        int64
Wilderness_Area2                        int64
Wilderness_Area3                        int64
Wilderness_Area4                        int64
Soil_Type1                              int64
Soil_Type2                              int64
Soil_Type3                              int64
Soil_Type4                              int64
Soil_Type5                              int64
Soil_Type6                              int64
Soil_Type7                              int64
Soil_Type8                              int64
Soil_Type9                              int64
Soil_Type10                             int64
Soil_Type11                             int64
Soil_Type12                             int64
Soil_Type13                             int64
Soil_Type14                             int64
Soil_Type15                             int64
Soil_Type16                             int64
Soil_Type17                             int64
Soil_Type18                             int64
Soil_Type19                             int64
Soil_Type20                             int64
Soil_Type21                             int64
Soil_Type22                             int64
Soil_Type23                             int64
Soil_Type24                             int64
Soil_Type25                             int64
Soil_Type26                             int64
Soil_Type27                             int64
Soil_Type28                             int64
Soil_Type29                             int64
Soil_Type30                             int64
Soil_Type31                             int64
Soil_Type32                             int64
Soil_Type33                             int64
Soil_Type34                             int64
Soil_Type35                             int64
Soil_Type36                             int64
Soil_Type37                             int64
Soil_Type38                             int64
Soil_Type39                             int64
Soil_Type40                             int64
Cover_Type                              int64
Soil                                   object
dtype: object


df_train.drop(['Soil_Type7','Soil_Type15','Soil_Type8','Slope'],axis='columns',inplace=True)


size = 9
corrmat = df_train.iloc[:,:size].corr()
f, ax = plt.subplots(figsize = (10,8))
sns.heatmap(corrmat,vmax=0.8,square=True, center=0, annot=True, annot_kws={"size": 15}, cbar_kws={"shrink": .8},
            cmap=plt.cm.coolwarm);


# Now split the data randomly into train and validation set 
X = df_train.drop(['Cover_Type'], axis = 1)
y = df_train['Cover_Type']
X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.20, random_state=42,stratify=y)
print("The shape of validation data:{} and {} ".format(X_val.shape,y_val.shape))
print("The shape of training data:{} and {} ".format(X_train.shape,y_train.shape))
# Added the label temporariliy for EDA
X_train['Cover_Type'] = y_train

The shape of validation data:(3024, 51) and (3024,) 
The shape of training data:(12096, 51) and (12096,)


X_train.drop(['Cover_Type','Soil'], axis = 1,inplace=True)
X_val.drop(['Soil'], axis = 1,inplace=True)


X_train_orig=X_train.copy()
y_train_orig = y_train.copy()

X_val_orig = X_val.copy()
y_val_orig = y_val.copy()


from sklearn.naive_bayes import MultinomialNB

nb_pipeline = Pipeline(steps = [('rscale',RobustScaler()),
                                ('minmax',MinMaxScaler()),
                                 ('nb',MultinomialNB())])

X = df_train.iloc[:,:50]
y = df_train.Cover_Type

print("Accuracy of the full dataset Naive Bayes classifier = {0:.4f}".format(np.mean(cross_val_score(nb_pipeline, X, y, scoring = 'accuracy'))))

Accuracy of the full dataset Naive Bayes classifier = 0.5713


nb_pipeline.fit(X_train_orig, y_train_orig)

accuracy_score(y_val_orig, nb_pipeline.predict(X_val_orig))

0.5846560846560847


target_names = ['Cover Type 1', 'Cover Type 2', 'Cover Type 3','Cover Type 4','Cover Type 5','Cover Type 6','Cover Type 7']
labels=[1,2,3,4,5,6,7]
report= classification_report(y_val_orig, nb_pipeline.predict(X_val_orig),target_names=target_names,
                                         labels=labels)
print(report)

              precision    recall  f1-score   support

Cover Type 1       0.58      0.43      0.49       432
Cover Type 2       0.41      0.56      0.47       432
Cover Type 3       0.50      0.36      0.42       432
Cover Type 4       0.69      0.82      0.75       432
Cover Type 5       0.57      0.54      0.55       432
Cover Type 6       0.51      0.58      0.54       432
Cover Type 7       0.89      0.81      0.85       432

    accuracy                           0.58      3024
   macro avg       0.59      0.58      0.58      3024
weighted avg       0.59      0.58      0.58      3024


cm = confusion_matrix(y_val_orig,nb_pipeline.predict(X_val_orig))
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
cm.diagonal()

array([0.42592593, 0.55787037, 0.3587963 , 0.81944444, 0.53703704,
       0.58101852, 0.8125    ])


from sklearn.pipeline import Pipeline
knn_pipeline = Pipeline(steps = [('scaler',StandardScaler()),
                             ('knn',KNeighborsClassifier())
                            ])

X = df_train.iloc[:,:50]
y = df_train.Cover_Type

print("Accuracy of the full dataset KNN classifier = {0:.4f}".format(np.mean(cross_val_score(knn_pipeline, X, y, scoring = 'accuracy'))))
# Run KNN with only the Original data provided

Accuracy of the full dataset KNN classifier = 0.6911


sklearn.set_config(display = 'diagram')


#Split arrays or matrices into train and valid subsets

knn = KNeighborsClassifier(n_neighbors=7)  
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train_orig,y_train_orig)

print("Preliminary model score:")
print(knn.score(X_val_orig,y_val_orig))

no_neighbors = np.arange(1, 9)
train_accuracy = np.empty(len(no_neighbors))
valid_accuracy = np.empty(len(no_neighbors))

for i, k in enumerate(no_neighbors):
    # We instantiate the classifier
    knn = KNeighborsClassifier(n_neighbors=k)
    # Fit the classifier to the training data
    knn.fit(X_train_orig,y_train_orig)
    
    # Compute accuracy on the training set
    train_accuracy[i] = knn.score(X_train_orig, y_train_orig)

    # Compute accuracy on the validing set
    valid_accuracy[i] = knn.score(X_val_orig, y_val_orig)

# Visualization of k values vs accuracy

plt.title('k-NN: Varying Number of Neighbors')
plt.plot(no_neighbors, valid_accuracy, label = 'validing Accuracy')
plt.plot(no_neighbors, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()

Preliminary model score:
0.8032407407407407


from sklearn.pipeline import Pipeline

lg_clf = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs')
lg_pipeline = Pipeline(steps = [('scaler',StandardScaler()),
                             ('lg',lg_clf)
                            ])

X = df_train.iloc[:,:50]
y = df_train.Cover_Type

print("Accuracy of the full dataset KNN classifier = {0:.4f}".format(np.mean(cross_val_score(lg_pipeline, X, y, scoring = 'accuracy'))))

Accuracy of the full dataset KNN classifier = 0.6448


scores = cross_val_score(lg_pipeline, X, y)


lg_pipeline.fit(X_train_orig, y_train_orig)

print("Accuracy of our baseline logistic regression model is",accuracy_score(y_val_orig,lg_pipeline.predict(X_val_orig)))

Accuracy of our baseline logistic regression model is 0.691468253968254


def construct_features(df):
    df['Ele_minus_VDtHyd'] = (df['Elevation']-df['Vertical_Distance_To_Hydrology'])
    df['Ele_plus_VDtHyd'] = (df['Elevation']+df['Vertical_Distance_To_Hydrology'])
    df['Distance_to_Hydrolody'] = (df['Horizontal_Distance_To_Hydrology']**2+df['Vertical_Distance_To_Hydrology']**2)**0.5
    df['Hydro_plus_Fire'] = df['Horizontal_Distance_To_Hydrology']+df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_minus_Fire'] = df['Horizontal_Distance_To_Hydrology']-df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_plus_Road'] = df['Horizontal_Distance_To_Hydrology']+df['Horizontal_Distance_To_Roadways']
    df['Hydro_minus_Road'] = df['Horizontal_Distance_To_Hydrology']-df['Horizontal_Distance_To_Roadways']
    df['Fire_plus_Road'] = df['Horizontal_Distance_To_Fire_Points']+df['Horizontal_Distance_To_Roadways']
    df['Fire_minus_Road'] = df['Horizontal_Distance_To_Fire_Points']-df['Horizontal_Distance_To_Roadways']
    return df


construct_features(X_train)
construct_features(X_val)


X_train.drop(['Vertical_Distance_To_Hydrology'],axis='columns',inplace=True)
X_val.drop(['Vertical_Distance_To_Hydrology'],axis='columns',inplace=True)


print("Number of initial features ",X_train_orig.shape[1])
print("Number of features after feature engineering",X_train.shape[1])

Number of initial features  50
Number of features after feature engineering 58


all_data = pd.concat([X_train, X_val], ignore_index=True)
all_y = np.hstack((y_train,y_val))
all_data['Cover_Type'] = all_y


def getImportance(classifier,cols,title):
    """ Create a chart of feature importances given a tree classifier."""

    importances = classifier.feature_importances_
    indices=np.argsort(importances)[::-1][:30]
    # Plot the feature importances of the forest
    plt.figure(figsize=(50,20))
    plt.title(title,fontsize=45)
    plt.bar(range(30), importances[indices],
       align="center",alpha=.5)
    plt.xticks(range(30), cols[indices], rotation=45, rotation_mode="anchor", ha="right",fontsize=30)
    plt.yticks(fontsize=30)
    plt.xlim([-1, 30])
    plt.show()


# Build a forest and compute the feature importances
n_estimators = list(range(50, 250,5))
criterion=['gini','entropy']
param_grid = dict(n_estimators=n_estimators, criterion=criterion)

forest = ExtraTreesClassifier(random_state=0)
grid_etc = RandomizedSearchCV(forest, param_grid, cv=5, scoring="accuracy" ,return_train_score=False)
grid_etc.fit(X_train, y_train)
print("The best score: ",grid_etc.best_score_.round(4))
#Parameter setting that gave the best results on the hold out data.
print("The best parameter: ",grid_etc.best_params_)
grid_etc.best_estimator_

The best score:  0.894
The best parameter:  {'n_estimators': 160, 'criterion': 'gini'}

ExtraTreesClassifier(n_estimators=160, random_state=0)


#Find Feature importance
etc_final = ExtraTreesClassifier(n_estimators=grid_etc.best_params_['n_estimators'],
                            criterion=grid_etc.best_params_['criterion'],random_state=0)
etc_final.fit(X_train, y_train)
etc_acc = etc_final.score(X_val,y_val)
print("The accuracy score of the ExtraTreesClassifier: ",round(etc_acc,2))
importances = etc_final.feature_importances_
indices = np.argsort(importances)[::-1]
getImportance(etc_final,X_train.columns,"The most important features from ExtraTrees Classifier")

The accuracy score of the ExtraTreesClassifier:  0.9


select_model = SelectFromModel(etc_final, threshold=0.01, prefit=True)
X_train_selected = select_model.transform(X_train)
X_val_selected = select_model.transform(X_val)
print("The shape of the selected training dataset " ,X_train_selected.shape)
print("The shape of the selected validation dataset " ,X_val_selected.shape )

The shape of the selected training dataset  (12096, 26)
The shape of the selected validation dataset  (3024, 26)


# Build a forest with the important features only
grid_etc_selected = RandomizedSearchCV(forest, param_grid, cv=5, scoring="accuracy" ,return_train_score=False)
grid_etc_selected.fit(X_train_selected, y_train)
print("The best score: ",grid_etc_selected.best_score_.round(4))
#Parameter setting that gave the best results on the hold out data.
print("The best parameter: ",grid_etc_selected.best_params_)
grid_etc_selected.best_estimator_

The best score:  0.8855
The best parameter:  {'n_estimators': 135, 'criterion': 'entropy'}

ExtraTreesClassifier(criterion='entropy', n_estimators=135, random_state=0)


#Find Feature importance
etc_selected = ExtraTreesClassifier(n_estimators=grid_etc_selected.best_params_['n_estimators'],
                            criterion=grid_etc_selected.best_params_['criterion'],random_state=0)
etc_selected.fit(X_train_selected, y_train)
etc_sel_acc = etc_selected.score(X_val_selected,y_val)
print("The accuracy score of the ExtraTreesClassifier with the selected features: ",round(etc_sel_acc,2))

The accuracy score of the ExtraTreesClassifier with the selected features:  0.89


def get_feature_importance(clf, feature_names):
    """
    Function to print the most important features of a logreg classifier based on the coefficient values
    """
    return pd.DataFrame({'variable': feature_names, # Feature names
                         'coefficient': clf.coef_[0] # Feature Coeficients
                    }) \
    .round(decimals=2) \
    .sort_values('coefficient', ascending=False) \
    .style.bar(color=['red', 'green'], align='zero')


#Run Logistic Regression with engineered features provided & L1 Regularization
logreg_l1 = LogisticRegression(penalty = 'l1',solver='saga', multi_class='multinomial',max_iter=10000)
logreg_l1.fit(X_train, y_train)
y_pred_l1 = logreg_l1.predict(X_val)
print("Accuracy with L1 regularization and Engineered Data",accuracy_score(y_val,y_pred_l1)) 

model = SelectFromModel(logreg_l1, threshold=0.00, prefit=True)
X_train_l1_selected = model.transform(X_train)
X_val_l1_selected = model.transform(X_val)

print("The shape of the selected training dataset " ,X_train_l1_selected.shape)
print("The shape of the selected validation dataset " ,X_val_l1_selected.shape )

Accuracy with L1 regularization and Engineered Data 0.5393518518518519
The shape of the selected training dataset  (12096, 58)
The shape of the selected validation dataset  (3024, 58)


get_feature_importance(logreg_l1, X_train.columns.get_level_values(0).tolist())


logreg_l2 = LogisticRegression(penalty = 'l2',solver='saga', multi_class='multinomial',max_iter=10000)
logreg_l2.fit(X_train, y_train)
y_pred_l2 = logreg_l1.predict(X_val)
print("Accuracy with L2 regularization and  Engineered Data",accuracy_score(y_val,y_pred_l2))

model2 = SelectFromModel(logreg_l2, threshold=0.00, prefit=True)
X_train_l2_selected = model2.transform(X_train)
X_val_l2_selected = model2.transform(X_val)

print("The shape of the selected training dataset " ,X_train_l2_selected.shape)
print("The shape of the selected validation dataset " ,X_val_l2_selected.shape )

Accuracy with L2 regularization and  Engineered Data 0.5393518518518519
The shape of the selected training dataset  (12096, 58)
The shape of the selected validation dataset  (3024, 58)


get_feature_importance(logreg_l2, X_train.columns.get_level_values(0).tolist())


# Run PCA to determine the number of principal components to use
from sklearn.decomposition import PCA
pca_mod = PCA()
pca_mod.fit(X_train)
num_features_selected = X_train.shape[1]
print("Number of features in combined, feature-reduced data sets = %d" % num_features_selected)
var_arr=np.zeros((num_features_selected))
cum_var_arr=np.zeros((num_features_selected))
cum_var=0.0

for n_comp in range(1,num_features_selected + 1):
    cum_var = cum_var + pca_mod.explained_variance_ratio_[n_comp - 1]
    var_arr[n_comp - 1] = round(pca_mod.explained_variance_ratio_[n_comp - 1],2)
    cum_var_arr[n_comp - 1] = round(cum_var,3)


    var_rslts = pd.DataFrame({'Principal Component':range(1,num_features_selected + 1),'Variance Explained'
                              :var_arr,'Cumulative Variance Explained':cum_var_arr})

Number of features in combined, feature-reduced data sets = 58


#Plot cumulative variance explained
plt.plot(var_rslts['Principal Component'],var_rslts['Cumulative Variance Explained'])
plt.xlabel("Principal Component")
plt.ylabel("Cumulative Variance Explained")

var_rslts.set_index('Principal Component',inplace=True)

var_rslts


pca_mod = PCA(n_components=5)
dimr_X_train_final_selected = pca_mod.fit_transform(X_train)
dimr_X_val_final_selected = pca_mod.transform(X_val)

print("Size of the dimensionality reduced training dataset", dimr_X_train_final_selected.shape)
print("Size of the dimensionality reduced validation dataset", dimr_X_val_final_selected.shape)

Size of the dimensionality reduced training dataset (12096, 5)
Size of the dimensionality reduced validation dataset (3024, 5)


from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators=100,n_jobs=-1,random_state=0)
etc.fit(dimr_X_train_final_selected, y_train)
y_pred_etc = etc.predict(dimr_X_val_final_selected)
print("The accuracy score of the selected feature : ",accuracy_score(y_pred_etc,y_val))

The accuracy score of the selected feature :  0.8485449735449735


from sklearn.ensemble import GradientBoostingClassifier

# param_list = {'n_estimators':range(20,81,10),
#                'max_depth':range(5,16,2),
#                'min_samples_split':range(200,1001,200),
#                'min_samples_split':range(1000,2100,200),
#                'min_samples_leaf':range(30,71,10),
#                'max_features':range(7,20,2)}
param_list = {}
gbc = GradientBoostingClassifier(learning_rate=0.1, subsample=0.8, random_state=10)

gs = GridSearchCV(estimator = gbc,param_grid = param_list, scoring='accuracy',n_jobs=-1, cv=5)
gs.fit(X_train, y_train)

print(gs.best_params_, gs.best_score_)

y_pred_gbc = gs.best_estimator_.predict(X_val)
print("The accuracy score of the selected feature : ",accuracy_score(y_pred_gbc,y_val))

{} 0.8163030621901679
The accuracy score of the selected feature :  0.8121693121693122


from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=0)

n_estimators = [100,200, 300, 500, 800, 1200]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 

# param_list = dict(n_estimators = n_estimators, max_depth = max_depth,  
#               min_samples_split = min_samples_split, 
#              min_samples_leaf = min_samples_leaf)

param_list={}

gs = GridSearchCV(rfc, param_grid = param_list, cv = 5, n_jobs = -1,scoring='accuracy')
gs.fit(X_train, y_train)

print(gs.best_params_, gs.best_score_)

y_pred_rfc = gs.best_estimator_.predict(X_val)
print("The accuracy score of the selected feature : ",accuracy_score(y_pred_rfc,y_val))


from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(random_state=0)

# param_grid = {
#     'n_estimators': [100,300,500,600,650,750],
#     'max_depth': [2,8,16,32,50],
#     'min_samples_split': [2,4,6],
#     'min_samples_leaf': [1,2,5,10],
#     'max_features':[0.2,0.25,0.3,0.35]
# }

param_grid={}

gs = GridSearchCV(etc, param_grid = param_grid, cv = 5, n_jobs = -1,scoring='accuracy')
gs.fit(X_train, y_train)

print(gs.best_params_, gs.best_score_)

y_pred_etc = gs.best_estimator_.predict(X_val)
print("The accuracy score of the selected feature : ",accuracy_score(y_pred_etc,y_val))

{} 0.8924437732961165
The accuracy score of the selected feature :  0.8998015873015873


X_train_l1_selected.shape

(12096, 58)


all_x_train = X_train.append(X_val, ignore_index=True)
all_x_train.shape


all_y_train = np.hstack((y_train,y_val))
all_y_train.shape


from sklearn.model_selection import cross_val_score
def cv_score(clf,x_train,y_train):
    score=cross_val_score(clf, x_train, y_train, scoring=None, 
                                           cv=10, n_jobs=1, verbose=0, fit_params=None, 
                                           pre_dispatch='2*n_jobs')                                      
    return score.mean()


clf=ExtraTreesClassifier(max_features= 0.35, n_estimators= 650)
score=cv_score(clf,all_x_train,all_y_train)
print(score)


# Read in the valid data
kaggle_df_train = pd.read_csv('data/valid.csv', engine='c')
kaggle_df_train.head()


kaggle_df_train[kaggle_df_train.isnull().any(axis=1)]
kaggle_df_train.set_index('Id',inplace=True)

# drop soil types that were dropped due to no values for them in the training data
kaggle_df_train.drop(['Soil_Type7','Soil_Type15','Soil_Type8','Slope'],axis='columns',inplace=True)


###Repeat for X_val
kaggle_df_train['Ele_minus_VDtHyd'] = kaggle_df_train.Elevation-kaggle_df_train.Vertical_Distance_To_Hydrology
kaggle_df_train['Ele_plus_VDtHyd'] = kaggle_df_train.Elevation+kaggle_df_train.Vertical_Distance_To_Hydrology
kaggle_df_train['Distanse_to_Hydrolody'] = (kaggle_df_train['Horizontal_Distance_To_Hydrology']**2+kaggle_df_train['Vertical_Distance_To_Hydrology']**2)**0.5
kaggle_df_train['Hydro_plus_Fire'] = kaggle_df_train['Horizontal_Distance_To_Hydrology']+kaggle_df_train['Horizontal_Distance_To_Fire_Points']
kaggle_df_train['Hydro_minus_Fire'] = kaggle_df_train['Horizontal_Distance_To_Hydrology']-kaggle_df_train['Horizontal_Distance_To_Fire_Points']
kaggle_df_train['Hydro_plus_Road'] = kaggle_df_train['Horizontal_Distance_To_Hydrology']+kaggle_df_train['Horizontal_Distance_To_Roadways']
kaggle_df_train['Hydro_minus_Road'] = kaggle_df_train['Horizontal_Distance_To_Hydrology']-kaggle_df_train['Horizontal_Distance_To_Roadways']
kaggle_df_train['Fire_plus_Road'] = kaggle_df_train['Horizontal_Distance_To_Fire_Points']+kaggle_df_train['Horizontal_Distance_To_Roadways']
kaggle_df_train['Fire_minus_Road'] = kaggle_df_train['Horizontal_Distance_To_Fire_Points']-kaggle_df_train['Horizontal_Distance_To_Roadways']


kaggle_df_train.drop(['Vertical_Distance_To_Hydrology'],axis='columns',inplace=True)


kaggle_df_train.shape


valid_ids = kaggle_df_train.index


def clf_predict(clf,X_train,y,X_valid):
    y_pred=clf.fit(X_train,y).predict(X_valid)
    return y_pred


def write_result(valid_ids,y_pred,filename):
    with open(filename, "w") as outfile:
        outfile.write("Id,Cover_Type\n")
        for e, val in enumerate(list(y_pred)):
            outfile.write("%s,%s\n"%(valid_ids[e],val))


all_x_train


pd.Series(all_y_train)


clf=ExtraTreesClassifier(max_features= 0.35, n_estimators= 650)
y_pred=clf_predict(clf,all_x_train,all_y_train,kaggle_df_train)


write_result(valid_ids,y_pred,'submission7.csv')


df_train_sub = pd.read_csv("submission7.csv")


df_train_sub.head()


df_train_sub['Cover_Type'].value_counts()

Variable Name	Data Type	Measurement	Description
Elevation	Numerical	Meters	Elevation in meters
Aspect	Numerical	Azimuth	Aspect in degrees Azimuth
Slope	Numerical	Degrees	Slope in degrees
Horizontal_Distance_To_Hydrology	Numerical	Meters	Horz Dist to nearest surface water features
Vertical_Distance_To_Hydrology	Numerical	Meters	Vert Dist to nearest surface water features
Horizontal_Distance_To_Roadways	Numerical	Meters	Horz Dist to nearest roadway
Hillshade_9am	Numerical	(0 to 255 index)	Hillshade index at 9am, summer solstice
Hillshade_Noon	Numerical	(0 to 255 index)	Hillshade index at noon, summer solstice
Hillshade_3pm	Numerical	(0 to 255 index)	Hillshade index at 3pm, summer solstice
Horizontal_Distance_To_Fire_Points	Numerical	Meters	Horz Dist to nearest wildf_trainire ignition points
Wilderness_Area (x4)	Binary	0 = absence or 1 = presence	Wilderness area designation
Soil_Type (x40)	Binary	0 = absence or 1 = presence	Soil Type designation
Cover_Type	Integer	1 to 7	Forest Cover Type designation

Column Name	Name of Wilderness Areas
Wilderness_Area1	Rawah Wilderness
Wilderness_Area2	Neota Wilderness Area
Wilderness_Area3	Comanche Wilderness Area
Wilderness_Area4	Cache La Poudre Wilderness Are

	mean	std	min	25%	50%	75%	max
Horizontal_Distance_To_Roadways	1714.023	1325.066	0.0	764.0	1316.0	2270.00	6890.0
Horizontal_Distance_To_Fire_Points	1511.147	1099.936	0.0	730.0	1256.0	1988.25	6993.0
Elevation	2749.323	417.678	1863.0	2376.0	2752.0	3104.00	3849.0
Horizontal_Distance_To_Hydrology	227.196	210.075	0.0	67.0	180.0	330.00	1343.0
Aspect	156.677	110.086	0.0	65.0	126.0	261.00	360.0
Vertical_Distance_To_Hydrology	51.077	61.239	-146.0	5.0	32.0	79.00	554.0
Hillshade_3pm	135.092	45.895	0.0	106.0	138.0	167.00	248.0
Hillshade_9am	212.704	30.561	0.0	196.0	220.0	235.00	254.0
Hillshade_Noon	218.966	22.802	99.0	207.0	223.0	235.00	254.0
Slope	16.502	8.454	0.0	10.0	15.0	22.00	52.0

	Elevation	Aspect	Horizontal_Distance_To_Hydrology	Vertical_Distance_To_Hydrology	Horizontal_Distance_To_Roadways	Hillshade_9am	Hillshade_Noon	Hillshade_3pm	Horizontal_Distance_To_Fire_Points	Wilderness_Area1	...	Soil_Type40	Ele_minus_VDtHyd	Ele_plus_VDtHyd	Distance_to_Hydrolody	Hydro_plus_Fire	Hydro_minus_Fire	Hydro_plus_Road	Hydro_minus_Road	Fire_plus_Road	Fire_minus_Road
Id
3552	2311	102	525	17	1392	252	199	59.0	947	0	...	0	2294	2328	525.275166	1472	-422	1917	-867	2339	-445
5929	2315	315	134	28	1500	170	223	192.0	660	0	...	0	2287	2343	136.894120	794	-526	1634	-1366	2160	-840
13058	2276	91	0	0	1072	243	212	93.0	1201	0	...	0	2276	2276	0.000000	1201	-1201	1072	-1072	2273	129
2816	2797	174	272	91	600	226	245	138.0	577	1	...	0	2706	2888	286.818758	849	-305	872	-328	1177	-23
12193	2038	15	0	0	700	217	233	154.0	691	0	...	0	2038	2038	0.000000	691	-691	700	-700	1391	-9
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
9602	3497	306	324	37	5287	190	234	184.0	4290	0	...	0	3460	3534	326.105811	4614	-3966	5611	-4963	9577	-997
14279	2750	281	371	49	1381	206	240	173.0	1935	0	...	0	2701	2799	374.221859	2306	-1564	1752	-1010	3316	554
13277	2341	88	0	0	853	245	203	79.0	1050	0	...	0	2341	2341	0.000000	1050	-1050	853	-853	1903	197
4313	2342	342	134	40	1248	184	213	167.0	351	0	...	0	2302	2382	139.842769	485	-217	1382	-1114	1599	-897
7149	2611	123	216	50	1514	247	224	99.0	1015	0	...	0	2561	2661	221.711524	1231	-799	1730	-1298	2529	-499

Integer Value	Name of Cover Type
1	Spruce/Fir
2	Lodgepole Pine
3	Ponderosa Pine
4	Cottonwood/Willow
5	Aspen
6	Douglas-fir
7	Krummholz

	Elevation	Aspect	Slope	Horizontal_Distance_To_Hydrology	Vertical_Distance_To_Hydrology	Horizontal_Distance_To_Roadways	Hillshade_9am	Hillshade_Noon	Hillshade_3pm	Horizontal_Distance_To_Fire_Points	...	Soil_Type32	Soil_Type33	Soil_Type34	Soil_Type35	Soil_Type36	Soil_Type37	Soil_Type38	Soil_Type39	Soil_Type40	Cover_Type
Id
1	2596	51	3	258	0	510	221	232	148	6279	...	0	0	0	0	0	0	0	0	0	5
2	2590	56	2	212	-6	390	220	235	151	6225	...	0	0	0	0	0	0	0	0	0	5
3	2804	139	9	268	65	3180	234	238	135	6121	...	0	0	0	0	0	0	0	0	0	2
4	2785	155	18	242	118	3090	238	238	122	6211	...	0	0	0	0	0	0	0	0	0	2
5	2595	45	2	153	-1	391	220	234	150	6172	...	0	0	0	0	0	0	0	0	0	5

	variable	coefficient
49	Ele_minus_VDtHyd	0.010000
0	Elevation	0.000000
42	Soil_Type34	-0.000000
31	Soil_Type23	0.000000
32	Soil_Type24	0.000000
33	Soil_Type25	0.000000
34	Soil_Type26	0.000000
35	Soil_Type27	0.000000
36	Soil_Type28	-0.000000
37	Soil_Type29	0.000000
38	Soil_Type30	-0.000000
39	Soil_Type31	0.000000
40	Soil_Type32	0.000000
41	Soil_Type33	0.000000
43	Soil_Type35	-0.000000
1	Aspect	-0.000000
44	Soil_Type36	-0.000000
45	Soil_Type37	-0.000000
46	Soil_Type38	-0.000000
47	Soil_Type39	-0.000000
48	Soil_Type40	-0.000000
50	Ele_plus_VDtHyd	-0.000000
52	Hydro_plus_Fire	0.000000
53	Hydro_minus_Fire	0.000000
54	Hydro_plus_Road	0.000000
55	Hydro_minus_Road	0.000000
56	Fire_plus_Road	0.000000
30	Soil_Type22	0.000000
29	Soil_Type21	0.000000
28	Soil_Type20	0.000000
15	Soil_Type4	-0.000000
2	Horizontal_Distance_To_Hydrology	0.000000
3	Horizontal_Distance_To_Roadways	0.000000
7	Horizontal_Distance_To_Fire_Points	0.000000
8	Wilderness_Area1	0.000000
9	Wilderness_Area2	0.000000
10	Wilderness_Area3	-0.000000
11	Wilderness_Area4	-0.000000
12	Soil_Type1	-0.000000
13	Soil_Type2	-0.000000
27	Soil_Type19	0.000000
14	Soil_Type3	-0.000000
16	Soil_Type5	-0.000000
17	Soil_Type6	-0.000000
18	Soil_Type9	0.000000
19	Soil_Type10	-0.000000
20	Soil_Type11	-0.000000
21	Soil_Type12	-0.000000
22	Soil_Type13	-0.000000
23	Soil_Type14	0.000000
24	Soil_Type16	0.000000
25	Soil_Type17	-0.000000
26	Soil_Type18	-0.000000
57	Fire_minus_Road	0.000000
51	Distance_to_Hydrolody	-0.010000
6	Hillshade_3pm	-0.020000
5	Hillshade_Noon	-0.030000
4	Hillshade_9am	-0.050000

	Variance Explained	Cumulative Variance Explained
Principal Component
1	0.73	0.727
2	0.24	0.962
3	0.03	0.987
4	0.01	0.999
5	0.00	1.000
6	0.00	1.000
7	0.00	1.000
8	0.00	1.000
9	0.00	1.000
10	0.00	1.000
11	0.00	1.000
12	0.00	1.000
13	0.00	1.000
14	0.00	1.000
15	0.00	1.000
16	0.00	1.000
17	0.00	1.000
18	0.00	1.000
19	0.00	1.000
20	0.00	1.000
21	0.00	1.000
22	0.00	1.000
23	0.00	1.000
24	0.00	1.000
25	0.00	1.000
26	0.00	1.000
27	0.00	1.000
28	0.00	1.000
29	0.00	1.000
30	0.00	1.000
31	0.00	1.000
32	0.00	1.000
33	0.00	1.000
34	0.00	1.000
35	0.00	1.000
36	0.00	1.000
37	0.00	1.000
38	0.00	1.000
39	0.00	1.000
40	0.00	1.000
41	0.00	1.000
42	0.00	1.000
43	0.00	1.000
44	0.00	1.000
45	0.00	1.000
46	0.00	1.000
47	0.00	1.000
48	0.00	1.000
49	0.00	1.000
50	0.00	1.000
51	0.00	1.000
52	0.00	1.000
53	0.00	1.000
54	0.00	1.000
55	0.00	1.000
56	0.00	1.000
57	0.00	1.000
58	0.00	1.000

Predicting Forest Cover Type¶

Table of Contents¶

1. Understanding the problem, the requirements and the goal¶

1.1 Description:¶

1.2 Variable Definitions¶

Wilderness_Area¶

Cover_Type¶

1.3 Setting up the Environment:¶

2.Data Collection: reading the data and understanding the goal¶

2.1 Loading the data:¶

2.2 Understanding the Data¶

Slope (degrees)¶

Aspect (azimuth)¶

Hillshade (0 to 255 index)¶

3. Data Cleaning: preparing the data and analyzing it.¶

3.1 Checking Data Types¶

Checking One-Hot Encoded Variables¶

Creating a new data frame df_train1 with One-Hot-Encoded variables categorical¶

3.2 Checking Missing Values¶

3.3 Checking Distribution of Target Variable¶

3.4 Checking Distribution of Numerical Values¶

Histogram Plots¶

Histogram Plots per Cover Type¶

Descriptive Statistics¶

Box Plot¶

Checking the number of 0's¶

3.5 Checking Outliers¶

Observations of outliers¶

Creating FunctionTranformer to Remove Outliers¶

4. Exploratory Data Anaysis¶

4.1 Numerical Distribution among Cover Types- Box Plots¶

4.2 HillShade vs. Aspect Scatter Plot¶

4.3 HillShade vs. Slope Scatter Plot¶

4.4 Soil Type Distribution¶

Creating irrelevant_soil_types_removerFunctionTransformer¶

4.4 Correlation Matrix¶

5. Split the Dataset into train and validation¶

6. Running the Baseline Model¶

6.1. Naive Bayes Classifier¶

Cross Validation¶

Train valid Split¶

6.2. KNN Classifier¶

Cross Validation¶

Train valid Split¶

6.3. Logistic Regression Classsifier¶

Cross Validation¶

Train Test Split¶

7. Feature Engineering¶

7.1 Feature Extraction¶

7.2 Feature Selection¶

7.2.1 Finding Feature Importance with ExtraTreesClassifier¶

Re-Run ExtraTreesClassifier with the selected features only¶

7.2.2. Feature Selection with L1 Regularization(Lasso) and L2 Regularization(Ridge)¶

7.2.3 Principal Component Analysis (PCA)¶

Check the model with PCA¶

8. Model Creation, Comparaison and Hyperparameter Tuning¶

8. 1. GradientBoostingClassifier¶

8.2 RandomForestClassifier¶

8.3. Extra Tree Classifier¶

9. Creating Kaggle Predictions from Chosen Model¶

Select the best model and train it on all data then create submission file using that model¶

Creating a new data frame `df_train1` with One-Hot-Encoded variables categorical¶

Creating `irrelevant_soil_types_remover`FunctionTransformer¶