Kaggle Competition: Leaf Classification

Posted on Fri 02 December 2016 in Kaggle

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import StratifiedShuffleSplit
/disk1/zhiqiang/.pyenv/versions/3.5.2/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [3]:
def encode(train, test):
    le = LabelEncoder().fit(train.species)
    labels = le.transform(train.species)    # encode species strings
    classes = list(le.classes_)             # save column names for submission
    test_ids = test.id                      # save test ids for submission
    
    train = train.drop(['species', 'id'], axis=1)
    test = test.drop(['id'], axis=1)
    
    return train, labels, test, test_ids, classes
In [4]:
train, labels, test, test_ids, classes = encode(train, test)
train.head(1)
Out[4]:
margin1 margin2 margin3 margin4 margin5 margin6 margin7 margin8 margin9 margin10 ... texture55 texture56 texture57 texture58 texture59 texture60 texture61 texture62 texture63 texture64
0 0.007812 0.023438 0.023438 0.003906 0.011719 0.009766 0.027344 0.0 0.001953 0.033203 ... 0.007812 0.0 0.00293 0.00293 0.035156 0.0 0.0 0.004883 0.0 0.025391

1 rows × 192 columns

In [5]:
sss = StratifiedShuffleSplit(labels, 10, test_size=0.2, random_state=23)

for train_index, test_index in sss:
    X_train, X_test = train.values[train_index], train.values[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
In [6]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
]

# Logging for Visual Comparison
log_cols = ["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("=" * 30)
    print(name)
    
    print("**** Results ****")
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
    train_predictions = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc * 100, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("=" * 30)
==============================
KNeighborsClassifier
**** Results ****
Accuracy: 88.8889%
Log Loss: 1.5755075129933762
==============================
SVC
**** Results ****
Accuracy: 81.8182%
Log Loss: 4.595257536682281
==============================
NuSVC
**** Results ****
Accuracy: 88.3838%
Log Loss: 2.486174622894385
==============================
DecisionTreeClassifier
**** Results ****
Accuracy: 66.1616%
Log Loss: 11.687363729591087
==============================
RandomForestClassifier
**** Results ****
Accuracy: 87.8788%
Log Loss: 1.1360000913463715
==============================
AdaBoostClassifier
**** Results ****
Accuracy: 4.5455%
Log Loss: 4.207215776494153
==============================
GradientBoostingClassifier
**** Results ****
Accuracy: 58.5859%
Log Loss: 2.7505649196010884
==============================
GaussianNB
**** Results ****
Accuracy: 57.0707%
Log Loss: 14.827252492813216
==============================
LinearDiscriminantAnalysis
**** Results ****
Accuracy: 97.9798%
Log Loss: 0.930197776313929
==============================
QuadraticDiscriminantAnalysis
**** Results ****
Accuracy: 2.0202%
Log Loss: 33.84102333642773
==============================
In [7]:
sns.set_color_codes("muted")
sns.barplot(x="Accuracy", y="Classifier", data=log, color="b")

plt.xlabel('Accuracy %')
plt.title('Classifier Accuracy')
plt.show()
In [8]:
sns.set_color_codes("muted")
sns.barplot(x='Log Loss', y='Classifier', data=log, color='g')

plt.xlabel('Log Loss')
plt.title('Classifier Log Loss')
plt.show()
In [9]:
favorite_clf = LinearDiscriminantAnalysis()
favorite_clf.fit(X_train, y_train)
test_predictions = favorite_clf.predict_proba(test)
In [10]:
submission = pd.DataFrame(test_predictions, columns=classes)
submission.insert(0, 'id', test_ids)
submission.reset_index()

submission.to_csv('submission.csv', index=False)