Before start our lesson please download the datasets.
Problem Statement:
The objective is to build a predictive model which is able to distinguish between main product categories. Many identical products get classified differently. Therefore, the quality of the product analysis depends heavily on the ability to accurately cluster similar products
Data importing
import pandas as pd
train=pd.read_csv("C:/Users/Personal/Google Drive/train_ecom.csv")
train.shape
import pandas as pd
test=pd.read_csv("C:/Users/Personal/Google Drive/test_ecom.csv")
test.shape
fullData = pd.concat([train,test],axis=0) #Combined both Train and Test Data set
fullData.shape
fullData.columns # This will show all the column names
fullData.head(10) # Show first 10 records of dataframe
fullData.describe() #You can look at summary of numerical fields by using describe() function
fullData.columns.values
#checking missing values
fullData.isnull().sum()
#Since, sklearn requires all inputs to be numeric, we should convert all our categorical variables into numeric by encoding the categories. This can be done using the following code:
from sklearn.preprocessing import LabelEncoder
var_mod = ['Category']
le = LabelEncoder()
for i in var_mod:
fullData[i] = le.fit_transform(fullData[i])
fullData.dtypes
Spliting the data into training and testing set
from sklearn.cross_validation import train_test_split
features=list(fullData.columns[1:101])
X1 = fullData[features]
y1 = fullData['Category']
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, y1, train_size=0.8,random_state=90)
X1_train.shape,Y1_train.shape,X1_test.shape,Y1_test.shape
Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X1_train,Y1_train)
#Naive Bayes classifier on training data
predict1 = model.predict(X1_train)
predict1
from sklearn.metrics import confusion_matrix
cm1=confusion_matrix(Y1_train, predict1)
print (cm1)
#Accuracy on training data
import numpy as np
from sklearn.metrics import accuracy_score
accuracy_score(Y1_train,predict1)
#Naive Bayes classifier on test data
predict = model.predict(X1_test)
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(Y1_test, predict)
print (cm)
from sklearn import metrics
print(metrics.classification_report(Y1_test, predict))
print(metrics.confusion_matrix(Y1_test, predict))
print(metrics.accuracy_score(Y1_test, predict))
#Accuracy on training data
import numpy as np
from sklearn.metrics import accuracy_score
accuracy_score(Y1_test,predict)
DecisionTree
Decision Trees (DTs) are a non-parametric supervised learning method used for classification. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features.
import pandas as pd
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X1_train,Y1_train)
clf
# Save tree as dot file
with open("tree1.dot", 'w') as f:
f = tree.export_graphviz(clf,
out_file=f)
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import Image
Image("tree1.png")
#DecisionTree on training data
predict = clf.predict(X1_train)
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(Y1_train, predict)
print (cm)
#Accuracy on training data
import numpy as np
from sklearn.metrics import accuracy_score
accuracy_score(Y1_train,predict)
#decision tree on test data
predict_d = clf.predict(X1_test)
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(Y1_test, predict_d)
print (cm)
#Accuracy on training data
import numpy as np
from sklearn.metrics import accuracy_score
accuracy_score(Y1_test,predict_d)
#model performs very well on training dataset not in test dataset.
#Lets prune the tree further. Lets oversimplyfy the model
tree1 = tree.DecisionTreeClassifier(criterion='gini', max_depth=16,random_state=90)
tree1.fit(X1_train,Y1_train)
#DecisionTree on training data
predict_1 = tree1.predict(X1_train)
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(Y1_train, predict_1)
print (cm)
#Accuracy on training data
import numpy as np
from sklearn.metrics import accuracy_score
accuracy_score(Y1_train,predict_1)
#decision tree on test data
predict_2 = tree1.predict(X1_test)
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(Y1_test, predict_2)
print (cm)
#Accuracy on training data
import numpy as np
from sklearn.metrics import accuracy_score
accuracy_score(Y1_test,predict_2)
RandomForest
The sklearn.ensemble module includes two averaging algorithms based on randomized decision trees: the RandomForest algorithm and the Extra-Trees method. Both algorithms are perturb-and-combine techniques [B1998] specifically designed for trees. This means a diverse set of classifiers is created by introducing randomness in the classifier construction. The prediction of the ensemble is given as the averaged prediction of the individual classifiers.
from sklearn.ensemble import RandomForestClassifier
forest=RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=16, min_samples_split=4,
min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',
max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=90,
verbose=0, warm_start=False, class_weight=None)
forest.fit(X1_train,Y1_train)
#RandomForest on training data
Predicted=forest.predict(X1_train)
from sklearn.metrics import confusion_matrix as cm
ConfusionMatrix = cm(Y1_train,Predicted)
print(ConfusionMatrix)
#Accuracy on training data
import numpy as np
from sklearn.metrics import accuracy_score
accuracy_score(Y1_train,Predicted)
#RandomForest on test data
Predicted1=forest.predict(X1_test)
from sklearn.metrics import confusion_matrix as cm
Confusion_Matrix = cm(Y1_test,Predicted1)
print(Confusion_Matrix)
#Accuracy on test data
import numpy as np
from sklearn.metrics import accuracy_score
accuracy_score(Y1_test,Predicted1)
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(500,500))
%matplotlib inline
importances = pd.DataFrame({'feature':X1_test.columns,'importance':np.round(forest.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
print (importances)
importances.plot(kind='bar')
Extremely randomized trees classifier
In extremely randomized trees randomness goes one step further in the way splits are computed. As in random forests, a random subset of candidate features is used, but instead of looking for the most discriminative thresholds, thresholds are drawn at random for each candidate feature and the best of these randomly-generated thresholds is picked as the splitting rule. This usually allows to reduce the variance of the model a bit more, at the expense of a slightly greater increase in bias
#Extremely randomized trees classifier
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier(n_estimators=10,random_state=90,max_depth=22)
clf
clf.fit(X1_train,Y1_train)
#Extremely randomized trees classifier on training data
Predicted2=clf.predict(X1_train)
from sklearn.metrics import confusion_matrix as cm
ConfusionMatrix = cm(Y1_train,Predicted2)
print(ConfusionMatrix)
#Accuracy on training data
import numpy as np
from sklearn.metrics import accuracy_score
accuracy_score(Y1_train,Predicted2)
#Extremely randomized trees classifier on test data
Predicted3=clf.predict(X1_test)
from sklearn.metrics import confusion_matrix as cm
Confusion_Matrix = cm(Y1_test,Predicted3)
print(Confusion_Matrix)
#Accuracy on test data
import numpy as np
from sklearn.metrics import accuracy_score
accuracy_score(Y1_test,Predicted3)
GradientBoostingClassifier
Gradient Tree Boosting or Gradient Boosted Regression Trees (GBRT) is a generalization of boosting to arbitrary differentiable loss functions. GBRT is an accurate and effective off-the-shelf procedure that can be used for both regression and classification problems
from sklearn.ensemble import GradientBoostingClassifier
clf_b = GradientBoostingClassifier(n_estimators=10, learning_rate=0.125,random_state=90, max_depth=4)
clf_b
clf_b.fit(X1_train, Y1_train)
#GradientBoostingClassifier on training data
Predicted4=clf_b.predict(X1_train)
from sklearn.metrics import confusion_matrix as cm
ConfusionMatrix = cm(Y1_train,Predicted4)
print(ConfusionMatrix)
#Accuracy on training data
import numpy as np
from sklearn.metrics import accuracy_score
accuracy_score(Y1_train,Predicted4)
#GradientBoostingClassifier on test data
Predicted5=clf_b.predict(X1_test)
from sklearn.metrics import confusion_matrix as cm
Confusion_Matrix = cm(Y1_test,Predicted5)
print(Confusion_Matrix)
#Accuracy on test data
import numpy as np
from sklearn.metrics import accuracy_score
accuracy_score(Y1_test,Predicted5)
K Fold Cross Validation(using RandomForest)
from sklearn.cross_validation import train_test_split
features=list(fullData.columns[1:101])
X1 = fullData[features]
y1 = fullData['Category']
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, y1, train_size=0.8,random_state=90)
X1_train.shape,Y1_train.shape,X1_test.shape,Y1_test.shape
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
score = cross_val_score(forest, X1,y1, cv=10)
score
print(forest.score(X1_test, Y1_test))
print(score)
conclusion
In this project i have used 5 classification models.
Naive Bayes classifier: Accuracy on training data is 61% and testing data is 61%.
Decision Tree: Accuracy on training data is 76% and testing data is 70%.
Random Forest: Accuracy on training data is 80% and testing data is 75%.
Extremely randomized trees classifier: Accuracy on training data is 77% and testing data is 70%.
Gradient Boosting Machine: Accuracy on training data is 75% and testing data is 74%.
K Fold Cross Validation accuracy is 75%.
Seem like a RandomForestClassifier is doing a pretty good job getting better accuracy.