Before start our lesson please download the datasets.
Abstract: The main objective is predicting the result of websites.
In [6]:
import pandas as pd
train1=pd.read_csv("~/datasets/phishing_websites.csv")
train1.shape
Out[6]:
In [4]:
train1.head()
Out[4]:
In [6]:
import pandas as pd
import sklearn as sk
import math
import numpy as np
from scipy import stats
import matplotlib as matlab
import statsmodels
train1=pd.read_csv("~/datasets/phishing_websites.csv")
train1.shape
train1.columns.values
train1.head(10)
train1.describe()
Out[6]:
association between result and a1
In [7]:
stats.pointbiserialr(train1.result,train1.a1)
Out[7]:
Logistic Regression
In [11]:
from sklearn.linear_model import LogisticRegression
logistic1= LogisticRegression()
logistic1.fit(train1[['a1']+['a2']+['a3']+['a4']+['a5']+['a6']+['a7']+['a8']+['a9']+['a10']+['a11']+['a12']+['a13']+['a14']+['a15']+['a16']+['a17']+['a18']+['a19']+['a20']+['a21']+['a22']+['a23']+['a24']+['a25']+['a26']+['a27']+['a28']+['a29']+['a30']],train1[['result']])
predict1=logistic1.predict(train1[['a1']+['a2']+['a3']+['a4']+['a5']+['a6']+['a7']+['a8']+['a9']+['a10']+['a11']+['a12']+['a13']+['a14']+['a15']+['a16']+['a17']+['a18']+['a19']+['a20']+['a21']+['a22']+['a23']+['a24']+['a25']+['a26']+['a27']+['a28']+['a29']+['a30']])
predict1
logistic1
Out[11]:
confusion matrix
In [10]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
cm1 = confusion_matrix(train1[['result']],predict1)
print(cm1)
total1=sum(sum(cm1))
Accuracy
In [13]:
accuracy1=(cm1[0,0]+cm1[1,1])/total1
accuracy1
Out[13]:
result is a target variable, contains -1 and 1. -1 denotes phishing(fake website), 1 denotes legitime(genuine website). change -1 as 0 because Logistic Regression is used to predict a binary outcome (1 / 0) given a set of independent variables.In simple words, it predicts the probability of occurrence of an event by fitting data to a logit function.
In [14]:
newdata1=train1[['result']]
newdata1[newdata1<=-1]=0
newdata1.head()
Out[14]:
In [15]:
import statsmodels.formula.api as sm
logistic=sm.Logit(newdata1,train1[['a1']+['a2']+['a3']+['a4']+['a5']+['a6']+['a7']+['a8']+['a9']+['a10']+['a11']+['a12']+['a13']+['a14']+['a15']+['a16']+['a17']+['a18']+['a19']+['a20']+['a21']+['a22']+['a23']+['a24']+['a25']+['a26']+['a27']+['a28']+['a29']+['a30']])
logistic
result1=logistic.fit()
summary_1=result1.summary()
summary_1
Out[15]:
a5,a9,a10,a18,a22 variables are the less impacting variables.
In [49]:
import pandas as pd
train1=pd.read_csv("~/datasets/phishing_websites.csv")
from sklearn.linear_model import LogisticRegression
logistic2= LogisticRegression( )
logistic2.fit(train1[['a1']+['a2']+['a3']+['a4']+['a6']+['a7']+['a8']+['a11']+['a12']+['a13']+['a14']+['a15']+['a16']+['a17']+['a19']+['a20']+['a21']+['a23']+['a24']+['a25']+['a26']+['a27']+['a28']+['a29']+['a30']],train1[['result']])
predict2=logistic2.predict(train1[['a1']+['a2']+['a3']+['a4']+['a6']+['a7']+['a8']+['a11']+['a12']+['a13']+['a14']+['a15']+['a16']+['a17']+['a19']+['a20']+['a21']+['a23']+['a24']+['a25']+['a26']+['a27']+['a28']+['a29']+['a30']])
predict2
Out[49]:
confusion matrix
In [58]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
cm2=confusion_matrix(train1[['result']],predict2)
print(cm2)
total2=sum(sum(cm2))
Accuracy for final logistic building
In [59]:
accuracy2=(cm2[0,0]+cm2[1,1])/total2
accuracy2
Out[59]:
Roc and auc
In [71]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
actual = train1[['result']]
false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predict2)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate,label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc
Out[71]:
Decision tree
In [9]:
import pandas as pd
train1=pd.read_csv("~/datasets/phishing_websites.csv")
from sklearn import tree
features= list(train1.columns[:30])
y=train1[['result']]
X = train1[features]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(y,X)
clf
Out[9]:
In [29]:
# Save tree as dot file
from IPython.display import Image
from sklearn.externals.six import StringIO
import matplotlib.pyplot as plt
import pydot
dot_data = StringIO()
tree.export_graphviz(clf,
out_file = dot_data,
feature_names = features,
filled=True, rounded=True,
impurity=False)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
In [32]:
predict3 = clf.predict3(X)
predict3
from sklearn.metrics import confusion_matrix
cm3=confusion_matrix(y, predict3)
print (cm3)
total3 = sum(sum(cm3))
accuracy3 = (cm3[0,0]+cm3[1,1])/total3
K fold cross validation(using logistic model)
In [43]:
import numpy as np
from sklearn.cross_validation import KFold
from sklearn import cross_validation
kfold = cross_validation.KFold(len(train1), n_folds=10)
X=train1[['a1']+['a2']+['a3']+['a4']+['a5']+['a6']+['a7']+['a8']+['a9']+['a10']+['a11']+['a12']+['a13']+['a14']+['a15']+['a16']+['a17']+['a18']+['a19']+['a20']+['a21']+['a22']+['a23']+['a24']+['a25']+['a26']+['a27']+['a28']+['a29']+['a30']]
y=train1[['result']]
score = cross_validation.cross_val_score(logistic1,X, y,scoring='mean_squared_error',cv=kfold)
score = cross_validation.cross_val_score(logistic1,X, y,scoring='accuracy',cv=kfold)
print("Accuracy per fold: ")
print(scores)
print("Average accuracy: ", scores.mean())
In [47]:
mean=np.mean(scores)
mean
Out[47]:
In [45]:
std=np.std(scores)
std
Out[45]:
Holdout
In [7]:
import numpy as np
from sklearn.cross_validation import train_test_split
k_train, k_test = train_test_split(train1, # Data set to split
test_size = 0.25, # Split ratio
random_state=1, # Set random seed
stratify = train1["result"]) #*
print(k_train.shape)
print(k_test.shape)
In [10]:
from sklearn.cross_validation import KFold
from sklearn import tree
cv = KFold(n=len(train1), # Number of elements
n_folds=10, # Desired number of cv folds
random_state=12) # Set a random seed
fold_accuracy = []
for train_fold, valid_fold in cv:
train = train1.loc[train_fold] # Extract train data with kf indices
valid = train1.loc[valid_fold] # Extract valid data with kf indices
model = clf.fit(X = train[['a1']+['a2']+['a3']+['a4']+['a6']+['a7']+['a8']+['a11']+['a12']+['a13']+['a14']+['a15']+['a16']+['a17']+['a19']+['a20']+['a21']+['a23']+['a24']+['a25']+['a26']+['a27']+['a28']+['a29']+['a30']],y =train[['result']])
valid_acc = model.score(X = valid[['a1']+['a2']+['a3']+['a4']+['a6']+['a7']+['a8']+['a11']+['a12']+['a13']+['a14']+['a15']+['a16']+['a17']+['a19']+['a20']+['a21']+['a23']+['a24']+['a25']+['a26']+['a27']+['a28']+['a29']+['a30']],y = valid[['result']])
fold_accuracy.append(valid_acc)
print("Accuracy per fold: ", fold_accuracy, "n")
print("Average accuracy: ", sum(fold_accuracy)/len(fold_accuracy))
K-fold cross validation(using decision tree)
In [19]:
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(estimator=clf, # Model to test
X= train1[['a1']+['a2']+['a3']+['a4']+['a6']+['a7']+['a8']+['a11']+['a12']+['a13']+['a14']+['a15']+['a16']+['a17']+['a19']+['a20']+['a21']+['a23']+['a24']+['a25']+['a26']+['a27']+['a28']+['a29']+['a30']],
y = train1[['result']],
cv=10,scoring = "accuracy")
print("Accuracy per fold: ")
print(scores)
print("Average accuracy: ", scores.mean())
Conclusion:
Real accuracy of data is 92%.


