Classification using Logistic Regression
This Data set contains the information related to red wine , Various factors affecting the quality. This data set was prepossessed and downloaded from the UCI Machine Learning Repository. This data set was simple, cleaned, practice data set for classification modelling. Source of this Dataset: https://archive.ics.uci.edu/ml/datasets/wine+quality
Attribute Information:
Input variables (based on physicochemical tests):
This analysis focuses on finding attributes that significantly affect wine quality classification and training a predictive model to classify wine quality into good
and bad
based on attributes. Analysis is pivoted on the variable quality
(target variable). Exploratory data analysis steps like removing null values, observing summary statistics, visualizing the variables, removing oultiers, checking for correlations are carried out.
Following significant correlations are observed.
A 70-30 split is done to divide dataset into test and train sets.
10 variables are selected using automated RFE. Further, manual selection is carried out using p-value method.
Models are build on train data using statsmodels.api
package.
Final Model is build on the following variables.
citric acid
,fixed acidity
,volatile acidity
,alcohol
,sulphates
,total sulfur dioxide
Variance inflation factor is calculated for all final selection of variables. VIF < 5. No significant Multicollinearity observed.
ROC, Precision-Recall / Sensitivity - Specificity curves have been plotted. The optimum threshold for classification seems to be 0.5
Model metrics on train data at classification threshold of 0.5 :
Model metrics on test data at classification threshold of 0.5 :
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv('./wine_quality_classification.csv')
data.head()
data.info()
data.isnull().sum()
quality
is our target variable. It has two levels - good & bad. No null or missing values. All the other variables are continuous variables.
quality
levels with 0,1¶data['quality'] = data['quality'].replace({'good' : 1, 'bad' : 0})
data.describe()
data.quantile(np.linspace(0.90,1,12))
fixed acidity
, volatile acidity
, citric acid
, residual sugar
, chlorides
, free sulfur dioxide
, total sulfur dioxide
, pH
, sulphates
, alcohol
x_vars = data.columns[data.columns != 'quality']
fig,ax = plt.subplots(len(x_vars))
fig.set_figheight(24)
fig.set_figwidth(12)
for num,i in enumerate(x_vars) :
ax[num].set_title(i)
ax[num].set_xlabel('')
sns.boxplot(data[i],ax=ax[num])
# removing outliers :
x_vars = data.columns[data.columns != 'quality']
for i in x_vars :
q1 = data[i].quantile(0.25)
q3 = data[i].quantile(0.75)
upper_extreme = data[i].quantile(0.75) + 1.5*(q3-q1) # q3-q1 is IQR
lower_extreme = data[i].quantile(0.75) - 1.5*(q3-q1)
mask = (data[i] > lower_extreme) & (data[i] < upper_extreme) # sans outliers
outliers = data[mask].index
data.drop(index=outliers)
from sklearn.model_selection import train_test_split
y = data.pop('quality')
X = data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=100)
# In our case, all the independent variables are continuous
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train[X_train.columns] = scaler.fit_transform(X_train[X_train.columns])
# Scaling test set for later use
X_test[X_train.columns] = scaler.transform(X_test[X_train.columns])
plt.figure(figsize=[20,10])
sns.heatmap(X_train.corr(),annot=True)
plt.title('Visualizing Correlations')
plt.show()
High Correlations :
import statsmodels.api as sm
# Logistic Regression Model
logm1 = sm.GLM(y_train, sm.add_constant(X_train),family=sm.families.Binomial())
logm1.fit().summary()
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()
from sklearn.feature_selection import RFE
rfe = RFE(logReg,10)
rfe = rfe.fit(X_train,y_train)
## RFE results
rfe_results = list(zip(X_train.columns,rfe.support_,rfe.ranking_))
sorted(rfe_results,key=lambda x : (x[2]))
pH
can be dropped. X_train.drop(columns=['pH'],inplace=True)
X_test.drop(columns=['pH'],inplace=True)
X_train.columns = X_train.columns[X_train.columns !='pH']
logm1 = sm.GLM(y_train, sm.add_constant(X_train),family=sm.families.Binomial())
logm1.fit().summary()
free sulfur dioxide
because of high p-valueX = X_train.loc[:,X_train.columns != 'free sulfur dioxide']
logm2 = sm.GLM(y_train, sm.add_constant(X),family=sm.families.Binomial())
logm2.fit().summary()
free sulfur dioxide
because of high p-valueX = X.loc[:,X.columns != 'free sulfur dioxide']
logm3 = sm.GLM(y_train, sm.add_constant(X),family=sm.families.Binomial())
logm3.fit().summary()
density
because of high p-valueX = X.loc[:,X.columns != 'density']
logm4 = sm.GLM(y_train, sm.add_constant(X),family=sm.families.Binomial())
logm4.fit().summary()
chlorides
because of high p-valueX = X.loc[:,X.columns != 'chlorides']
logm5 = sm.GLM(y_train, sm.add_constant(X),family=sm.families.Binomial())
logm5.fit().summary()
-- Dropping residual sugar
because of high p-value
X = X.loc[:,X.columns != 'residual sugar']
logm6 = sm.GLM(y_train, sm.add_constant(X),family=sm.families.Binomial())
logm6.fit().summary()
from statsmodels.stats.outliers_influence import variance_inflation_factor
def vif(X) :
df = sm.add_constant(X)
vif = [variance_inflation_factor(df.values,i) for i in range(df.shape[1])]
vif_frame = pd.DataFrame({'vif' : vif[0:]},index = df.columns).reset_index()
print(vif_frame.sort_values(by='vif',ascending=False))
vif(X)
print('Selected columns :' , X.columns)
logm_final = sm.GLM(y_train, sm.add_constant(X_train[X.columns]),family=sm.families.Binomial())
res = logm_final.fit()
res.summary()
selected_vars = X.columns
y_train_pred = res.predict(sm.add_constant(X_train[X.columns]))
print(y_train_pred.head())
predictions = pd.DataFrame({'Quality' : y_train.values,'class_probability' : y_train_pred.values.reshape(-1)}, index=X_train.index)
print(predictions.head())
predictions['Predicted_Quality'] = predictions['class_probability'].apply(lambda x : 1 if x > 0.5 else 0)
print(predictions.head())
from sklearn import metrics
confusion = metrics.confusion_matrix(predictions['Quality'],predictions['Predicted_Quality'])
print(confusion)
Predicted > | 0 | 1 |
---|---|---|
Actual | ||
0 | TN = 390 | FP =130 |
1 | FN =147 | TP = 452 |
0 : bad, 1 : good
# Accuracy of the model
print(metrics.accuracy_score(predictions['Quality'],predictions['Predicted_Quality']))
TP = confusion[1,1]
TN = confusion[0,0]
FP = confusion[0,1]
FN = confusion[1,0]
#### Metrics
import math
def model_metrics(TP,TN,FP,FN) :
print('Accuracy :' , round((TP + TN)/float(TP+TN+FP+FN),3))
print('Misclassification Rate / Error Rate :', round((FP + FN)/float(TP+TN+FP+FN),3))
print('Sensitivity / True Positive Rate / Recall :', round(TP/float(FN + TP),3))
sensitivity = round(TP/float(FN + TP),3)
print('Specificity / True Negative Rate : ', round(TN/float(TN + FP),3))
specificity = round(TN/float(TN + FP),3)
print('False Positive Rate :',round(FP/float(TN + FP),3))
print('Precision / Positive Predictive Value :', round(TP/float(TP + FP),3))
precision = round(TP/float(TP + FP),3)
print('Prevalance :',round((FN + TP)/float(TP+TN+FP+FN),3))
print('Negative Predictive Value', round(TN/float(TN + FN),3))
print('Likelihood Ratio : Sensitivity / 1-Specificity :', round(sensitivity/float(1-specificity) ,3))
print('F1-score :', round(2*precision*sensitivity/(precision + sensitivity),3))
model_metrics(TP,TN,FP,FN)
print(predictions.head())
# generating predictions for cutoffs between 0 and 1
cutoffs = pd.DataFrame()
for i in np.arange(0,1,0.1) :
cutoffs[i] = predictions['class_probability'].map(lambda x : 1 if x > i else 0)
tpr = []
fpr = []
for column in cutoffs.columns :
confusion = metrics.confusion_matrix(predictions['Quality'],cutoffs[column])
TP = confusion[1,1] # true positive
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives
tpr.append(TP/float(TP + FN))
fpr.append(FP/float(FP + TN))
plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
sns.scatterplot(fpr,tpr);
sensitivity = []
specificity = []
accuracy = []
coffs = []
for column in cutoffs.columns :
confusion = metrics.confusion_matrix(predictions['Quality'],cutoffs[column])
TP = confusion[1,1] # true positive
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives
sensitivity.append(TP/float(TP + FN))
specificity.append(1 - FP/float(FP + TN))
accuracy.append((TP + TN)/(TP + TN + FP + FN))
fig,ax = plt.subplots()
ax.set_xlabel('Cutoffs')
ax.plot(cutoffs.columns,sensitivity,label='sensitivity')
ax.plot(cutoffs.columns,specificity,label='specificity')
ax.plot(cutoffs.columns,accuracy,label='accuracy')
ax.legend(('sensitivity','specificity','accuracy'))
plt.show()
predictions['Final_Predictions'] = predictions['Predicted_Quality'].map(lambda x : 1 if x > 0.5 else 0)
confusion_final = metrics.confusion_matrix(predictions['Quality'],predictions['Final_Predictions'])
TP = confusion_final[1,1]
TN = confusion_final[0,0]
FP = confusion_final[0,1]
FN = confusion_final[1,0]
#### Metrics
model_metrics(TP,TN,FP,FN)
precision = [] # positive predictive power - TP / TP + FP
recall = [] ## same as sensitivity
for column in cutoffs.columns :
confusion = metrics.confusion_matrix(predictions['Quality'],cutoffs[column])
TP = confusion[1,1] # true positive
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives
precision.append(TP/float(TP + FP))
recall.append(TP/float(FN + TP))
fig,ax = plt.subplots()
ax.set_xlabel('Cutoffs')
ax.plot(cutoffs.columns,precision,label='precision')
ax.plot(cutoffs.columns,recall,label='recall')
ax.legend(('precision','recall'))
plt.show()
# using sklearn utilities
from sklearn.metrics import precision_score, recall_score
print('Precision',precision_score(predictions['Quality'],predictions['Predicted_Quality']))
print('Recall', recall_score(predictions['Quality'],predictions['Predicted_Quality']))
print(X_test[X.columns].head())
test_predictions = pd.DataFrame()
X_test_ = X_test[X.columns]
test_predictions['Class_Probabilities'] = res.predict(sm.add_constant(X_test_))
test_predictions['Original'] = y_test
test_predictions.index = y_test.index
# Predictions are made using 0.5 as the threshold
test_predictions['Predicted'] = test_predictions['Class_Probabilities'].map(lambda x : 1 if x > 0.5 else 0)
#### Metrics
TN,FP,FN,TP = metrics.confusion_matrix(test_predictions['Original'],test_predictions['Predicted']).reshape(-1)
model_metrics(TP,TN,FP,FN)