See Understanding Machine Learning with Python
It's a full copy so I don't know anything
Use Jupyter Notebook Separate data required
# coding: utf-8
# # Predicting Diabetes
# ## Import Libraries
# In[4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
get_ipython().magic('matplotlib inline')
# ## Loading and review data
# In[5]:
df = pd.read_csv("Notebooks/data/pima-data.csv")
# In[6]:
df.shape
# In[7]:
df.head(5)
# In[8]:
df.tail(5)
# ## Check for null values
# In[9]:
df.isnull().values.any()
# In[10]:
def plot_corr(df, size=11):
corr = df.corr()
fig, ax = plt.subplots(figsize=(size, size))
ax.matshow(corr)
plt.xticks(range(len(corr.columns)), corr.columns)
plt.yticks(range(len(corr.columns)), corr.columns)
# In[11]:
plot_corr(df)
# In[12]:
df.corr()
# In[13]:
df.head()
# In[14]:
del df['skin']
# In[15]:
df.head()
# In[16]:
plot_corr(df)
# ## Check Data Types
# In[17]:
df.head(5)
# Change True to 1, False to 0
# In[18]:
diabetes_map = {True : 1, False : 0}
# In[19]:
df['diabetes'] = df['diabetes'].map(diabetes_map)
# In[20]:
df.head(5)
# ## Check true/false ratio
# In[21]:
num_true = len(df.loc[df['diabetes'] == True])
num_false = len(df.loc[df['diabetes'] == False])
print("Number of True cases: {0} ({1:2.2f}%)".format(num_true, (num_true/ (num_true + num_false)) * 100))
print("Number of False cases: {0} ({1:2.2f}%)".format(num_false, (num_false/ (num_true + num_false)) * 100))
# ## Spliting the data
# 70% for training 30% for testing
# In[22]:
from sklearn.cross_validation import train_test_split
feature_col_names = ['num_preg', 'glucose_conc', 'diastolic_bp', 'thickness', 'insulin', 'bmi', 'diab_pred', 'age']
predicted_class_names = ['diabetes']
x = df[feature_col_names].values
y = df[predicted_class_names].values
split_test_size = 0.30
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=split_test_size, random_state=42)
# We check to ensure we have the desired 70% train, 30% test split of the data
# In[23]:
print("{0:0.2f}% in training set".format((len(x_train)/len(df.index)) * 100))
print("{0:0.2f}% in test set".format((len(x_test)/len(df.index)) * 100))
# #### Verifying predicted value was split correctly
# In[24]:
print("Original True : {0} ({1:0.2f}%)".format(len(df.loc[df['diabetes'] == 1]), (len(df.loc[df['diabetes'] == 1])/len(df.index)) * 100))
print("Original False : {0} ({1:0.2f}%)".format(len(df.loc[df['diabetes'] == 0]), (len(df.loc[df['diabetes'] == 0])/len(df.index)) * 100))
print("")
print("Training True : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), (len(y_train[y_train[:] == 1])/len(y_train)) * 100))
print("Training False : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), (len(y_train[y_train[:] == 0])/len(y_train)) * 100))
print("")
print("Test True : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1])/len(y_test)) * 100))
print("Test False : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0])/len(y_test)) * 100))
print("")
# ### Post-split Data Preparation
# #### Hidden Nissing Values
# In[25]:
df.head()
# #### Impute with the mean
# In[26]:
from sklearn.preprocessing import Imputer
fill_0 = Imputer(missing_values=0, strategy="mean", axis=0)
x_train = fill_0.fit_transform(x_train)
x_test = fill_0.fit_transform(x_test)
# ## Training Initial Algorithm - Naive Bayes
# In[27]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(x_train, y_train.ravel())
# ### Performance on Training Data
# In[28]:
nb_predict_train = nb_model.predict(x_train)
from sklearn import metrics
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, nb_predict_train)))
print()
# #### Metrics
# In[29]:
nb_predict_test = nb_model.predict(x_test)
from sklearn import metrics
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, nb_predict_test)))
print()
# In[30]:
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_test, nb_predict_test, labels=[1, 0])))
print("")
print("Classification Report")
print(metrics.classification_report(y_test, nb_predict_test, labels=[1, 0]))
# ## Random Forest
# In[31]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(x_train, y_train.ravel())
# ### Predict Training Data
# In[32]:
rf_predict_train = rf_model.predict(x_train)
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, rf_predict_train)))
# ### Predict Test Data
# In[33]:
rf_predict_test = rf_model.predict(x_test)
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, rf_predict_test)))
# In[34]:
print(metrics.confusion_matrix(y_test, rf_predict_test, labels=[1, 0]))
print("")
print("Classification Report")
print(metrics.classification_report(y_test, rf_predict_test, labels=[1, 0]))
# ### Logistic Regression
# In[35]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(C=0.7, random_state=42)
lr_model.fit(x_train, y_train.ravel())
lr_predict_test = lr_model.predict(x_test)
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, lr_predict_test)))
print(metrics.confusion_matrix(y_test, lr_predict_test, labels=[1, 0]))
print("")
print("Classification Report")
print(metrics.classification_report(y_test, lr_predict_test, labels=[1, 0]))
# Setting regularization parameter
# In[39]:
C_start = 0.1
C_end = 5
C_inc = 0.1
C_values, recall_scores = [], []
C_val = C_start
best_recall_score = 0
while(C_val < C_end):
C_values.append(C_val)
lr_model_loop = LogisticRegression(C=C_val, random_state=42)
lr_model_loop.fit(x_train, y_train.ravel())
lr_predict_loop_test = lr_model_loop.predict(x_test)
recall_score = metrics.recall_score(y_test, lr_predict_loop_test)
recall_scores.append(recall_score)
if(recall_score > best_recall_score):
best_recall_score = recall_score
best_lr_predict_test = lr_predict_loop_test
C_val = C_val +C_inc
best_score_C_val = C_values[recall_scores.index(best_recall_score)]
print("1st max value of {0:.3f} occured at C={1:.3f}".format(best_recall_score, best_score_C_val))
get_ipython().magic('matplotlib inline')
plt.plot(C_values, recall_scores, "-")
plt.xlabel("C value")
plt.ylabel("recall score")
# ### Logistic regression with class_weight='balanced'
# In[40]:
C_start = 0.1
C_end = 5
C_inc = 0.1
C_values, recall_scores = [], []
C_val = C_start
best_recall_score = 0
while(C_val < C_end):
C_values.append(C_val)
lr_model_loop = LogisticRegression(C=C_val, class_weight="balanced", random_state=42)
lr_model_loop.fit(x_train, y_train.ravel())
lr_predict_loop_test = lr_model_loop.predict(x_test)
recall_score = metrics.recall_score(y_test, lr_predict_loop_test)
recall_scores.append(recall_score)
if(recall_score > best_recall_score):
best_recall_score = recall_score
best_lr_predict_test = lr_predict_loop_test
C_val = C_val +C_inc
best_score_C_val = C_values[recall_scores.index(best_recall_score)]
print("1st max value of {0:.3f} occured at C={1:.3f}".format(best_recall_score, best_score_C_val))
get_ipython().magic('matplotlib inline')
plt.plot(C_values, recall_scores, "-")
plt.xlabel("C value")
plt.ylabel("recall score")
# In[41]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(class_weight="balanced", C=best_score_C_val, random_state=42)
lr_model.fit(x_train, y_train.ravel())
lr_predict_test = lr_model.predict(x_test)
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, lr_predict_test)))
print(metrics.confusion_matrix(y_test, lr_predict_test, labels=[1, 0]))
print("")
print("Classification Report")
print(metrics.classification_report(y_test, lr_predict_test, labels=[1, 0]))
print(metrics.recall_score(y_test, lr_predict_test))
# ### Logistic RegressionCV
# In[42]:
from sklearn.linear_model import LogisticRegressionCV
lr_cv_model = LogisticRegressionCV(n_jobs=-1, random_state=42, Cs=3, cv=10, refit=True, class_weight="balanced")
lr_cv_model.fit(x_train, y_train.ravel())
# ### Predict on Test data
# In[43]:
lr_cv_predict_test = lr_cv_model.predict(x_test)
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, lr_cv_predict_test)))
print(metrics.confusion_matrix(y_test, lr_cv_predict_test, labels=[1, 0]))
print("")
print("Classification Report")
print(metrics.classification_report(y_test, lr_cv_predict_test, labels=[1, 0]))
# In[ ]:
Recommended Posts