Evaluate the Diabetes DatasetEvaluate the Diabetes Dataset H

Evaluate the Diabetes Dataset

Happy coding!

1: Import the dataset

#Import the required libraries
import pandas as pd

#Import the diabetes dataset
df_diabetes_data = pd.read_csv('C:\\Users\\kevin.zhang\\Lesson 8-2\\pima-indians-diabetes.data',header=None)

2: Analyze the dataset

#View the first five observations of the dataset
df_diabetes_data.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	0	1	2	3	4	5	6	7	8
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

3: Find the features of the dataset

#Use the .NAMES file to view and set the features of the dataset
feature_names = ['Pregnant','Glucose','bp','skin','insulin','bmi','pedigree','age','label']

#Use the feature names set earlier and fix it as the column headers of the dataset
df_diabetes_data = pd.read_csv('C:\\Users\\kevin.zhang\\Lesson 8-2\\pima-indians-diabetes.data',header=None,names=feature_names)

#Verify if the dataset is updated with the new headers
df_diabetes_data.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	Pregnant	Glucose	bp	skin	insulin	bmi	pedigree	age	label
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

#View the number of observations and features of the dataset
df_diabetes_data.shape

(768, 9)

4: Find the response of the dataset

#Select features from the dataset to create the model
feature_select_cols = ['Pregnant','insulin','bmi','age']

#Create the feature object
X_feature = df_diabetes_data[feature_select_cols]

#Create the reponse object
Y_target = df_diabetes_data['label']

#View the shape of the feature object
X_feature.shape

(768, 4)

#View the shape of the target object
Y_target.shape

(768,)

5: Use training and testing datasets to train the model

#Split the dataset to test and train the model
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X_feature,Y_target,random_state=1)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(576, 4)
(192, 4)
(576,)
(192,)

6: Create a model to predict the diabetes outcome

# Create a logistic regression model using the training set
from sklearn.linear_model import LogisticRegression
linreg = LogisticRegression()
linreg.fit(x_train,y_train)

LogisticRegression()

#Make predictions using the testing set
y_pred = linreg.predict(x_test)

7: Check the accuracy of the model

#Evaluate the accuracy of your model
from sklearn import metrics
print(metrics.accuracy_score(y_test,y_pred))

0.6927083333333334

#Print the first 30 actual and predicted responses
print('actual', y_test[0:30])
print('predict', y_pred[0:30])

actual 285    0
101    0
581    0
352    0
726    0
472    0
233    0
385    0
556    0
59     0
756    0
341    0
445    1
614    1
371    0
355    1
19     1
711    0
430    0
117    0
493    1
218    1
159    1
667    1
553    0
628    0
620    0
238    1
640    0
750    1
Name: label, dtype: int64
predict [0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]