Diabeties - dataset for machine learning

In [2]:
# raw_data = spark.read.format("csv").option("header","true").option("inferSchema", "true").load(r".\\diabetes.csv")
ds = spark.read.csv('diabetes.csv',inferSchema = True, header =True)
In [3]:
ds.show() # let me see a sample of the data
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|31.0|                   0.248| 26|      1|
|         10|    115|            0|            0|      0|35.3|                   0.134| 29|      0|
|          2|    197|           70|           45|    543|30.5|                   0.158| 53|      1|
|          8|    125|           96|            0|      0| 0.0|                   0.232| 54|      1|
|          4|    110|           92|            0|      0|37.6|                   0.191| 30|      0|
|         10|    168|           74|            0|      0|38.0|                   0.537| 34|      1|
|         10|    139|           80|            0|      0|27.1|                   1.441| 57|      0|
|          1|    189|           60|           23|    846|30.1|                   0.398| 59|      1|
|          5|    166|           72|           19|    175|25.8|                   0.587| 51|      1|
|          7|    100|            0|            0|      0|30.0|                   0.484| 32|      1|
|          0|    118|           84|           47|    230|45.8|                   0.551| 31|      1|
|          7|    107|           74|            0|      0|29.6|                   0.254| 31|      1|
|          1|    103|           30|           38|     83|43.3|                   0.183| 33|      0|
|          1|    115|           70|           30|     96|34.6|                   0.529| 32|      1|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
only showing top 20 rows

In [7]:
ds.count() # how many records are there in this dataset?
Out[7]:
768
In [4]:
ds.columns # list all the headings
Out[4]:
['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

check the statistics for each column

In [5]:
# the min value of 0 show that there is some human error in the dataset
ds.describe().select("Summary","Pregnancies","Glucose","BloodPressure").show()
+-------+------------------+-----------------+------------------+
|Summary|       Pregnancies|          Glucose|     BloodPressure|
+-------+------------------+-----------------+------------------+
|  count|               768|              768|               768|
|   mean|3.8450520833333335|     120.89453125|       69.10546875|
| stddev|  3.36957806269887|31.97261819513622|19.355807170644777|
|    min|                 0|                0|                 0|
|    max|                17|              199|               122|
+-------+------------------+-----------------+------------------+

In [6]:
dataset.describe().select("Summary","SkinThickness","Insulin").show()
+-------+------------------+------------------+
|Summary|     SkinThickness|           Insulin|
+-------+------------------+------------------+
|  count|               768|               768|
|   mean|20.536458333333332| 79.79947916666667|
| stddev|15.952217567727642|115.24400235133803|
|    min|                 0|                 0|
|    max|                99|               846|
+-------+------------------+------------------+

In [9]:
ds.describe().select("Summary","BMI","DiabetesPedigreeFunction","Age").show()
+-------+------------------+------------------------+------------------+
|Summary|               BMI|DiabetesPedigreeFunction|               Age|
+-------+------------------+------------------------+------------------+
|  count|               768|                     768|               768|
|   mean|31.992578124999977|      0.4718763020833327|33.240885416666664|
| stddev| 7.884160320375441|       0.331328595012775|11.760231540678689|
|    min|               0.0|                   0.078|                21|
|    max|              67.1|                    2.42|                81|
+-------+------------------+------------------------+------------------+

In [8]:
ds.printSchema() # another view of the data
root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)

clean the data

In [11]:
# replace min value of zeros with Nan as part of the data cleaning processs
import numpy as np
from pyspark.sql.functions import when
ds = ds.withColumn("Glucose",when(ds.Glucose==0,np.nan).otherwise (ds.Glucose))
ds = ds.withColumn("BloodPressure",when(ds.BloodPressure==0,np.nan).otherwise(ds.BloodPressure))
ds = ds.withColumn("SkinThickness",when(ds.SkinThickness==0,np.nan).otherwise(ds.SkinThickness))
ds = ds.withColumn("BMI",when(ds.BMI==0,np.nan).otherwise(ds.BMI))
ds = ds.withColumn("Insulin",when(ds.Insulin==0,np.nan).otherwise(ds.Insulin))
In [12]:
ds.select("Insulin","Glucose","BloodPressure","SkinThickness","BMI").show(5) # check if the data is clean
+-------+-------+-------------+-------------+----+
|Insulin|Glucose|BloodPressure|SkinThickness| BMI|
+-------+-------+-------------+-------------+----+
|    NaN|  148.0|         72.0|         35.0|33.6|
|    NaN|   85.0|         66.0|         29.0|26.6|
|    NaN|  183.0|         64.0|          NaN|23.3|
|   94.0|   89.0|         66.0|         23.0|28.1|
|  168.0|  137.0|         40.0|         35.0|43.1|
+-------+-------+-------------+-------------+----+
only showing top 5 rows

In [15]:
# impute - replace missig values with median / mode
from pyspark.ml.feature import Imputer
imputer = Imputer(inputCols=["Glucose","BloodPressure","SkinThickness","BMI","Insulin"],
                outputCols=["Glucose","BloodPressure","SkinThickness","BMI","Insulin"])
model = imputer.fit(ds)
ds = model.transform(ds)
ds.show(5) # did everything work as expected?
+-----------+-------+-------------+------------------+-----------------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|     SkinThickness|          Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+------------------+-----------------+----+------------------------+---+-------+
|          6|  148.0|         72.0|              35.0|155.5482233502538|33.6|                   0.627| 50|      1|
|          1|   85.0|         66.0|              29.0|155.5482233502538|26.6|                   0.351| 31|      0|
|          8|  183.0|         64.0|29.153419593345657|155.5482233502538|23.3|                   0.672| 32|      1|
|          1|   89.0|         66.0|              23.0|             94.0|28.1|                   0.167| 21|      0|
|          0|  137.0|         40.0|              35.0|            168.0|43.1|                   2.288| 33|      1|
+-----------+-------+-------------+------------------+-----------------+----+------------------------+---+-------+
only showing top 5 rows

make feature vector

more research required

In [16]:
# combine all the features into one single feature vector.
cols = ds.columns
cols.remove("Outcome")
# Let us import the vector assembler
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=cols,outputCol="features")
# Now let us use the transform method to transform our dataset
ds = assembler.transform(ds)
ds.select("features").show(truncate=False)
+-----------------------------------------------------------------------------------+
|features                                                                           |
+-----------------------------------------------------------------------------------+
|[6.0,148.0,72.0,35.0,155.5482233502538,33.6,0.627,50.0]                            |
|[1.0,85.0,66.0,29.0,155.5482233502538,26.6,0.351,31.0]                             |
|[8.0,183.0,64.0,29.153419593345657,155.5482233502538,23.3,0.672,32.0]              |
|[1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0]                                          |
|[0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0]                                        |
|[5.0,116.0,74.0,29.153419593345657,155.5482233502538,25.6,0.201,30.0]              |
|[3.0,78.0,50.0,32.0,88.0,31.0,0.248,26.0]                                          |
|[10.0,115.0,72.40518417462484,29.153419593345657,155.5482233502538,35.3,0.134,29.0]|
|[2.0,197.0,70.0,45.0,543.0,30.5,0.158,53.0]                                        |
|[8.0,125.0,96.0,29.153419593345657,155.5482233502538,32.45746367239099,0.232,54.0] |
|[4.0,110.0,92.0,29.153419593345657,155.5482233502538,37.6,0.191,30.0]              |
|[10.0,168.0,74.0,29.153419593345657,155.5482233502538,38.0,0.537,34.0]             |
|[10.0,139.0,80.0,29.153419593345657,155.5482233502538,27.1,1.441,57.0]             |
|[1.0,189.0,60.0,23.0,846.0,30.1,0.398,59.0]                                        |
|[5.0,166.0,72.0,19.0,175.0,25.8,0.587,51.0]                                        |
|[7.0,100.0,72.40518417462484,29.153419593345657,155.5482233502538,30.0,0.484,32.0] |
|[0.0,118.0,84.0,47.0,230.0,45.8,0.551,31.0]                                        |
|[7.0,107.0,74.0,29.153419593345657,155.5482233502538,29.6,0.254,31.0]              |
|[1.0,103.0,30.0,38.0,83.0,43.3,0.183,33.0]                                         |
|[1.0,115.0,70.0,30.0,96.0,34.6,0.529,32.0]                                         |
+-----------------------------------------------------------------------------------+
only showing top 20 rows

In machine learning and pattern recognition, a feature is an individual measurable property or characteristic of a phenomenon being observed. Choosing informative, discriminating and independent features is a crucial step for effective algorithms in pattern recognition, classification and regression. Features are usually numeric, but structural features such as strings and graphs are used in syntactic pattern recognition. The concept of "feature" is related to that of explanatory variable used in statistical techniques such as linear regression. [https://en.wikipedia.org/wiki/Feature_(machine_learning)]

apply normalization

In [22]:
# Standard Scaler - scaling as described in [https://www.w3schools.com/python/python_ml_scale.asp]
from pyspark.ml.feature import StandardScaler
standardscaler = StandardScaler().setInputCol("features").setOutputCol("Scaled_features")
ds = standardscaler.fit(ds).transform(ds)
ds.select("features","Scaled_features").show(5)
+--------------------+--------------------+
|            features|     Scaled_features|
+--------------------+--------------------+
|[6.0,148.0,72.0,3...|[1.78063837321943...|
|[1.0,85.0,66.0,29...|[0.29677306220323...|
|[8.0,183.0,64.0,2...|[2.37418449762590...|
|[1.0,89.0,66.0,23...|[0.29677306220323...|
|[0.0,137.0,40.0,3...|[0.0,4.5012560836...|
+--------------------+--------------------+
only showing top 5 rows

StandardScaler is an Estimator which can be fit on a dataset to produce a StandardScalerModel; this amounts to computing summary statistics. The model can then transform a Vector column in a dataset to have unit standard deviation and/or zero mean features.

Note that if the standard deviation of a feature is zero, it will return default 0.0 value in the Vector for that feature. [https://spark.apache.org/docs/latest/ml-features#standardscaler]

train / test data

In [23]:
# Train, test split
train, test = ds.randomSplit([0.8, 0.2], seed=12345)

apply balancing ratio

much more info needed

In [26]:
# imbalance in the dataset
ds_size = float(train.select("Outcome").count())
numPositives = train.select("Outcome").where('Outcome == 1').count()
per_ones = (float(numPositives)/float(ds_size))*100
numNegatives = float(ds_size-numPositives)
print('The number of ones are {}'.format(numPositives))
print('Percentage of ones are {}'.format(per_ones))
The number of ones are 221
Percentage of ones are 35.24720893141946
In [27]:
BalancingRatio = numNegatives/ds_size
print('BalancingRatio = {}'.format(BalancingRatio))
BalancingRatio = 0.6475279106858054
In [28]:
# balance 
train = train.withColumn("classWeights", when(train.Outcome == 1,BalancingRatio).otherwise(1-BalancingRatio))
train.select("classWeights").show(5)
+------------------+
|      classWeights|
+------------------+
|0.3524720893141946|
|0.3524720893141946|
|0.3524720893141946|
|0.3524720893141946|
|0.3524720893141946|
+------------------+
only showing top 5 rows

Feature selection

more info needed

In [29]:
# Feature selection using chisquareSelector
from pyspark.ml.feature import ChiSqSelector
css = ChiSqSelector(featuresCol='Scaled_features',outputCol='Aspect',labelCol='Outcome',fpr=0.05)
train = css.fit(train).transform(train)
test = css.fit(test).transform(test)
test.select("Aspect").show(5,truncate=False)
+----------------------------------------------------------------------------------------------------------------------------------------+
|Aspect                                                                                                                                  |
+----------------------------------------------------------------------------------------------------------------------------------------+
|[0.0,2.7598942410664704,6.778906518747398,3.526357045954172,1.4702231396384098,5.556241336417194,0.7032293726142661,1.955743806611537]  |
|[0.0,3.3184442660442084,5.290853868290652,1.93380870262003,1.8295247783934943,3.0544782215906037,0.7605742570763735,1.785679127775751]  |
|[0.0,3.3184442660442084,5.373523459982693,3.1850966866682846,1.8295247783934943,3.5781030595775647,0.7153019798694465,1.870711467193644]|
|[0.0,3.3513001498664283,6.2002193769031075,2.616329421191805,1.8295247783934943,4.720981710256498,1.726382837490816,1.785679127775751]  |
|[0.0,3.449867801333088,5.621532235058818,2.5025759680965094,1.8295247783934943,2.9090268777053367,0.7122838280556514,1.870711467193644] |
+----------------------------------------------------------------------------------------------------------------------------------------+
only showing top 5 rows

Building a classification model using Logistic Regression (LR)

In [30]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="Outcome", featuresCol="Aspect",weightCol="classWeights",maxIter=10)
model = lr.fit(train)
predict_train = model.transform(train)
predict_test = model.transform(test)
predict_test.select("Outcome","prediction").show(10)
+-------+----------+
|Outcome|prediction|
+-------+----------+
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      1|       1.0|
|      1|       1.0|
+-------+----------+
only showing top 10 rows

my resuts differ from the lecture notes? more info required

Evaluating the model

In [31]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",labelCol="Outcome")
predict_test.select("Outcome","rawPrediction","prediction","probability").show(5)
+-------+--------------------+----------+--------------------+
|Outcome|       rawPrediction|prediction|         probability|
+-------+--------------------+----------+--------------------+
|      0|[2.18942177236358...|       0.0|[0.89929555245987...|
|      0|[2.96897901803907...|       0.0|[0.95115286290599...|
|      0|[2.76972148989964...|       0.0|[0.94101753013781...|
|      0|[1.60574060539613...|       0.0|[0.83281918539839...|
|      0|[3.13428460989505...|       0.0|[0.95828500630564...|
+-------+--------------------+----------+--------------------+
only showing top 5 rows

In [33]:
import matplotlib.pyplot as plt
pr = model.summary.pr.toPandas()
plt.plot(pr['recall'],pr['precision'])
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.show()
In [34]:
print(model.summary)
<pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary object at 0x7f97ad0639d0>

Some conclusions to make sense of the analysis!?