Diabeties - dataset for machine learning¶

# raw_data = spark.read.format("csv").option("header","true").option("inferSchema", "true").load(r".\\diabetes.csv")
ds = spark.read.csv('diabetes.csv',inferSchema = True, header =True)

ds.show() # let me see a sample of the data

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|31.0|                   0.248| 26|      1|
|         10|    115|            0|            0|      0|35.3|                   0.134| 29|      0|
|          2|    197|           70|           45|    543|30.5|                   0.158| 53|      1|
|          8|    125|           96|            0|      0| 0.0|                   0.232| 54|      1|
|          4|    110|           92|            0|      0|37.6|                   0.191| 30|      0|
|         10|    168|           74|            0|      0|38.0|                   0.537| 34|      1|
|         10|    139|           80|            0|      0|27.1|                   1.441| 57|      0|
|          1|    189|           60|           23|    846|30.1|                   0.398| 59|      1|
|          5|    166|           72|           19|    175|25.8|                   0.587| 51|      1|
|          7|    100|            0|            0|      0|30.0|                   0.484| 32|      1|
|          0|    118|           84|           47|    230|45.8|                   0.551| 31|      1|
|          7|    107|           74|            0|      0|29.6|                   0.254| 31|      1|
|          1|    103|           30|           38|     83|43.3|                   0.183| 33|      0|
|          1|    115|           70|           30|     96|34.6|                   0.529| 32|      1|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
only showing top 20 rows

ds.count() # how many records are there in this dataset?

768

ds.columns # list all the headings

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

check the statistics for each column¶

# the min value of 0 show that there is some human error in the dataset
ds.describe().select("Summary","Pregnancies","Glucose","BloodPressure").show()

+-------+------------------+-----------------+------------------+
|Summary|       Pregnancies|          Glucose|     BloodPressure|
+-------+------------------+-----------------+------------------+
|  count|               768|              768|               768|
|   mean|3.8450520833333335|     120.89453125|       69.10546875|
| stddev|  3.36957806269887|31.97261819513622|19.355807170644777|
|    min|                 0|                0|                 0|
|    max|                17|              199|               122|
+-------+------------------+-----------------+------------------+

dataset.describe().select("Summary","SkinThickness","Insulin").show()

+-------+------------------+------------------+
|Summary|     SkinThickness|           Insulin|
+-------+------------------+------------------+
|  count|               768|               768|
|   mean|20.536458333333332| 79.79947916666667|
| stddev|15.952217567727642|115.24400235133803|
|    min|                 0|                 0|
|    max|                99|               846|
+-------+------------------+------------------+

ds.describe().select("Summary","BMI","DiabetesPedigreeFunction","Age").show()

+-------+------------------+------------------------+------------------+
|Summary|               BMI|DiabetesPedigreeFunction|               Age|
+-------+------------------+------------------------+------------------+
|  count|               768|                     768|               768|
|   mean|31.992578124999977|      0.4718763020833327|33.240885416666664|
| stddev| 7.884160320375441|       0.331328595012775|11.760231540678689|
|    min|               0.0|                   0.078|                21|
|    max|              67.1|                    2.42|                81|
+-------+------------------+------------------------+------------------+

ds.printSchema() # another view of the data

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)

clean the data¶

# replace min value of zeros with Nan as part of the data cleaning processs
import numpy as np
from pyspark.sql.functions import when
ds = ds.withColumn("Glucose",when(ds.Glucose==0,np.nan).otherwise (ds.Glucose))
ds = ds.withColumn("BloodPressure",when(ds.BloodPressure==0,np.nan).otherwise(ds.BloodPressure))
ds = ds.withColumn("SkinThickness",when(ds.SkinThickness==0,np.nan).otherwise(ds.SkinThickness))
ds = ds.withColumn("BMI",when(ds.BMI==0,np.nan).otherwise(ds.BMI))
ds = ds.withColumn("Insulin",when(ds.Insulin==0,np.nan).otherwise(ds.Insulin))

ds.select("Insulin","Glucose","BloodPressure","SkinThickness","BMI").show(5) # check if the data is clean

+-------+-------+-------------+-------------+----+
|Insulin|Glucose|BloodPressure|SkinThickness| BMI|
+-------+-------+-------------+-------------+----+
|    NaN|  148.0|         72.0|         35.0|33.6|
|    NaN|   85.0|         66.0|         29.0|26.6|
|    NaN|  183.0|         64.0|          NaN|23.3|
|   94.0|   89.0|         66.0|         23.0|28.1|
|  168.0|  137.0|         40.0|         35.0|43.1|
+-------+-------+-------------+-------------+----+
only showing top 5 rows

# impute - replace missig values with median / mode
from pyspark.ml.feature import Imputer
imputer = Imputer(inputCols=["Glucose","BloodPressure","SkinThickness","BMI","Insulin"],
                outputCols=["Glucose","BloodPressure","SkinThickness","BMI","Insulin"])
model = imputer.fit(ds)
ds = model.transform(ds)
ds.show(5) # did everything work as expected?

+-----------+-------+-------------+------------------+-----------------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|     SkinThickness|          Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+------------------+-----------------+----+------------------------+---+-------+
|          6|  148.0|         72.0|              35.0|155.5482233502538|33.6|                   0.627| 50|      1|
|          1|   85.0|         66.0|              29.0|155.5482233502538|26.6|                   0.351| 31|      0|
|          8|  183.0|         64.0|29.153419593345657|155.5482233502538|23.3|                   0.672| 32|      1|
|          1|   89.0|         66.0|              23.0|             94.0|28.1|                   0.167| 21|      0|
|          0|  137.0|         40.0|              35.0|            168.0|43.1|                   2.288| 33|      1|
+-----------+-------+-------------+------------------+-----------------+----+------------------------+---+-------+
only showing top 5 rows

make feature vector¶

more research required

# combine all the features into one single feature vector.
cols = ds.columns
cols.remove("Outcome")
# Let us import the vector assembler
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=cols,outputCol="features")
# Now let us use the transform method to transform our dataset
ds = assembler.transform(ds)
ds.select("features").show(truncate=False)

+-----------------------------------------------------------------------------------+
|features                                                                           |
+-----------------------------------------------------------------------------------+
|[6.0,148.0,72.0,35.0,155.5482233502538,33.6,0.627,50.0]                            |
|[1.0,85.0,66.0,29.0,155.5482233502538,26.6,0.351,31.0]                             |
|[8.0,183.0,64.0,29.153419593345657,155.5482233502538,23.3,0.672,32.0]              |
|[1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0]                                          |
|[0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0]                                        |
|[5.0,116.0,74.0,29.153419593345657,155.5482233502538,25.6,0.201,30.0]              |
|[3.0,78.0,50.0,32.0,88.0,31.0,0.248,26.0]                                          |
|[10.0,115.0,72.40518417462484,29.153419593345657,155.5482233502538,35.3,0.134,29.0]|
|[2.0,197.0,70.0,45.0,543.0,30.5,0.158,53.0]                                        |
|[8.0,125.0,96.0,29.153419593345657,155.5482233502538,32.45746367239099,0.232,54.0] |
|[4.0,110.0,92.0,29.153419593345657,155.5482233502538,37.6,0.191,30.0]              |
|[10.0,168.0,74.0,29.153419593345657,155.5482233502538,38.0,0.537,34.0]             |
|[10.0,139.0,80.0,29.153419593345657,155.5482233502538,27.1,1.441,57.0]             |
|[1.0,189.0,60.0,23.0,846.0,30.1,0.398,59.0]                                        |
|[5.0,166.0,72.0,19.0,175.0,25.8,0.587,51.0]                                        |
|[7.0,100.0,72.40518417462484,29.153419593345657,155.5482233502538,30.0,0.484,32.0] |
|[0.0,118.0,84.0,47.0,230.0,45.8,0.551,31.0]                                        |
|[7.0,107.0,74.0,29.153419593345657,155.5482233502538,29.6,0.254,31.0]              |
|[1.0,103.0,30.0,38.0,83.0,43.3,0.183,33.0]                                         |
|[1.0,115.0,70.0,30.0,96.0,34.6,0.529,32.0]                                         |
+-----------------------------------------------------------------------------------+
only showing top 20 rows

In machine learning and pattern recognition, a feature is an individual measurable property or characteristic of a phenomenon being observed. Choosing informative, discriminating and independent features is a crucial step for effective algorithms in pattern recognition, classification and regression. Features are usually numeric, but structural features such as strings and graphs are used in syntactic pattern recognition. The concept of "feature" is related to that of explanatory variable used in statistical techniques such as linear regression. [https://en.wikipedia.org/wiki/Feature_(machine_learning)]

apply normalization¶

# Standard Scaler - scaling as described in [https://www.w3schools.com/python/python_ml_scale.asp]
from pyspark.ml.feature import StandardScaler
standardscaler = StandardScaler().setInputCol("features").setOutputCol("Scaled_features")
ds = standardscaler.fit(ds).transform(ds)
ds.select("features","Scaled_features").show(5)

+--------------------+--------------------+
|            features|     Scaled_features|
+--------------------+--------------------+
|[6.0,148.0,72.0,3...|[1.78063837321943...|
|[1.0,85.0,66.0,29...|[0.29677306220323...|
|[8.0,183.0,64.0,2...|[2.37418449762590...|
|[1.0,89.0,66.0,23...|[0.29677306220323...|
|[0.0,137.0,40.0,3...|[0.0,4.5012560836...|
+--------------------+--------------------+
only showing top 5 rows

StandardScaler is an Estimator which can be fit on a dataset to produce a StandardScalerModel; this amounts to computing summary statistics. The model can then transform a Vector column in a dataset to have unit standard deviation and/or zero mean features.

Note that if the standard deviation of a feature is zero, it will return default 0.0 value in the Vector for that feature. [https://spark.apache.org/docs/latest/ml-features#standardscaler]

train / test data¶

# Train, test split
train, test = ds.randomSplit([0.8, 0.2], seed=12345)

apply balancing ratio¶

much more info needed

# imbalance in the dataset
ds_size = float(train.select("Outcome").count())
numPositives = train.select("Outcome").where('Outcome == 1').count()
per_ones = (float(numPositives)/float(ds_size))*100
numNegatives = float(ds_size-numPositives)
print('The number of ones are {}'.format(numPositives))
print('Percentage of ones are {}'.format(per_ones))

The number of ones are 221
Percentage of ones are 35.24720893141946

BalancingRatio = numNegatives/ds_size
print('BalancingRatio = {}'.format(BalancingRatio))

BalancingRatio = 0.6475279106858054

# balance 
train = train.withColumn("classWeights", when(train.Outcome == 1,BalancingRatio).otherwise(1-BalancingRatio))
train.select("classWeights").show(5)

+------------------+
|      classWeights|
+------------------+
|0.3524720893141946|
|0.3524720893141946|
|0.3524720893141946|
|0.3524720893141946|
|0.3524720893141946|
+------------------+
only showing top 5 rows

Feature selection¶

more info needed

# Feature selection using chisquareSelector
from pyspark.ml.feature import ChiSqSelector
css = ChiSqSelector(featuresCol='Scaled_features',outputCol='Aspect',labelCol='Outcome',fpr=0.05)
train = css.fit(train).transform(train)
test = css.fit(test).transform(test)
test.select("Aspect").show(5,truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------+
|Aspect                                                                                                                                  |
+----------------------------------------------------------------------------------------------------------------------------------------+
|[0.0,2.7598942410664704,6.778906518747398,3.526357045954172,1.4702231396384098,5.556241336417194,0.7032293726142661,1.955743806611537]  |
|[0.0,3.3184442660442084,5.290853868290652,1.93380870262003,1.8295247783934943,3.0544782215906037,0.7605742570763735,1.785679127775751]  |
|[0.0,3.3184442660442084,5.373523459982693,3.1850966866682846,1.8295247783934943,3.5781030595775647,0.7153019798694465,1.870711467193644]|
|[0.0,3.3513001498664283,6.2002193769031075,2.616329421191805,1.8295247783934943,4.720981710256498,1.726382837490816,1.785679127775751]  |
|[0.0,3.449867801333088,5.621532235058818,2.5025759680965094,1.8295247783934943,2.9090268777053367,0.7122838280556514,1.870711467193644] |
+----------------------------------------------------------------------------------------------------------------------------------------+
only showing top 5 rows

Building a classification model using Logistic Regression (LR)¶

from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="Outcome", featuresCol="Aspect",weightCol="classWeights",maxIter=10)
model = lr.fit(train)
predict_train = model.transform(train)
predict_test = model.transform(test)
predict_test.select("Outcome","prediction").show(10)

+-------+----------+
|Outcome|prediction|
+-------+----------+
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      1|       1.0|
|      1|       1.0|
+-------+----------+
only showing top 10 rows

my resuts differ from the lecture notes? more info required

Evaluating the model¶

from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",labelCol="Outcome")
predict_test.select("Outcome","rawPrediction","prediction","probability").show(5)

+-------+--------------------+----------+--------------------+
|Outcome|       rawPrediction|prediction|         probability|
+-------+--------------------+----------+--------------------+
|      0|[2.18942177236358...|       0.0|[0.89929555245987...|
|      0|[2.96897901803907...|       0.0|[0.95115286290599...|
|      0|[2.76972148989964...|       0.0|[0.94101753013781...|
|      0|[1.60574060539613...|       0.0|[0.83281918539839...|
|      0|[3.13428460989505...|       0.0|[0.95828500630564...|
+-------+--------------------+----------+--------------------+
only showing top 5 rows

import matplotlib.pyplot as plt
pr = model.summary.pr.toPandas()
plt.plot(pr['recall'],pr['precision'])
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.show()

print(model.summary)

<pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary object at 0x7f97ad0639d0>

Some conclusions to make sense of the analysis!?