In [6]:
#raw_data = spark.read.format("csv").option("header","true").option("inferSchema", "true").load(r".\\creditN.csv")
ds = spark.read.csv('creditN.csv',inferSchema = True, header =True)
In [7]:
ds.show(5)
+-------------+-------+--------+-------+-------+------+-------+----------+-----------+----------+----------+-----------------+------+---+----------+---------+-------+----------+----------+--------+-------+
|creditability|balance|duration|history|purpose|amount|savings|employment|instPercent|sexMarried|guarantors|residenceDuration|assets|age|concCredit|apartment|credits|occupation|dependents|hasPhone|foreign|
+-------------+-------+--------+-------+-------+------+-------+----------+-----------+----------+----------+-----------------+------+---+----------+---------+-------+----------+----------+--------+-------+
|            1|      1|      18|      4|      2|  1049|      1|         2|          4|         2|         1|                4|     2| 21|         3|        1|      1|         3|         1|       1|      1|
|            1|      1|       9|      4|      0|  2799|      1|         3|          2|         3|         1|                2|     1| 36|         3|        1|      2|         3|         2|       1|      1|
|            1|      2|      12|      2|      9|   841|      2|         4|          2|         2|         1|                4|     1| 23|         3|        1|      1|         2|         1|       1|      1|
|            1|      1|      12|      4|      0|  2122|      1|         3|          3|         3|         1|                2|     1| 39|         3|        1|      2|         2|         2|       1|      2|
|            1|      1|      12|      4|      0|  2171|      1|         3|          4|         3|         1|                4|     2| 38|         1|        2|      2|         2|         1|       1|      2|
+-------------+-------+--------+-------+-------+------+-------+----------+-----------+----------+----------+-----------------+------+---+----------+---------+-------+----------+----------+--------+-------+
only showing top 5 rows

In [8]:
ds.columns
Out[8]:
['creditability',
 'balance',
 'duration',
 'history',
 'purpose',
 'amount',
 'savings',
 'employment',
 'instPercent',
 'sexMarried',
 'guarantors',
 'residenceDuration',
 'assets',
 'age',
 'concCredit',
 'apartment',
 'credits',
 'occupation',
 'dependents',
 'hasPhone',
 'foreign']
In [9]:
cols = ds.columns

# Let us import the vector assembler
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=cols,outputCol="features")
# Now let us use the transform method to transform our dataset
ds=assembler.transform(ds)
ds.select("features").show(truncate=False)
+-------------------------------------------------------------------------------------------+
|features                                                                                   |
+-------------------------------------------------------------------------------------------+
|[1.0,1.0,18.0,4.0,2.0,1049.0,1.0,2.0,4.0,2.0,1.0,4.0,2.0,21.0,3.0,1.0,1.0,3.0,1.0,1.0,1.0] |
|[1.0,1.0,9.0,4.0,0.0,2799.0,1.0,3.0,2.0,3.0,1.0,2.0,1.0,36.0,3.0,1.0,2.0,3.0,2.0,1.0,1.0]  |
|[1.0,2.0,12.0,2.0,9.0,841.0,2.0,4.0,2.0,2.0,1.0,4.0,1.0,23.0,3.0,1.0,1.0,2.0,1.0,1.0,1.0]  |
|[1.0,1.0,12.0,4.0,0.0,2122.0,1.0,3.0,3.0,3.0,1.0,2.0,1.0,39.0,3.0,1.0,2.0,2.0,2.0,1.0,2.0] |
|[1.0,1.0,12.0,4.0,0.0,2171.0,1.0,3.0,4.0,3.0,1.0,4.0,2.0,38.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0] |
|[1.0,1.0,10.0,4.0,0.0,2241.0,1.0,2.0,1.0,3.0,1.0,3.0,1.0,48.0,3.0,1.0,2.0,2.0,2.0,1.0,2.0] |
|[1.0,1.0,8.0,4.0,0.0,3398.0,1.0,4.0,1.0,3.0,1.0,4.0,1.0,39.0,3.0,2.0,2.0,2.0,1.0,1.0,2.0]  |
|[1.0,1.0,6.0,4.0,0.0,1361.0,1.0,2.0,2.0,3.0,1.0,4.0,1.0,40.0,3.0,2.0,1.0,2.0,2.0,1.0,2.0]  |
|[1.0,4.0,18.0,4.0,3.0,1098.0,1.0,1.0,4.0,2.0,1.0,4.0,3.0,65.0,3.0,2.0,2.0,1.0,1.0,1.0,1.0] |
|[1.0,2.0,24.0,2.0,3.0,3758.0,3.0,1.0,1.0,2.0,1.0,4.0,4.0,23.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0] |
|[1.0,1.0,11.0,4.0,0.0,3905.0,1.0,3.0,2.0,3.0,1.0,2.0,1.0,36.0,3.0,1.0,2.0,3.0,2.0,1.0,1.0] |
|[1.0,1.0,30.0,4.0,1.0,6187.0,2.0,4.0,1.0,4.0,1.0,4.0,3.0,24.0,3.0,1.0,2.0,3.0,1.0,1.0,1.0] |
|[1.0,1.0,6.0,4.0,3.0,1957.0,1.0,4.0,1.0,2.0,1.0,4.0,3.0,31.0,3.0,2.0,1.0,3.0,1.0,1.0,1.0]  |
|[1.0,2.0,48.0,3.0,10.0,7582.0,2.0,1.0,2.0,3.0,1.0,4.0,4.0,31.0,3.0,2.0,1.0,4.0,1.0,2.0,1.0]|
|[1.0,1.0,18.0,2.0,3.0,1936.0,5.0,4.0,2.0,4.0,1.0,4.0,3.0,23.0,3.0,1.0,2.0,2.0,1.0,1.0,1.0] |
|[1.0,1.0,6.0,2.0,3.0,2647.0,3.0,3.0,2.0,3.0,1.0,3.0,1.0,44.0,3.0,1.0,1.0,3.0,2.0,1.0,1.0]  |
|[1.0,1.0,11.0,4.0,0.0,3939.0,1.0,3.0,1.0,3.0,1.0,2.0,1.0,40.0,3.0,2.0,2.0,2.0,2.0,1.0,1.0] |
|[1.0,2.0,18.0,2.0,3.0,3213.0,3.0,2.0,1.0,4.0,1.0,3.0,1.0,25.0,3.0,1.0,1.0,3.0,1.0,1.0,1.0] |
|[1.0,2.0,36.0,4.0,3.0,2337.0,1.0,5.0,4.0,3.0,1.0,4.0,1.0,36.0,3.0,2.0,1.0,3.0,1.0,1.0,1.0] |
|[1.0,4.0,11.0,4.0,0.0,7228.0,1.0,3.0,1.0,3.0,1.0,4.0,2.0,39.0,3.0,2.0,2.0,2.0,1.0,1.0,1.0] |
+-------------------------------------------------------------------------------------------+
only showing top 20 rows

In [11]:
#us combine all the features in one single feature vector. This step is  complusory 
cols=ds.columns
cols.remove("creditability")
In [12]:
# Let us import the vector assembler
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=cols,outputCol="featuresNN")
In [13]:
# Now let us use the transform method to transform our dataset
datasetFv=assembler.transform(ds)
datasetFv.printSchema()
datasetFv.show(10)
root
 |-- creditability: integer (nullable = true)
 |-- balance: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- history: integer (nullable = true)
 |-- purpose: integer (nullable = true)
 |-- amount: integer (nullable = true)
 |-- savings: integer (nullable = true)
 |-- employment: integer (nullable = true)
 |-- instPercent: integer (nullable = true)
 |-- sexMarried: integer (nullable = true)
 |-- guarantors: integer (nullable = true)
 |-- residenceDuration: integer (nullable = true)
 |-- assets: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- concCredit: integer (nullable = true)
 |-- apartment: integer (nullable = true)
 |-- credits: integer (nullable = true)
 |-- occupation: integer (nullable = true)
 |-- dependents: integer (nullable = true)
 |-- hasPhone: integer (nullable = true)
 |-- foreign: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- featuresNN: vector (nullable = true)

+-------------+-------+--------+-------+-------+------+-------+----------+-----------+----------+----------+-----------------+------+---+----------+---------+-------+----------+----------+--------+-------+--------------------+--------------------+
|creditability|balance|duration|history|purpose|amount|savings|employment|instPercent|sexMarried|guarantors|residenceDuration|assets|age|concCredit|apartment|credits|occupation|dependents|hasPhone|foreign|            features|          featuresNN|
+-------------+-------+--------+-------+-------+------+-------+----------+-----------+----------+----------+-----------------+------+---+----------+---------+-------+----------+----------+--------+-------+--------------------+--------------------+
|            1|      1|      18|      4|      2|  1049|      1|         2|          4|         2|         1|                4|     2| 21|         3|        1|      1|         3|         1|       1|      1|[1.0,1.0,18.0,4.0...|[1.0,18.0,4.0,2.0...|
|            1|      1|       9|      4|      0|  2799|      1|         3|          2|         3|         1|                2|     1| 36|         3|        1|      2|         3|         2|       1|      1|[1.0,1.0,9.0,4.0,...|[1.0,9.0,4.0,0.0,...|
|            1|      2|      12|      2|      9|   841|      2|         4|          2|         2|         1|                4|     1| 23|         3|        1|      1|         2|         1|       1|      1|[1.0,2.0,12.0,2.0...|[2.0,12.0,2.0,9.0...|
|            1|      1|      12|      4|      0|  2122|      1|         3|          3|         3|         1|                2|     1| 39|         3|        1|      2|         2|         2|       1|      2|[1.0,1.0,12.0,4.0...|[1.0,12.0,4.0,0.0...|
|            1|      1|      12|      4|      0|  2171|      1|         3|          4|         3|         1|                4|     2| 38|         1|        2|      2|         2|         1|       1|      2|[1.0,1.0,12.0,4.0...|[1.0,12.0,4.0,0.0...|
|            1|      1|      10|      4|      0|  2241|      1|         2|          1|         3|         1|                3|     1| 48|         3|        1|      2|         2|         2|       1|      2|[1.0,1.0,10.0,4.0...|[1.0,10.0,4.0,0.0...|
|            1|      1|       8|      4|      0|  3398|      1|         4|          1|         3|         1|                4|     1| 39|         3|        2|      2|         2|         1|       1|      2|[1.0,1.0,8.0,4.0,...|[1.0,8.0,4.0,0.0,...|
|            1|      1|       6|      4|      0|  1361|      1|         2|          2|         3|         1|                4|     1| 40|         3|        2|      1|         2|         2|       1|      2|[1.0,1.0,6.0,4.0,...|[1.0,6.0,4.0,0.0,...|
|            1|      4|      18|      4|      3|  1098|      1|         1|          4|         2|         1|                4|     3| 65|         3|        2|      2|         1|         1|       1|      1|[1.0,4.0,18.0,4.0...|[4.0,18.0,4.0,3.0...|
|            1|      2|      24|      2|      3|  3758|      3|         1|          1|         2|         1|                4|     4| 23|         3|        1|      1|         1|         1|       1|      1|[1.0,2.0,24.0,2.0...|[2.0,24.0,2.0,3.0...|
+-------------+-------+--------+-------+-------+------+-------+----------+-----------+----------+----------+-----------------+------+---+----------+---------+-------+----------+----------+--------+-------+--------------------+--------------------+
only showing top 10 rows

In [14]:
from pyspark.ml.classification import RandomForestClassifier
train, test = datasetFv.randomSplit([0.8, 0.2], seed=12345)
Random_Forest = RandomForestClassifier(featuresCol='featuresNN', labelCol='creditability')
model = Random_Forest.fit(train)
In [15]:
#run model with test dataset to getpredictions

predictionsDf=model.transform(test)

predictionsDf.show(10)

# checking predciton values

predictionsDf.select(['creditability','prediction', 'probability']).show()
+-------------+-------+--------+-------+-------+------+-------+----------+-----------+----------+----------+-----------------+------+---+----------+---------+-------+----------+----------+--------+-------+--------------------+--------------------+--------------------+--------------------+----------+
|creditability|balance|duration|history|purpose|amount|savings|employment|instPercent|sexMarried|guarantors|residenceDuration|assets|age|concCredit|apartment|credits|occupation|dependents|hasPhone|foreign|            features|          featuresNN|       rawPrediction|         probability|prediction|
+-------------+-------+--------+-------+-------+------+-------+----------+-----------+----------+----------+-----------------+------+---+----------+---------+-------+----------+----------+--------+-------+--------------------+--------------------+--------------------+--------------------+----------+
|            0|      1|       9|      2|      3|  1366|      1|         2|          3|         2|         1|                4|     2| 22|         3|        1|      1|         3|         1|       1|      1|[0.0,1.0,9.0,2.0,...|[1.0,9.0,2.0,3.0,...|[15.9951224117837...|[0.79975612058918...|       0.0|
|            0|      1|      12|      2|      4|   741|      2|         1|          4|         2|         1|                3|     2| 22|         3|        2|      1|         3|         1|       1|      1|[0.0,1.0,12.0,2.0...|[1.0,12.0,2.0,4.0...|[16.8028147194760...|[0.84014073597380...|       0.0|
|            0|      1|      12|      2|      6|   684|      1|         3|          4|         3|         1|                4|     3| 40|         3|        1|      1|         2|         2|       1|      1|[0.0,1.0,12.0,2.0...|[1.0,12.0,2.0,6.0...|[16.0526342278021...|[0.80263171139010...|       0.0|
|            0|      1|      12|      4|      0|  3499|      1|         3|          3|         2|         2|                2|     1| 29|         3|        2|      2|         3|         1|       1|      1|[0.0,1.0,12.0,4.0...|[1.0,12.0,4.0,0.0...|[13.5933760002454...|[0.67966880001227...|       0.0|
|            0|      1|      16|      4|      0|  2625|      1|         5|          2|         3|         3|                4|     2| 43|         3|        1|      1|         3|         1|       2|      2|[0.0,1.0,16.0,4.0...|[1.0,16.0,4.0,0.0...|[12.0090054942415...|[0.60045027471207...|       0.0|
|            0|      1|      18|      2|      3|  1345|      1|         3|          4|         4|         1|                3|     1| 26|         3|        2|      1|         3|         1|       1|      1|[0.0,1.0,18.0,2.0...|[1.0,18.0,2.0,3.0...|[16.8230584702264...|[0.84115292351132...|       0.0|
|            0|      1|      18|      2|      3|  3190|      1|         3|          2|         2|         1|                2|     1| 24|         3|        2|      1|         3|         1|       1|      1|[0.0,1.0,18.0,2.0...|[1.0,18.0,2.0,3.0...|[16.9772417229781...|[0.84886208614890...|       0.0|
|            0|      1|      24|      2|      0|  1207|      1|         2|          4|         2|         1|                4|     2| 24|         3|        1|      1|         3|         1|       1|      1|[0.0,1.0,24.0,2.0...|[1.0,24.0,2.0,0.0...|[17.7838855157080...|[0.88919427578540...|       0.0|
|            0|      1|      33|      4|      2|  4281|      3|         3|          1|         2|         1|                4|     3| 23|         3|        2|      2|         3|         1|       1|      1|[0.0,1.0,33.0,4.0...|[1.0,33.0,4.0,2.0...|[16.7344762022007...|[0.83672381011003...|       0.0|
|            0|      1|      36|      2|      0|  9271|      1|         4|          2|         3|         1|                1|     3| 24|         3|        2|      1|         3|         1|       2|      1|[0.0,1.0,36.0,2.0...|[1.0,36.0,2.0,0.0...|[18.8316317873303...|[0.94158158936651...|       0.0|
+-------------+-------+--------+-------+-------+------+-------+----------+-----------+----------+----------+-----------------+------+---+----------+---------+-------+----------+----------+--------+-------+--------------------+--------------------+--------------------+--------------------+----------+
only showing top 10 rows

+-------------+----------+--------------------+
|creditability|prediction|         probability|
+-------------+----------+--------------------+
|            0|       0.0|[0.79975612058918...|
|            0|       0.0|[0.84014073597380...|
|            0|       0.0|[0.80263171139010...|
|            0|       0.0|[0.67966880001227...|
|            0|       0.0|[0.60045027471207...|
|            0|       0.0|[0.84115292351132...|
|            0|       0.0|[0.84886208614890...|
|            0|       0.0|[0.88919427578540...|
|            0|       0.0|[0.83672381011003...|
|            0|       0.0|[0.94158158936651...|
|            0|       0.0|[0.78512994077297...|
|            0|       0.0|[0.92388640873015...|
|            0|       0.0|[0.70667109480431...|
|            0|       0.0|[0.77753491765686...|
|            0|       0.0|[0.84072269870947...|
|            0|       0.0|[0.69245448253822...|
|            0|       0.0|[0.80829365786165...|
|            0|       0.0|[0.75069415625101...|
|            0|       0.0|[0.84936745272271...|
|            0|       0.0|[0.73319036326455...|
+-------------+----------+--------------------+
only showing top 20 rows

In [16]:
#Evulate the Model using BinaryClasssificationEvaluator using  Area under curve

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='creditability', metricName='areaUnderROC')
accuracy=evaluator.evaluate(predictionsDf)
print(accuracy)
1.0
In [ ]: