#raw_data = spark.read.format("csv").option("header","true").option("inferSchema", "true").load(r".\\creditN.csv")
ds = spark.read.csv('creditN.csv',inferSchema = True, header =True)
ds.show(5)
+-------------+-------+--------+-------+-------+------+-------+----------+-----------+----------+----------+-----------------+------+---+----------+---------+-------+----------+----------+--------+-------+ |creditability|balance|duration|history|purpose|amount|savings|employment|instPercent|sexMarried|guarantors|residenceDuration|assets|age|concCredit|apartment|credits|occupation|dependents|hasPhone|foreign| +-------------+-------+--------+-------+-------+------+-------+----------+-----------+----------+----------+-----------------+------+---+----------+---------+-------+----------+----------+--------+-------+ | 1| 1| 18| 4| 2| 1049| 1| 2| 4| 2| 1| 4| 2| 21| 3| 1| 1| 3| 1| 1| 1| | 1| 1| 9| 4| 0| 2799| 1| 3| 2| 3| 1| 2| 1| 36| 3| 1| 2| 3| 2| 1| 1| | 1| 2| 12| 2| 9| 841| 2| 4| 2| 2| 1| 4| 1| 23| 3| 1| 1| 2| 1| 1| 1| | 1| 1| 12| 4| 0| 2122| 1| 3| 3| 3| 1| 2| 1| 39| 3| 1| 2| 2| 2| 1| 2| | 1| 1| 12| 4| 0| 2171| 1| 3| 4| 3| 1| 4| 2| 38| 1| 2| 2| 2| 1| 1| 2| +-------------+-------+--------+-------+-------+------+-------+----------+-----------+----------+----------+-----------------+------+---+----------+---------+-------+----------+----------+--------+-------+ only showing top 5 rows
ds.columns
['creditability', 'balance', 'duration', 'history', 'purpose', 'amount', 'savings', 'employment', 'instPercent', 'sexMarried', 'guarantors', 'residenceDuration', 'assets', 'age', 'concCredit', 'apartment', 'credits', 'occupation', 'dependents', 'hasPhone', 'foreign']
cols = ds.columns
# Let us import the vector assembler
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=cols,outputCol="features")
# Now let us use the transform method to transform our dataset
ds=assembler.transform(ds)
ds.select("features").show(truncate=False)
+-------------------------------------------------------------------------------------------+ |features | +-------------------------------------------------------------------------------------------+ |[1.0,1.0,18.0,4.0,2.0,1049.0,1.0,2.0,4.0,2.0,1.0,4.0,2.0,21.0,3.0,1.0,1.0,3.0,1.0,1.0,1.0] | |[1.0,1.0,9.0,4.0,0.0,2799.0,1.0,3.0,2.0,3.0,1.0,2.0,1.0,36.0,3.0,1.0,2.0,3.0,2.0,1.0,1.0] | |[1.0,2.0,12.0,2.0,9.0,841.0,2.0,4.0,2.0,2.0,1.0,4.0,1.0,23.0,3.0,1.0,1.0,2.0,1.0,1.0,1.0] | |[1.0,1.0,12.0,4.0,0.0,2122.0,1.0,3.0,3.0,3.0,1.0,2.0,1.0,39.0,3.0,1.0,2.0,2.0,2.0,1.0,2.0] | |[1.0,1.0,12.0,4.0,0.0,2171.0,1.0,3.0,4.0,3.0,1.0,4.0,2.0,38.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0] | |[1.0,1.0,10.0,4.0,0.0,2241.0,1.0,2.0,1.0,3.0,1.0,3.0,1.0,48.0,3.0,1.0,2.0,2.0,2.0,1.0,2.0] | |[1.0,1.0,8.0,4.0,0.0,3398.0,1.0,4.0,1.0,3.0,1.0,4.0,1.0,39.0,3.0,2.0,2.0,2.0,1.0,1.0,2.0] | |[1.0,1.0,6.0,4.0,0.0,1361.0,1.0,2.0,2.0,3.0,1.0,4.0,1.0,40.0,3.0,2.0,1.0,2.0,2.0,1.0,2.0] | |[1.0,4.0,18.0,4.0,3.0,1098.0,1.0,1.0,4.0,2.0,1.0,4.0,3.0,65.0,3.0,2.0,2.0,1.0,1.0,1.0,1.0] | |[1.0,2.0,24.0,2.0,3.0,3758.0,3.0,1.0,1.0,2.0,1.0,4.0,4.0,23.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0] | |[1.0,1.0,11.0,4.0,0.0,3905.0,1.0,3.0,2.0,3.0,1.0,2.0,1.0,36.0,3.0,1.0,2.0,3.0,2.0,1.0,1.0] | |[1.0,1.0,30.0,4.0,1.0,6187.0,2.0,4.0,1.0,4.0,1.0,4.0,3.0,24.0,3.0,1.0,2.0,3.0,1.0,1.0,1.0] | |[1.0,1.0,6.0,4.0,3.0,1957.0,1.0,4.0,1.0,2.0,1.0,4.0,3.0,31.0,3.0,2.0,1.0,3.0,1.0,1.0,1.0] | |[1.0,2.0,48.0,3.0,10.0,7582.0,2.0,1.0,2.0,3.0,1.0,4.0,4.0,31.0,3.0,2.0,1.0,4.0,1.0,2.0,1.0]| |[1.0,1.0,18.0,2.0,3.0,1936.0,5.0,4.0,2.0,4.0,1.0,4.0,3.0,23.0,3.0,1.0,2.0,2.0,1.0,1.0,1.0] | |[1.0,1.0,6.0,2.0,3.0,2647.0,3.0,3.0,2.0,3.0,1.0,3.0,1.0,44.0,3.0,1.0,1.0,3.0,2.0,1.0,1.0] | |[1.0,1.0,11.0,4.0,0.0,3939.0,1.0,3.0,1.0,3.0,1.0,2.0,1.0,40.0,3.0,2.0,2.0,2.0,2.0,1.0,1.0] | |[1.0,2.0,18.0,2.0,3.0,3213.0,3.0,2.0,1.0,4.0,1.0,3.0,1.0,25.0,3.0,1.0,1.0,3.0,1.0,1.0,1.0] | |[1.0,2.0,36.0,4.0,3.0,2337.0,1.0,5.0,4.0,3.0,1.0,4.0,1.0,36.0,3.0,2.0,1.0,3.0,1.0,1.0,1.0] | |[1.0,4.0,11.0,4.0,0.0,7228.0,1.0,3.0,1.0,3.0,1.0,4.0,2.0,39.0,3.0,2.0,2.0,2.0,1.0,1.0,1.0] | +-------------------------------------------------------------------------------------------+ only showing top 20 rows
#us combine all the features in one single feature vector. This step is complusory
cols=ds.columns
cols.remove("creditability")
# Let us import the vector assembler
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=cols,outputCol="featuresNN")
# Now let us use the transform method to transform our dataset
datasetFv=assembler.transform(ds)
datasetFv.printSchema()
datasetFv.show(10)
root |-- creditability: integer (nullable = true) |-- balance: integer (nullable = true) |-- duration: integer (nullable = true) |-- history: integer (nullable = true) |-- purpose: integer (nullable = true) |-- amount: integer (nullable = true) |-- savings: integer (nullable = true) |-- employment: integer (nullable = true) |-- instPercent: integer (nullable = true) |-- sexMarried: integer (nullable = true) |-- guarantors: integer (nullable = true) |-- residenceDuration: integer (nullable = true) |-- assets: integer (nullable = true) |-- age: integer (nullable = true) |-- concCredit: integer (nullable = true) |-- apartment: integer (nullable = true) |-- credits: integer (nullable = true) |-- occupation: integer (nullable = true) |-- dependents: integer (nullable = true) |-- hasPhone: integer (nullable = true) |-- foreign: integer (nullable = true) |-- features: vector (nullable = true) |-- featuresNN: vector (nullable = true) +-------------+-------+--------+-------+-------+------+-------+----------+-----------+----------+----------+-----------------+------+---+----------+---------+-------+----------+----------+--------+-------+--------------------+--------------------+ |creditability|balance|duration|history|purpose|amount|savings|employment|instPercent|sexMarried|guarantors|residenceDuration|assets|age|concCredit|apartment|credits|occupation|dependents|hasPhone|foreign| features| featuresNN| +-------------+-------+--------+-------+-------+------+-------+----------+-----------+----------+----------+-----------------+------+---+----------+---------+-------+----------+----------+--------+-------+--------------------+--------------------+ | 1| 1| 18| 4| 2| 1049| 1| 2| 4| 2| 1| 4| 2| 21| 3| 1| 1| 3| 1| 1| 1|[1.0,1.0,18.0,4.0...|[1.0,18.0,4.0,2.0...| | 1| 1| 9| 4| 0| 2799| 1| 3| 2| 3| 1| 2| 1| 36| 3| 1| 2| 3| 2| 1| 1|[1.0,1.0,9.0,4.0,...|[1.0,9.0,4.0,0.0,...| | 1| 2| 12| 2| 9| 841| 2| 4| 2| 2| 1| 4| 1| 23| 3| 1| 1| 2| 1| 1| 1|[1.0,2.0,12.0,2.0...|[2.0,12.0,2.0,9.0...| | 1| 1| 12| 4| 0| 2122| 1| 3| 3| 3| 1| 2| 1| 39| 3| 1| 2| 2| 2| 1| 2|[1.0,1.0,12.0,4.0...|[1.0,12.0,4.0,0.0...| | 1| 1| 12| 4| 0| 2171| 1| 3| 4| 3| 1| 4| 2| 38| 1| 2| 2| 2| 1| 1| 2|[1.0,1.0,12.0,4.0...|[1.0,12.0,4.0,0.0...| | 1| 1| 10| 4| 0| 2241| 1| 2| 1| 3| 1| 3| 1| 48| 3| 1| 2| 2| 2| 1| 2|[1.0,1.0,10.0,4.0...|[1.0,10.0,4.0,0.0...| | 1| 1| 8| 4| 0| 3398| 1| 4| 1| 3| 1| 4| 1| 39| 3| 2| 2| 2| 1| 1| 2|[1.0,1.0,8.0,4.0,...|[1.0,8.0,4.0,0.0,...| | 1| 1| 6| 4| 0| 1361| 1| 2| 2| 3| 1| 4| 1| 40| 3| 2| 1| 2| 2| 1| 2|[1.0,1.0,6.0,4.0,...|[1.0,6.0,4.0,0.0,...| | 1| 4| 18| 4| 3| 1098| 1| 1| 4| 2| 1| 4| 3| 65| 3| 2| 2| 1| 1| 1| 1|[1.0,4.0,18.0,4.0...|[4.0,18.0,4.0,3.0...| | 1| 2| 24| 2| 3| 3758| 3| 1| 1| 2| 1| 4| 4| 23| 3| 1| 1| 1| 1| 1| 1|[1.0,2.0,24.0,2.0...|[2.0,24.0,2.0,3.0...| +-------------+-------+--------+-------+-------+------+-------+----------+-----------+----------+----------+-----------------+------+---+----------+---------+-------+----------+----------+--------+-------+--------------------+--------------------+ only showing top 10 rows
from pyspark.ml.classification import RandomForestClassifier
train, test = datasetFv.randomSplit([0.8, 0.2], seed=12345)
Random_Forest = RandomForestClassifier(featuresCol='featuresNN', labelCol='creditability')
model = Random_Forest.fit(train)
#run model with test dataset to getpredictions
predictionsDf=model.transform(test)
predictionsDf.show(10)
# checking predciton values
predictionsDf.select(['creditability','prediction', 'probability']).show()
+-------------+-------+--------+-------+-------+------+-------+----------+-----------+----------+----------+-----------------+------+---+----------+---------+-------+----------+----------+--------+-------+--------------------+--------------------+--------------------+--------------------+----------+ |creditability|balance|duration|history|purpose|amount|savings|employment|instPercent|sexMarried|guarantors|residenceDuration|assets|age|concCredit|apartment|credits|occupation|dependents|hasPhone|foreign| features| featuresNN| rawPrediction| probability|prediction| +-------------+-------+--------+-------+-------+------+-------+----------+-----------+----------+----------+-----------------+------+---+----------+---------+-------+----------+----------+--------+-------+--------------------+--------------------+--------------------+--------------------+----------+ | 0| 1| 9| 2| 3| 1366| 1| 2| 3| 2| 1| 4| 2| 22| 3| 1| 1| 3| 1| 1| 1|[0.0,1.0,9.0,2.0,...|[1.0,9.0,2.0,3.0,...|[15.9951224117837...|[0.79975612058918...| 0.0| | 0| 1| 12| 2| 4| 741| 2| 1| 4| 2| 1| 3| 2| 22| 3| 2| 1| 3| 1| 1| 1|[0.0,1.0,12.0,2.0...|[1.0,12.0,2.0,4.0...|[16.8028147194760...|[0.84014073597380...| 0.0| | 0| 1| 12| 2| 6| 684| 1| 3| 4| 3| 1| 4| 3| 40| 3| 1| 1| 2| 2| 1| 1|[0.0,1.0,12.0,2.0...|[1.0,12.0,2.0,6.0...|[16.0526342278021...|[0.80263171139010...| 0.0| | 0| 1| 12| 4| 0| 3499| 1| 3| 3| 2| 2| 2| 1| 29| 3| 2| 2| 3| 1| 1| 1|[0.0,1.0,12.0,4.0...|[1.0,12.0,4.0,0.0...|[13.5933760002454...|[0.67966880001227...| 0.0| | 0| 1| 16| 4| 0| 2625| 1| 5| 2| 3| 3| 4| 2| 43| 3| 1| 1| 3| 1| 2| 2|[0.0,1.0,16.0,4.0...|[1.0,16.0,4.0,0.0...|[12.0090054942415...|[0.60045027471207...| 0.0| | 0| 1| 18| 2| 3| 1345| 1| 3| 4| 4| 1| 3| 1| 26| 3| 2| 1| 3| 1| 1| 1|[0.0,1.0,18.0,2.0...|[1.0,18.0,2.0,3.0...|[16.8230584702264...|[0.84115292351132...| 0.0| | 0| 1| 18| 2| 3| 3190| 1| 3| 2| 2| 1| 2| 1| 24| 3| 2| 1| 3| 1| 1| 1|[0.0,1.0,18.0,2.0...|[1.0,18.0,2.0,3.0...|[16.9772417229781...|[0.84886208614890...| 0.0| | 0| 1| 24| 2| 0| 1207| 1| 2| 4| 2| 1| 4| 2| 24| 3| 1| 1| 3| 1| 1| 1|[0.0,1.0,24.0,2.0...|[1.0,24.0,2.0,0.0...|[17.7838855157080...|[0.88919427578540...| 0.0| | 0| 1| 33| 4| 2| 4281| 3| 3| 1| 2| 1| 4| 3| 23| 3| 2| 2| 3| 1| 1| 1|[0.0,1.0,33.0,4.0...|[1.0,33.0,4.0,2.0...|[16.7344762022007...|[0.83672381011003...| 0.0| | 0| 1| 36| 2| 0| 9271| 1| 4| 2| 3| 1| 1| 3| 24| 3| 2| 1| 3| 1| 2| 1|[0.0,1.0,36.0,2.0...|[1.0,36.0,2.0,0.0...|[18.8316317873303...|[0.94158158936651...| 0.0| +-------------+-------+--------+-------+-------+------+-------+----------+-----------+----------+----------+-----------------+------+---+----------+---------+-------+----------+----------+--------+-------+--------------------+--------------------+--------------------+--------------------+----------+ only showing top 10 rows +-------------+----------+--------------------+ |creditability|prediction| probability| +-------------+----------+--------------------+ | 0| 0.0|[0.79975612058918...| | 0| 0.0|[0.84014073597380...| | 0| 0.0|[0.80263171139010...| | 0| 0.0|[0.67966880001227...| | 0| 0.0|[0.60045027471207...| | 0| 0.0|[0.84115292351132...| | 0| 0.0|[0.84886208614890...| | 0| 0.0|[0.88919427578540...| | 0| 0.0|[0.83672381011003...| | 0| 0.0|[0.94158158936651...| | 0| 0.0|[0.78512994077297...| | 0| 0.0|[0.92388640873015...| | 0| 0.0|[0.70667109480431...| | 0| 0.0|[0.77753491765686...| | 0| 0.0|[0.84072269870947...| | 0| 0.0|[0.69245448253822...| | 0| 0.0|[0.80829365786165...| | 0| 0.0|[0.75069415625101...| | 0| 0.0|[0.84936745272271...| | 0| 0.0|[0.73319036326455...| +-------------+----------+--------------------+ only showing top 20 rows
#Evulate the Model using BinaryClasssificationEvaluator using Area under curve
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='creditability', metricName='areaUnderROC')
accuracy=evaluator.evaluate(predictionsDf)
print(accuracy)
1.0