In [0]:
# every package has to install each time googlecolab runs jyputernoot book.
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-eu.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"
# starts the spark only three steps
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
from google.colab import drive
drive.mount('/content/drive')
dataset = spark.read.csv('drive/My Drive/boston_housing.csv',inferSchema=True, header =True)
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
In [0]:
dataset.columns
dataset.show()
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|   crim|  zn|indus|chas|  nox|   rm|  age|   dis|rad|tax|ptratio|     b|lstat|medv|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|
|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|
|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|
|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|
|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|
|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|
|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5605|  5|311|   15.2| 395.6|12.43|22.9|
|0.14455|12.5| 7.87|   0|0.524|6.172| 96.1|5.9505|  5|311|   15.2| 396.9|19.15|27.1|
|0.21124|12.5| 7.87|   0|0.524|5.631|100.0|6.0821|  5|311|   15.2|386.63|29.93|16.5|
|0.17004|12.5| 7.87|   0|0.524|6.004| 85.9|6.5921|  5|311|   15.2|386.71| 17.1|18.9|
|0.22489|12.5| 7.87|   0|0.524|6.377| 94.3|6.3467|  5|311|   15.2|392.52|20.45|15.0|
|0.11747|12.5| 7.87|   0|0.524|6.009| 82.9|6.2267|  5|311|   15.2| 396.9|13.27|18.9|
|0.09378|12.5| 7.87|   0|0.524|5.889| 39.0|5.4509|  5|311|   15.2| 390.5|15.71|21.7|
|0.62976| 0.0| 8.14|   0|0.538|5.949| 61.8|4.7075|  4|307|   21.0| 396.9| 8.26|20.4|
|0.63796| 0.0| 8.14|   0|0.538|6.096| 84.5|4.4619|  4|307|   21.0|380.02|10.26|18.2|
|0.62739| 0.0| 8.14|   0|0.538|5.834| 56.5|4.4986|  4|307|   21.0|395.62| 8.47|19.9|
|1.05393| 0.0| 8.14|   0|0.538|5.935| 29.3|4.4986|  4|307|   21.0|386.85| 6.58|23.1|
| 0.7842| 0.0| 8.14|   0|0.538| 5.99| 81.7|4.2579|  4|307|   21.0|386.75|14.67|17.5|
|0.80271| 0.0| 8.14|   0|0.538|5.456| 36.6|3.7965|  4|307|   21.0|288.99|11.69|20.2|
| 0.7258| 0.0| 8.14|   0|0.538|5.727| 69.5|3.7965|  4|307|   21.0|390.95|11.28|18.2|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
only showing top 20 rows

In [0]:
feature_columns = dataset.columns[:-1] 
feature_columns
Out[0]:
['crim',
 'zn',
 'indus',
 'chas',
 'nox',
 'rm',
 'age',
 'dis',
 'rad',
 'tax',
 'ptratio',
 'b',
 'lstat']
In [0]:
#@title

from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=feature_columns,outputCol="features")
data_2 = assembler.transform(dataset)
data_2.show()
data_2.select("features").show()
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
|   crim|  zn|indus|chas|  nox|   rm|  age|   dis|rad|tax|ptratio|     b|lstat|medv|            features|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|[0.00632,18.0,2.3...|
|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|[0.02731,0.0,7.07...|
|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|[0.02729,0.0,7.07...|
|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|[0.03237,0.0,2.18...|
|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|[0.06905,0.0,2.18...|
|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|[0.02985,0.0,2.18...|
|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5605|  5|311|   15.2| 395.6|12.43|22.9|[0.08829,12.5,7.8...|
|0.14455|12.5| 7.87|   0|0.524|6.172| 96.1|5.9505|  5|311|   15.2| 396.9|19.15|27.1|[0.14455,12.5,7.8...|
|0.21124|12.5| 7.87|   0|0.524|5.631|100.0|6.0821|  5|311|   15.2|386.63|29.93|16.5|[0.21124,12.5,7.8...|
|0.17004|12.5| 7.87|   0|0.524|6.004| 85.9|6.5921|  5|311|   15.2|386.71| 17.1|18.9|[0.17004,12.5,7.8...|
|0.22489|12.5| 7.87|   0|0.524|6.377| 94.3|6.3467|  5|311|   15.2|392.52|20.45|15.0|[0.22489,12.5,7.8...|
|0.11747|12.5| 7.87|   0|0.524|6.009| 82.9|6.2267|  5|311|   15.2| 396.9|13.27|18.9|[0.11747,12.5,7.8...|
|0.09378|12.5| 7.87|   0|0.524|5.889| 39.0|5.4509|  5|311|   15.2| 390.5|15.71|21.7|[0.09378,12.5,7.8...|
|0.62976| 0.0| 8.14|   0|0.538|5.949| 61.8|4.7075|  4|307|   21.0| 396.9| 8.26|20.4|[0.62976,0.0,8.14...|
|0.63796| 0.0| 8.14|   0|0.538|6.096| 84.5|4.4619|  4|307|   21.0|380.02|10.26|18.2|[0.63796,0.0,8.14...|
|0.62739| 0.0| 8.14|   0|0.538|5.834| 56.5|4.4986|  4|307|   21.0|395.62| 8.47|19.9|[0.62739,0.0,8.14...|
|1.05393| 0.0| 8.14|   0|0.538|5.935| 29.3|4.4986|  4|307|   21.0|386.85| 6.58|23.1|[1.05393,0.0,8.14...|
| 0.7842| 0.0| 8.14|   0|0.538| 5.99| 81.7|4.2579|  4|307|   21.0|386.75|14.67|17.5|[0.7842,0.0,8.14,...|
|0.80271| 0.0| 8.14|   0|0.538|5.456| 36.6|3.7965|  4|307|   21.0|288.99|11.69|20.2|[0.80271,0.0,8.14...|
| 0.7258| 0.0| 8.14|   0|0.538|5.727| 69.5|3.7965|  4|307|   21.0|390.95|11.28|18.2|[0.7258,0.0,8.14,...|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
only showing top 20 rows

+--------------------+
|            features|
+--------------------+
|[0.00632,18.0,2.3...|
|[0.02731,0.0,7.07...|
|[0.02729,0.0,7.07...|
|[0.03237,0.0,2.18...|
|[0.06905,0.0,2.18...|
|[0.02985,0.0,2.18...|
|[0.08829,12.5,7.8...|
|[0.14455,12.5,7.8...|
|[0.21124,12.5,7.8...|
|[0.17004,12.5,7.8...|
|[0.22489,12.5,7.8...|
|[0.11747,12.5,7.8...|
|[0.09378,12.5,7.8...|
|[0.62976,0.0,8.14...|
|[0.63796,0.0,8.14...|
|[0.62739,0.0,8.14...|
|[1.05393,0.0,8.14...|
|[0.7842,0.0,8.14,...|
|[0.80271,0.0,8.14...|
|[0.7258,0.0,8.14,...|
+--------------------+
only showing top 20 rows

In [0]:
#checking the remove of col
mycols=dataset.columns
mycols.remove("nox")
mycols
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=mycols,outputCol="myfeatures")
data_3 = assembler.transform(dataset)
data_3.show()
#data_3.select("myfeatures").show()
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
|   crim|  zn|indus|chas|  nox|   rm|  age|   dis|rad|tax|ptratio|     b|lstat|medv|          myfeatures|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|[0.00632,18.0,2.3...|
|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|[0.02731,0.0,7.07...|
|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|[0.02729,0.0,7.07...|
|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|[0.03237,0.0,2.18...|
|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|[0.06905,0.0,2.18...|
|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|[0.02985,0.0,2.18...|
|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5605|  5|311|   15.2| 395.6|12.43|22.9|[0.08829,12.5,7.8...|
|0.14455|12.5| 7.87|   0|0.524|6.172| 96.1|5.9505|  5|311|   15.2| 396.9|19.15|27.1|[0.14455,12.5,7.8...|
|0.21124|12.5| 7.87|   0|0.524|5.631|100.0|6.0821|  5|311|   15.2|386.63|29.93|16.5|[0.21124,12.5,7.8...|
|0.17004|12.5| 7.87|   0|0.524|6.004| 85.9|6.5921|  5|311|   15.2|386.71| 17.1|18.9|[0.17004,12.5,7.8...|
|0.22489|12.5| 7.87|   0|0.524|6.377| 94.3|6.3467|  5|311|   15.2|392.52|20.45|15.0|[0.22489,12.5,7.8...|
|0.11747|12.5| 7.87|   0|0.524|6.009| 82.9|6.2267|  5|311|   15.2| 396.9|13.27|18.9|[0.11747,12.5,7.8...|
|0.09378|12.5| 7.87|   0|0.524|5.889| 39.0|5.4509|  5|311|   15.2| 390.5|15.71|21.7|[0.09378,12.5,7.8...|
|0.62976| 0.0| 8.14|   0|0.538|5.949| 61.8|4.7075|  4|307|   21.0| 396.9| 8.26|20.4|[0.62976,0.0,8.14...|
|0.63796| 0.0| 8.14|   0|0.538|6.096| 84.5|4.4619|  4|307|   21.0|380.02|10.26|18.2|[0.63796,0.0,8.14...|
|0.62739| 0.0| 8.14|   0|0.538|5.834| 56.5|4.4986|  4|307|   21.0|395.62| 8.47|19.9|[0.62739,0.0,8.14...|
|1.05393| 0.0| 8.14|   0|0.538|5.935| 29.3|4.4986|  4|307|   21.0|386.85| 6.58|23.1|[1.05393,0.0,8.14...|
| 0.7842| 0.0| 8.14|   0|0.538| 5.99| 81.7|4.2579|  4|307|   21.0|386.75|14.67|17.5|[0.7842,0.0,8.14,...|
|0.80271| 0.0| 8.14|   0|0.538|5.456| 36.6|3.7965|  4|307|   21.0|288.99|11.69|20.2|[0.80271,0.0,8.14...|
| 0.7258| 0.0| 8.14|   0|0.538|5.727| 69.5|3.7965|  4|307|   21.0|390.95|11.28|18.2|[0.7258,0.0,8.14,...|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
only showing top 20 rows

In [0]:
train, test = data_2.randomSplit([0.7, 0.3])
from pyspark.ml.regression import LinearRegression
algo = LinearRegression(featuresCol="features", labelCol="medv")
model = algo.fit(train)
In [0]:
evaluation_summary = model.evaluate(test)
evaluation_summary.meanAbsoluteError
evaluation_summary.rootMeanSquaredError
evaluation_summary.r2
Out[0]:
0.8205587307919501
In [0]:
predictions = model.transform(test)
predictions.show()
+-------+----+-----+----+------+-----+----+------+---+---+-------+------+-----+----+--------------------+------------------+
|   crim|  zn|indus|chas|   nox|   rm| age|   dis|rad|tax|ptratio|     b|lstat|medv|            features|        prediction|
+-------+----+-----+----+------+-----+----+------+---+---+-------+------+-----+----+--------------------+------------------+
|0.00906|90.0| 2.97|   0|   0.4|7.088|20.8|7.3073|  1|285|   15.3|394.72| 7.85|32.2|[0.00906,90.0,2.9...|31.130005187033618|
|0.01301|35.0| 1.52|   0| 0.442|7.241|49.3|7.0379|  1|284|   15.5|394.74| 5.49|32.7|[0.01301,35.0,1.5...|30.051538532597974|
| 0.0136|75.0|  4.0|   0|  0.41|5.888|47.6|7.3197|  3|469|   21.1| 396.9| 14.8|18.9|[0.0136,75.0,4.0,...|15.511922838169287|
|0.01439|60.0| 2.93|   0| 0.401|6.604|18.8|6.2196|  1|265|   15.6| 376.7| 4.38|29.1|[0.01439,60.0,2.9...|31.715144625539686|
|0.01501|90.0| 1.21|   1| 0.401|7.923|24.8| 5.885|  1|198|   13.6|395.52| 3.16|50.0|[0.01501,90.0,1.2...|44.561651147412014|
|0.02009|95.0| 2.68|   0|0.4161|8.034|31.9| 5.118|  4|224|   14.7|390.55| 2.88|50.0|[0.02009,95.0,2.6...| 42.62350143300318|
|0.02177|82.5| 2.03|   0| 0.415| 7.61|15.7|  6.27|  2|348|   14.7|395.38| 3.11|42.3|[0.02177,82.5,2.0...| 36.69741849123246|
|0.03237| 0.0| 2.18|   0| 0.458|6.998|45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|[0.03237,0.0,2.18...|28.557985574787445|
|0.03445|82.5| 2.03|   0| 0.415|6.162|38.4|  6.27|  2|348|   14.7|393.77| 7.43|24.1|[0.03445,82.5,2.0...|29.750110495289537|
|0.03502|80.0| 4.95|   0| 0.411|6.861|27.9|5.1167|  4|245|   19.2| 396.9| 3.33|28.5|[0.03502,80.0,4.9...|33.585405662868254|
|0.03584|80.0| 3.37|   0| 0.398| 6.29|17.8|6.6115|  4|337|   16.1| 396.9| 4.67|23.5|[0.03584,80.0,3.3...|30.741099039843533|
|0.03705|20.0| 3.33|   0|0.4429|6.968|37.2|5.2447|  5|216|   14.9|392.23| 4.59|35.4|[0.03705,20.0,3.3...| 34.14004378029615|
|0.03768|80.0| 1.52|   0| 0.404|7.274|38.3| 7.309|  2|329|   12.6| 392.2| 6.62|34.6|[0.03768,80.0,1.5...| 34.38386223615609|
|0.03932| 0.0| 3.41|   0| 0.489|6.405|73.9|3.0921|  2|270|   17.8|393.55|  8.2|22.0|[0.03932,0.0,3.41...|27.735380888133534|
|0.03961| 0.0| 5.19|   0| 0.515|6.037|34.5|5.9853|  5|224|   20.2| 396.9| 8.01|21.1|[0.03961,0.0,5.19...|20.611816133247807|
|0.04011|80.0| 1.52|   0| 0.404|7.287|34.1| 7.309|  2|329|   12.6| 396.9| 4.08|33.3|[0.04011,80.0,1.5...| 35.97164470367424|
|0.04113|25.0| 4.86|   0| 0.426|6.727|33.5|5.4007|  4|281|   19.0| 396.9| 5.29|28.0|[0.04113,25.0,4.8...|28.384256556085887|
|0.04417|70.0| 2.24|   0|   0.4|6.871|47.4|7.8278|  5|358|   14.8|390.86| 6.07|24.8|[0.04417,70.0,2.2...|30.823153861831443|
|0.04527| 0.0|11.93|   0| 0.573| 6.12|76.7|2.2875|  1|273|   21.0| 396.9| 9.08|20.6|[0.04527,0.0,11.9...|22.744515966451146|
|0.04666|80.0| 1.52|   0| 0.404|7.107|36.6| 7.309|  2|329|   12.6|354.31| 8.61|30.3|[0.04666,80.0,1.5...| 32.37537747462373|
+-------+----+-----+----+------+-----+----+------+---+---+-------+------+-----+----+--------------------+------------------+
only showing top 20 rows