My Practice

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-eu.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark
In [0]:
import os
os.environ["JAVA_HOME"]="/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"]="/content/spark-2.4.4-bin-hadoop2.7"
In [0]:
import findspark
findspark.init()
In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[4]").appName("example").enableHiveSupport().getOrCreate()
In [0]:
from pyspark.mllib.stat import Statistics
In [0]:
import numpy
import pandas as pd

A = numpy.random.uniform(0.0, 5.0, 50)
B = numpy.random.uniform(0.0, 5.0, 50)
C = numpy.random.uniform(0.0, 5.0, 50)
In [26]:
myData = {'col 1': A, 'col 2': B, 'col 3':  C}
df_myData = pd.DataFrame(myData, columns = ['col 1', 'col 2', 'col 3'])
df_myData
Out[26]:
col 1 col 2 col 3
0 2.076811 4.504808 1.185009
1 1.934227 3.667571 3.251078
2 2.309815 3.100999 2.669341
3 4.733721 4.736521 0.540907
4 0.947666 3.223237 1.688003
5 3.028272 4.862867 2.630330
6 0.810942 1.304632 2.360165
7 2.224395 3.492871 3.281828
8 0.792280 1.591555 1.110427
9 3.292726 1.410364 2.150106
10 3.059236 3.341691 0.720368
11 1.462383 0.307339 4.805191
12 3.926427 1.143685 4.683265
13 0.192009 1.906677 4.710594
14 2.785568 3.028357 3.882144
15 1.132007 4.828728 1.328386
16 0.788626 1.243279 3.018588
17 0.859967 1.833478 0.979773
18 1.553708 2.054370 1.658596
19 0.183227 4.725916 3.034490
20 2.899136 4.490044 1.093030
21 2.693121 3.277379 3.610852
22 4.116490 0.888438 2.852708
23 4.116207 3.803362 1.548007
24 4.753051 4.441588 2.075379
25 0.980783 1.641690 3.973315
26 3.401274 1.605742 3.635228
27 2.145107 2.038139 2.478654
28 0.126935 0.791612 2.179851
29 3.805047 0.443080 2.612522
30 1.015546 0.514713 2.957102
31 4.192259 0.997767 4.659578
32 3.106782 0.194211 4.866433
33 4.104189 0.401162 3.395980
34 1.229888 0.012068 0.919222
35 1.735839 3.486452 3.734895
36 4.811291 1.783623 2.495885
37 3.791955 3.929932 2.224936
38 1.411379 4.049912 3.865664
39 3.770632 2.902380 3.003475
40 1.041498 4.308844 2.933885
41 0.548339 3.250213 1.348262
42 3.149948 3.189463 3.890871
43 4.061976 4.859229 2.541517
44 2.170328 0.630645 1.466513
45 3.552004 4.945298 0.394305
46 2.712664 0.187814 3.268831
47 0.346897 0.239367 4.010723
48 0.707725 0.616417 4.884030
49 3.130152 0.348009 1.719771
In [0]:
df_mySparkData = spark.createDataFrame(df_myData)
In [28]:
df_mySparkData.select('col 1').describe().show()
+-------+------------------+
|summary|             col 1|
+-------+------------------+
|  count|                50|
|   mean| 2.354449066498408|
| stddev|1.3968864071189608|
|    min|0.1269349072738568|
|    max| 4.811291438359658|
+-------+------------------+

In [29]:
df_mySparkData.select('col 2').describe().show()
+-------+--------------------+
|summary|               col 2|
+-------+--------------------+
|  count|                  50|
|   mean|   2.411550805961591|
| stddev|   1.623769658832128|
|    min|0.012067865014394608|
|    max|   4.945298155785863|
+-------+--------------------+

In [30]:
df_mySparkData.select('col 3').describe().show()
+-------+------------------+
|summary|             col 3|
+-------+------------------+
|  count|                50|
|   mean| 2.686600283100578|
| stddev|1.2485293848922936|
|    min|0.3943048957760886|
|    max|4.8840303033810875|
+-------+------------------+

In [31]:
df_mySparkData.select(['col 1', 'col 2', 'col 3']).describe().show()
+-------+------------------+--------------------+------------------+
|summary|             col 1|               col 2|             col 3|
+-------+------------------+--------------------+------------------+
|  count|                50|                  50|                50|
|   mean| 2.354449066498408|   2.411550805961591| 2.686600283100578|
| stddev|1.3968864071189608|   1.623769658832128|1.2485293848922936|
|    min|0.1269349072738568|0.012067865014394608|0.3943048957760886|
|    max| 4.811291438359658|   4.945298155785863|4.8840303033810875|
+-------+------------------+--------------------+------------------+

In [0]:
from pyspark.sql import functions as f
In [38]:
df_mySparkData.select(f.skewness('col 1'),f.kurtosis('col 1')).show()
+-------------------+-------------------+
|    skewness(col 1)|    kurtosis(col 1)|
+-------------------+-------------------+
|0.06188109739788434|-1.2510277035937163|
+-------------------+-------------------+

In [37]:
boxplot = df_myData.boxplot()
boxplot
Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f93ba2b6518>
In [0]: