!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-eu.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"]="/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"]="/content/spark-2.4.4-bin-hadoop2.7"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[4]").appName("example").enableHiveSupport().getOrCreate()
from pyspark.mllib.stat import Statistics
import numpy
import pandas as pd
A = numpy.random.uniform(0.0, 5.0, 50)
B = numpy.random.uniform(0.0, 5.0, 50)
C = numpy.random.uniform(0.0, 5.0, 50)
myData = {'col 1': A, 'col 2': B, 'col 3': C}
df_myData = pd.DataFrame(myData, columns = ['col 1', 'col 2', 'col 3'])
df_myData
df_mySparkData = spark.createDataFrame(df_myData)
df_mySparkData.select('col 1').describe().show()
df_mySparkData.select('col 2').describe().show()
df_mySparkData.select('col 3').describe().show()
df_mySparkData.select(['col 1', 'col 2', 'col 3']).describe().show()
from pyspark.sql import functions as f
df_mySparkData.select(f.skewness('col 1'),f.kurtosis('col 1')).show()
boxplot = df_myData.boxplot()
boxplot