!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-eu.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"]="/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"]="/content/spark-2.4.4-bin-hadoop2.7"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[4]").appName("example").enableHiveSupport().getOrCreate()
unsw_rdd = spark.sparkContext.textFile("./UNSW-NB15.csv") # Import data into Spark's RDD
unsw_rdd.count()
unsw_rdd.take(2)
def AddNormalLabel(line):
line = line.split(",")
if line[47] == "":
line[47] = 'normal'
return line
unsw_update_rdd = unsw_rdd.map(AddNormalLabel) # convert empty value to 'normal' in col. 47
unsw_update_rdd.take(2) # to check the changes for the first two lines/rows
# print out the names of "connections type" at column 48
connections_rdd = unsw_update_rdd.map(lambda x: x[47].strip()).distinct() # then, get the index 47 for "connections"
Labels = connections_rdd.collect()
Labels
shellcode_rdd = unsw_update_rdd.filter(lambda line: 'Shellcode' in line) # count the number of "Shellcode" connections
shellcode_rdd.count()
DoS_rdd = unsw_update_rdd.filter(lambda line: 'DoS' in line) # count the number of "DoS" connections
DoS_rdd.count()
# print out the "states of the protocol", which is at column 6
states_rdd = unsw_update_rdd.map(lambda x: x[5]).distinct() # then, get the 2nd index for "protocols"
States = states_rdd.collect()
States
def State_Count(line):
count = []
for state in line:
All_rdd = unsw_update_rdd.filter(lambda x: state in x[5])
count.append(All_rdd.count())
return count
State_Count(States)
# print out the "services", which is at column 14
services_rdd = unsw_update_rdd.map(lambda x: x[13]).distinct()
services_rdd.collect()
import pandas as pd
link = "https://www.dropbox.com/s/2b0tbw4muxg4yr5/data_sample.csv?dl=1"
data = pd.read_csv(link)
data.sample(10)
data.count() # You can count the number of records, which is 1794 samples
# Since there is no header for data, it would better to understand each attribute by adding header
Column_names = ['DateTime', 'Event', 'Country', 'User_ID', 'Source', 'Topic']
data = pd.read_csv(link, delimiter=';', names = Column_names)
data.sample(10)
data.head()
data.tail()
# Select specific columns of your dataframe
data[['Country','Event','Source']]