unsw research

In [1]:
dataset = spark.read.csv('UNSW-NB15.csv',inferSchema=True, header =True)
In [16]:
dataset.show()
+----------+-----+-------------+-----+---+---+------------+-----+------+---+---+---+---+--------+---------+---------+---+---+-----+-----+----------+----------+---+----+---+---+----------+---------+----------+----------+------------+------------+------------+------------+------------+---+---+---+---+---+---+---+---+---+---+---+---+----+---+
|59.166.0.3|56716|149.171.126.8|  143|tcp|FIN|  0.82546002| 7812| 16236| 31| 29| 30| 32|       -| 75090.25|156111.73|122|126|25518|25519|2751097753|2748686736| 64| 129|024|025| 445.25928| 474.9451|1421970774|1421970775|   6.8190908|    6.599896|5.9700001E-4|4.6899999E-4|    0.000128|035|036|037|038|039|  2|  7|142|  4|144|145|146|_c47|048|
+----------+-----+-------------+-----+---+---+------------+-----+------+---+---+---+---+--------+---------+---------+---+---+-----+-----+----------+----------+---+----+---+---+----------+---------+----------+----------+------------+------------+------------+------------+------------+---+---+---+---+---+---+---+---+---+---+---+---+----+---+
|59.166.0.0|43467|149.171.126.6|49729|tcp|FIN|    0.101815| 4238| 65628| 31| 29|  7| 30|       -|328438.84|5087030.5| 72| 74|  255|  255| 961515433|3225510659| 59| 887|  0|  0|       0.0|91.579567|1421970775|1421970775|    1.429493|    1.387192|      6.8E-4|5.4600002E-4|     1.34E-4|  0|  0|  0|  0|  0|  7|  4|  1|  6|  1|  1|  1|null|  0|
|59.166.0.5|41289|149.171.126.2| 9574|tcp|FIN| 0.044002999| 2750| 29104| 31| 29|  7| 17|       -|488693.97|5181101.5| 44| 48|  255|  255|3291096757|1191410228| 63| 606|  0|  0| 78.126968|62.206562|1421970775|1421970775|    1.014977|  0.92583001|     0.00125|     4.85E-4|     7.65E-4|  0|  0|  0|  0|  0|  3|  5|  3|  3|  1|  1|  1|null|  0|
|59.166.0.9|43785|149.171.126.0| 6881|tcp|FIN|   2.7908299|10476|395734| 31| 29| 16|143|       -|29863.518|1130840.8|180|320|  255|  255|3934392726|3961690324| 58|1237|  0|  0| 2707.4927| 2018.976|1421970772|1421970775|   15.589459|   8.7470121|6.8400003E-4|5.3199998E-4|1.5199999E-4|  0|  0|  0|  0|  0| 11|  4|  3|  2|  1|  1|  1|null|  0|
|59.166.0.8|40691|149.171.126.9| 6881|tcp|FIN|   2.6335001|13350|548216| 31| 29| 21|197|       -|40381.238|1661560.6|232|438|  255|  255|   1518931|  18267719| 58|1252|  0|  0| 718.33679|500.57288|1421970773|1421970775|   11.399026|   6.0251832|     6.19E-4|     4.89E-4|      1.3E-4|  0|  0|  0|  0|  0| 16|  7|  7|  1|  1|  1|  1|null|  0|
|59.166.0.3|20393|149.171.126.3| 5190|tcp|FIN|    0.115048| 1958|  2308| 31| 29|  6|  6|       -|129963.15|153814.06| 22| 24|  255|  255|3646899201|3651364285| 89|  96|  0|  0| 435.26627|417.08563|1421970775|1421970775|    5.460381|    4.976913|7.0999999E-4|     5.73E-4|     1.37E-4|  0|  0|  0|  0|  0|  2|  6|  1|  4|  1|  1|  1|null|  0|
|59.166.0.7|19792|149.171.126.0|   53|udp|CON|    0.003362|  146|   178| 31| 29|  0|  0|     dns|173706.13| 211778.7|  2|  2|    0|    0|         0|         0| 73|  89|  0|  0|       0.0|      0.0|1421970775|1421970775|       0.011|0.0060000001|         0.0|         0.0|         0.0|  0|  0|  0|  0|  0|  3|  2|  3|  3|  3|  1|  1|null|  0|
|59.166.0.3|14382|149.171.126.9| 3354|tcp|FIN|  0.45305201|  424|  8824| 31| 29|  1|  4|ftp-data| 6551.124| 142835.7|  8| 12|  255|  255|2206905053|3307670308| 53| 735|  0|  0| 3906.7949|3074.6694|1421970775|1421970775|   64.671288|   41.134998|      6.8E-4|5.5900001E-4|     1.21E-4|  0|  0|  0|  0|  0|  4|  6|  7|  4|  1|  1|  2|null|  0|
|59.166.0.9|37074|149.171.126.2|   53|udp|CON|    0.001088|  146|   178| 31| 29|  0|  0|     dns|536764.69|654411.75|  2|  2|    0|    0|         0|         0| 73|  89|  0|  0|       0.0|      0.0|1421970775|1421970775|       0.001|0.0089999996|         0.0|         0.0|         0.0|  0|  0|  0|  0|  0|  2|  5|  3|  2|  1|  1|  1|null|  0|
|59.166.0.7|12569|149.171.126.5|   53|udp|CON|9.6899999E-4|  146|   178| 31| 29|  0|  0|     dns|602683.19|734778.13|  2|  2|    0|    0|         0|         0| 73|  89|  0|  0|       0.0|      0.0|1421970775|1421970775|0.0099999998|       0.003|         0.0|         0.0|         0.0|  0|  0|  0|  0|  0|  3|  1|  2|  3|  3|  1|  1|null|  0|
|59.166.0.1|12792|149.171.126.7|   53|udp|CON|0.0010629999|  146|   178| 31| 29|  0|  0|     dns|549388.56| 669802.5|  2|  2|    0|    0|         0|         0| 73|  89|  0|  0|       0.0|      0.0|1421970780|1421970780|0.0020000001|       0.003|         0.0|         0.0|         0.0|  0|  0|  0|  0|  0|  5|  3|  2|  4|  3|  1|  1|null|  0|
|59.166.0.0|63414|149.171.126.9|10330|tcp|FIN|  0.26501101| 8928|   320| 31| 29|  4|  1|ftp-data|250283.94|8060.0425| 14|  6|  255|  255|2386904726|3699988372|638|  53|  0|  0| 1725.2916|71.464249|1421970775|1421970776|   20.385462|   52.644802|7.7500002E-4|6.3299999E-4|     1.42E-4|  0|  0|  0|  0|  0|  3|  6|  6|  5|  1|  1|  2|null|  0|
|59.166.0.1|33555|149.171.126.3| 6881|tcp|FIN|  0.51712799| 1540|  1644| 31| 29|  4|  4|       -|22338.764|  24025.0| 16| 18|  255|  255|1741520309|3943579644| 96|  91|  0|  0| 2036.1301| 51.91766|1421970776|1421970776|   34.433331|   30.388353|6.5399997E-4|5.2200002E-4|     1.32E-4|  0|  0|  0|  0|  0|  4|  6|  5|  7|  4|  1|  4|null|  0|
|59.166.0.8|10867|149.171.126.8|  111|udp|CON|0.0053389999|  568|   312| 31| 29|  0|  0|       -|638321.81|350627.47|  4|  4|    0|    0|         0|         0|142|  78|  0|  0| 1.7430201| 1.757632|1421970776|1421970776|        1.24|    1.252333|         0.0|         0.0|         0.0|  0|  0|  0|  0|  0| 16|  7|  5|  5|  1|  1|  4|null|  0|
|59.166.0.8|12411|149.171.126.8| 1715|udp|CON|    0.001739|  512|   304| 31| 29|  0|  0|       -|1766532.5|1048878.6|  4|  4|    0|    0|         0|         0|128|  76|  0|  0|0.63026798| 0.318434|1421970776|1421970776|  0.44966701|    0.227667|         0.0|         0.0|         0.0|  0|  0|  0|  0|  0| 16|  7|  5|  5|  1|  1|  4|null|  0|
|59.166.0.8|46725|149.171.126.2|   53|udp|CON|    0.001018|  146|   178| 31| 29|  0|  0|     dns|573673.88|699410.63|  2|  2|    0|    0|         0|         0| 73|  89|  0|  0|       0.0|      0.0|1421970776|1421970776|       0.011|       0.011|         0.0|         0.0|         0.0|  0|  0|  0|  0|  0|  2|  5|  3|  5|  1|  1|  1|null|  0|
|59.166.0.1|51562|149.171.126.4|   53|udp|CON|    0.001044|  146|   178| 31| 29|  0|  0|     dns|559386.94|681992.31|  2|  2|    0|    0|         0|         0| 73|  89|  0|  0|       0.0|      0.0|1421970776|1421970776|       0.011|       0.003|         0.0|         0.0|         0.0|  0|  0|  0|  0|  0|  2|  3|  5|  7|  2|  1|  3|null|  0|
|59.166.0.3|48838|149.171.126.2|   53|udp|CON|9.8699995E-4|  146|   178| 31| 29|  0|  0|     dns| 591692.0|721377.94|  2|  2|    0|    0|         0|         0| 73|  89|  0|  0|       0.0|      0.0|1421970776|1421970776|0.0080000004|       0.003|         0.0|         0.0|         0.0|  0|  0|  0|  0|  0|  2|  5|  3|  3|  1|  1|  1|null|  0|
|59.166.0.0|16907|149.171.126.9|   21|tcp|FIN|   2.2547121| 2934|  3740| 31| 29| 11| 15|     ftp|10211.503|13025.166| 52| 54|  255|  255| 241515551|2584135680| 56|  69|  0|  0| 3141.3708| 99.93412|1421970773|1421970776|   44.203648|   42.531734|6.6100003E-4|5.2499998E-4|     1.36E-4|  0|  0|  0|  0|  0|  1|  3|  6|  5|  1|  1|  2|null|  0|
|59.166.0.0| 1915|149.171.126.4|32945|tcp|FIN| 0.051220998| 2854| 29104| 31| 29|  7| 17|       -|436071.16|4450987.0| 46| 48|  255|  255|1921515932|1974066994| 62| 606|  0|  0| 78.226654|76.387978|1421970776|1421970776|   1.1294219|   1.0781699|      8.1E-4|     5.43E-4|     2.67E-4|  0|  0|  0|  0|  0|  7|  2|  5|  5|  1|  1|  2|null|  0|
|59.166.0.4| 1309|149.171.126.8|52139|tcp|FIN| 0.091375001| 4238| 65628| 31| 29|  7| 30|       -|365964.44|5668246.0| 72| 74|  255|  255|1681522413|1731423574| 59| 887|  0|  0|       0.0|77.518913|1421970776|1421970776|    1.268014|     1.24474|6.3899998E-4|     5.05E-4|     1.34E-4|  0|  0|  0|  0|  0|  2|  7|  5|  2|  1|  1|  1|null|  0|
+----------+-----+-------------+-----+---+---+------------+-----+------+---+---+---+---+--------+---------+---------+---+---+-----+-----+----------+----------+---+----+---+---+----------+---------+----------+----------+------------+------------+------------+------------+------------+---+---+---+---+---+---+---+---+---+---+---+---+----+---+
only showing top 20 rows

In [2]:
dataset.columns
Out[2]:
['59.166.0.3',
 '56716',
 '149.171.126.8',
 '143',
 'tcp',
 'FIN',
 '0.82546002',
 '7812',
 '16236',
 '31',
 '29',
 '30',
 '32',
 '-',
 '75090.25',
 '156111.73',
 '122',
 '126',
 '25518',
 '25519',
 '2751097753',
 '2748686736',
 '64',
 '129',
 '024',
 '025',
 '445.25928',
 '474.9451',
 '1421970774',
 '1421970775',
 '6.8190908',
 '6.599896',
 '5.9700001E-4',
 '4.6899999E-4',
 '0.000128',
 '035',
 '036',
 '037',
 '038',
 '039',
 '2',
 '7',
 '142',
 '4',
 '144',
 '145',
 '146',
 '_c47',
 '048']
In [3]:
dataset.count()
Out[3]:
2539738
In [6]:
dataset.head()
Out[6]:
Row(59.166.0.3='59.166.0.0', 56716=43467, 149.171.126.8='149.171.126.6', 143=49729, tcp='tcp', FIN='FIN', 0.82546002=0.101815, 7812=4238, 16236=65628, 31=31, 29=29, 30=7, 32=30, -='-', 75090.25=328438.84, 156111.73=5087030.5, 122=72, 126=74, 25518=255, 25519=255, 2751097753=961515433, 2748686736=3225510659, 64=59, 129=887, 024=0, 025=0, 445.25928=0.0, 474.9451=91.579567, 1421970774=1421970775, 1421970775=1421970775, 6.8190908=1.429493, 6.599896=1.387192, 5.9700001E-4=0.00068, 4.6899999E-4=0.00054600002, 0.000128=0.000134, 035=0, 036=0, 037=0, 038=0, 039=0, 2=7, 7=4, 142=1, 4=6, 144=1, 145=1, 146=1, _c47=None, 048=0)
In [7]:
dataset.take(2)
Out[7]:
[Row(59.166.0.3='59.166.0.0', 56716=43467, 149.171.126.8='149.171.126.6', 143=49729, tcp='tcp', FIN='FIN', 0.82546002=0.101815, 7812=4238, 16236=65628, 31=31, 29=29, 30=7, 32=30, -='-', 75090.25=328438.84, 156111.73=5087030.5, 122=72, 126=74, 25518=255, 25519=255, 2751097753=961515433, 2748686736=3225510659, 64=59, 129=887, 024=0, 025=0, 445.25928=0.0, 474.9451=91.579567, 1421970774=1421970775, 1421970775=1421970775, 6.8190908=1.429493, 6.599896=1.387192, 5.9700001E-4=0.00068, 4.6899999E-4=0.00054600002, 0.000128=0.000134, 035=0, 036=0, 037=0, 038=0, 039=0, 2=7, 7=4, 142=1, 4=6, 144=1, 145=1, 146=1, _c47=None, 048=0),
 Row(59.166.0.3='59.166.0.5', 56716=41289, 149.171.126.8='149.171.126.2', 143=9574, tcp='tcp', FIN='FIN', 0.82546002=0.044002999, 7812=2750, 16236=29104, 31=31, 29=29, 30=7, 32=17, -='-', 75090.25=488693.97, 156111.73=5181101.5, 122=44, 126=48, 25518=255, 25519=255, 2751097753=3291096757, 2748686736=1191410228, 64=63, 129=606, 024=0, 025=0, 445.25928=78.126968, 474.9451=62.206562, 1421970774=1421970775, 1421970775=1421970775, 6.8190908=1.014977, 6.599896=0.92583001, 5.9700001E-4=0.00125, 4.6899999E-4=0.000485, 0.000128=0.000765, 035=0, 036=0, 037=0, 038=0, 039=0, 2=3, 7=5, 142=3, 4=3, 144=1, 145=1, 146=1, _c47=None, 048=0)]
In [8]:
def AddNormalLabel(line):
    line = line.split(",")
    if line[47] == "":
        line[47] = 'Normal'
        
    return line

updated_dataset = dataset.map(AddNormalLabel) stops working here!

In [13]:
dataset.printSchema()
root
 |-- 59.166.0.3: string (nullable = true)
 |-- 56716: integer (nullable = true)
 |-- 149.171.126.8: string (nullable = true)
 |-- 143: integer (nullable = true)
 |-- tcp: string (nullable = true)
 |-- FIN: string (nullable = true)
 |-- 0.82546002: double (nullable = true)
 |-- 7812: integer (nullable = true)
 |-- 16236: integer (nullable = true)
 |-- 31: integer (nullable = true)
 |-- 29: integer (nullable = true)
 |-- 30: integer (nullable = true)
 |-- 32: integer (nullable = true)
 |-- -: string (nullable = true)
 |-- 75090.25: double (nullable = true)
 |-- 156111.73: double (nullable = true)
 |-- 122: integer (nullable = true)
 |-- 126: integer (nullable = true)
 |-- 25518: integer (nullable = true)
 |-- 25519: integer (nullable = true)
 |-- 2751097753: long (nullable = true)
 |-- 2748686736: long (nullable = true)
 |-- 64: integer (nullable = true)
 |-- 129: integer (nullable = true)
 |-- 024: integer (nullable = true)
 |-- 025: integer (nullable = true)
 |-- 445.25928: double (nullable = true)
 |-- 474.9451: double (nullable = true)
 |-- 1421970774: integer (nullable = true)
 |-- 1421970775: integer (nullable = true)
 |-- 6.8190908: double (nullable = true)
 |-- 6.599896: double (nullable = true)
 |-- 5.9700001E-4: double (nullable = true)
 |-- 4.6899999E-4: double (nullable = true)
 |-- 0.000128: double (nullable = true)
 |-- 035: integer (nullable = true)
 |-- 036: integer (nullable = true)
 |-- 037: integer (nullable = true)
 |-- 038: integer (nullable = true)
 |-- 039: integer (nullable = true)
 |-- 2: integer (nullable = true)
 |-- 7: integer (nullable = true)
 |-- 142: integer (nullable = true)
 |-- 4: integer (nullable = true)
 |-- 144: integer (nullable = true)
 |-- 145: integer (nullable = true)
 |-- 146: integer (nullable = true)
 |-- _c47: string (nullable = true)
 |-- 048: integer (nullable = true)

In [15]:
cols = dataset.columns

# Let us import the vector assembler
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=cols,outputCol="features")
# Now let us use the transform method to transform our dataset
dataset=assembler.transform(dataset)
dataset.select("features").show(truncate=False)
---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
~/Documents/spark-3.0.0/python/pyspark/sql/utils.py in deco(*a, **kw)
     97         try:
---> 98             return f(*a, **kw)
     99         except py4j.protocol.Py4JJavaError as e:

~/Documents/spark-3.0.0/python/lib/py4j-0.10.8.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    327                     "An error occurred while calling {0}{1}{2}.\n".
--> 328                     format(target_id, ".", name), value)
    329             else:

Py4JJavaError: An error occurred while calling o46.transform.
: java.lang.IllegalArgumentException: Data type string of column 59.166.0.3 is not supported.
Data type string of column 149.171.126.8 is not supported.
Data type string of column tcp is not supported.
Data type string of column FIN is not supported.
Data type string of column - is not supported.
Data type string of column _c47 is not supported.
	at org.apache.spark.ml.feature.VectorAssembler.transformSchema(VectorAssembler.scala:168)
	at org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:75)
	at org.apache.spark.ml.feature.VectorAssembler.transform(VectorAssembler.scala:85)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:834)


During handling of the above exception, another exception occurred:

IllegalArgumentException                  Traceback (most recent call last)
<ipython-input-15-cb1addfb5fab> in <module>
      5 assembler = VectorAssembler(inputCols=cols,outputCol="features")
      6 # Now let us use the transform method to transform our dataset
----> 7 dataset=assembler.transform(dataset)
      8 dataset.select("features").show(truncate=False)

~/Documents/spark-3.0.0/python/pyspark/ml/base.py in transform(self, dataset, params)
    170                 return self.copy(params)._transform(dataset)
    171             else:
--> 172                 return self._transform(dataset)
    173         else:
    174             raise ValueError("Params must be a param map but got %s." % type(params))

~/Documents/spark-3.0.0/python/pyspark/ml/wrapper.py in _transform(self, dataset)
    336     def _transform(self, dataset):
    337         self._transfer_params_to_java()
--> 338         return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx)
    339 
    340 

~/Documents/spark-3.0.0/python/lib/py4j-0.10.8.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1284         answer = self.gateway_client.send_command(command)
   1285         return_value = get_return_value(
-> 1286             answer, self.gateway_client, self.target_id, self.name)
   1287 
   1288         for temp_arg in temp_args:

~/Documents/spark-3.0.0/python/pyspark/sql/utils.py in deco(*a, **kw)
    100             converted = convert_exception(e.java_exception)
    101             if not isinstance(converted, UnknownException):
--> 102                 raise converted
    103             else:
    104                 raise

IllegalArgumentException: Data type string of column 59.166.0.3 is not supported.
Data type string of column 149.171.126.8 is not supported.
Data type string of column tcp is not supported.
Data type string of column FIN is not supported.
Data type string of column - is not supported.
Data type string of column _c47 is not supported.
In [ ]: