dataset = spark.read.csv('UNSW-NB15.csv',inferSchema=True, header =True)
dataset.show()
+----------+-----+-------------+-----+---+---+------------+-----+------+---+---+---+---+--------+---------+---------+---+---+-----+-----+----------+----------+---+----+---+---+----------+---------+----------+----------+------------+------------+------------+------------+------------+---+---+---+---+---+---+---+---+---+---+---+---+----+---+ |59.166.0.3|56716|149.171.126.8| 143|tcp|FIN| 0.82546002| 7812| 16236| 31| 29| 30| 32| -| 75090.25|156111.73|122|126|25518|25519|2751097753|2748686736| 64| 129|024|025| 445.25928| 474.9451|1421970774|1421970775| 6.8190908| 6.599896|5.9700001E-4|4.6899999E-4| 0.000128|035|036|037|038|039| 2| 7|142| 4|144|145|146|_c47|048| +----------+-----+-------------+-----+---+---+------------+-----+------+---+---+---+---+--------+---------+---------+---+---+-----+-----+----------+----------+---+----+---+---+----------+---------+----------+----------+------------+------------+------------+------------+------------+---+---+---+---+---+---+---+---+---+---+---+---+----+---+ |59.166.0.0|43467|149.171.126.6|49729|tcp|FIN| 0.101815| 4238| 65628| 31| 29| 7| 30| -|328438.84|5087030.5| 72| 74| 255| 255| 961515433|3225510659| 59| 887| 0| 0| 0.0|91.579567|1421970775|1421970775| 1.429493| 1.387192| 6.8E-4|5.4600002E-4| 1.34E-4| 0| 0| 0| 0| 0| 7| 4| 1| 6| 1| 1| 1|null| 0| |59.166.0.5|41289|149.171.126.2| 9574|tcp|FIN| 0.044002999| 2750| 29104| 31| 29| 7| 17| -|488693.97|5181101.5| 44| 48| 255| 255|3291096757|1191410228| 63| 606| 0| 0| 78.126968|62.206562|1421970775|1421970775| 1.014977| 0.92583001| 0.00125| 4.85E-4| 7.65E-4| 0| 0| 0| 0| 0| 3| 5| 3| 3| 1| 1| 1|null| 0| |59.166.0.9|43785|149.171.126.0| 6881|tcp|FIN| 2.7908299|10476|395734| 31| 29| 16|143| -|29863.518|1130840.8|180|320| 255| 255|3934392726|3961690324| 58|1237| 0| 0| 2707.4927| 2018.976|1421970772|1421970775| 15.589459| 8.7470121|6.8400003E-4|5.3199998E-4|1.5199999E-4| 0| 0| 0| 0| 0| 11| 4| 3| 2| 1| 1| 1|null| 0| |59.166.0.8|40691|149.171.126.9| 6881|tcp|FIN| 2.6335001|13350|548216| 31| 29| 21|197| -|40381.238|1661560.6|232|438| 255| 255| 1518931| 18267719| 58|1252| 0| 0| 718.33679|500.57288|1421970773|1421970775| 11.399026| 6.0251832| 6.19E-4| 4.89E-4| 1.3E-4| 0| 0| 0| 0| 0| 16| 7| 7| 1| 1| 1| 1|null| 0| |59.166.0.3|20393|149.171.126.3| 5190|tcp|FIN| 0.115048| 1958| 2308| 31| 29| 6| 6| -|129963.15|153814.06| 22| 24| 255| 255|3646899201|3651364285| 89| 96| 0| 0| 435.26627|417.08563|1421970775|1421970775| 5.460381| 4.976913|7.0999999E-4| 5.73E-4| 1.37E-4| 0| 0| 0| 0| 0| 2| 6| 1| 4| 1| 1| 1|null| 0| |59.166.0.7|19792|149.171.126.0| 53|udp|CON| 0.003362| 146| 178| 31| 29| 0| 0| dns|173706.13| 211778.7| 2| 2| 0| 0| 0| 0| 73| 89| 0| 0| 0.0| 0.0|1421970775|1421970775| 0.011|0.0060000001| 0.0| 0.0| 0.0| 0| 0| 0| 0| 0| 3| 2| 3| 3| 3| 1| 1|null| 0| |59.166.0.3|14382|149.171.126.9| 3354|tcp|FIN| 0.45305201| 424| 8824| 31| 29| 1| 4|ftp-data| 6551.124| 142835.7| 8| 12| 255| 255|2206905053|3307670308| 53| 735| 0| 0| 3906.7949|3074.6694|1421970775|1421970775| 64.671288| 41.134998| 6.8E-4|5.5900001E-4| 1.21E-4| 0| 0| 0| 0| 0| 4| 6| 7| 4| 1| 1| 2|null| 0| |59.166.0.9|37074|149.171.126.2| 53|udp|CON| 0.001088| 146| 178| 31| 29| 0| 0| dns|536764.69|654411.75| 2| 2| 0| 0| 0| 0| 73| 89| 0| 0| 0.0| 0.0|1421970775|1421970775| 0.001|0.0089999996| 0.0| 0.0| 0.0| 0| 0| 0| 0| 0| 2| 5| 3| 2| 1| 1| 1|null| 0| |59.166.0.7|12569|149.171.126.5| 53|udp|CON|9.6899999E-4| 146| 178| 31| 29| 0| 0| dns|602683.19|734778.13| 2| 2| 0| 0| 0| 0| 73| 89| 0| 0| 0.0| 0.0|1421970775|1421970775|0.0099999998| 0.003| 0.0| 0.0| 0.0| 0| 0| 0| 0| 0| 3| 1| 2| 3| 3| 1| 1|null| 0| |59.166.0.1|12792|149.171.126.7| 53|udp|CON|0.0010629999| 146| 178| 31| 29| 0| 0| dns|549388.56| 669802.5| 2| 2| 0| 0| 0| 0| 73| 89| 0| 0| 0.0| 0.0|1421970780|1421970780|0.0020000001| 0.003| 0.0| 0.0| 0.0| 0| 0| 0| 0| 0| 5| 3| 2| 4| 3| 1| 1|null| 0| |59.166.0.0|63414|149.171.126.9|10330|tcp|FIN| 0.26501101| 8928| 320| 31| 29| 4| 1|ftp-data|250283.94|8060.0425| 14| 6| 255| 255|2386904726|3699988372|638| 53| 0| 0| 1725.2916|71.464249|1421970775|1421970776| 20.385462| 52.644802|7.7500002E-4|6.3299999E-4| 1.42E-4| 0| 0| 0| 0| 0| 3| 6| 6| 5| 1| 1| 2|null| 0| |59.166.0.1|33555|149.171.126.3| 6881|tcp|FIN| 0.51712799| 1540| 1644| 31| 29| 4| 4| -|22338.764| 24025.0| 16| 18| 255| 255|1741520309|3943579644| 96| 91| 0| 0| 2036.1301| 51.91766|1421970776|1421970776| 34.433331| 30.388353|6.5399997E-4|5.2200002E-4| 1.32E-4| 0| 0| 0| 0| 0| 4| 6| 5| 7| 4| 1| 4|null| 0| |59.166.0.8|10867|149.171.126.8| 111|udp|CON|0.0053389999| 568| 312| 31| 29| 0| 0| -|638321.81|350627.47| 4| 4| 0| 0| 0| 0|142| 78| 0| 0| 1.7430201| 1.757632|1421970776|1421970776| 1.24| 1.252333| 0.0| 0.0| 0.0| 0| 0| 0| 0| 0| 16| 7| 5| 5| 1| 1| 4|null| 0| |59.166.0.8|12411|149.171.126.8| 1715|udp|CON| 0.001739| 512| 304| 31| 29| 0| 0| -|1766532.5|1048878.6| 4| 4| 0| 0| 0| 0|128| 76| 0| 0|0.63026798| 0.318434|1421970776|1421970776| 0.44966701| 0.227667| 0.0| 0.0| 0.0| 0| 0| 0| 0| 0| 16| 7| 5| 5| 1| 1| 4|null| 0| |59.166.0.8|46725|149.171.126.2| 53|udp|CON| 0.001018| 146| 178| 31| 29| 0| 0| dns|573673.88|699410.63| 2| 2| 0| 0| 0| 0| 73| 89| 0| 0| 0.0| 0.0|1421970776|1421970776| 0.011| 0.011| 0.0| 0.0| 0.0| 0| 0| 0| 0| 0| 2| 5| 3| 5| 1| 1| 1|null| 0| |59.166.0.1|51562|149.171.126.4| 53|udp|CON| 0.001044| 146| 178| 31| 29| 0| 0| dns|559386.94|681992.31| 2| 2| 0| 0| 0| 0| 73| 89| 0| 0| 0.0| 0.0|1421970776|1421970776| 0.011| 0.003| 0.0| 0.0| 0.0| 0| 0| 0| 0| 0| 2| 3| 5| 7| 2| 1| 3|null| 0| |59.166.0.3|48838|149.171.126.2| 53|udp|CON|9.8699995E-4| 146| 178| 31| 29| 0| 0| dns| 591692.0|721377.94| 2| 2| 0| 0| 0| 0| 73| 89| 0| 0| 0.0| 0.0|1421970776|1421970776|0.0080000004| 0.003| 0.0| 0.0| 0.0| 0| 0| 0| 0| 0| 2| 5| 3| 3| 1| 1| 1|null| 0| |59.166.0.0|16907|149.171.126.9| 21|tcp|FIN| 2.2547121| 2934| 3740| 31| 29| 11| 15| ftp|10211.503|13025.166| 52| 54| 255| 255| 241515551|2584135680| 56| 69| 0| 0| 3141.3708| 99.93412|1421970773|1421970776| 44.203648| 42.531734|6.6100003E-4|5.2499998E-4| 1.36E-4| 0| 0| 0| 0| 0| 1| 3| 6| 5| 1| 1| 2|null| 0| |59.166.0.0| 1915|149.171.126.4|32945|tcp|FIN| 0.051220998| 2854| 29104| 31| 29| 7| 17| -|436071.16|4450987.0| 46| 48| 255| 255|1921515932|1974066994| 62| 606| 0| 0| 78.226654|76.387978|1421970776|1421970776| 1.1294219| 1.0781699| 8.1E-4| 5.43E-4| 2.67E-4| 0| 0| 0| 0| 0| 7| 2| 5| 5| 1| 1| 2|null| 0| |59.166.0.4| 1309|149.171.126.8|52139|tcp|FIN| 0.091375001| 4238| 65628| 31| 29| 7| 30| -|365964.44|5668246.0| 72| 74| 255| 255|1681522413|1731423574| 59| 887| 0| 0| 0.0|77.518913|1421970776|1421970776| 1.268014| 1.24474|6.3899998E-4| 5.05E-4| 1.34E-4| 0| 0| 0| 0| 0| 2| 7| 5| 2| 1| 1| 1|null| 0| +----------+-----+-------------+-----+---+---+------------+-----+------+---+---+---+---+--------+---------+---------+---+---+-----+-----+----------+----------+---+----+---+---+----------+---------+----------+----------+------------+------------+------------+------------+------------+---+---+---+---+---+---+---+---+---+---+---+---+----+---+ only showing top 20 rows
dataset.columns
['59.166.0.3', '56716', '149.171.126.8', '143', 'tcp', 'FIN', '0.82546002', '7812', '16236', '31', '29', '30', '32', '-', '75090.25', '156111.73', '122', '126', '25518', '25519', '2751097753', '2748686736', '64', '129', '024', '025', '445.25928', '474.9451', '1421970774', '1421970775', '6.8190908', '6.599896', '5.9700001E-4', '4.6899999E-4', '0.000128', '035', '036', '037', '038', '039', '2', '7', '142', '4', '144', '145', '146', '_c47', '048']
dataset.count()
2539738
dataset.head()
Row(59.166.0.3='59.166.0.0', 56716=43467, 149.171.126.8='149.171.126.6', 143=49729, tcp='tcp', FIN='FIN', 0.82546002=0.101815, 7812=4238, 16236=65628, 31=31, 29=29, 30=7, 32=30, -='-', 75090.25=328438.84, 156111.73=5087030.5, 122=72, 126=74, 25518=255, 25519=255, 2751097753=961515433, 2748686736=3225510659, 64=59, 129=887, 024=0, 025=0, 445.25928=0.0, 474.9451=91.579567, 1421970774=1421970775, 1421970775=1421970775, 6.8190908=1.429493, 6.599896=1.387192, 5.9700001E-4=0.00068, 4.6899999E-4=0.00054600002, 0.000128=0.000134, 035=0, 036=0, 037=0, 038=0, 039=0, 2=7, 7=4, 142=1, 4=6, 144=1, 145=1, 146=1, _c47=None, 048=0)
dataset.take(2)
[Row(59.166.0.3='59.166.0.0', 56716=43467, 149.171.126.8='149.171.126.6', 143=49729, tcp='tcp', FIN='FIN', 0.82546002=0.101815, 7812=4238, 16236=65628, 31=31, 29=29, 30=7, 32=30, -='-', 75090.25=328438.84, 156111.73=5087030.5, 122=72, 126=74, 25518=255, 25519=255, 2751097753=961515433, 2748686736=3225510659, 64=59, 129=887, 024=0, 025=0, 445.25928=0.0, 474.9451=91.579567, 1421970774=1421970775, 1421970775=1421970775, 6.8190908=1.429493, 6.599896=1.387192, 5.9700001E-4=0.00068, 4.6899999E-4=0.00054600002, 0.000128=0.000134, 035=0, 036=0, 037=0, 038=0, 039=0, 2=7, 7=4, 142=1, 4=6, 144=1, 145=1, 146=1, _c47=None, 048=0), Row(59.166.0.3='59.166.0.5', 56716=41289, 149.171.126.8='149.171.126.2', 143=9574, tcp='tcp', FIN='FIN', 0.82546002=0.044002999, 7812=2750, 16236=29104, 31=31, 29=29, 30=7, 32=17, -='-', 75090.25=488693.97, 156111.73=5181101.5, 122=44, 126=48, 25518=255, 25519=255, 2751097753=3291096757, 2748686736=1191410228, 64=63, 129=606, 024=0, 025=0, 445.25928=78.126968, 474.9451=62.206562, 1421970774=1421970775, 1421970775=1421970775, 6.8190908=1.014977, 6.599896=0.92583001, 5.9700001E-4=0.00125, 4.6899999E-4=0.000485, 0.000128=0.000765, 035=0, 036=0, 037=0, 038=0, 039=0, 2=3, 7=5, 142=3, 4=3, 144=1, 145=1, 146=1, _c47=None, 048=0)]
def AddNormalLabel(line):
line = line.split(",")
if line[47] == "":
line[47] = 'Normal'
return line
updated_dataset = dataset.map(AddNormalLabel) stops working here!
dataset.printSchema()
root |-- 59.166.0.3: string (nullable = true) |-- 56716: integer (nullable = true) |-- 149.171.126.8: string (nullable = true) |-- 143: integer (nullable = true) |-- tcp: string (nullable = true) |-- FIN: string (nullable = true) |-- 0.82546002: double (nullable = true) |-- 7812: integer (nullable = true) |-- 16236: integer (nullable = true) |-- 31: integer (nullable = true) |-- 29: integer (nullable = true) |-- 30: integer (nullable = true) |-- 32: integer (nullable = true) |-- -: string (nullable = true) |-- 75090.25: double (nullable = true) |-- 156111.73: double (nullable = true) |-- 122: integer (nullable = true) |-- 126: integer (nullable = true) |-- 25518: integer (nullable = true) |-- 25519: integer (nullable = true) |-- 2751097753: long (nullable = true) |-- 2748686736: long (nullable = true) |-- 64: integer (nullable = true) |-- 129: integer (nullable = true) |-- 024: integer (nullable = true) |-- 025: integer (nullable = true) |-- 445.25928: double (nullable = true) |-- 474.9451: double (nullable = true) |-- 1421970774: integer (nullable = true) |-- 1421970775: integer (nullable = true) |-- 6.8190908: double (nullable = true) |-- 6.599896: double (nullable = true) |-- 5.9700001E-4: double (nullable = true) |-- 4.6899999E-4: double (nullable = true) |-- 0.000128: double (nullable = true) |-- 035: integer (nullable = true) |-- 036: integer (nullable = true) |-- 037: integer (nullable = true) |-- 038: integer (nullable = true) |-- 039: integer (nullable = true) |-- 2: integer (nullable = true) |-- 7: integer (nullable = true) |-- 142: integer (nullable = true) |-- 4: integer (nullable = true) |-- 144: integer (nullable = true) |-- 145: integer (nullable = true) |-- 146: integer (nullable = true) |-- _c47: string (nullable = true) |-- 048: integer (nullable = true)
cols = dataset.columns
# Let us import the vector assembler
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=cols,outputCol="features")
# Now let us use the transform method to transform our dataset
dataset=assembler.transform(dataset)
dataset.select("features").show(truncate=False)
--------------------------------------------------------------------------- Py4JJavaError Traceback (most recent call last) ~/Documents/spark-3.0.0/python/pyspark/sql/utils.py in deco(*a, **kw) 97 try: ---> 98 return f(*a, **kw) 99 except py4j.protocol.Py4JJavaError as e: ~/Documents/spark-3.0.0/python/lib/py4j-0.10.8.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 327 "An error occurred while calling {0}{1}{2}.\n". --> 328 format(target_id, ".", name), value) 329 else: Py4JJavaError: An error occurred while calling o46.transform. : java.lang.IllegalArgumentException: Data type string of column 59.166.0.3 is not supported. Data type string of column 149.171.126.8 is not supported. Data type string of column tcp is not supported. Data type string of column FIN is not supported. Data type string of column - is not supported. Data type string of column _c47 is not supported. at org.apache.spark.ml.feature.VectorAssembler.transformSchema(VectorAssembler.scala:168) at org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:75) at org.apache.spark.ml.feature.VectorAssembler.transform(VectorAssembler.scala:85) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.base/java.lang.reflect.Method.invoke(Method.java:566) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:282) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.base/java.lang.Thread.run(Thread.java:834) During handling of the above exception, another exception occurred: IllegalArgumentException Traceback (most recent call last) <ipython-input-15-cb1addfb5fab> in <module> 5 assembler = VectorAssembler(inputCols=cols,outputCol="features") 6 # Now let us use the transform method to transform our dataset ----> 7 dataset=assembler.transform(dataset) 8 dataset.select("features").show(truncate=False) ~/Documents/spark-3.0.0/python/pyspark/ml/base.py in transform(self, dataset, params) 170 return self.copy(params)._transform(dataset) 171 else: --> 172 return self._transform(dataset) 173 else: 174 raise ValueError("Params must be a param map but got %s." % type(params)) ~/Documents/spark-3.0.0/python/pyspark/ml/wrapper.py in _transform(self, dataset) 336 def _transform(self, dataset): 337 self._transfer_params_to_java() --> 338 return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx) 339 340 ~/Documents/spark-3.0.0/python/lib/py4j-0.10.8.1-src.zip/py4j/java_gateway.py in __call__(self, *args) 1284 answer = self.gateway_client.send_command(command) 1285 return_value = get_return_value( -> 1286 answer, self.gateway_client, self.target_id, self.name) 1287 1288 for temp_arg in temp_args: ~/Documents/spark-3.0.0/python/pyspark/sql/utils.py in deco(*a, **kw) 100 converted = convert_exception(e.java_exception) 101 if not isinstance(converted, UnknownException): --> 102 raise converted 103 else: 104 raise IllegalArgumentException: Data type string of column 59.166.0.3 is not supported. Data type string of column 149.171.126.8 is not supported. Data type string of column tcp is not supported. Data type string of column FIN is not supported. Data type string of column - is not supported. Data type string of column _c47 is not supported.