In [3]:
data = [1,2,4,7,11,15,20] # list of data
data
Out[3]:
[1, 2, 4, 7, 11, 15, 20]
In [7]:
rdd = sc.parallelize(data,4) # create 4 partitions of the data
In [8]:
rdd
Out[8]:
ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:247
In [9]:
rdd.collect() # action the data
Out[9]:
[1, 2, 4, 7, 11, 15, 20]
In [10]:
rdd1 = rdd.map(lambda x:(x+2)*4) # perform a transformation or calculation of the data
rdd1.collect()
Out[10]:
[12, 16, 24, 36, 52, 68, 88]
In [11]:
rdd2 = rdd.filter(lambda x:(x+2)*4)
rdd2.collect()
Out[11]:
[1, 2, 4, 7, 11, 15, 20]
In [12]:
rdd3 = rdd.filter(lambda x:x%3==0)
rdd3.collect()
Out[12]:
[15]
In [13]:
rdd4 = rdd.map(lambda x:x%3==0)
rdd4.collect()
Out[13]:
[False, False, False, False, False, True, False]
In [14]:
rdd5 = sc.parallelize([4,2,2,6,7,7,19,40,41,40,40])
In [15]:
rdd5.distinct()
Out[15]:
PythonRDD[10] at RDD at PythonRDD.scala:53
In [16]:
rdd5.distinct().collect()
Out[16]:
[4, 40, 41, 2, 6, 7, 19]
In [17]:
rdd6 = sc.parallelize([1,2,3,4])
In [18]:
rdd7 = rdd6.map(lambda x:[x,x+2,x+7])
rdd7.collect()
Out[18]:
[[1, 3, 8], [2, 4, 9], [3, 5, 10], [4, 6, 11]]
In [20]:
rdd8 = rdd6.flatMap(lambda x:[x,x+2,x+7])
rdd8.collect()
Out[20]:
[1, 3, 8, 2, 4, 9, 3, 5, 10, 4, 6, 11]
In [21]:
print(rdd7.collect())
[[1, 3, 8], [2, 4, 9], [3, 5, 10], [4, 6, 11]]
In [22]:
print(rdd8.collect())
[1, 3, 8, 2, 4, 9, 3, 5, 10, 4, 6, 11]
In [23]:
rdd8.reduce(lambda a,b:a*b)
Out[23]:
68428800
In [ ]: