val rdd0_1000000 = sc.parallelize(Seq.range(0, 1000000)) // <Shift+Enter> to create an RDD of million integers: 0,1,2,...,10^6
rdd0_1000000: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[181332] at parallelize at <console>:34
rdd0_1000000.take(5) // <Ctrl+Enter> gives the first 5 elements of the RDD, (0, 1, 2, 3, 4)
res0: Array[Int] = Array(0, 1, 2, 3, 4)
rdd0_1000000.takeOrdered(5) // <Shift+Enter> is same as rdd0_1000000.take(5)
res1: Array[Int] = Array(0, 1, 2, 3, 4)
rdd0_1000000.takeOrdered(5)(Ordering[Int].reverse) // <Ctrl+Enter> to get the last 5 elements of the RDD 999999, 999998, ..., 999995
res2: Array[Int] = Array(999999, 999998, 999997, 999996, 999995)
// HOMEWORK: edit the numbers below to get the last 20 elements of an RDD made of a sequence of integers from 669966 to 969696 sc.parallelize(Seq.range(0, 10)).takeOrdered(5)(Ordering[Int].reverse) // <Ctrl+Enter> evaluate this cell after editing it for the right answer
res3: Array[Int] = Array(9, 8, 7, 6, 5)
val rdd = sc.parallelize(Seq(1, 2, 3, 4)) // <Shift+Enter> to evaluate this cell (using default number of partitions)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[181337] at parallelize at <console>:34
rdd.map( x => x*2) // <Ctrl+Enter> to transform rdd by map that doubles each element
res4: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[181338] at map at <console>:36
rdd.map( x => x*2).count() // <Shift+Enter> to perform count (action) the element of the RDD = 4
res5: Long = 4
rdd.map( x => x*2).collect() // <Shift+Enter> to perform collect (action) to show 2, 4, 6, 8
res6: Array[Int] = Array(2, 4, 6, 8)
// HOMEWORK: uncomment the last line in this cell and modify the '<Fill-In-Here>' in the code below to collect and display the square (x*x) of each element of the RDD // the answer should be Array[Int] = Array(1, 4, 9, 16) Press <Cntrl+Enter> to evaluate the cell after modifying '???' //sc.parallelize(Seq(1, 2, 3, 4)).map( x => <Fill-In-Here> ).collect()
val rddFiltered = rdd.filter( x => x%2==0 ) // <Ctrl+Enter> to declare rddFiltered from transforming rdd
rddFiltered: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[181341] at filter at <console>:34
rddFiltered.collect() // <Ctrl+Enter> to collect (action) elements of rddFiltered; should be (2, 4)
res8: Array[Int] = Array(2, 4)
val rdd = sc.parallelize(Array(1,2,3,4,5))
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[181342] at parallelize at <console>:34
rdd.reduce( (x,y)=>x+y ) // <Shift+Enter> to do reduce (action) to sum and return Int = 15
res9: Int = 15
rdd.reduce( _ + _ ) // <Shift+Enter> to do same sum as above and return Int = 15 (undescore syntax)
res10: Int = 15
rdd.reduce( (x,y)=>x*y ) // <Shift+Enter> to do reduce (action) to multiply and return Int = 120
res11: Int = 120
val rdd0_1000000 = sc.parallelize(Seq.range(0, 1000000)) // <Shift+Enter> to create an RDD of million integers: 0,1,2,...,10^6
rdd0_1000000: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[181343] at parallelize at <console>:34
rdd0_1000000.reduce( (x,y)=>x+y ) // <Ctrl+Enter> to do reduce (action) to sum and return Int 1783293664
res12: Int = 1783293664
// the following correctly returns Int = 0 although for wrong reason // we have flowed out of Int's numeric limits!!! (but got lucky with 0*x=0 for any Int x) // <Shift+Enter> to do reduce (action) to multiply and return Int = 0 rdd0_1000000.reduce( (x,y)=>x*y )
res13: Int = 0
// <Ctrl+Enter> to do reduce (action) to multiply 1*2*...*9*10 and return correct answer Int = 3628800 sc.parallelize(Seq.range(1, 11)).reduce( (x,y)=>x*y )
res14: Int = 3628800
// <Ctrl+Enter> to do reduce (action) to multiply 1*2*...*20 and return wrong answer as Int = -2102132736 // we have overflowed out of Int's in a circle back to negative Ints!!! (rigorous distributed numerics, anyone?) sc.parallelize(Seq.range(1, 21)).reduce( (x,y)=>x*y )
res17: Int = -2102132736
SDS-2.2-360-in-525-01: Intro to Apache Spark for data Scientists
SDS-2.2, Scalable Data Science
Last refresh: Never