// Shift+Enter to make RDD x and RDD y that is mapped from x val x = sc.parallelize(Array("b", "a", "c")) // make RDD x: [b, a, c] val y = x.map(z => (z,1)) // map x into RDD y: [(b, 1), (a, 1), (c, 1)]
x: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[89] at parallelize at command-112937334110684:2
y: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[90] at map at command-112937334110684:3
//Shift+Enter to make RDD x and filter it by (n => n%2 == 1) to make RDD y val x = sc.parallelize(Array(1,2,3)) // the closure (n => n%2 == 1) in the filter will // return True if element n in RDD x has remainder 1 when divided by 2 (i.e., if n is odd) val y = x.filter(n => n%2 == 1)
x: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[91] at parallelize at command-112937334110688:2
y: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[92] at filter at command-112937334110688:5
//Shift+Enter to make RDD x and flatMap it into RDD by closure (n => Array(n, n*100, 42)) val x = sc.parallelize(Array(1,2,3)) val y = x.flatMap(n => Array(n, n*100, 42))
x: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[100] at parallelize at command-112937334110696:2
y: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[101] at flatMap at command-112937334110696:3
// Cntrl+Enter to make RDD words and display it by collect val words = sc.parallelize(Array("a", "b", "a", "a", "b", "b", "a", "a", "a", "b", "b")) words.collect()
words: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[114] at parallelize at command-112937334110699:2
res30: Array[String] = Array(a, b, a, a, b, b, a, a, a, b, b)
// Cntrl+Enter to make and collect Pair RDD wordCountPairRDD val wordCountPairRDD = words.map(s => (s, 1)) wordCountPairRDD.collect()
wordCountPairRDD: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[115] at map at command-112937334110701:2
res31: Array[(String, Int)] = Array((a,1), (b,1), (a,1), (a,1), (b,1), (b,1), (a,1), (a,1), (a,1), (b,1), (b,1))
// Cntrl+Enter to reduceByKey and collect wordcounts RDD //val wordcounts = wordCountPairRDD.reduceByKey( _ + _ ) val wordcounts = wordCountPairRDD.reduceByKey( (value1, value2) => value1 + value2 ) wordcounts.collect()
wordcounts: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[116] at reduceByKey at command-112937334110704:3
res32: Array[(String, Int)] = Array((a,6), (b,5))
//Cntrl+Enter to make words RDD and do the word count in two lines val words = sc.parallelize(Array("a", "b", "a", "a", "b", "b", "a", "a", "a", "b", "b")) val wordcounts = words .map(s => (s, 1)) .reduceByKey(_ + _) .collect()
words: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[117] at parallelize at command-112937334110706:2
wordcounts: Array[(String, Int)] = Array((a,6), (b,5))
// Shift+Enter and comprehend code val words = sc.parallelize(Array("a", "b", "a", "a", "b", "b", "a", "a", "a", "b", "b")) val wordCountPairRDD = words.map(s => (s, 1)) val wordCountPairRDDSortedByKey = wordCountPairRDD.sortByKey()
words: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[120] at parallelize at command-112937334110708:2
wordCountPairRDD: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[121] at map at command-112937334110708:3
wordCountPairRDDSortedByKey: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[124] at sortByKey at command-112937334110708:4
SDS-2.x, Scalable Data Engineering Science
Last refresh: Never