println(sc) println(sqlContext)
org.apache.spark.SparkContext@31081837
org.apache.spark.sql.hive.HiveContext@774252ff
val x = sc.parallelize(Array(1, 2, 3), 2) // <Ctrl+Enter> to evaluate this cell (using 2 partitions)
x: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[181331] at parallelize at <console>:34
x.collect() // <Ctrl+Enter> to collect (action) elements of rdd; should be (1, 2, 3)
res0: Array[Int] = Array(1, 2, 3)
// <Ctrl+Enter> to evaluate this cell and find the number of partitions in RDD x x.getNumPartitions
res1: Int = 2
x.glom().collect() // glom() flattens elements on the same partition
res2: Array[Array[Int]] = Array(Array(1), Array(2, 3))
val x = sc.parallelize(Seq(1, 2, 3)) // <Shift+Enter> to evaluate this cell (using default number of partitions)
//x.take( ) // uncomment by removing '//' before x in the cell and fill in the parenthesis to take just one element from RDD x and Cntrl+Enter
// Shift+Enter to make RDD x and RDD y that is mapped from x val x = sc.parallelize(Array("b", "a", "c")) // make RDD x: [b, a, c] val y = x.map(z => (z,1)) // map x into RDD y: [(b, 1), (a, 1), (c, 1)]
x: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[734] at parallelize at <console>:37
y: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[735] at map at <console>:38
// Cntrl+Enter to collect and print the two RDDs println(x.collect().mkString(", ")) println(y.collect().mkString(", "))
b, a, c
(b,1), (a,1), (c,1)
//Shift+Enter to make RDD x and filter it by (n => n%2 == 1) to make RDD y val x = sc.parallelize(Array(1,2,3)) // the closure (n => n%2 == 1) in the filter will // return True if element n in RDD x has remainder 1 when divided by 2 (i.e., if n is odd) val y = x.filter(n => n%2 == 1)
x: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[768] at parallelize at <console>:37
y: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[769] at filter at <console>:40
// Cntrl+Enter to collect and print the two RDDs println(x.collect().mkString(", ")) println(y.collect().mkString(", "))
1, 2, 3
1, 3
//Shift+Enter to make RDD x of inteegrs 1,2,3,4 and reduce it to sum val x = sc.parallelize(Array(1,2,3,4)) val y = x.reduce((a,b) => a+b)
x: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[808] at parallelize at <console>:37
y: Int = 10
//Cntrl+Enter to collect and print RDD x and the Int y, sum of x println(x.collect.mkString(", ")) println(y)
1, 2, 3, 4
10
//Shift+Enter to make RDD x and flatMap it into RDD by closure (n => Array(n, n*100, 42)) val x = sc.parallelize(Array(1,2,3)) val y = x.flatMap(n => Array(n, n*100, 42))
x: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[844] at parallelize at <console>:37
y: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[845] at flatMap at <console>:38
//Cntrl+Enter to collect and print RDDs x and y println(x.collect().mkString(", ")) println(y.collect().mkString(", "))
1, 2, 3
1, 100, 42, 2, 200, 42, 3, 300, 42
// Cntrl+Enter to make RDD words and display it by collect val words = sc.parallelize(Array("a", "b", "a", "a", "b", "b", "a", "a", "a", "b", "b")) words.collect()
words: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[859] at parallelize at <console>:35
res13: Array[String] = Array(a, b, a, a, b, b, a, a, a, b, b)
// Cntrl+Enter to make and collect Pair RDD wordCountPairRDD val wordCountPairRDD = words.map(s => (s, 1)) wordCountPairRDD.collect()
wordCountPairRDD: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[867] at map at <console>:37
res14: Array[(String, Int)] = Array((a,1), (b,1), (a,1), (a,1), (b,1), (b,1), (a,1), (a,1), (a,1), (b,1), (b,1))
// Cntrl+Enter to reduceByKey and collect wordcounts RDD //val wordcounts = wordCountPairRDD.reduceByKey( _ + _ ) val wordcounts = wordCountPairRDD.reduceByKey( (v1,v2) => v1+v2 ) wordcounts.collect()
wordcounts: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[908] at reduceByKey at <console>:42
res16: Array[(String, Int)] = Array((a,6), (b,5))
//Cntrl+Enter to make words RDD and do the word count in two lines val words = sc.parallelize(Array("a", "b", "a", "a", "b", "b", "a", "a", "a", "b", "b")) val wordcounts = words.map(s => (s, 1)).reduceByKey(_ + _).collect()
words: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[914] at parallelize at <console>:37
wordcounts: Array[(String, Int)] = Array((a,6), (b,5))
// Shift+Enter and comprehend code val words = sc.parallelize(Array("a", "b", "a", "a", "b", "b", "a", "a", "a", "b", "b")) val wordCountPairRDD = words.map(s => (s, 1)) val wordCountPairRDDSortedByKey = wordCountPairRDD.sortByKey()
val wordCountPairRDDGroupByKey = wordCountPairRDD.groupByKey() // <Shift+Enter> CAUTION: this transformation can be very wide!
wordCountPairRDDGroupByKey: org.apache.spark.rdd.RDD[(String, Iterable[Int])] = ShuffledRDD[984] at groupByKey at <console>:38
wordCountPairRDDGroupByKey.collect() // Cntrl+Enter
res19: Array[(String, Iterable[Int])] = Array((a,CompactBuffer(1, 1, 1, 1, 1, 1)), (b,CompactBuffer(1, 1, 1, 1, 1)))
val list = 1 to 10 var sum = 0 list.map(x => sum = sum + x) print(sum)
55list: scala.collection.immutable.Range.Inclusive = Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
sum: Int = 55
val rdd = sc.parallelize(1 to 10) var sum = 0
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[358776] at parallelize at <console>:34
sum: Int = 0
val rdd1 = rdd.map(x => sum = sum + x)
rdd1: org.apache.spark.rdd.RDD[Unit] = MapPartitionsRDD[358778] at map at <console>:38
val rdd1 = rdd.map(x => {var sum = 0; sum = sum + x sum} )
rdd1: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[358779] at map at <console>:38
SDS-2.2-360-in-525-01: Intro to Apache Spark for data Scientists
SDS-2.2, Scalable Data Science
Last refresh: Never