// we use that class as a key in the group by
val rdd = sc.parallelize(Array((TestKey(1L, "abd"), "dss"), (TestKey(2L, "ggs"), "dse"), (TestKey(1L, "abd"), "qrf")))
rdd.groupByKey().collect
rdd: org.apache.spark.rdd.RDD[(TestKey, String)] = ParallelCollectionRDD[0] at parallelize at command-2972105651607260:2
res0: Array[(TestKey, Iterable[String])] = Array((TestKey(2,ggs),CompactBuffer(dse)), (TestKey(1,abd),CompactBuffer(qrf)), (TestKey(1,abd),CompactBuffer(dss)))
import com.databricks.example
val rdd = sc.parallelize(Array(
(example.TestKey(1L, "abd"), "dss"), (example.TestKey(2L, "ggs"), "dse"), (example.TestKey(1L, "abd"), "qrf")))
rdd.groupByKey().collect
import com.databricks.example
rdd: org.apache.spark.rdd.RDD[(com.databricks.example.TestKey, String)] = ParallelCollectionRDD[2] at parallelize at command-2972105651607263:3
res2: Array[(com.databricks.example.TestKey, Iterable[String])] = Array((TestKey(2,ggs),CompactBuffer(dse)), (TestKey(1,abd),CompactBuffer(dss, qrf)))
package x.y.z
val aNumber = 5 // won't work
def functionThatWillNotWork(a: Int): Int = a + 1
<notebook>:4: error: expected class or object definition
val aNumber = 5 // won't work
^
<notebook>:6: error: expected class or object definition
def functionThatWillNotWork(a: Int): Int = a + 1
^
Compilation failed.
package x.y.zpackage
import org.apache.spark.SparkContext
case class IntArray(values: Array[Int])
class MyClass(sc: SparkContext) {
def sparkSum(array: IntArray): Int = {
sc.parallelize(array.values).reduce(_ + _)
}
}
object MyClass {
def sparkSum(sc: SparkContext, array: IntArray): Int = {
sc.parallelize(array.values).reduce(_ + _)
}
}
Warning: classes defined within packages cannot be redefined without a cluster restart.
Compilation successful.
ScaDaMaLe Course site and book