// We'll use the Diamonds dataset from the R datasets hosted on DBC. val diamondsFilePath = "dbfs:/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv"
diamondsFilePath: String = dbfs:/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv
sc.textFile(diamondsFilePath).take(2) // looks like a csv file as it should
res2: Array[String] = Array("","carat","cut","color","clarity","depth","table","price","x","y","z", "1",0.23,"Ideal","E","SI2",61.5,55,326,3.95,3.98,2.43)
val diamondsRawDF = sqlContext.read // we can use sqlContext instead of SparkSession for backwards compatibility to 1.x .format("com.databricks.spark.csv") // use spark.csv package .option("header", "true") // Use first line of all files as header .option("inferSchema", "true") // Automatically infer data types //.option("delimiter", ",") // Specify the delimiter as comma or ',' DEFAULT .load(diamondsFilePath)
diamondsRawDF: org.apache.spark.sql.DataFrame = [_c0: int, carat: double ... 9 more fields]
//There are 10 columns. We will try to predict the price of diamonds, treating the other 9 columns as features. diamondsRawDF.printSchema()
root
|-- _c0: integer (nullable = true)
|-- carat: double (nullable = true)
|-- cut: string (nullable = true)
|-- color: string (nullable = true)
|-- clarity: string (nullable = true)
|-- depth: double (nullable = true)
|-- table: double (nullable = true)
|-- price: integer (nullable = true)
|-- x: double (nullable = true)
|-- y: double (nullable = true)
|-- z: double (nullable = true)
diamondsRawDF.show(10)
+---+-----+---------+-----+-------+-----+-----+-----+----+----+----+
|_c0|carat| cut|color|clarity|depth|table|price| x| y| z|
+---+-----+---------+-----+-------+-----+-----+-----+----+----+----+
| 1| 0.23| Ideal| E| SI2| 61.5| 55.0| 326|3.95|3.98|2.43|
| 2| 0.21| Premium| E| SI1| 59.8| 61.0| 326|3.89|3.84|2.31|
| 3| 0.23| Good| E| VS1| 56.9| 65.0| 327|4.05|4.07|2.31|
| 4| 0.29| Premium| I| VS2| 62.4| 58.0| 334| 4.2|4.23|2.63|
| 5| 0.31| Good| J| SI2| 63.3| 58.0| 335|4.34|4.35|2.75|
| 6| 0.24|Very Good| J| VVS2| 62.8| 57.0| 336|3.94|3.96|2.48|
| 7| 0.24|Very Good| I| VVS1| 62.3| 57.0| 336|3.95|3.98|2.47|
| 8| 0.26|Very Good| H| SI1| 61.9| 55.0| 337|4.07|4.11|2.53|
| 9| 0.22| Fair| E| VS2| 65.1| 61.0| 337|3.87|3.78|2.49|
| 10| 0.23|Very Good| H| VS1| 59.4| 61.0| 338| 4.0|4.05|2.39|
+---+-----+---------+-----+-------+-----+-----+-----+----+----+----+
only showing top 10 rows
import org.apache.spark.sql.types.DoubleType //we will convert price column from int to double for being able to model, fit and predict in downstream ML task val diamondsDF = diamondsRawDF.select($"carat", $"cut", $"color", $"clarity", $"depth", $"table",$"price".cast(DoubleType).as("price"), $"x", $"y", $"z") diamondsDF.cache() // let's cache it for reuse diamondsDF.printSchema // print schema
root
|-- carat: double (nullable = true)
|-- cut: string (nullable = true)
|-- color: string (nullable = true)
|-- clarity: string (nullable = true)
|-- depth: double (nullable = true)
|-- table: double (nullable = true)
|-- price: double (nullable = true)
|-- x: double (nullable = true)
|-- y: double (nullable = true)
|-- z: double (nullable = true)
import org.apache.spark.sql.types.DoubleType
diamondsDF: org.apache.spark.sql.DataFrame = [carat: double, cut: string ... 8 more fields]
diamondsDF.show(10,false) // notice that price column has Double values that end in '.0' now
+-----+---------+-----+-------+-----+-----+-----+----+----+----+
|carat|cut |color|clarity|depth|table|price|x |y |z |
+-----+---------+-----+-------+-----+-----+-----+----+----+----+
|0.23 |Ideal |E |SI2 |61.5 |55.0 |326.0|3.95|3.98|2.43|
|0.21 |Premium |E |SI1 |59.8 |61.0 |326.0|3.89|3.84|2.31|
|0.23 |Good |E |VS1 |56.9 |65.0 |327.0|4.05|4.07|2.31|
|0.29 |Premium |I |VS2 |62.4 |58.0 |334.0|4.2 |4.23|2.63|
|0.31 |Good |J |SI2 |63.3 |58.0 |335.0|4.34|4.35|2.75|
|0.24 |Very Good|J |VVS2 |62.8 |57.0 |336.0|3.94|3.96|2.48|
|0.24 |Very Good|I |VVS1 |62.3 |57.0 |336.0|3.95|3.98|2.47|
|0.26 |Very Good|H |SI1 |61.9 |55.0 |337.0|4.07|4.11|2.53|
|0.22 |Fair |E |VS2 |65.1 |61.0 |337.0|3.87|3.78|2.49|
|0.23 |Very Good|H |VS1 |59.4 |61.0 |338.0|4.0 |4.05|2.39|
+-----+---------+-----+-------+-----+-----+-----+----+----+----+
only showing top 10 rows
//View DataFrame in databricks // note this 'display' is a databricks notebook specific command that is quite powerful for visual interaction with the data // other notebooks like zeppelin have similar commands for interactive visualisation display(diamondsDF)
val cutsDF = diamondsDF.select("cut") // Shift+Enter
cutsDF: org.apache.spark.sql.DataFrame = [cut: string]
cutsDF.show(10) // Ctrl+Enter
+---------+
| cut|
+---------+
| Ideal|
| Premium|
| Good|
| Premium|
| Good|
|Very Good|
|Very Good|
|Very Good|
| Fair|
|Very Good|
+---------+
only showing top 10 rows
// View distinct diamond cuts in dataset val cutsDistinctDF = diamondsDF.select("cut").distinct()
cutsDistinctDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [cut: string]
SDS-2.2, Scalable Data Science
Last refresh: Never