val diamondsRawDF = sqlContext.read // we can use sqlContext instead of SparkSession for backwards compatibility to 1.x
.format("com.databricks.spark.csv") // use spark.csv package
.option("header", "true") // Use first line of all files as header
.option("inferSchema", "true") // Automatically infer data types
//.option("delimiter", ",") // Specify the delimiter as comma or ',' DEFAULT
.load(diamondsFilePath)
diamondsRawDF: org.apache.spark.sql.DataFrame = [_c0: int, carat: double ... 9 more fields]
//There are 10 columns. We will try to predict the price of diamonds, treating the other 9 columns as features.
diamondsRawDF.printSchema()
root
|-- _c0: integer (nullable = true)
|-- carat: double (nullable = true)
|-- cut: string (nullable = true)
|-- color: string (nullable = true)
|-- clarity: string (nullable = true)
|-- depth: double (nullable = true)
|-- table: double (nullable = true)
|-- price: integer (nullable = true)
|-- x: double (nullable = true)
|-- y: double (nullable = true)
|-- z: double (nullable = true)
diamondsRawDF.show(10)
+---+-----+---------+-----+-------+-----+-----+-----+----+----+----+
|_c0|carat| cut|color|clarity|depth|table|price| x| y| z|
+---+-----+---------+-----+-------+-----+-----+-----+----+----+----+
| 1| 0.23| Ideal| E| SI2| 61.5| 55.0| 326|3.95|3.98|2.43|
| 2| 0.21| Premium| E| SI1| 59.8| 61.0| 326|3.89|3.84|2.31|
| 3| 0.23| Good| E| VS1| 56.9| 65.0| 327|4.05|4.07|2.31|
| 4| 0.29| Premium| I| VS2| 62.4| 58.0| 334| 4.2|4.23|2.63|
| 5| 0.31| Good| J| SI2| 63.3| 58.0| 335|4.34|4.35|2.75|
| 6| 0.24|Very Good| J| VVS2| 62.8| 57.0| 336|3.94|3.96|2.48|
| 7| 0.24|Very Good| I| VVS1| 62.3| 57.0| 336|3.95|3.98|2.47|
| 8| 0.26|Very Good| H| SI1| 61.9| 55.0| 337|4.07|4.11|2.53|
| 9| 0.22| Fair| E| VS2| 65.1| 61.0| 337|3.87|3.78|2.49|
| 10| 0.23|Very Good| H| VS1| 59.4| 61.0| 338| 4.0|4.05|2.39|
+---+-----+---------+-----+-------+-----+-----+-----+----+----+----+
only showing top 10 rows
import org.apache.spark.sql.types.DoubleType
//we will convert price column from int to double for being able to model, fit and predict in downstream ML task
val diamondsDF = diamondsRawDF.select($"carat", $"cut", $"color", $"clarity", $"depth", $"table",$"price".cast(DoubleType).as("price"), $"x", $"y", $"z")
diamondsDF.cache() // let's cache it for reuse
diamondsDF.printSchema // print schema
root
|-- carat: double (nullable = true)
|-- cut: string (nullable = true)
|-- color: string (nullable = true)
|-- clarity: string (nullable = true)
|-- depth: double (nullable = true)
|-- table: double (nullable = true)
|-- price: double (nullable = true)
|-- x: double (nullable = true)
|-- y: double (nullable = true)
|-- z: double (nullable = true)
import org.apache.spark.sql.types.DoubleType
diamondsDF: org.apache.spark.sql.DataFrame = [carat: double, cut: string ... 8 more fields]
diamondsDF.show(10,false) // notice that price column has Double values that end in '.0' now
+-----+---------+-----+-------+-----+-----+-----+----+----+----+
|carat|cut |color|clarity|depth|table|price|x |y |z |
+-----+---------+-----+-------+-----+-----+-----+----+----+----+
|0.23 |Ideal |E |SI2 |61.5 |55.0 |326.0|3.95|3.98|2.43|
|0.21 |Premium |E |SI1 |59.8 |61.0 |326.0|3.89|3.84|2.31|
|0.23 |Good |E |VS1 |56.9 |65.0 |327.0|4.05|4.07|2.31|
|0.29 |Premium |I |VS2 |62.4 |58.0 |334.0|4.2 |4.23|2.63|
|0.31 |Good |J |SI2 |63.3 |58.0 |335.0|4.34|4.35|2.75|
|0.24 |Very Good|J |VVS2 |62.8 |57.0 |336.0|3.94|3.96|2.48|
|0.24 |Very Good|I |VVS1 |62.3 |57.0 |336.0|3.95|3.98|2.47|
|0.26 |Very Good|H |SI1 |61.9 |55.0 |337.0|4.07|4.11|2.53|
|0.22 |Fair |E |VS2 |65.1 |61.0 |337.0|3.87|3.78|2.49|
|0.23 |Very Good|H |VS1 |59.4 |61.0 |338.0|4.0 |4.05|2.39|
+-----+---------+-----+-------+-----+-----+-----+----+----+----+
only showing top 10 rows
ScaDaMaLe Course site and book