val diamondsRawDF = sqlContext.read // we can use sqlContext instead of SparkSession for backwards compatibility to 1.x .format("com.databricks.spark.csv") // use spark.csv package .option("header", "true") // Use first line of all files as header .option("inferSchema", "true") // Automatically infer data types //.option("delimiter", ",") // Specify the delimiter as comma or ',' DEFAULT .load(diamondsFilePath)
diamondsRawDF: org.apache.spark.sql.DataFrame = [_c0: int, carat: double ... 9 more fields]
//There are 10 columns. We will try to predict the price of diamonds, treating the other 9 columns as features. diamondsRawDF.printSchema()
root
|-- _c0: integer (nullable = true)
|-- carat: double (nullable = true)
|-- cut: string (nullable = true)
|-- color: string (nullable = true)
|-- clarity: string (nullable = true)
|-- depth: double (nullable = true)
|-- table: double (nullable = true)
|-- price: integer (nullable = true)
|-- x: double (nullable = true)
|-- y: double (nullable = true)
|-- z: double (nullable = true)
diamondsRawDF.show(10)
+---+-----+---------+-----+-------+-----+-----+-----+----+----+----+
|_c0|carat| cut|color|clarity|depth|table|price| x| y| z|
+---+-----+---------+-----+-------+-----+-----+-----+----+----+----+
| 1| 0.23| Ideal| E| SI2| 61.5| 55.0| 326|3.95|3.98|2.43|
| 2| 0.21| Premium| E| SI1| 59.8| 61.0| 326|3.89|3.84|2.31|
| 3| 0.23| Good| E| VS1| 56.9| 65.0| 327|4.05|4.07|2.31|
| 4| 0.29| Premium| I| VS2| 62.4| 58.0| 334| 4.2|4.23|2.63|
| 5| 0.31| Good| J| SI2| 63.3| 58.0| 335|4.34|4.35|2.75|
| 6| 0.24|Very Good| J| VVS2| 62.8| 57.0| 336|3.94|3.96|2.48|
| 7| 0.24|Very Good| I| VVS1| 62.3| 57.0| 336|3.95|3.98|2.47|
| 8| 0.26|Very Good| H| SI1| 61.9| 55.0| 337|4.07|4.11|2.53|
| 9| 0.22| Fair| E| VS2| 65.1| 61.0| 337|3.87|3.78|2.49|
| 10| 0.23|Very Good| H| VS1| 59.4| 61.0| 338| 4.0|4.05|2.39|
+---+-----+---------+-----+-------+-----+-----+-----+----+----+----+
only showing top 10 rows
import org.apache.spark.sql.types.DoubleType //we will convert price column from int to double for being able to model, fit and predict in downstream ML task val diamondsDF = diamondsRawDF.select($"carat", $"cut", $"color", $"clarity", $"depth", $"table",$"price".cast(DoubleType).as("price"), $"x", $"y", $"z") diamondsDF.cache() // let's cache it for reuse diamondsDF.printSchema // print schema
root
|-- carat: double (nullable = true)
|-- cut: string (nullable = true)
|-- color: string (nullable = true)
|-- clarity: string (nullable = true)
|-- depth: double (nullable = true)
|-- table: double (nullable = true)
|-- price: double (nullable = true)
|-- x: double (nullable = true)
|-- y: double (nullable = true)
|-- z: double (nullable = true)
import org.apache.spark.sql.types.DoubleType
diamondsDF: org.apache.spark.sql.DataFrame = [carat: double, cut: string ... 8 more fields]
diamondsDF.show(10,false) // notice that price column has Double values that end in '.0' now
+-----+---------+-----+-------+-----+-----+-----+----+----+----+
|carat|cut |color|clarity|depth|table|price|x |y |z |
+-----+---------+-----+-------+-----+-----+-----+----+----+----+
|0.23 |Ideal |E |SI2 |61.5 |55.0 |326.0|3.95|3.98|2.43|
|0.21 |Premium |E |SI1 |59.8 |61.0 |326.0|3.89|3.84|2.31|
|0.23 |Good |E |VS1 |56.9 |65.0 |327.0|4.05|4.07|2.31|
|0.29 |Premium |I |VS2 |62.4 |58.0 |334.0|4.2 |4.23|2.63|
|0.31 |Good |J |SI2 |63.3 |58.0 |335.0|4.34|4.35|2.75|
|0.24 |Very Good|J |VVS2 |62.8 |57.0 |336.0|3.94|3.96|2.48|
|0.24 |Very Good|I |VVS1 |62.3 |57.0 |336.0|3.95|3.98|2.47|
|0.26 |Very Good|H |SI1 |61.9 |55.0 |337.0|4.07|4.11|2.53|
|0.22 |Fair |E |VS2 |65.1 |61.0 |337.0|3.87|3.78|2.49|
|0.23 |Very Good|H |VS1 |59.4 |61.0 |338.0|4.0 |4.05|2.39|
+-----+---------+-----+-------+-----+-----+-----+----+----+----+
only showing top 10 rows
// View distinct diamond clarities in dataset val claritiesDistinctDF = diamondsDF.select("clarity").distinct() // .collect() claritiesDistinctDF.show()
+-------+
|clarity|
+-------+
| VVS2|
| SI1|
| IF|
| I1|
| VVS1|
| VS2|
| SI2|
| VS1|
+-------+
claritiesDistinctDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [clarity: string]
diamondsDF.printSchema // Ctrl+Enter
root
|-- carat: double (nullable = true)
|-- cut: string (nullable = true)
|-- color: string (nullable = true)
|-- clarity: string (nullable = true)
|-- depth: double (nullable = true)
|-- table: double (nullable = true)
|-- price: double (nullable = true)
|-- x: double (nullable = true)
|-- y: double (nullable = true)
|-- z: double (nullable = true)
diamondsDColoredDF.show(10) // Ctrl+Enter
+-----+-----+-----+
|carat|color|price|
+-----+-----+-----+
| 0.23| D|357.0|
| 0.23| D|402.0|
| 0.26| D|403.0|
| 0.26| D|403.0|
| 0.26| D|403.0|
| 0.22| D|404.0|
| 0.3| D|552.0|
| 0.3| D|552.0|
| 0.3| D|552.0|
| 0.24| D|553.0|
+-----+-----+-----+
only showing top 10 rows
sqlContext.tables.show() // Ctrl+Enter to see available tables
+--------+--------------------+-----------+
|database| tableName|isTemporary|
+--------+--------------------+-----------+
| default| adult| false|
| default| business_csv_csv| false|
| default| checkin_table| false|
| default| diamonds| false|
| default| inventory| false|
| default|item_merchant_cat...| false|
| default| items_left_csv| false|
| default| logistic_detail| false|
| default| merchant_ratings| false|
| default| order_data| false|
| default| order_ids_left_csv| false|
| default| repeat_csv| false|
| default| review_2019_csv| false|
| default|sample_logistic_t...| false|
| default| sentimentlex_csv| false|
| default| simple_range| false|
| default| social_media_usage| false|
| default| tip_json| false|
| default| tips_csv_csv| false|
| default| users_csv| false|
+--------+--------------------+-----------+
only showing top 20 rows
sqlContext.tables.show() // Ctrl+Enter to see available tables
+--------+--------------------+-----------+
|database| tableName|isTemporary|
+--------+--------------------+-----------+
| default| adult| false|
| default| business_csv_csv| false|
| default| checkin_table| false|
| default| diamonds| false|
| default| inventory| false|
| default|item_merchant_cat...| false|
| default| items_left_csv| false|
| default| logistic_detail| false|
| default| merchant_ratings| false|
| default| order_data| false|
| default| order_ids_left_csv| false|
| default| repeat_csv| false|
| default| review_2019_csv| false|
| default|sample_logistic_t...| false|
| default| sentimentlex_csv| false|
| default| simple_range| false|
| default| social_media_usage| false|
| default| tip_json| false|
| default| tips_csv_csv| false|
| default| users_csv| false|
+--------+--------------------+-----------+
only showing top 20 rows
// or if you like use upper case for SQL then this is equivalent val diamondsDColoredDF_FromTable = sqlContext.sql("SELECT carat, color, price FROM diamonds WHERE color='D'") // Shift+Enter
diamondsDColoredDF_FromTable: org.apache.spark.sql.DataFrame = [carat: double, color: string ... 1 more field]
// from version 2.x onwards you can call from SparkSession, the pre-made spark in spark-shell or databricks notebook val diamondsDColoredDF_FromTable = spark.sql("SELECT carat, color, price FROM diamonds WHERE color='D'") // Shift+Enter
diamondsDColoredDF_FromTable: org.apache.spark.sql.DataFrame = [carat: double, color: string ... 1 more field]
diamondsDF.printSchema // since price is double in the DF that was turned into table we can rely on the descenting sort on doubles
root
|-- carat: double (nullable = true)
|-- cut: string (nullable = true)
|-- color: string (nullable = true)
|-- clarity: string (nullable = true)
|-- depth: double (nullable = true)
|-- table: double (nullable = true)
|-- price: double (nullable = true)
|-- x: double (nullable = true)
|-- y: double (nullable = true)
|-- z: double (nullable = true)
SDS-2.x, Scalable Data Engineering Science
Last refresh: Never