// 2. Range of numbers, note that Spark automatically names column as "id" val range = spark.range(0, 10) // In order to get a preview of data in DataFrame use "show()" range.show(3)
+---+
| id|
+---+
| 0|
| 1|
| 2|
+---+
only showing top 3 rows
range: org.apache.spark.sql.Dataset[Long] = [id: bigint]
// Let's find out what tables are already available for loading spark.catalog.listTables.show()
+--------------------+--------+-----------+---------+-----------+
| name|database|description|tableType|isTemporary|
+--------------------+--------+-----------+---------+-----------+
| adult| default| null| EXTERNAL| false|
| business_csv_csv| default| null| EXTERNAL| false|
| checkin_table| default| null| MANAGED| false|
| diamonds| default| null| EXTERNAL| false|
| inventory| default| null| MANAGED| false|
|item_merchant_cat...| default| null| MANAGED| false|
| items_left_csv| default| null| EXTERNAL| false|
| logistic_detail| default| null| MANAGED| false|
| merchant_ratings| default| null| MANAGED| false|
| order_data| default| null| MANAGED| false|
| order_ids_left_csv| default| null| EXTERNAL| false|
| repeat_csv| default| null| MANAGED| false|
| review_2019_csv| default| null| EXTERNAL| false|
|sample_logistic_t...| default| null| EXTERNAL| false|
| sentimentlex_csv| default| null| EXTERNAL| false|
| simple_range| default| null| MANAGED| false|
| social_media_usage| default| null| MANAGED| false|
| tip_json| default| null| EXTERNAL| false|
| tips_csv_csv| default| null| EXTERNAL| false|
| users_csv| default| null| EXTERNAL| false|
+--------------------+--------+-----------+---------+-----------+
// File location and type // You may need to change the file name "social_media_usage-5dbee.csv" depending on your upload's name //val file_location = "/FileStore/tables/social_media_usage-5dbee.csv" val file_location = "/FileStore/tables/social_media_usage-5dbee.csv" val file_type = "csv" // CSV options val infer_schema = "true" val first_row_is_header = "true" val delimiter = "," // The applied options are for CSV files. For other file types, these will be ignored. val socialMediaDF = spark.read.format(file_type) .option("inferSchema", infer_schema) .option("header", first_row_is_header) .option("sep", delimiter) .load(file_location) socialMediaDF.show(10)
+------+----------+--------------------+-------------------+------+
|agency| platform| url| date|visits|
+------+----------+--------------------+-------------------+------+
| OEM| SMS| null|2012-02-17 00:00:00| 61652|
| OEM| SMS| null|2012-11-09 00:00:00| 44547|
| EDC| Flickr|http://www.flickr...|2012-05-09 00:00:00| null|
| NYCHA|Newsletter| null|2012-05-09 00:00:00| null|
| DHS| Twitter|www.twitter.com/n...|2012-06-13 00:00:00| 389|
| DHS| Twitter|www.twitter.com/n...|2012-08-02 00:00:00| 431|
| DOH| Android| Condom Finder|2011-08-08 00:00:00| 5026|
| DOT| Android| You The Man|2011-08-08 00:00:00| null|
| MOME| Android| MiNY Venor app|2011-08-08 00:00:00| 313|
| DOT|Broadcastr| null|2011-08-08 00:00:00| null|
+------+----------+--------------------+-------------------+------+
only showing top 10 rows
file_location: String = /FileStore/tables/social_media_usage-5dbee.csv
file_type: String = csv
infer_schema: String = true
first_row_is_header: String = true
delimiter: String = ,
socialMediaDF: org.apache.spark.sql.DataFrame = [agency: string, platform: string ... 3 more fields]
// Let's find out what tables are already available for loading spark.catalog.listTables.show(100)
+--------------------+--------+-----------+---------+-----------+
| name|database|description|tableType|isTemporary|
+--------------------+--------+-----------+---------+-----------+
| adult| default| null| EXTERNAL| false|
| business_csv_csv| default| null| EXTERNAL| false|
| checkin_table| default| null| MANAGED| false|
| diamonds| default| null| EXTERNAL| false|
| inventory| default| null| MANAGED| false|
|item_merchant_cat...| default| null| MANAGED| false|
| items_left_csv| default| null| EXTERNAL| false|
| logistic_detail| default| null| MANAGED| false|
| merchant_ratings| default| null| MANAGED| false|
| order_data| default| null| MANAGED| false|
| order_ids_left_csv| default| null| EXTERNAL| false|
| repeat_csv| default| null| MANAGED| false|
| review_2019_csv| default| null| EXTERNAL| false|
|sample_logistic_t...| default| null| EXTERNAL| false|
| sentimentlex_csv| default| null| EXTERNAL| false|
| tip_json| default| null| EXTERNAL| false|
| tips_csv_csv| default| null| EXTERNAL| false|
| users_csv| default| null| EXTERNAL| false|
| anonym| null| null|TEMPORARY| true|
| hubots| null| null|TEMPORARY| true|
| percent| null| null|TEMPORARY| true|
| social_media_usage| null| null|TEMPORARY| true|
+--------------------+--------+-----------+---------+-----------+
spark.catalog.listTables.show(100)
+--------------------+--------+-----------+---------+-----------+
| name|database|description|tableType|isTemporary|
+--------------------+--------+-----------+---------+-----------+
| adult| default| null| EXTERNAL| false|
| business_csv_csv| default| null| EXTERNAL| false|
| checkin_table| default| null| MANAGED| false|
| diamonds| default| null| EXTERNAL| false|
| inventory| default| null| MANAGED| false|
|item_merchant_cat...| default| null| MANAGED| false|
| items_left_csv| default| null| EXTERNAL| false|
| logistic_detail| default| null| MANAGED| false|
| merchant_ratings| default| null| MANAGED| false|
| order_data| default| null| MANAGED| false|
| order_ids_left_csv| default| null| EXTERNAL| false|
| repeat_csv| default| null| MANAGED| false|
| review_2019_csv| default| null| EXTERNAL| false|
|sample_logistic_t...| default| null| EXTERNAL| false|
| sentimentlex_csv| default| null| EXTERNAL| false|
| social_media_usage| default| null| MANAGED| false|
| tip_json| default| null| EXTERNAL| false|
| tips_csv_csv| default| null| EXTERNAL| false|
| users_csv| default| null| EXTERNAL| false|
| anonym| null| null|TEMPORARY| true|
| hubots| null| null|TEMPORARY| true|
| percent| null| null|TEMPORARY| true|
| social_media_usage| null| null|TEMPORARY| true|
+--------------------+--------+-----------+---------+-----------+
// Ctrl+Enter df.printSchema() // prints schema of the DataFrame df.show() // shows first n (default is 20) rows
root
|-- agency: string (nullable = true)
|-- platform: string (nullable = true)
|-- url: string (nullable = true)
|-- date: timestamp (nullable = true)
|-- visits: integer (nullable = true)
+----------+----------+--------------------+-------------------+------+
| agency| platform| url| date|visits|
+----------+----------+--------------------+-------------------+------+
| OEM| SMS| null|2012-02-17 00:00:00| 61652|
| OEM| SMS| null|2012-11-09 00:00:00| 44547|
| EDC| Flickr|http://www.flickr...|2012-05-09 00:00:00| null|
| NYCHA|Newsletter| null|2012-05-09 00:00:00| null|
| DHS| Twitter|www.twitter.com/n...|2012-06-13 00:00:00| 389|
| DHS| Twitter|www.twitter.com/n...|2012-08-02 00:00:00| 431|
| DOH| Android| Condom Finder|2011-08-08 00:00:00| 5026|
| DOT| Android| You The Man|2011-08-08 00:00:00| null|
| MOME| Android| MiNY Venor app|2011-08-08 00:00:00| 313|
| DOT|Broadcastr| null|2011-08-08 00:00:00| null|
| DPR|Broadcastr|http://beta.broad...|2011-08-08 00:00:00| null|
| ENDHT| Facebook|http://www.facebo...|2011-08-08 00:00:00| 3|
| VAC| Facebook|https://www.faceb...|2011-08-08 00:00:00| 36|
| PlaNYC| Facebook|http://www.facebo...|2011-08-08 00:00:00| 47|
| DFTA| Facebook|http://www.facebo...|2011-08-08 00:00:00| 90|
| energyNYC| Facebook|http://www.facebo...|2011-08-08 00:00:00| 105|
| MOIA| Facebook|http://www.facebo...|2011-08-08 00:00:00| 123|
|City Store| Facebook|http://www.facebo...|2011-08-08 00:00:00| 119|
| OCDV| Facebook|http://www.facebo...|2011-08-08 00:00:00| 148|
| HIA| Facebook|http://www.facebo...|2011-08-08 00:00:00| 197|
+----------+----------+--------------------+-------------------+------+
only showing top 20 rows
%py # Ctrl+Enter to evaluate this python cell, recall '#' is the pre-comment character in python # Using Python to query our "social_media_usage" table pythonDF = spark.table("social_media_usage").select("platform").distinct() pythonDF.show(3)
+--------+
|platform|
+--------+
| nyc.gov|
| Flickr|
| Vimeo|
+--------+
only showing top 3 rows
val sms = df.select($"agency", $"platform", $"visits").filter($"platform" === "SMS") sms.show() // Ctrl+Enter
+------+--------+------+
|agency|platform|visits|
+------+--------+------+
| OEM| SMS| 61652|
| OEM| SMS| 44547|
| DOE| SMS| 382|
| NYCHA| SMS| null|
| OEM| SMS| 61652|
| DOE| SMS| 382|
| NYCHA| SMS| null|
| OEM| SMS| 61652|
| OEM| SMS| null|
| DOE| SMS| null|
| NYCHA| SMS| null|
| OEM| SMS| null|
| DOE| SMS| null|
| NYCHA| SMS| null|
| DOE| SMS| 382|
| NYCHA| SMS| null|
| OEM| SMS| 61652|
| DOE| SMS| 382|
| NYCHA| SMS| null|
| OEM| SMS| 61652|
+------+--------+------+
only showing top 20 rows
sms: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [agency: string, platform: string ... 1 more field]
// Ctrl+Enter Note that we are using "platform = 'SMS'" since it will be evaluated as actual SQL val sms = df.select(df("agency"), df("platform"), df("visits")).filter("platform = 'SMS'") sms.show(5)
+------+--------+------+
|agency|platform|visits|
+------+--------+------+
| OEM| SMS| 61652|
| OEM| SMS| 44547|
| DOE| SMS| 382|
| NYCHA| SMS| null|
| OEM| SMS| 61652|
+------+--------+------+
only showing top 5 rows
sms: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [agency: string, platform: string ... 1 more field]
// Ctrl+Enter to make fixedDF // import the needed sql functions import org.apache.spark.sql.functions.{coalesce, lit} // make the fixedDF DataFrame val fixedDF = df. select( $"agency", $"platform", $"url", $"date", coalesce($"visits", lit(0)).as("visits")) .filter($"platform" =!= "TOTAL") fixedDF.printSchema() // print its schema // and show the top 20 records fully fixedDF.show(false) // the false argument does not truncate the rows, so you will not see something like this "anot..."
root
|-- agency: string (nullable = true)
|-- platform: string (nullable = true)
|-- url: string (nullable = true)
|-- date: timestamp (nullable = true)
|-- visits: integer (nullable = false)
+----------+----------+---------------------------------------------------------------------------------------+-------------------+------+
|agency |platform |url |date |visits|
+----------+----------+---------------------------------------------------------------------------------------+-------------------+------+
|OEM |SMS |null |2012-02-17 00:00:00|61652 |
|OEM |SMS |null |2012-11-09 00:00:00|44547 |
|EDC |Flickr |http://www.flickr.com/nycedc |2012-05-09 00:00:00|0 |
|NYCHA |Newsletter|null |2012-05-09 00:00:00|0 |
|DHS |Twitter |www.twitter.com/nycdhs |2012-06-13 00:00:00|389 |
|DHS |Twitter |www.twitter.com/nycdhs |2012-08-02 00:00:00|431 |
|DOH |Android |Condom Finder |2011-08-08 00:00:00|5026 |
|DOT |Android |You The Man |2011-08-08 00:00:00|0 |
|MOME |Android |MiNY Venor app |2011-08-08 00:00:00|313 |
|DOT |Broadcastr|null |2011-08-08 00:00:00|0 |
|DPR |Broadcastr|http://beta.broadcastr.com/Echo.html?audioId=670026-4001 |2011-08-08 00:00:00|0 |
|ENDHT |Facebook |http://www.facebook.com/pages/NYC-Lets-End-Human-Trafficking/125730490795659?sk=wall |2011-08-08 00:00:00|3 |
|VAC |Facebook |https://www.facebook.com/pages/NYC-Voter-Assistance-Commission/110226709012110 |2011-08-08 00:00:00|36 |
|PlaNYC |Facebook |http://www.facebook.com/pages/New-York-NY/PlaNYC/160454173971169?ref=ts |2011-08-08 00:00:00|47 |
|DFTA |Facebook |http://www.facebook.com/pages/NYC-Department-for-the-Aging/109028655823590 |2011-08-08 00:00:00|90 |
|energyNYC |Facebook |http://www.facebook.com/EnergyNYC?sk=wall |2011-08-08 00:00:00|105 |
|MOIA |Facebook |http://www.facebook.com/ihwnyc |2011-08-08 00:00:00|123 |
|City Store|Facebook |http://www.facebook.com/citystorenyc |2011-08-08 00:00:00|119 |
|OCDV |Facebook |http://www.facebook.com/pages/NYC-Healthy-Relationship-Training-Academy/134637829901065|2011-08-08 00:00:00|148 |
|HIA |Facebook |http://www.facebook.com/pages/New-York-City-Health-Insurance-Link/145920551598 |2011-08-08 00:00:00|197 |
+----------+----------+---------------------------------------------------------------------------------------+-------------------+------+
only showing top 20 rows
import org.apache.spark.sql.functions.{coalesce, lit}
fixedDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [agency: string, platform: string ... 3 more fields]
// Ctrl+Enter to evaluate this UDF which takes a input String called "value" // and converts it into lower-case if it begins with http and otherwise leaves it as null, so we sort of remove non valid web-urls val cleanUrl = udf((value: String) => if (value != null && value.startsWith("http")) value.toLowerCase() else null)
cleanUrl: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,StringType,Some(List(StringType)))
// Shift+Enter cleanedDF.filter($"url".isNull).show(5)
+------+----------+----+-------------------+------+
|agency| platform| url| date|visits|
+------+----------+----+-------------------+------+
| OEM| SMS|null|2012-02-17 00:00:00| 61652|
| OEM| SMS|null|2012-11-09 00:00:00| 44547|
| NYCHA|Newsletter|null|2012-05-09 00:00:00| 0|
| DHS| Twitter|null|2012-06-13 00:00:00| 389|
| DHS| Twitter|null|2012-08-02 00:00:00| 431|
+------+----------+----+-------------------+------+
only showing top 5 rows
// Ctrl+Enter cleanedDF.filter($"url".isNotNull).show(5, false) // false in .show(5, false) shows rows untruncated
+------+----------+------------------------------------------------------------------------------------+-------------------+------+
|agency|platform |url |date |visits|
+------+----------+------------------------------------------------------------------------------------+-------------------+------+
|EDC |Flickr |http://www.flickr.com/nycedc |2012-05-09 00:00:00|0 |
|DPR |Broadcastr|http://beta.broadcastr.com/echo.html?audioid=670026-4001 |2011-08-08 00:00:00|0 |
|ENDHT |Facebook |http://www.facebook.com/pages/nyc-lets-end-human-trafficking/125730490795659?sk=wall|2011-08-08 00:00:00|3 |
|VAC |Facebook |https://www.facebook.com/pages/nyc-voter-assistance-commission/110226709012110 |2011-08-08 00:00:00|36 |
|PlaNYC|Facebook |http://www.facebook.com/pages/new-york-ny/planyc/160454173971169?ref=ts |2011-08-08 00:00:00|47 |
+------+----------+------------------------------------------------------------------------------------+-------------------+------+
only showing top 5 rows
// Crtl+Enter // Import Spark SQL function that will give us unique id across all the records in this DataFrame import org.apache.spark.sql.functions.monotonically_increasing_id // We append column as SQL function that creates unique ids across all records in DataFrames val agencies = cleanedDF.select(cleanedDF("agency")) .distinct() .withColumn("id", monotonically_increasing_id()) agencies.show(5)
+--------------------+-----------+
| agency| id|
+--------------------+-----------+
| PlaNYC|34359738368|
| HIA|34359738369|
|NYC Digital: exte...|34359738370|
| NYCGLOBAL|42949672960|
| nycgov|68719476736|
+--------------------+-----------+
only showing top 5 rows
import org.apache.spark.sql.functions.monotonically_increasing_id
agencies: org.apache.spark.sql.DataFrame = [agency: string, id: bigint]
// Ctrl+Enter // And join with the rest of the data, note how join condition is specified val anonym = cleanedDF.join(agencies, cleanedDF("agency") === agencies("agency"), "inner").select("id", "platform", "url", "date", "visits") // We also cache DataFrame since it can be quite expensive to recompute join anonym.cache() // Display result anonym.show(5)
+-------------+----------+--------------------+-------------------+------+
| id| platform| url| date|visits|
+-------------+----------+--------------------+-------------------+------+
|1580547964928| SMS| null|2012-02-17 00:00:00| 61652|
|1580547964928| SMS| null|2012-11-09 00:00:00| 44547|
| 412316860416| Flickr|http://www.flickr...|2012-05-09 00:00:00| 0|
|1649267441664|Newsletter| null|2012-05-09 00:00:00| 0|
|1529008357376| Twitter| null|2012-06-13 00:00:00| 389|
+-------------+----------+--------------------+-------------------+------+
only showing top 5 rows
anonym: org.apache.spark.sql.DataFrame = [id: bigint, platform: string ... 3 more fields]
spark.catalog.listTables().show() // look at the available tables
+--------------------+--------+-----------+---------+-----------+
| name|database|description|tableType|isTemporary|
+--------------------+--------+-----------+---------+-----------+
| adult| default| null| EXTERNAL| false|
| business_csv_csv| default| null| EXTERNAL| false|
| checkin_table| default| null| MANAGED| false|
| diamonds| default| null| EXTERNAL| false|
| inventory| default| null| MANAGED| false|
|item_merchant_cat...| default| null| MANAGED| false|
| items_left_csv| default| null| EXTERNAL| false|
| logistic_detail| default| null| MANAGED| false|
| merchant_ratings| default| null| MANAGED| false|
| order_data| default| null| MANAGED| false|
| order_ids_left_csv| default| null| EXTERNAL| false|
| repeat_csv| default| null| MANAGED| false|
| review_2019_csv| default| null| EXTERNAL| false|
|sample_logistic_t...| default| null| EXTERNAL| false|
| sentimentlex_csv| default| null| EXTERNAL| false|
| social_media_usage| default| null| MANAGED| false|
| tip_json| default| null| EXTERNAL| false|
| tips_csv_csv| default| null| EXTERNAL| false|
| users_csv| default| null| EXTERNAL| false|
| anonym| null| null|TEMPORARY| true|
+--------------------+--------+-----------+---------+-----------+
only showing top 20 rows
import org.apache.spark.sql.functions.{dayofmonth, month, row_number, sum} import org.apache.spark.sql.expressions.Window val coolDF = anonym.select($"id", $"platform", dayofmonth($"date").as("day"), month($"date").as("month"), $"visits"). groupBy($"id", $"platform", $"day", $"month").agg(sum("visits").as("visits")) // Run window aggregation on visits per month and platform val window = coolDF.select($"id", $"day", $"visits", sum($"visits").over(Window.partitionBy("platform", "month")).as("monthly_visits")) // Create and register percent table val percent = window.select($"id", $"day", ($"visits" / $"monthly_visits").as("percent")) percent.createOrReplaceTempView("percent")
import org.apache.spark.sql.functions.{dayofmonth, month, row_number, sum}
import org.apache.spark.sql.expressions.Window
coolDF: org.apache.spark.sql.DataFrame = [id: bigint, platform: string ... 3 more fields]
window: org.apache.spark.sql.DataFrame = [id: bigint, day: int ... 2 more fields]
percent: org.apache.spark.sql.DataFrame = [id: bigint, day: int ... 1 more field]
%sql -- You also could just use plain SQL to write query above, note that you might need to group by id and day as well. with aggr as ( select id, dayofmonth(date) as day, visits / sum(visits) over (partition by (platform, month(date))) as percent from anonym ) select * from aggr where day = 2 and percent > 0.3
// Define case class that will be our schema for DataFrame case class Hubot(name: String, year: Int, manufacturer: String, version: Array[Int], details: Map[String, String]) // You can process a text file, for example, to convert every row to our Hubot, but we will create RDD manually val rdd = sc.parallelize( Array( Hubot("Jerry", 2015, "LCorp", Array(1, 2, 3), Map("eat" -> "yes", "sleep" -> "yes", "drink" -> "yes")), Hubot("Mozart", 2010, "LCorp", Array(1, 2), Map("eat" -> "no", "sleep" -> "no", "drink" -> "no")), Hubot("Einstein", 2012, "LCorp", Array(1, 2, 3), Map("eat" -> "yes", "sleep" -> "yes", "drink" -> "no")) ) )
defined class Hubot
rdd: org.apache.spark.rdd.RDD[Hubot] = ParallelCollectionRDD[27514] at parallelize at command-112937334110413:5
// In order to convert RDD into DataFrame you need to do this: val hubots = rdd.toDF() // Display DataFrame, note how array and map fields are displayed hubots.printSchema() hubots.show()
root
|-- name: string (nullable = true)
|-- year: integer (nullable = false)
|-- manufacturer: string (nullable = true)
|-- version: array (nullable = true)
| |-- element: integer (containsNull = false)
|-- details: map (nullable = true)
| |-- key: string
| |-- value: string (valueContainsNull = true)
+--------+----+------------+---------+--------------------+
| name|year|manufacturer| version| details|
+--------+----+------------+---------+--------------------+
| Jerry|2015| LCorp|[1, 2, 3]|[eat -> yes, slee...|
| Mozart|2010| LCorp| [1, 2]|[eat -> no, sleep...|
|Einstein|2012| LCorp|[1, 2, 3]|[eat -> yes, slee...|
+--------+----+------------+---------+--------------------+
hubots: org.apache.spark.sql.DataFrame = [name: string, year: int ... 3 more fields]
// You can query complex type the same as you query any other column // By the way you can use `sql` function to invoke Spark SQL to create DataFrame hubots.createOrReplaceTempView("hubots") val onesThatEat = sqlContext.sql("select name, details.eat from hubots where details.eat = 'yes'") onesThatEat.show()
+--------+---+
| name|eat|
+--------+---+
| Jerry|yes|
|Einstein|yes|
+--------+---+
onesThatEat: org.apache.spark.sql.DataFrame = [name: string, eat: string]
import org.apache.spark.sql.types._ // Let's say we have an RDD of String and we need to convert it into a DataFrame with schema "name", "year", and "manufacturer" // As you can see every record is space-separated. val rdd = sc.parallelize( Array( "Jerry 2015 LCorp", "Mozart 2010 LCorp", "Einstein 2012 LCorp" ) ) // Create schema as StructType // val schema = StructType( StructField("name", StringType, false) :: StructField("year", IntegerType, false) :: StructField("manufacturer", StringType, false) :: Nil ) // Prepare RDD[Row] val rows = rdd.map { entry => val arr = entry.split("\\s+") val name = arr(0) val year = arr(1).toInt val manufacturer = arr(2) Row(name, year, manufacturer) } // Create DataFrame val bots = sqlContext.createDataFrame(rows, schema) bots.printSchema() bots.show()
root
|-- name: string (nullable = false)
|-- year: integer (nullable = false)
|-- manufacturer: string (nullable = false)
+--------+----+------------+
| name|year|manufacturer|
+--------+----+------------+
| Jerry|2015| LCorp|
| Mozart|2010| LCorp|
|Einstein|2012| LCorp|
+--------+----+------------+
import org.apache.spark.sql.types._
rdd: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[27519] at parallelize at command-112937334110417:5
schema: org.apache.spark.sql.types.StructType = StructType(StructField(name,StringType,false), StructField(year,IntegerType,false), StructField(manufacturer,StringType,false))
rows: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[27520] at map at command-112937334110417:22
bots: org.apache.spark.sql.DataFrame = [name: string, year: int ... 1 more field]
// We can start working with Datasets by using our "hubots" table // To create Dataset from DataFrame do this (assuming that case class Hubot exists): val ds = hubots.as[Hubot] ds.show()
+--------+----+------------+---------+--------------------+
| name|year|manufacturer| version| details|
+--------+----+------------+---------+--------------------+
| Jerry|2015| LCorp|[1, 2, 3]|[eat -> yes, slee...|
| Mozart|2010| LCorp| [1, 2]|[eat -> no, sleep...|
|Einstein|2012| LCorp|[1, 2, 3]|[eat -> yes, slee...|
+--------+----+------------+---------+--------------------+
ds: org.apache.spark.sql.Dataset[Hubot] = [name: string, year: int ... 3 more fields]
SDS-2.x, Scalable Data Engineering Science
Last refresh: Never