// Evaluation of the cell by Ctrl+Enter will print spark session available in notebook spark
res0: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@2d0c6c9
// Spark has some of the pre-built methods to create simple Dataset/DataFrame // 1. Empty Dataset/DataFrame, not really interesting, is it? println(spark.emptyDataFrame) println(spark.emptyDataset[Int])
[]
[value: int]
// 2. Range of numbers, note that Spark automatically names column as "id" val range = spark.range(0, 10) // In order to get a preview of data in DataFrame use "show()" range.show(3)
+---+
| id|
+---+
| 0|
| 1|
| 2|
+---+
only showing top 3 rows
range: org.apache.spark.sql.Dataset[Long] = [id: bigint]
// Let's find out what tables are already available for loading spark.catalog.listTables.show()
+--------------------+--------+-----------+---------+-----------+
| name|database|description|tableType|isTemporary|
+--------------------+--------+-----------+---------+-----------+
| cities_csv| default| null| EXTERNAL| false|
| cleaned_taxes| default| null| MANAGED| false|
|commdettrumpclint...| default| null| MANAGED| false|
| donaldtrumptweets| default| null| EXTERNAL| false|
| linkage| default| null| EXTERNAL| false|
| nations| default| null| EXTERNAL| false|
| newmplist| default| null| EXTERNAL| false|
| ny_baby_names| default| null| MANAGED| false|
| nzmpsandparty| default| null| EXTERNAL| false|
| pos_neg_category| default| null| EXTERNAL| false|
| rna| default| null| MANAGED| false|
| samh| default| null| EXTERNAL| false|
| social_media_usage| default| null| EXTERNAL| false|
| table1| default| null| EXTERNAL| false|
| test_table| default| null| EXTERNAL| false|
| uscites| default| null| EXTERNAL| false|
+--------------------+--------+-----------+---------+-----------+
val df = spark.table("social_media_usage") // Ctrl+Enter
df: org.apache.spark.sql.DataFrame = [agency: string, platform: string ... 3 more fields]
// Ctrl+Enter df.printSchema() // prints schema of the DataFrame df.show() // shows first n (default is 20) rows
root
|-- agency: string (nullable = true)
|-- platform: string (nullable = true)
|-- url: string (nullable = true)
|-- date: string (nullable = true)
|-- visits: integer (nullable = true)
+----------+----------+--------------------+--------------------+------+
| agency| platform| url| date|visits|
+----------+----------+--------------------+--------------------+------+
| OEM| SMS| null|02/17/2012 12:00:...| 61652|
| OEM| SMS| null|11/09/2012 12:00:...| 44547|
| EDC| Flickr|http://www.flickr...|05/09/2012 12:00:...| null|
| NYCHA|Newsletter| null|05/09/2012 12:00:...| null|
| DHS| Twitter|www.twitter.com/n...|06/13/2012 12:00:...| 389|
| DHS| Twitter|www.twitter.com/n...|08/02/2012 12:00:...| 431|
| DOH| Android| Condom Finder|08/08/2011 12:00:...| 5026|
| DOT| Android| You The Man|08/08/2011 12:00:...| null|
| MOME| Android| MiNY Venor app|08/08/2011 12:00:...| 313|
| DOT|Broadcastr| null|08/08/2011 12:00:...| null|
| DPR|Broadcastr|http://beta.broad...|08/08/2011 12:00:...| null|
| ENDHT| Facebook|http://www.facebo...|08/08/2011 12:00:...| 3|
| VAC| Facebook|https://www.faceb...|08/08/2011 12:00:...| 36|
| PlaNYC| Facebook|http://www.facebo...|08/08/2011 12:00:...| 47|
| DFTA| Facebook|http://www.facebo...|08/08/2011 12:00:...| 90|
| energyNYC| Facebook|http://www.facebo...|08/08/2011 12:00:...| 105|
| MOIA| Facebook|http://www.facebo...|08/08/2011 12:00:...| 123|
|City Store| Facebook|http://www.facebo...|08/08/2011 12:00:...| 119|
| OCDV| Facebook|http://www.facebo...|08/08/2011 12:00:...| 148|
| HIA| Facebook|http://www.facebo...|08/08/2011 12:00:...| 197|
+----------+----------+--------------------+--------------------+------+
only showing top 20 rows
val platforms = df.select("platform") // Shift+Enter
platforms: org.apache.spark.sql.DataFrame = [platform: string]
platforms.show(5) // Ctrl+Enter to show top 5 rows
+----------+
| platform|
+----------+
| SMS|
| SMS|
| Flickr|
|Newsletter|
| Twitter|
+----------+
only showing top 5 rows
val uniquePlatforms = df.select("platform").distinct() // Shift+Enter
uniquePlatforms: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [platform: string]
display(uniquePlatforms) // Ctrl+Enter to show all rows; use the scroll-bar on the right of the display to see all platforms
%py # Ctrl+Enter to evaluate this python cell, recall '#' is the pre-comment character in python # Using Python to query our "social_media_usage" table pythonDF = spark.table("social_media_usage").select("platform").distinct() pythonDF.show(3)
+--------+
|platform|
+--------+
| nyc.gov|
| Flickr|
| Vimeo|
+--------+
only showing top 3 rows
SDS-2.2, Scalable Data Science
Last refresh: Never