023_OnTimeFlightPerformance(Scala)

Archived YouTube video of this live unedited lab-lecture:

Archived YouTube video of this live unedited lab-lecture

This is a scala version of the python notebook in the following talk:

Homework:

See https://www.brighttalk.com/webcast/12891/199003 (you need to subscribe freely to Bright Talk first). Then go through this scala version of the notebook from the talk.

On-Time Flight Performance with GraphFrames for Apache Spark

This notebook provides an analysis of On-Time Flight Performance and Departure Delays data using GraphFrames for Apache Spark.

Source Data:

References:

Preparation

Extract the Airports and Departure Delays information from S3 / DBFS

// Set File Paths
val tripdelaysFilePath = "/databricks-datasets/flights/departuredelays.csv"
val airportsnaFilePath = "/databricks-datasets/flights/airport-codes-na.txt"
tripdelaysFilePath: String = /databricks-datasets/flights/departuredelays.csv airportsnaFilePath: String = /databricks-datasets/flights/airport-codes-na.txt
// Obtain airports dataset
// Note that "spark-csv" package is built-in datasource in Spark 2.0
val airportsna = sqlContext.read.format("com.databricks.spark.csv").
  option("header", "true").
  option("inferschema", "true").
  option("delimiter", "\t").
  load(airportsnaFilePath)

airportsna.createOrReplaceTempView("airports_na")

// Obtain departure Delays data
val departureDelays = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").load(tripdelaysFilePath)
departureDelays.createOrReplaceTempView("departureDelays")
departureDelays.cache()

// Available IATA (International Air Transport Association) codes from the departuredelays sample dataset
val tripIATA = sqlContext.sql("select distinct iata from (select distinct origin as iata from departureDelays union all select distinct destination as iata from departureDelays) a")
tripIATA.createOrReplaceTempView("tripIATA")

// Only include airports with atleast one trip from the departureDelays dataset
val airports = sqlContext.sql("select f.IATA, f.City, f.State, f.Country from airports_na f join tripIATA t on t.IATA = f.IATA")
airports.createOrReplaceTempView("airports")
airports.cache()
airportsna: org.apache.spark.sql.DataFrame = [City: string, State: string ... 2 more fields] departureDelays: org.apache.spark.sql.DataFrame = [date: string, delay: string ... 3 more fields] tripIATA: org.apache.spark.sql.DataFrame = [iata: string] airports: org.apache.spark.sql.DataFrame = [IATA: string, City: string ... 2 more fields] res0: airports.type = [IATA: string, City: string ... 2 more fields]
// Build `departureDelays_geo` DataFrame
// Obtain key attributes such as Date of flight, delays, distance, and airport information (Origin, Destination)  
val departureDelays_geo = sqlContext.sql("select cast(f.date as int) as tripid, cast(concat(concat(concat(concat(concat(concat('2014-', concat(concat(substr(cast(f.date as string), 1, 2), '-')), substr(cast(f.date as string), 3, 2)), ' '), substr(cast(f.date as string), 5, 2)), ':'), substr(cast(f.date as string), 7, 2)), ':00') as timestamp) as `localdate`, cast(f.delay as int), cast(f.distance as int), f.origin as src, f.destination as dst, o.city as city_src, d.city as city_dst, o.state as state_src, d.state as state_dst from departuredelays f join airports o on o.iata = f.origin join airports d on d.iata = f.destination") 

// RegisterTempTable
departureDelays_geo.createOrReplaceTempView("departureDelays_geo")

// Cache and Count
departureDelays_geo.cache()
departureDelays_geo.count()
departureDelays_geo: org.apache.spark.sql.DataFrame = [tripid: int, localdate: timestamp ... 8 more fields] res3: Long = 1361141
display(departureDelays_geo)
10111112014-01-01T11:11:00.000+0000-5221MSPINLMinneapolisInternational FallsMNMN
10211112014-01-02T11:11:00.000+00007221MSPINLMinneapolisInternational FallsMNMN
10311112014-01-03T11:11:00.000+00000221MSPINLMinneapolisInternational FallsMNMN
10419252014-01-04T19:25:00.000+00000221MSPINLMinneapolisInternational FallsMNMN
10611152014-01-06T11:15:00.000+000033221MSPINLMinneapolisInternational FallsMNMN
10711152014-01-07T11:15:00.000+000023221MSPINLMinneapolisInternational FallsMNMN
10811152014-01-08T11:15:00.000+0000-9221MSPINLMinneapolisInternational FallsMNMN
10911152014-01-09T11:15:00.000+000011221MSPINLMinneapolisInternational FallsMNMN
11011152014-01-10T11:15:00.000+0000-3221MSPINLMinneapolisInternational FallsMNMN
11120152014-01-11T20:15:00.000+0000-7221MSPINLMinneapolisInternational FallsMNMN
11219252014-01-12T19:25:00.000+0000-5221MSPINLMinneapolisInternational FallsMNMN
11311152014-01-13T11:15:00.000+0000-3221MSPINLMinneapolisInternational FallsMNMN
11411152014-01-14T11:15:00.000+0000-6221MSPINLMinneapolisInternational FallsMNMN
11511152014-01-15T11:15:00.000+0000-7221MSPINLMinneapolisInternational FallsMNMN
11611152014-01-16T11:15:00.000+0000-3221MSPINLMinneapolisInternational FallsMNMN
11711152014-01-17T11:15:00.000+00004221MSPINLMinneapolisInternational FallsMNMN
11820152014-01-18T20:15:00.000+0000-5221MSPINLMinneapolisInternational FallsMNMN
11919252014-01-19T19:25:00.000+0000-7221MSPINLMinneapolisInternational FallsMNMN
12011152014-01-20T11:15:00.000+0000-6221MSPINLMinneapolisInternational FallsMNMN
12111152014-01-21T11:15:00.000+00000221MSPINLMinneapolisInternational FallsMNMN
12211152014-01-22T11:15:00.000+0000-4221MSPINLMinneapolisInternational FallsMNMN
12311152014-01-23T11:15:00.000+0000-4221MSPINLMinneapolisInternational FallsMNMN
12411152014-01-24T11:15:00.000+0000-3221MSPINLMinneapolisInternational FallsMNMN
12520152014-01-25T20:15:00.000+0000-12221MSPINLMinneapolisInternational FallsMNMN
12619252014-01-26T19:25:00.000+0000-5221MSPINLMinneapolisInternational FallsMNMN
12711152014-01-27T11:15:00.000+00000221MSPINLMinneapolisInternational FallsMNMN
12811152014-01-28T11:15:00.000+0000-8221MSPINLMinneapolisInternational FallsMNMN
12911152014-01-29T11:15:00.000+0000-2221MSPINLMinneapolisInternational FallsMNMN
13011152014-01-30T11:15:00.000+00000221MSPINLMinneapolisInternational FallsMNMN
13111152014-01-31T11:15:00.000+0000-3221MSPINLMinneapolisInternational FallsMNMN
20120152014-02-01T20:15:00.000+0000-4221MSPINLMinneapolisInternational FallsMNMN
20220152014-02-02T20:15:00.000+00000221MSPINLMinneapolisInternational FallsMNMN
20311152014-02-03T11:15:00.000+0000-7221MSPINLMinneapolisInternational FallsMNMN
20411152014-02-04T11:15:00.000+0000-6221MSPINLMinneapolisInternational FallsMNMN
20511152014-02-05T11:15:00.000+0000-4221MSPINLMinneapolisInternational FallsMNMN
20611152014-02-06T11:15:00.000+0000-2221MSPINLMinneapolisInternational FallsMNMN
20711152014-02-07T11:15:00.000+0000-15221MSPINLMinneapolisInternational FallsMNMN
20820152014-02-08T20:15:00.000+0000-4221MSPINLMinneapolisInternational FallsMNMN
20919252014-02-09T19:25:00.000+00001221MSPINLMinneapolisInternational FallsMNMN
21011152014-02-10T11:15:00.000+0000-3221MSPINLMinneapolisInternational FallsMNMN
21111152014-02-11T11:15:00.000+0000-7221MSPINLMinneapolisInternational FallsMNMN
21211152014-02-12T11:15:00.000+0000-2221MSPINLMinneapolisInternational FallsMNMN
21311152014-02-13T11:15:00.000+0000-3221MSPINLMinneapolisInternational FallsMNMN
21411152014-02-14T11:15:00.000+0000-11221MSPINLMinneapolisInternational FallsMNMN
21520152014-02-15T20:15:00.000+000016221MSPINLMinneapolisInternational FallsMNMN
21619252014-02-16T19:25:00.000+0000169221MSPINLMinneapolisInternational FallsMNMN
21711152014-02-17T11:15:00.000+000027221MSPINLMinneapolisInternational FallsMNMN
21811152014-02-18T11:15:00.000+000096221MSPINLMinneapolisInternational FallsMNMN
21911152014-02-19T11:15:00.000+0000-9221MSPINLMinneapolisInternational FallsMNMN
22011152014-02-20T11:15:00.000+0000-6221MSPINLMinneapolisInternational FallsMNMN
22111152014-02-21T11:15:00.000+0000-4221MSPINLMinneapolisInternational FallsMNMN
22220152014-02-22T20:15:00.000+0000-4221MSPINLMinneapolisInternational FallsMNMN
22319252014-02-23T19:25:00.000+0000-3221MSPINLMinneapolisInternational FallsMNMN
22411152014-02-24T11:15:00.000+0000-2221MSPINLMinneapolisInternational FallsMNMN
22511152014-02-25T11:15:00.000+0000-6221MSPINLMinneapolisInternational FallsMNMN
22611152014-02-26T11:15:00.000+0000-8221MSPINLMinneapolisInternational FallsMNMN
22711152014-02-27T11:15:00.000+0000-8221MSPINLMinneapolisInternational FallsMNMN
22811152014-02-28T11:15:00.000+00005221MSPINLMinneapolisInternational FallsMNMN
30120152014-03-01T20:15:00.000+0000-4221MSPINLMinneapolisInternational FallsMNMN
30220002014-03-02T20:00:00.000+00000221MSPINLMinneapolisInternational FallsMNMN
30311152014-03-03T11:15:00.000+000017221MSPINLMinneapolisInternational FallsMNMN
30411152014-03-04T11:15:00.000+00000221MSPINLMinneapolisInternational FallsMNMN
30511152014-03-05T11:15:00.000+0000-7221MSPINLMinneapolisInternational FallsMNMN
30611152014-03-06T11:15:00.000+0000-8221MSPINLMinneapolisInternational FallsMNMN
30711152014-03-07T11:15:00.000+0000-10221MSPINLMinneapolisInternational FallsMNMN
30820002014-03-08T20:00:00.000+0000-11221MSPINLMinneapolisInternational FallsMNMN
30920002014-03-09T20:00:00.000+0000-9221MSPINLMinneapolisInternational FallsMNMN
31011152014-03-10T11:15:00.000+0000-10221MSPINLMinneapolisInternational FallsMNMN
31111152014-03-11T11:15:00.000+0000-8221MSPINLMinneapolisInternational FallsMNMN
31211152014-03-12T11:15:00.000+0000-6221MSPINLMinneapolisInternational FallsMNMN
31311152014-03-13T11:15:00.000+0000-8221MSPINLMinneapolisInternational FallsMNMN
31411152014-03-14T11:15:00.000+0000-5221MSPINLMinneapolisInternational FallsMNMN
31520002014-03-15T20:00:00.000+0000-11221MSPINLMinneapolisInternational FallsMNMN
31620002014-03-16T20:00:00.000+0000-10221MSPINLMinneapolisInternational FallsMNMN
31711152014-03-17T11:15:00.000+000025221MSPINLMinneapolisInternational FallsMNMN
31811152014-03-18T11:15:00.000+00002221MSPINLMinneapolisInternational FallsMNMN
31911152014-03-19T11:15:00.000+0000-5221MSPINLMinneapolisInternational FallsMNMN
32011152014-03-20T11:15:00.000+0000-6221MSPINLMinneapolisInternational FallsMNMN
32111152014-03-21T11:15:00.000+00000221MSPINLMinneapolisInternational FallsMNMN
32220002014-03-22T20:00:00.000+0000-10221MSPINLMinneapolisInternational FallsMNMN
32320002014-03-23T20:00:00.000+0000-9221MSPINLMinneapolisInternational FallsMNMN
32411152014-03-24T11:15:00.000+0000-9221MSPINLMinneapolisInternational FallsMNMN
32511152014-03-25T11:15:00.000+0000-4221MSPINLMinneapolisInternational FallsMNMN
32611152014-03-26T11:15:00.000+0000-5221MSPINLMinneapolisInternational FallsMNMN
32711152014-03-27T11:15:00.000+00009221MSPINLMinneapolisInternational FallsMNMN
32811152014-03-28T11:15:00.000+0000-7221MSPINLMinneapolisInternational FallsMNMN
32920002014-03-29T20:00:00.000+0000-19221MSPINLMinneapolisInternational FallsMNMN
33020002014-03-30T20:00:00.000+0000-10221MSPINLMinneapolisInternational FallsMNMN
33111152014-03-31T11:15:00.000+0000-8221MSPINLMinneapolisInternational FallsMNMN
10115192014-01-01T15:19:00.000+0000-1265IAHMSYHoustonNew OrleansTXLA
10116462014-01-01T16:46:00.000+000071265IAHMSYHoustonNew OrleansTXLA
10107142014-01-01T07:14:00.000+0000-2265IAHMSYHoustonNew OrleansTXLA
10215192014-01-02T15:19:00.000+0000-3265IAHMSYHoustonNew OrleansTXLA
10216542014-01-02T16:54:00.000+0000-2265IAHMSYHoustonNew OrleansTXLA
10207142014-01-02T07:14:00.000+00000265IAHMSYHoustonNew OrleansTXLA
10315192014-01-03T15:19:00.000+0000136265IAHMSYHoustonNew OrleansTXLA
10316462014-01-03T16:46:00.000+0000-10265IAHMSYHoustonNew OrleansTXLA
10307142014-01-03T07:14:00.000+0000-3265IAHMSYHoustonNew OrleansTXLA
10421232014-01-04T21:23:00.000+000037265IAHMSYHoustonNew OrleansTXLA
10416462014-01-04T16:46:00.000+0000-6265IAHMSYHoustonNew OrleansTXLA

Showing the first 1000 rows.

Building the Graph

Now that we've imported our data, we're going to need to build our graph. To do so we're going to do two things. We are going to build the structure of the vertices (or nodes) and we're going to build the structure of the edges. What's awesome about GraphFrames is that this process is incredibly simple.

  • Rename IATA airport code to id in the Vertices Table
  • Start and End airports to src and dst for the Edges Table (flights)

These are required naming conventions for vertices and edges in GraphFrames as of the time of this writing (Feb. 2016).

WARNING: If the graphframes package, required in the cell below, is not installed, follow the instructions here.

// Note, ensure you have already installed the GraphFrames spack-package
import org.apache.spark.sql.functions._
import org.graphframes._

// Create Vertices (airports) and Edges (flights)
val tripVertices = airports.withColumnRenamed("IATA", "id").distinct()
val tripEdges = departureDelays_geo.select("tripid", "delay", "src", "dst", "city_dst", "state_dst")

// Cache Vertices and Edges
tripEdges.cache()
tripVertices.cache()
import org.apache.spark.sql.functions._ import org.graphframes._ tripVertices: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: string, City: string ... 2 more fields] tripEdges: org.apache.spark.sql.DataFrame = [tripid: int, delay: int ... 4 more fields] res5: tripVertices.type = [id: string, City: string ... 2 more fields]
// Vertices
// The vertices of our graph are the airports
display(tripVertices)
FATFresnoCAUSA
CMHColumbusOHUSA
PHXPhoenixAZUSA
PAHPaducahKYUSA
COSColorado SpringsCOUSA
MYRMyrtle BeachSCUSA
RNORenoNVUSA
SRQSarasotaFLUSA
VLDValdostaGAUSA
PSCPascoWAUSA
BPTBeaumontTXUSA
CAEColumbiaSCUSA
LAXLos AngelesCAUSA
DAYDaytonOHUSA
AVPWilkes-BarrePAUSA
MFRMedfordORUSA
JFKNew YorkNYUSA
LASLas VegasNVUSA
BNANashvilleTNUSA
CLTCharlotteNCUSA
BDLHartfordCTUSA
ILGWilmingtonDEUSA
ACTWacoTXUSA
ATWAppletonWIUSA
RHIRhinelanderWIUSA
PWMPortlandMEUSA
SJTSan AngeloTXUSA
GRBGreen BayWIUSA
APNAlpenaMIUSA
MSYNew OrleansLAUSA
CAKAkronOHUSA
LANLansingMIUSA
FLLFort LauderdaleFLUSA
CODCodyWYUSA
TPATampaFLUSA
SATSan AntonioTXUSA
MODModestoCAUSA
GTRColumbusMSUSA
RDDReddingCAUSA
BTVBurlingtonVTUSA
HLNHelenaMTUSA
IMTIron MountainMIUSA
CPRCasperWYUSA
FWAFort WayneINUSA
DTWDetroitMIUSA
BZNBozemanMTUSA
SBNSouth BendINUSA
SPSWichita FallsTXUSA
BFLBakersfieldCAUSA
HOBHobbsNMUSA
TVCTraverse CityMIUSA
CLEClevelandOHUSA
ABRAberdeenSDUSA
CHSCharlestonSCUSA
GUCGunnisonCOUSA
INDIndianapolisINUSA
SDFLouisvilleKYUSA
RSWFort MyersFLUSA
SANSan DiegoCAUSA
TULTulsaOKUSA
BOSBostonMAUSA
AGSAugustaGAUSA
MOBMobileALUSA
TUSTucsonAZUSA
KTNKetchikanAKUSA
PNSPensacolaFLUSA
BTRBaton RougeLAUSA
ABQAlbuquerqueNMUSA
LGANew YorkNYUSA
MAFMidlandTXUSA
DALDallasTXUSA
JNUJuneauAKUSA
FARFargoNDUSA
RICRichmondVAUSA
MTJMontroseCOUSA
SHVShreveportLAUSA
AMAAmarilloTXUSA
ROCRochesterNYUSA
YAKYakutatAKUSA
DRODurangoCOUSA
CRPCorpus ChristiTXUSA
CSGColumbusGAUSA
ALOWaterlooIAUSA
GSOGreensboroNCUSA
FNTFlintMIUSA
LWSLewistonIDUSA
TOLToledoOHUSA
GTFGreat FallsMTUSA
RKSRock SpringsWYUSA
MKEMilwaukeeWIUSA
STLSt. LouisMOUSA
MHTManchesterNHUSA
CRWCharlestonWVUSA
SLCSalt Lake CityUTUSA
ACVEurekaCAUSA
DFWDallasTXUSA
OMENomeAKUSA
ORFNorfolkVAUSA
ROARoanokeVAUSA
BQKBrunswickGAUSA
// Edges
// The edges of our graph are the flights between airports
display(tripEdges)
1011111-5MSPINLInternational FallsMN
10211117MSPINLInternational FallsMN
10311110MSPINLInternational FallsMN
10419250MSPINLInternational FallsMN
106111533MSPINLInternational FallsMN
107111523MSPINLInternational FallsMN
1081115-9MSPINLInternational FallsMN
109111511MSPINLInternational FallsMN
1101115-3MSPINLInternational FallsMN
1112015-7MSPINLInternational FallsMN
1121925-5MSPINLInternational FallsMN
1131115-3MSPINLInternational FallsMN
1141115-6MSPINLInternational FallsMN
1151115-7MSPINLInternational FallsMN
1161115-3MSPINLInternational FallsMN
11711154MSPINLInternational FallsMN
1182015-5MSPINLInternational FallsMN
1191925-7MSPINLInternational FallsMN
1201115-6MSPINLInternational FallsMN
12111150MSPINLInternational FallsMN
1221115-4MSPINLInternational FallsMN
1231115-4MSPINLInternational FallsMN
1241115-3MSPINLInternational FallsMN
1252015-12MSPINLInternational FallsMN
1261925-5MSPINLInternational FallsMN
12711150MSPINLInternational FallsMN
1281115-8MSPINLInternational FallsMN
1291115-2MSPINLInternational FallsMN
13011150MSPINLInternational FallsMN
1311115-3MSPINLInternational FallsMN
2012015-4MSPINLInternational FallsMN
20220150MSPINLInternational FallsMN
2031115-7MSPINLInternational FallsMN
2041115-6MSPINLInternational FallsMN
2051115-4MSPINLInternational FallsMN
2061115-2MSPINLInternational FallsMN
2071115-15MSPINLInternational FallsMN
2082015-4MSPINLInternational FallsMN
20919251MSPINLInternational FallsMN
2101115-3MSPINLInternational FallsMN
2111115-7MSPINLInternational FallsMN
2121115-2MSPINLInternational FallsMN
2131115-3MSPINLInternational FallsMN
2141115-11MSPINLInternational FallsMN
215201516MSPINLInternational FallsMN
2161925169MSPINLInternational FallsMN
217111527MSPINLInternational FallsMN
218111596MSPINLInternational FallsMN
2191115-9MSPINLInternational FallsMN
2201115-6MSPINLInternational FallsMN
2211115-4MSPINLInternational FallsMN
2222015-4MSPINLInternational FallsMN
2231925-3MSPINLInternational FallsMN
2241115-2MSPINLInternational FallsMN
2251115-6MSPINLInternational FallsMN
2261115-8MSPINLInternational FallsMN
2271115-8MSPINLInternational FallsMN
22811155MSPINLInternational FallsMN
3012015-4MSPINLInternational FallsMN
30220000MSPINLInternational FallsMN
303111517MSPINLInternational FallsMN
30411150MSPINLInternational FallsMN
3051115-7MSPINLInternational FallsMN
3061115-8MSPINLInternational FallsMN
3071115-10MSPINLInternational FallsMN
3082000-11MSPINLInternational FallsMN
3092000-9MSPINLInternational FallsMN
3101115-10MSPINLInternational FallsMN
3111115-8MSPINLInternational FallsMN
3121115-6MSPINLInternational FallsMN
3131115-8MSPINLInternational FallsMN
3141115-5MSPINLInternational FallsMN
3152000-11MSPINLInternational FallsMN
3162000-10MSPINLInternational FallsMN
317111525MSPINLInternational FallsMN
31811152MSPINLInternational FallsMN
3191115-5MSPINLInternational FallsMN
3201115-6MSPINLInternational FallsMN
32111150MSPINLInternational FallsMN
3222000-10MSPINLInternational FallsMN
3232000-9MSPINLInternational FallsMN
3241115-9MSPINLInternational FallsMN
3251115-4MSPINLInternational FallsMN
3261115-5MSPINLInternational FallsMN
32711159MSPINLInternational FallsMN
3281115-7MSPINLInternational FallsMN
3292000-19MSPINLInternational FallsMN
3302000-10MSPINLInternational FallsMN
3311115-8MSPINLInternational FallsMN
1011519-1IAHMSYNew OrleansLA
101164671IAHMSYNew OrleansLA
1010714-2IAHMSYNew OrleansLA
1021519-3IAHMSYNew OrleansLA
1021654-2IAHMSYNew OrleansLA
10207140IAHMSYNew OrleansLA
1031519136IAHMSYNew OrleansLA
1031646-10IAHMSYNew OrleansLA
1030714-3IAHMSYNew OrleansLA
104212337IAHMSYNew OrleansLA
1041646-6IAHMSYNew OrleansLA

Showing the first 1000 rows.

// Build `tripGraph` GraphFrame
// This GraphFrame builds up on the vertices and edges based on our trips (flights)
val tripGraph = GraphFrame(tripVertices, tripEdges)
println(tripGraph)

// Build `tripGraphPrime` GraphFrame
// This graphframe contains a smaller subset of data to make it easier to display motifs and subgraphs (below)
val tripEdgesPrime = departureDelays_geo.select("tripid", "delay", "src", "dst")
val tripGraphPrime = GraphFrame(tripVertices, tripEdgesPrime)
GraphFrame(v:[id: string, City: string ... 2 more fields], e:[src: string, dst: string ... 4 more fields]) tripGraph: org.graphframes.GraphFrame = GraphFrame(v:[id: string, City: string ... 2 more fields], e:[src: string, dst: string ... 4 more fields]) tripEdgesPrime: org.apache.spark.sql.DataFrame = [tripid: int, delay: int ... 2 more fields] tripGraphPrime: org.graphframes.GraphFrame = GraphFrame(v:[id: string, City: string ... 2 more fields], e:[src: string, dst: string ... 2 more fields])

Simple Queries

Let's start with a set of simple graph queries to understand flight performance and departure delays

Determine the number of airports and trips

println(s"Airports: ${tripGraph.vertices.count()}")
println(s"Trips: ${tripGraph.edges.count()}")
Airports: 279 Trips: 1361141

Determining the longest delay in this dataset

// Finding the longest Delay
val longestDelay = tripGraph.edges.groupBy().max("delay")
display(longestDelay)
1642

Determining the number of delayed vs. on-time / early flights

// Determining number of on-time / early flights vs. delayed flights
println(s"On-time / Early Flights: ${tripGraph.edges.filter("delay <= 0").count()}")
println(s"Delayed Flights: ${tripGraph.edges.filter("delay > 0").count()}")
On-time / Early Flights: 780469 Delayed Flights: 580672

What flights departing SFO are most likely to have significant delays

Note, delay can be <= 0 meaning the flight left on time or early

val sfoDelayedTrips = tripGraph.edges.
  filter("src = 'SFO' and delay > 0").
  groupBy("src", "dst").
  avg("delay").
  sort(desc("avg(delay)"))
sfoDelayedTrips: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [src: string, dst: string ... 1 more field]
display(sfoDelayedTrips)
SFOOKC59.073170731707314
SFOJAC57.13333333333333
SFOCOS53.976190476190474
SFOOTH48.09090909090909
SFOSAT47.625
SFOMOD46.80952380952381
SFOSUN46.723404255319146
SFOCIC46.72164948453608
SFOABQ44.8125
SFOASE44.285714285714285
SFOPIT43.875
SFOMIA43.81730769230769
SFOFAT43.23972602739726
SFOMFR43.11848341232228
SFOSBP43.09770114942529
SFOMSP42.766917293233085
SFOBOI42.65482233502538
SFORDM41.98823529411764
SFOAUS41.690677966101696
SFOSLC41.407272727272726
SFOJFK41.01379310344828
SFOPSP40.909909909909906
SFOPHX40.67272727272727
SFOMRY40.61764705882353
SFOACV40.3728813559322
SFOLAS40.107602339181284
SFOTUS39.853658536585364
SFOSAN38.97361809045226
SFOSBA38.758620689655174
SFOBFL38.51136363636363
SFORDU38.170731707317074
SFOSTL38.13513513513514
SFOIND38.114285714285714
SFOEUG37.573913043478264
SFORNO36.81372549019608
SFOBUR36.75675675675676
SFOLGB36.752941176470586
SFOHNL36.25367647058823
SFOLAX36.165543071161046
SFORDD36.11009174311926
SFOMSY35.421052631578945
SFOSMF34.936
SFOMDW34.824742268041234
SFOFLL34.76842105263158
SFOSEA34.68854961832061
SFOMCI34.68571428571428
SFODFW34.36642599277978
SFOOGG34.171875
SFOPDX34.14430894308943
SFOORD33.991130820399114
SFOLIH32.93023255813954
SFODEN32.861491628614914
SFOPSC32.604651162790695
SFOPHL32.440677966101696
SFOBWI31.70212765957447
SFOONT31.49079754601227
SFOSNA31.18426103646833
SFOMCO31.03488372093023
SFOMKE31.03448275862069
SFOCLE30.979591836734695
SFOEWR30.354285714285716
SFOBOS29.623471882640587
SFOLMT29.233333333333334
SFODTW28.34722222222222
SFOIAH28.322105263157894
SFOCVG27.03125
SFOATL26.84860557768924
SFOIAD26.125964010282775
SFOANC25.5
SFOBZN23.964285714285715
SFOCLT22.636363636363637
SFODCA21.896103896103895