ScaDaMaLe Course site and book

SQL Pivoting since Spark 2.4

SQL Pivot: Converting Rows to Columns

This is from the following blogpost: - https://databricks.com/blog/2018/11/01/sql-pivot-converting-rows-to-columns.html

This is a useful trick to know when having to do ETL before exploring datasets that need row to column conversions.

Load Data

Create tables and load temperature data

CREATE OR REPLACE TEMPORARY VIEW high_temps
  USING csv
  OPTIONS (path "/databricks-datasets/weather/high_temps", header "true", mode "FAILFAST")
CREATE OR REPLACE TEMPORARY VIEW low_temps
  USING csv
  OPTIONS (path "/databricks-datasets/weather/low_temps", header "true", mode "FAILFAST")

Pivoting in SQL

Getting the monthly average high temperatures with month as columns and year as rows.

SELECT * FROM (
  SELECT year(date) year, month(date) month, temp
  FROM high_temps
  WHERE date BETWEEN DATE '2015-01-01' AND DATE '2018-08-31'
)
PIVOT (
  CAST(avg(temp) AS DECIMAL(4, 1))
  FOR month in (
    1 JAN, 2 FEB, 3 MAR, 4 APR, 5 MAY, 6 JUN,
    7 JUL, 8 AUG, 9 SEP, 10 OCT, 11 NOV, 12 DEC
  )
)
ORDER BY year DESC
year JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC
2018.0 49.7 45.8 54.0 58.6 70.8 71.9 82.8 79.1 null null null null
2017.0 43.7 46.6 51.5 57.3 67.0 72.1 78.3 81.5 73.8 61.1 51.3 45.5
2016.0 49.1 53.6 56.4 65.9 68.8 73.1 76.0 79.5 69.6 60.5 56.0 41.9
2015.0 50.3 54.5 57.9 59.9 68.0 78.9 82.6 79.0 68.5 63.6 49.4 47.1

Pivoting with Multiple Aggregate Expressions

Getting monthly average and maximum high temperatures with month as columns and year as rows.

SELECT * FROM (
  SELECT year(date) year, month(date) month, temp
  FROM high_temps
  WHERE date BETWEEN DATE '2015-01-01' AND DATE '2018-08-31'
)
PIVOT (
  CAST(avg(temp) AS DECIMAL(4, 1)) avg, max(temp) max
  FOR month in (6 JUN, 7 JUL, 8 AUG, 9 SEP)
)
ORDER BY year DESC
year JUN_avg JUN_max JUL_avg JUL_max AUG_avg AUG_max SEP_avg SEP_max
2018.0 71.9 88 82.8 94 79.1 94 null null
2017.0 72.1 96 78.3 87 81.5 94 73.8 90
2016.0 73.1 93 76.0 89 79.5 95 69.6 78
2015.0 78.9 92 82.6 95 79.0 92 68.5 81

Pivoting with Multiple Grouping Columns

Getting monthly average high and average low temperatures with month as columns and (year, hi/lo) as rows.

SELECT * FROM (
  SELECT year(date) year, month(date) month, temp, flag `H/L`
  FROM (
    SELECT date, temp, 'H' as flag
    FROM high_temps
    UNION ALL
    SELECT date, temp, 'L' as flag
    FROM low_temps
  )
  WHERE date BETWEEN DATE '2015-01-01' AND DATE '2018-08-31'
)
PIVOT (
  CAST(avg(temp) AS DECIMAL(4, 1))
  FOR month in (6 JUN, 7 JUL, 8 AUG, 9 SEP)
)
ORDER BY year DESC, `H/L` ASC
year H/L JUN JUL AUG SEP
2018.0 H 71.9 82.8 79.1 null
2018.0 L 53.4 58.5 58.5 null
2017.0 H 72.1 78.3 81.5 73.8
2017.0 L 53.7 56.3 59.0 55.6
2016.0 H 73.1 76.0 79.5 69.6
2016.0 L 53.9 57.6 57.9 52.9
2015.0 H 78.9 82.6 79.0 68.5
2015.0 L 56.4 59.9 58.5 52.5

Pivoting with Multiple Pivot Columns

Getting monthly average high and average low temperatures with (month, hi/lo) as columns and year as rows.

SELECT * FROM (
  SELECT year(date) year, month(date) month, temp, flag
  FROM (
    SELECT date, temp, 'H' as flag
    FROM high_temps
    UNION ALL
    SELECT date, temp, 'L' as flag
    FROM low_temps
  )
  WHERE date BETWEEN DATE '2015-01-01' AND DATE '2018-08-31'
)
PIVOT (
  CAST(avg(temp) AS DECIMAL(4, 1))
  FOR (month, flag) in (
    (6, 'H') JUN_hi, (6, 'L') JUN_lo,
    (7, 'H') JUL_hi, (7, 'L') JUL_lo,
    (8, 'H') AUG_hi, (8, 'L') AUG_lo,
    (9, 'H') SEP_hi, (9, 'L') SEP_lo
  )
)
ORDER BY year DESC
year JUN_hi JUN_lo JUL_hi JUL_lo AUG_hi AUG_lo SEP_hi SEP_lo
2018.0 71.9 53.4 82.8 58.5 79.1 58.5 null null
2017.0 72.1 53.7 78.3 56.3 81.5 59.0 73.8 55.6
2016.0 73.1 53.9 76.0 57.6 79.5 57.9 69.6 52.9
2015.0 78.9 56.4 82.6 59.9 79.0 58.5 68.5 52.5