ScaDaMaLe Course site and book

import pandas as pd import matplotlib import matplotlib.pyplot as plt import numpy as np import pycountry import geopandas from cartogram_geopandas import make_cartogram import imageio def load_twitter_geo_data(path): df = spark.read.parquet(path) df = df.select('countryCode', "CurrentTweetDate") df = df.toPandas() # Add some new datetime derived columns. df["date"] = df["CurrentTweetDate"].dt.date df["year"] = df["CurrentTweetDate"].dt.year df["month"] = df["CurrentTweetDate"].dt.month df["day"] = df["CurrentTweetDate"].dt.day df["dayofweek"] = df["CurrentTweetDate"].dt.dayofweek df["hour"] = df["CurrentTweetDate"].dt.hour df["minute"] = df["CurrentTweetDate"].dt.minute df["second"] = df["CurrentTweetDate"].dt.second return df def load_twitter_geo_data_with_filter(path, filter_str): df = spark.read.parquet(path) df = df.filter(filter_str).select('countryCode', "CurrentTweetDate") df = df.toPandas() # Add some new datetime derived columns. df["year"] = df["CurrentTweetDate"].dt.year df["month"] = df["CurrentTweetDate"].dt.month df["day"] = df["CurrentTweetDate"].dt.day df["dayofweek"] = df["CurrentTweetDate"].dt.dayofweek df["hour"] = df["CurrentTweetDate"].dt.hour df["minute"] = df["CurrentTweetDate"].dt.minute df["second"] = df["CurrentTweetDate"].dt.second return df def country_code_grouping(df): df['count'] = df.groupby('countryCode')['countryCode'].transform('count') #The count inside the transform function calls pandas count function df_cc = df.drop_duplicates(subset=['countryCode']) df_cc = df_cc.filter(['countryCode', 'count']).reset_index() return df_cc def country_code_grouping_extra(df, key): #df['count'] = df.groupby('countryCode')['countryCode'].transform('count') #The count inside the transform function calls pandas count function df_cc = df[["countryCode", key]].groupby('countryCode').sum().reset_index() #df.drop_duplicates(subset=['countryCode']) return df_cc def add_iso_a3_col(df_cc): cc_dict = {} for country in pycountry.countries: cc_dict[country.alpha_2] = country.alpha_3 df_cc["iso_a3"] = df_cc["countryCode"].map(cc_dict) return df_cc def create_geo_df(df_cc): df_world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) # natural earth has missing iso_a3 names for France, Norway, Somalia, Kosovo and Northen Cypruys.. # See the following issue: https://github.com/geopandas/geopandas/issues/1041 # The following lines manually fixes it for all but Northern Cyprus, which does not have an iso_a3 code. df_world.loc[df_world['name'] == 'France', 'iso_a3'] = 'FRA' df_world.loc[df_world['name'] == 'Norway', 'iso_a3'] = 'NOR' df_world.loc[df_world['name'] == 'Somaliland', 'iso_a3'] = 'SOM' df_world.loc[df_world['name'] == 'Kosovo', 'iso_a3'] = 'RKS' numTweetDict = {} for countryCode in df_world["iso_a3"]: numTweetDict[countryCode] = 0 for index, row in df_cc.iterrows(): numTweetDict[row["iso_a3"]] = row["count"] df_world["numTweets"] = df_world["iso_a3"].map(numTweetDict) # Could be useful to throw away antarctica and antarctic isles. # df_world = df_world.query("(continent != 'Antarctica') or (continent != 'Seven seas (open ocean)')") # Redundant # df_world_proj = df_world.to_crs({'init': 'EPSG:4326'}) # df_world["area"] = df_world_proj['geometry'].area # df_world["tweetDensity"] = df_world["numTweets"]/df_world["area"] # df_world["tweetPerCapita"] = df_world["numTweets"]/df_world["pop_est"] return df_world def create_geo_df_extra(df_cc, data_of_interest="count", default_value=0): df_world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) # natural earth has missing iso_a3 names for France, Norway, Somalia, Kosovo and Northen Cypruys.. # See the following issue: https://github.com/geopandas/geopandas/issues/1041 # The following lines manually fixes it for all but Northern Cyprus, which does not have an iso_a3 code. df_world.loc[df_world['name'] == 'France', 'iso_a3'] = 'FRA' df_world.loc[df_world['name'] == 'Norway', 'iso_a3'] = 'NOR' df_world.loc[df_world['name'] == 'Somaliland', 'iso_a3'] = 'SOM' df_world.loc[df_world['name'] == 'Kosovo', 'iso_a3'] = 'RKS' dataTweetDict = {} for countryCode in df_world["iso_a3"]: dataTweetDict[countryCode] = default_value for index, row in df_cc.iterrows(): dataTweetDict[row["iso_a3"]] = row[data_of_interest] df_world[data_of_interest] = df_world["iso_a3"].map(dataTweetDict) # Could be useful to throw away antarctica and antarctic isles. # df_world = df_world.query("(continent != 'Antarctica') or (continent != 'Seven seas (open ocean)')") # Redundant # df_world_proj = df_world.to_crs({'init': 'EPSG:4326'}) # df_world["area"] = df_world_proj['geometry'].area # df_world["tweetDensity"] = df_world["numTweets"]/df_world["area"] # df_world["tweetPerCapita"] = df_world["numTweets"]/df_world["pop_est"] return df_world
def animate_cartogram(df, filterKey, filterList, out_path, nIter, cartogram_key, legend=""): vmax = max(df.groupby([filterKey, "countryCode"]).count()["index"]) # Get maximum count within a single country in a single hour. We will use this to fix the colorbar. images=[] # array for storing the png frames for the gif for i in filterList: # Load the data and add ISOa3 codes. df_filtered = df.query("%s==%d"%(filterKey, i)).reset_index() df_cc = country_code_grouping(df_filtered) df_cc = add_iso_a3_col(df_cc) # Create the geopandas dataframe df_world = create_geo_df(df_cc) #Create cartogram # The make_cartogram function can not handle a tweetcount of zero, so a not so elegant solution is to clip the tweet count at 1. # The alternative (to remove countries without tweets) is not elegant either, and causes problems when we look at the time evolution, since countries will be popping in and out of existence. df_world2 = df_world.copy(deep=True) df_world2["numTweets"] = df_world2["numTweets"].clip(lower=1) df_cartogram = make_cartogram(df_world2, cartogram_key, nIter, inplace=False) plot = df_cartogram.plot(column=cartogram_key, cmap='viridis', figsize=(20, 8), legend=True, vmin=0, vmax=vmax) # Plot a vertical line indicating midnight. 360degrees/24hours = 15 degrees/hour if i<12: t_midnight = -15*i #15deg per hour t_noon = t_midnight + 180 else: t_midnight = 180 - (i-12)*15 t_noon = t_midnight - 180 plt.axvline(x=t_midnight, ymin=-90, ymax=90, ls="--", c="black") plt.axvline(x=t_noon, ymin=-90, ymax=90, ls="--", c="yellow") plt.title(legend + "Time of day (GMT): %02d"%i, fontsize=24) plt.xlabel("Longitude $^\circ$", fontsize=20) plt.ylabel("Latitude $^\circ$", fontsize=20) plt.ylim(-90,90) plt.xlim(-180,180) #Save cartogram as a png fig = plot.get_figure() fig.savefig(out_path + "%d.png"%i) plt.close(fig) #Append images to image list images.append(imageio.imread(out_path + "%d.png"%i)) #create gif from the image list imageio.mimsave(out_path + ".gif", images, duration=0.5) def animate_cartogram_extra(df, filterKey, filterList, out_path, nIter, cartogram_key, default_value, scale_factor=2, vmin=0.0, vmax=1.0): # uses scaling proportional to original area of country images=[] # array for storing the png frames for the gif for i in filterList: # Load the data and add ISOa3 codes. df_filtered = df.query("%s==%d"%(filterKey, i)).reset_index() df_cc = country_code_grouping_extra(df_filtered, cartogram_key) df_cc = add_iso_a3_col(df_cc) # Create the geopandas dataframe df_world = create_geo_df_extra(df_cc, cartogram_key, default_value) # scale by area df_world["__scaled"] = (scale_factor - 1) * df_world[cartogram_key] * pd.to_numeric(df_world['geometry'].area) # make sure the quantity of interest > 0, add area to every value df_world["__scaled"] = pd.to_numeric(df_world['geometry'].area) + df_world["__scaled"] #Create cartogram df_cartogram = make_cartogram(df_world, "__scaled", nIter, inplace=False) plot = df_cartogram.plot(column=cartogram_key, cmap='viridis', figsize=(20, 8), legend=cartogram_key, vmin=vmin, vmax=vmax) # Plot a vertical line indicating midnight and one indicating noon. 360degrees/24hours = 15 degrees/hour if i<12: t_midnight = -15*i #15deg per hour t_noon = t_midnight + 180 else: t_midnight = 180 - (i-12)*15 t_noon = t_midnight - 180 plt.axvline(x=t_midnight, ymin=-90, ymax=90, ls="--", c="black") plt.axvline(x=t_noon, ymin=-90, ymax=90, ls="--", c="yellow") plt.title("Time of day (GMT): %02d"%i, fontsize=24) plt.xlabel("Longitude $^\circ$", fontsize=20) plt.ylabel("Latitude $^\circ$", fontsize=20) plt.ylim(-90,90) plt.xlim(-180,180) #Save cartogram as a png fig = plot.get_figure() fig.savefig(out_path + "%d.png"%i) plt.close(fig) #Append images to image list images.append(imageio.imread(out_path + "%d.png"%i)) #create gif from the image list imageio.mimsave(out_path + ".gif", images, duration=0.5)
def load_twitter_geo_data_sentiment(path): df = spark.read.parquet(path) df = df.select('countryCode', "CurrentTweetDate", "prayer", "monkey", "happy", "SK", "cat", "notHappy") df = df.toPandas() # Add some new datetime derived columns. df["date"] = df["CurrentTweetDate"].dt.date df["year"] = df["CurrentTweetDate"].dt.year df["month"] = df["CurrentTweetDate"].dt.month df["day"] = df["CurrentTweetDate"].dt.day df["dayofweek"] = df["CurrentTweetDate"].dt.dayofweek df["hour"] = df["CurrentTweetDate"].dt.hour df["minute"] = df["CurrentTweetDate"].dt.minute df["second"] = df["CurrentTweetDate"].dt.second return df def country_code_grouping_sentiment(df): df['count'] = df.groupby(['countryCode', 'sentiment'])['countryCode'].transform('count') #The count inside the transform function calls pandas count function df["sentiment"] = df.groupby(['countryCode'])["sentiment"].transform("mean") df = df.drop_duplicates("countryCode") df_cc = df.filter(['countryCode', 'count', "sentiment"]).reset_index() return df_cc def create_geo_df_sentiment(df_cc): df_world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) # natural earth has missing iso_a3 names for France, Norway, Somalia, Kosovo and Northen Cypruys.. # See the following issue: https://github.com/geopandas/geopandas/issues/1041 # The following lines manually fixes it for all but Northern Cyprus, which does not have an iso_a3 code. df_world.loc[df_world['name'] == 'France', 'iso_a3'] = 'FRA' df_world.loc[df_world['name'] == 'Norway', 'iso_a3'] = 'NOR' df_world.loc[df_world['name'] == 'Somaliland', 'iso_a3'] = 'SOM' df_world.loc[df_world['name'] == 'Kosovo', 'iso_a3'] = 'RKS' numTweetDict = {} for countryCode in df_world["iso_a3"]: numTweetDict[countryCode] = 0 for index, row in df_cc.iterrows(): numTweetDict[row["iso_a3"]] = row["count"] df_world["numTweets"] = df_world["iso_a3"].map(numTweetDict) sentimentDict = {} for countryCode in df_world["iso_a3"]: sentimentDict[countryCode] = 0 for index, row in df_cc.iterrows(): sentimentDict[row["iso_a3"]] = row["sentiment"] df_world["sentiment"] = df_world["iso_a3"].map(sentimentDict) # Could be useful to throw away antarctica and antarctic isles. # df_world = df_world.query("(continent != 'Antarctica') or (continent != 'Seven seas (open ocean)')") # Redundant # df_world_proj = df_world.to_crs({'init': 'EPSG:4326'}) # df_world["area"] = df_world_proj['geometry'].area # df_world["tweetDensity"] = df_world["numTweets"]/df_world["area"] # df_world["tweetPerCapita"] = df_world["numTweets"]/df_world["pop_est"] return df_world def animate_cartogram_sentiment(df, filterKey, filterList, out_path, nIter, cartogram_key, minSamples, cmap, vmin, vmax, legendList): images=[] # array for storing the png frames for the gif frameCount = 0 for i in filterList: # Load the data and add ISOa3 codes. df_filtered = df.query("%s==%d"%(filterKey, i)).reset_index() df_cc = country_code_grouping_sentiment(df_filtered) df_cc = add_iso_a3_col(df_cc) # Create the geopandas dataframe df_world = create_geo_df_sentiment(df_cc) #Create cartogram # The make_cartogram function can not handle a tweetcount of zero, so a not so elegant solution is to clip the tweet count at 1. # The alternative (to remove countries without tweets) is not elegant either, and causes problems when we look at the time evolution, since countries will be popping in and out of existence. df_world2 = df_world.copy(deep=True) df_world2["numTweets"] = df_world2["numTweets"].clip(lower=1) # We want to color all countries with less than minSamples tweets grey. # The colormap will do this if these countries sentiment score is below vmin. df_world2.loc[df_world["numTweets"] < minSamples, 'sentiment'] = vmin -1 df_cartogram = make_cartogram(df_world2, cartogram_key, nIter, inplace=False) plot = df_cartogram.plot(column="sentiment", cmap=cmap, figsize=(20, 8), legend=True, vmin=vmin, vmax=vmax) plt.title(legendList[frameCount], fontsize=24) plt.xlabel("Longitude $^\circ$", fontsize=20) plt.ylabel("Latitude $^\circ$", fontsize=20) plt.ylim(-90,90) plt.xlim(-180,180) frameCount += 1 #Save cartogram as a png fig = plot.get_figure() fig.savefig(out_path + "%d.png"%i) plt.close(fig) #Append images to image list images.append(imageio.imread(out_path + "%d.png"%i)) #create gif from the image list imageio.mimsave(out_path + ".gif", images, duration=1)