import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pycountry
import geopandas
from cartogram_geopandas import make_cartogram
import imageio
def load_twitter_geo_data(path):
df = spark.read.parquet(path)
df = df.select('countryCode', "CurrentTweetDate")
df = df.toPandas()
# Add some new datetime derived columns.
df["date"] = df["CurrentTweetDate"].dt.date
df["year"] = df["CurrentTweetDate"].dt.year
df["month"] = df["CurrentTweetDate"].dt.month
df["day"] = df["CurrentTweetDate"].dt.day
df["dayofweek"] = df["CurrentTweetDate"].dt.dayofweek
df["hour"] = df["CurrentTweetDate"].dt.hour
df["minute"] = df["CurrentTweetDate"].dt.minute
df["second"] = df["CurrentTweetDate"].dt.second
return df
def load_twitter_geo_data_with_filter(path, filter_str):
df = spark.read.parquet(path)
df = df.filter(filter_str).select('countryCode', "CurrentTweetDate")
df = df.toPandas()
# Add some new datetime derived columns.
df["year"] = df["CurrentTweetDate"].dt.year
df["month"] = df["CurrentTweetDate"].dt.month
df["day"] = df["CurrentTweetDate"].dt.day
df["dayofweek"] = df["CurrentTweetDate"].dt.dayofweek
df["hour"] = df["CurrentTweetDate"].dt.hour
df["minute"] = df["CurrentTweetDate"].dt.minute
df["second"] = df["CurrentTweetDate"].dt.second
return df
def country_code_grouping(df):
df['count'] = df.groupby('countryCode')['countryCode'].transform('count') #The count inside the transform function calls pandas count function
df_cc = df.drop_duplicates(subset=['countryCode'])
df_cc = df_cc.filter(['countryCode', 'count']).reset_index()
return df_cc
def country_code_grouping_extra(df, key):
#df['count'] = df.groupby('countryCode')['countryCode'].transform('count') #The count inside the transform function calls pandas count function
df_cc = df[["countryCode", key]].groupby('countryCode').sum().reset_index() #df.drop_duplicates(subset=['countryCode'])
return df_cc
def add_iso_a3_col(df_cc):
cc_dict = {}
for country in pycountry.countries:
cc_dict[country.alpha_2] = country.alpha_3
df_cc["iso_a3"] = df_cc["countryCode"].map(cc_dict)
return df_cc
def create_geo_df(df_cc):
df_world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
# natural earth has missing iso_a3 names for France, Norway, Somalia, Kosovo and Northen Cypruys..
# See the following issue: https://github.com/geopandas/geopandas/issues/1041
# The following lines manually fixes it for all but Northern Cyprus, which does not have an iso_a3 code.
df_world.loc[df_world['name'] == 'France', 'iso_a3'] = 'FRA'
df_world.loc[df_world['name'] == 'Norway', 'iso_a3'] = 'NOR'
df_world.loc[df_world['name'] == 'Somaliland', 'iso_a3'] = 'SOM'
df_world.loc[df_world['name'] == 'Kosovo', 'iso_a3'] = 'RKS'
numTweetDict = {}
for countryCode in df_world["iso_a3"]:
numTweetDict[countryCode] = 0
for index, row in df_cc.iterrows():
numTweetDict[row["iso_a3"]] = row["count"]
df_world["numTweets"] = df_world["iso_a3"].map(numTweetDict)
# Could be useful to throw away antarctica and antarctic isles.
# df_world = df_world.query("(continent != 'Antarctica') or (continent != 'Seven seas (open ocean)')")
# Redundant
# df_world_proj = df_world.to_crs({'init': 'EPSG:4326'})
# df_world["area"] = df_world_proj['geometry'].area
# df_world["tweetDensity"] = df_world["numTweets"]/df_world["area"]
# df_world["tweetPerCapita"] = df_world["numTweets"]/df_world["pop_est"]
return df_world
def create_geo_df_extra(df_cc, data_of_interest="count", default_value=0):
df_world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
# natural earth has missing iso_a3 names for France, Norway, Somalia, Kosovo and Northen Cypruys..
# See the following issue: https://github.com/geopandas/geopandas/issues/1041
# The following lines manually fixes it for all but Northern Cyprus, which does not have an iso_a3 code.
df_world.loc[df_world['name'] == 'France', 'iso_a3'] = 'FRA'
df_world.loc[df_world['name'] == 'Norway', 'iso_a3'] = 'NOR'
df_world.loc[df_world['name'] == 'Somaliland', 'iso_a3'] = 'SOM'
df_world.loc[df_world['name'] == 'Kosovo', 'iso_a3'] = 'RKS'
dataTweetDict = {}
for countryCode in df_world["iso_a3"]:
dataTweetDict[countryCode] = default_value
for index, row in df_cc.iterrows():
dataTweetDict[row["iso_a3"]] = row[data_of_interest]
df_world[data_of_interest] = df_world["iso_a3"].map(dataTweetDict)
# Could be useful to throw away antarctica and antarctic isles.
# df_world = df_world.query("(continent != 'Antarctica') or (continent != 'Seven seas (open ocean)')")
# Redundant
# df_world_proj = df_world.to_crs({'init': 'EPSG:4326'})
# df_world["area"] = df_world_proj['geometry'].area
# df_world["tweetDensity"] = df_world["numTweets"]/df_world["area"]
# df_world["tweetPerCapita"] = df_world["numTweets"]/df_world["pop_est"]
return df_world
def animate_cartogram(df, filterKey, filterList, out_path, nIter, cartogram_key, legend=""):
vmax = max(df.groupby([filterKey, "countryCode"]).count()["index"]) # Get maximum count within a single country in a single hour. We will use this to fix the colorbar.
images=[] # array for storing the png frames for the gif
for i in filterList:
# Load the data and add ISOa3 codes.
df_filtered = df.query("%s==%d"%(filterKey, i)).reset_index()
df_cc = country_code_grouping(df_filtered)
df_cc = add_iso_a3_col(df_cc)
# Create the geopandas dataframe
df_world = create_geo_df(df_cc)
#Create cartogram
# The make_cartogram function can not handle a tweetcount of zero, so a not so elegant solution is to clip the tweet count at 1.
# The alternative (to remove countries without tweets) is not elegant either, and causes problems when we look at the time evolution, since countries will be popping in and out of existence.
df_world2 = df_world.copy(deep=True)
df_world2["numTweets"] = df_world2["numTweets"].clip(lower=1)
df_cartogram = make_cartogram(df_world2, cartogram_key, nIter, inplace=False)
plot = df_cartogram.plot(column=cartogram_key, cmap='viridis', figsize=(20, 8), legend=True, vmin=0, vmax=vmax)
# Plot a vertical line indicating midnight. 360degrees/24hours = 15 degrees/hour
if i<12:
t_midnight = -15*i #15deg per hour
t_noon = t_midnight + 180
else:
t_midnight = 180 - (i-12)*15
t_noon = t_midnight - 180
plt.axvline(x=t_midnight, ymin=-90, ymax=90, ls="--", c="black")
plt.axvline(x=t_noon, ymin=-90, ymax=90, ls="--", c="yellow")
plt.title(legend + "Time of day (GMT): %02d"%i, fontsize=24)
plt.xlabel("Longitude $^\circ$", fontsize=20)
plt.ylabel("Latitude $^\circ$", fontsize=20)
plt.ylim(-90,90)
plt.xlim(-180,180)
#Save cartogram as a png
fig = plot.get_figure()
fig.savefig(out_path + "%d.png"%i)
plt.close(fig)
#Append images to image list
images.append(imageio.imread(out_path + "%d.png"%i))
#create gif from the image list
imageio.mimsave(out_path + ".gif", images, duration=0.5)
def animate_cartogram_extra(df, filterKey, filterList, out_path, nIter, cartogram_key, default_value, scale_factor=2, vmin=0.0, vmax=1.0):
# uses scaling proportional to original area of country
images=[] # array for storing the png frames for the gif
for i in filterList:
# Load the data and add ISOa3 codes.
df_filtered = df.query("%s==%d"%(filterKey, i)).reset_index()
df_cc = country_code_grouping_extra(df_filtered, cartogram_key)
df_cc = add_iso_a3_col(df_cc)
# Create the geopandas dataframe
df_world = create_geo_df_extra(df_cc, cartogram_key, default_value)
# scale by area
df_world["__scaled"] = (scale_factor - 1) * df_world[cartogram_key] * pd.to_numeric(df_world['geometry'].area)
# make sure the quantity of interest > 0, add area to every value
df_world["__scaled"] = pd.to_numeric(df_world['geometry'].area) + df_world["__scaled"]
#Create cartogram
df_cartogram = make_cartogram(df_world, "__scaled", nIter, inplace=False)
plot = df_cartogram.plot(column=cartogram_key, cmap='viridis', figsize=(20, 8), legend=cartogram_key, vmin=vmin, vmax=vmax)
# Plot a vertical line indicating midnight and one indicating noon. 360degrees/24hours = 15 degrees/hour
if i<12:
t_midnight = -15*i #15deg per hour
t_noon = t_midnight + 180
else:
t_midnight = 180 - (i-12)*15
t_noon = t_midnight - 180
plt.axvline(x=t_midnight, ymin=-90, ymax=90, ls="--", c="black")
plt.axvline(x=t_noon, ymin=-90, ymax=90, ls="--", c="yellow")
plt.title("Time of day (GMT): %02d"%i, fontsize=24)
plt.xlabel("Longitude $^\circ$", fontsize=20)
plt.ylabel("Latitude $^\circ$", fontsize=20)
plt.ylim(-90,90)
plt.xlim(-180,180)
#Save cartogram as a png
fig = plot.get_figure()
fig.savefig(out_path + "%d.png"%i)
plt.close(fig)
#Append images to image list
images.append(imageio.imread(out_path + "%d.png"%i))
#create gif from the image list
imageio.mimsave(out_path + ".gif", images, duration=0.5)
def load_twitter_geo_data_sentiment(path):
df = spark.read.parquet(path)
df = df.select('countryCode', "CurrentTweetDate", "prayer", "monkey", "happy", "SK", "cat", "notHappy")
df = df.toPandas()
# Add some new datetime derived columns.
df["date"] = df["CurrentTweetDate"].dt.date
df["year"] = df["CurrentTweetDate"].dt.year
df["month"] = df["CurrentTweetDate"].dt.month
df["day"] = df["CurrentTweetDate"].dt.day
df["dayofweek"] = df["CurrentTweetDate"].dt.dayofweek
df["hour"] = df["CurrentTweetDate"].dt.hour
df["minute"] = df["CurrentTweetDate"].dt.minute
df["second"] = df["CurrentTweetDate"].dt.second
return df
def country_code_grouping_sentiment(df):
df['count'] = df.groupby(['countryCode', 'sentiment'])['countryCode'].transform('count') #The count inside the transform function calls pandas count function
df["sentiment"] = df.groupby(['countryCode'])["sentiment"].transform("mean")
df = df.drop_duplicates("countryCode")
df_cc = df.filter(['countryCode', 'count', "sentiment"]).reset_index()
return df_cc
def create_geo_df_sentiment(df_cc):
df_world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
# natural earth has missing iso_a3 names for France, Norway, Somalia, Kosovo and Northen Cypruys..
# See the following issue: https://github.com/geopandas/geopandas/issues/1041
# The following lines manually fixes it for all but Northern Cyprus, which does not have an iso_a3 code.
df_world.loc[df_world['name'] == 'France', 'iso_a3'] = 'FRA'
df_world.loc[df_world['name'] == 'Norway', 'iso_a3'] = 'NOR'
df_world.loc[df_world['name'] == 'Somaliland', 'iso_a3'] = 'SOM'
df_world.loc[df_world['name'] == 'Kosovo', 'iso_a3'] = 'RKS'
numTweetDict = {}
for countryCode in df_world["iso_a3"]:
numTweetDict[countryCode] = 0
for index, row in df_cc.iterrows():
numTweetDict[row["iso_a3"]] = row["count"]
df_world["numTweets"] = df_world["iso_a3"].map(numTweetDict)
sentimentDict = {}
for countryCode in df_world["iso_a3"]:
sentimentDict[countryCode] = 0
for index, row in df_cc.iterrows():
sentimentDict[row["iso_a3"]] = row["sentiment"]
df_world["sentiment"] = df_world["iso_a3"].map(sentimentDict)
# Could be useful to throw away antarctica and antarctic isles.
# df_world = df_world.query("(continent != 'Antarctica') or (continent != 'Seven seas (open ocean)')")
# Redundant
# df_world_proj = df_world.to_crs({'init': 'EPSG:4326'})
# df_world["area"] = df_world_proj['geometry'].area
# df_world["tweetDensity"] = df_world["numTweets"]/df_world["area"]
# df_world["tweetPerCapita"] = df_world["numTweets"]/df_world["pop_est"]
return df_world
def animate_cartogram_sentiment(df, filterKey, filterList, out_path, nIter, cartogram_key, minSamples, cmap, vmin, vmax, legendList):
images=[] # array for storing the png frames for the gif
frameCount = 0
for i in filterList:
# Load the data and add ISOa3 codes.
df_filtered = df.query("%s==%d"%(filterKey, i)).reset_index()
df_cc = country_code_grouping_sentiment(df_filtered)
df_cc = add_iso_a3_col(df_cc)
# Create the geopandas dataframe
df_world = create_geo_df_sentiment(df_cc)
#Create cartogram
# The make_cartogram function can not handle a tweetcount of zero, so a not so elegant solution is to clip the tweet count at 1.
# The alternative (to remove countries without tweets) is not elegant either, and causes problems when we look at the time evolution, since countries will be popping in and out of existence.
df_world2 = df_world.copy(deep=True)
df_world2["numTweets"] = df_world2["numTweets"].clip(lower=1)
# We want to color all countries with less than minSamples tweets grey.
# The colormap will do this if these countries sentiment score is below vmin.
df_world2.loc[df_world["numTweets"] < minSamples, 'sentiment'] = vmin -1
df_cartogram = make_cartogram(df_world2, cartogram_key, nIter, inplace=False)
plot = df_cartogram.plot(column="sentiment", cmap=cmap, figsize=(20, 8), legend=True, vmin=vmin, vmax=vmax)
plt.title(legendList[frameCount], fontsize=24)
plt.xlabel("Longitude $^\circ$", fontsize=20)
plt.ylabel("Latitude $^\circ$", fontsize=20)
plt.ylim(-90,90)
plt.xlim(-180,180)
frameCount += 1
#Save cartogram as a png
fig = plot.get_figure()
fig.savefig(out_path + "%d.png"%i)
plt.close(fig)
#Append images to image list
images.append(imageio.imread(out_path + "%d.png"%i))
#create gif from the image list
imageio.mimsave(out_path + ".gif", images, duration=1)