Execute relevant notebooks to load and preprocess data
"./02_DataPreprocess"
display(valid_distinct_features)
// transform to features in order to perform kmeans
import org.apache.spark.ml.feature.VectorAssembler
// define input cols and output (add additional columns here...)
val va = new VectorAssembler().setInputCols(Array("population","population_density","median_age","aged_65_older","aged_70_older","gdp_per_capita","cardiovasc_death_rate","diabetes_prevalence","female_smokers","male_smokers","hospital_beds_per_thousand","life_expectancy","human_development_index")).setOutputCol("features")
// create features
val df_feats = va.transform(valid_distinct_features)
display(df_feats)
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.ml.evaluation.ClusteringEvaluator
// number of clusters
val num_clusters: Int = 6
// fixed seed for initialization
val seed: Int = 2
// init kmeans method
val kmeans = new KMeans().setK(num_clusters).setSeed(seed).setFeaturesCol("features")
// train kmeans cluster
val model = kmeans.fit(df_feats)
// cluster predictions
val preds = model.transform(df_feats)
// evaluate clustering base on Silhouette metric
val cluster_evaluator = new ClusteringEvaluator()
val silhouette_metric = cluster_evaluator.evaluate(preds)
// show evaluation and results
println(s"Silhouette metric: $silhouette_metric")
// cluster centers
println("Cluster centers:")
model.clusterCenters.foreach(println)
Silhouette metric: 0.8522450614906474
Cluster centers:
[4.610053152E7,113.00692000000001,31.668000000000003,9.45524,6.029160000000001,17977.45664,247.31187999999995,6.65,9.020000000000001,29.740000000000002,2.9476000000000004,73.7436,0.7251200000000002]
[1.4096640795E9,299.0465,33.45,8.315,4.6715,10867.693,272.0895,10.065000000000001,1.9,34.5,2.435,73.285,0.696]
[3.02263134E8,90.6665,33.8,10.366,6.3925,32707.095,246.9765,8.555,10.950000000000001,50.349999999999994,1.905,75.28999999999999,0.8089999999999999]
[1.993803743333333E8,515.2163333333333,28.166666666666664,6.048333333333333,3.700666666666666,7554.047999999999,299.66499999999996,8.280000000000001,4.633333333333333,33.1,1.2,71.91333333333333,0.643]
[7294451.1190476185,259.81845238095224,33.12142857142858,10.440523809523814,6.707988095238093,24778.540952380958,246.83450000000002,7.545952380952383,11.430952380952382,31.786904761904758,3.2384761904761903,74.73202380952378,0.7552380952380952]
[1.07767729E8,167.7762,33.06,10.376800000000001,6.874000000000001,19659.705700000002,258.51500000000004,9.284,9.4,35.4,4.029000000000001,75.318,0.7576]
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.ml.evaluation.ClusteringEvaluator
num_clusters: Int = 6
seed: Int = 2
kmeans: org.apache.spark.ml.clustering.KMeans = kmeans_46338dc75b58
model: org.apache.spark.ml.clustering.KMeansModel = KMeansModel: uid=kmeans_46338dc75b58, k=6, distanceMeasure=euclidean, numFeatures=13
preds: org.apache.spark.sql.DataFrame = [iso_code: string, location: string ... 15 more fields]
cluster_evaluator: org.apache.spark.ml.evaluation.ClusteringEvaluator = ClusteringEvaluator: uid=cluEval_2943e2a697af, metricName=silhouette, distanceMeasure=squaredEuclidean
silhouette_metric: Double = 0.8522450614906474
// check model parameters
model.extractParamMap
res20: org.apache.spark.ml.param.ParamMap =
{
kmeans_46338dc75b58-distanceMeasure: euclidean,
kmeans_46338dc75b58-featuresCol: features,
kmeans_46338dc75b58-initMode: k-means||,
kmeans_46338dc75b58-initSteps: 2,
kmeans_46338dc75b58-k: 6,
kmeans_46338dc75b58-maxIter: 20,
kmeans_46338dc75b58-predictionCol: prediction,
kmeans_46338dc75b58-seed: 2,
kmeans_46338dc75b58-tol: 1.0E-4
}
val df_clstr = preds.withColumnRenamed("prediction", "kmeans_class")
display(df_clstr)
Visualization
Based on each country's features, the countries can be clustered accordingly
val df_clstr_filtered = df_clstr.select($"iso_code",$"kmeans_class")
display(df_clstr_filtered)