"./DataPreprocess"
display(valid_distinct_features.describe())
summary | iso_code | location | population | population_density | median_age | aged_65_older | aged_70_older | gdp_per_capita | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | hospital_beds_per_thousand | life_expectancy | human_development_index |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 126 | 126 | 126 | 126 | 126 | 126 | 126 | 126 | 126 | 126 | 126 | 126 | 126 | 126 | 126 |
mean | null | null | 5.448349223809524E7 | 227.40285714285716 | 32.72619047619046 | 10.100476190476186 | 6.477539682539681 | 22517.798674603175 | 249.51723809523807 | 7.5796031746031725 | 10.470634920634918 | 32.03650793650794 | 3.1610476190476198 | 74.50119047619042 | 0.7466904761904759 |
stddev | null | null | 1.8094803349108434E8 | 737.9315775155493 | 8.810413643605422 | 6.507230389299932 | 4.50202071901193 | 21194.388486506883 | 120.66734039269643 | 3.8247238647083845 | 10.346516843539705 | 13.459477571879843 | 2.4548683837864473 | 6.634961896757854 | 0.1496665875490299 |
min | ALB | Albania | 98340.0 | 1.98 | 15.1 | 1.144 | 0.526 | 752.788 | 79.37 | 0.99 | 0.1 | 7.7 | 0.1 | 59.31 | 0.354 |
max | ZWE | Zimbabwe | 1.439323774E9 | 7915.731 | 48.2 | 27.049 | 18.493 | 116935.6 | 724.417 | 22.02 | 44.0 | 78.1 | 13.05 | 84.63 | 0.953 |
display(valid_distinct_features.select($"iso_code", $"population"))
display(valid_distinct_features.select($"iso_code", $"population_density"))
display(valid_distinct_features.select($"iso_code", $"median_age"))
display(valid_distinct_features.select($"iso_code", $"aged_65_older"))
display(valid_distinct_features.select($"iso_code", $"aged_70_older"))
display(valid_distinct_features.select($"iso_code", $"gdp_per_capita"))
display(valid_distinct_features.select($"iso_code", $"cardiovasc_death_rate"))
display(valid_distinct_features.select($"iso_code", $"diabetes_prevalence"))
display(valid_distinct_features.select($"iso_code", $"female_smokers"))
display(valid_distinct_features.select($"iso_code", $"male_smokers"))
display(valid_distinct_features.select($"iso_code", $"hospital_beds_per_thousand"))
display(valid_distinct_features.select($"iso_code", $"life_expectancy"))
display(valid_distinct_features.select($"iso_code", $"human_development_index"))
Correlation between invariant features
There are some pairs of features are highly correlated i.e. 1. medianage, aged65older 2. medianage, humandevelopmentindex 3. medianage, lifeexpectancy 4. gdppercapita, humandevelopmentindex 5. gdppercapita, lifeexpectancy 6. humandevelopmentindex, lifeexpectancy
display(valid_distinct_features.drop("iso_code","location"))
display(df_cleaned_time_series.drop("iso_code", "continent", "location",
"date", "icu_patients", "icu_patients_per_million",
"hosp_patients", "hosp_patients_per_million", "weekly_icu_admissions",
"weekly_icu_admissions_per_million", "weekly_hosp_admissions", "weekly_hosp_admissions_per_million",
"total_tests", "new_tests", "total_tests_per_thousand",
"new_tests_per_thousand", "new_tests_smoothed", "new_tests_smoothed_per_thousand",
"positive_rate", "tests_per_case", "tests_units",
"extreme_poverty", "handwashing_facilities"))