InΒ [Β ]:
# @title
!pip install pandas numpy seaborn matplotlib plotly scikit-learn kaggle
Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)
Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)
Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)
Requirement already satisfied: plotly in /usr/local/lib/python3.12/dist-packages (5.24.1)
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.12/dist-packages (1.6.1)
Requirement already satisfied: kaggle in /usr/local/lib/python3.12/dist-packages (1.7.4.5)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.60.1)
Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (25.0)
Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.2.5)
Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.12/dist-packages (from plotly) (8.5.0)
Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (1.16.3)
Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (1.5.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (3.6.0)
Requirement already satisfied: bleach in /usr/local/lib/python3.12/dist-packages (from kaggle) (6.3.0)
Requirement already satisfied: certifi>=14.05.14 in /usr/local/lib/python3.12/dist-packages (from kaggle) (2025.11.12)
Requirement already satisfied: charset-normalizer in /usr/local/lib/python3.12/dist-packages (from kaggle) (3.4.4)
Requirement already satisfied: idna in /usr/local/lib/python3.12/dist-packages (from kaggle) (3.11)
Requirement already satisfied: protobuf in /usr/local/lib/python3.12/dist-packages (from kaggle) (5.29.5)
Requirement already satisfied: python-slugify in /usr/local/lib/python3.12/dist-packages (from kaggle) (8.0.4)
Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from kaggle) (2.32.4)
Requirement already satisfied: setuptools>=21.0.0 in /usr/local/lib/python3.12/dist-packages (from kaggle) (75.2.0)
Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.12/dist-packages (from kaggle) (1.17.0)
Requirement already satisfied: text-unidecode in /usr/local/lib/python3.12/dist-packages (from kaggle) (1.3)
Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from kaggle) (4.67.1)
Requirement already satisfied: urllib3>=1.15.1 in /usr/local/lib/python3.12/dist-packages (from kaggle) (2.5.0)
Requirement already satisfied: webencodings in /usr/local/lib/python3.12/dist-packages (from kaggle) (0.5.1)
InΒ [Β ]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings("ignore")

plt.style.use("seaborn-v0_8")
InΒ [Β ]:
import pandas as pd

gap_url = "https://raw.githubusercontent.com/resbaz/r-novice-gapminder-files/master/data/gapminder-FiveYearData.csv"
gap = pd.read_csv(gap_url)

gap.head()
Out[Β ]:
country year pop continent lifeExp gdpPercap
0 Afghanistan 1952 8425333.0 Asia 28.801 779.445314
1 Afghanistan 1957 9240934.0 Asia 30.332 820.853030
2 Afghanistan 1962 10267083.0 Asia 31.997 853.100710
3 Afghanistan 1967 11537966.0 Asia 34.020 836.197138
4 Afghanistan 1972 13079460.0 Asia 36.088 739.981106
InΒ [Β ]:
# keep most recent data per country
gap_recent = gap.sort_values("year").groupby("country").tail(1).reset_index(drop=True)

# create a happiness proxy based on development indicators
gap_recent["happiness_proxy"] = (
    gap_recent["lifeExp"] +
    (gap_recent["gdpPercap"].apply(lambda x: np.log(x+1)))
) / 2

gap_recent.head()
Out[Β ]:
country year pop continent lifeExp gdpPercap happiness_proxy
0 Nicaragua 2007 5675356.0 Americas 72.899 2749.320965 40.409236
1 New Zealand 2007 4115771.0 Oceania 80.204 25185.009110 45.169022
2 Norway 2007 4627926.0 Europe 80.196 49357.190170 45.501429
3 Greece 2007 10706290.0 Europe 79.483 27538.411880 44.853187
4 Somalia 2007 9118773.0 Africa 48.159 926.141068 27.495553
InΒ [Β ]:
import requests
import pandas as pd

# PM2.5 indicator from the World Bank
pm25_api = "https://api.worldbank.org/v2/country/all/indicator/EN.ATM.PM25.MC.M3?format=json&per_page=20000"
pm_json = requests.get(pm25_api).json()

pm_list = []
for entry in pm_json[1]:
    if entry["value"] is not None:
        pm_list.append([
            entry["country"]["value"],
            entry["date"],
            entry["value"]
        ])

pm25 = pd.DataFrame(pm_list, columns=["country", "year", "pm25"])
pm25 = pm25.sort_values("year").groupby("country").tail(1).reset_index(drop=True)

pm25.head()
Out[Β ]:
country year pm25
0 Azerbaijan 2020 21.727000
1 Belgium 2020 11.216066
2 Argentina 2020 14.908174
3 West Bank and Gaza 2020 26.363626
4 Venezuela, RB 2020 15.256580
InΒ [Β ]:
# Basic summary statistics

df_stats = gap_recent[["lifeExp", "gdpPercap", "happiness_proxy"]].describe()
df_stats

stats = gap_recent[["lifeExp", "gdpPercap", "happiness_proxy"]].describe().T
stats["cv"] = stats["std"] / stats["mean"]
stats
Out[Β ]:
count mean std min 25% 50% 75% max cv
lifeExp 142.0 67.007423 12.073021 39.613000 57.160250 71.935500 76.413250 82.603000 0.180174
gdpPercap 142.0 11680.071820 12859.937337 277.551859 1624.842248 6124.371108 18008.835640 49357.190170 1.101015
happiness_proxy 142.0 37.811815 6.596834 24.014023 32.339989 40.375318 42.826808 46.482858 0.174465
InΒ [Β ]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.pairplot(
    gap_recent[["lifeExp", "gdpPercap", "happiness_proxy"]],
    diag_kind="kde",
    plot_kws={"alpha": 0.7}
)
plt.suptitle("Pairwise Relationships Between Key Indicators", y=1.02)
plt.show()
No description has been provided for this image
InΒ [Β ]:
import numpy as np

corr = gap_recent[["lifeExp", "gdpPercap", "happiness_proxy"]].corr()

mask = np.triu(np.ones_like(corr, dtype=bool))

plt.figure(figsize=(7,5))
sns.heatmap(
    corr,
    annot=True,
    cmap="viridis",
    linewidths=0.5,
    mask=mask,
    fmt=".2f"
)
plt.title("Correlation Matrix (Masked Upper Triangle)")
plt.show()
No description has been provided for this image
InΒ [Β ]:
plt.figure(figsize=(14,4))

for i, col in enumerate(["lifeExp", "gdpPercap", "happiness_proxy"]):
    plt.subplot(1, 3, i+1)
    sns.histplot(gap_recent[col], kde=True)
    plt.title(f"Distribution: {col}")

plt.tight_layout()
plt.show()
No description has been provided for this image

Key InsightsΒΆ

Life expectancy and GDP per capita show a strong positive correlation (~0.68), consistent with global development theory.

The happiness proxy strongly correlates with both life expectancy (1.00) and GDP per capita (~0.71), confirming that longevity and economic prosperity jointly drive wellbeing.

GDP has the highest coefficient of variation (CV), indicating extreme inequality across countries.

Life expectancy has the lowest CV, showing global convergence driven by medical advancements.

Distributions show GDP is right-skewed, while life expectancy and happiness proxy are more symmetric.