In [ ]:
start = "1970-01-01"
end = "2020-10-01"

series = ["UNRATE", "T10Y2Y", "INDPRO", "CPIAUCSL", "FEDFUNDS", "USREC"]

dfs = {}

# Fetch & resample EACH SERIES separately
for s in series:
    print("Fetching:", s)
    data = pdr.DataReader(s, "fred", start, end)

    # Convert all to MONTHLY, forward-fill internally
    data = data.resample("ME").ffill()

    dfs[s] = data

# MERGE all clean monthly series
df = pd.concat(dfs.values(), axis=1)
df.columns = series

# Drop pre-1976 where T10Y2Y is missing
df = df.dropna(subset=["T10Y2Y"])

print(df.shape)
print(df.head(10))
print(df.tail(10))
print(df.isna().sum())
Fetching: UNRATE
Fetching: T10Y2Y
Fetching: INDPRO
Fetching: CPIAUCSL
Fetching: FEDFUNDS
Fetching: USREC
(524, 6)
            UNRATE  T10Y2Y   INDPRO  CPIAUCSL  FEDFUNDS  USREC
DATE                                                          
1976-06-30     7.6    0.84  43.8775      56.7      5.48      0
1976-07-31     7.8    1.12  44.1248      57.0      5.31      0
1976-08-31     7.8    1.16  44.4262      57.3      5.29      0
1976-09-30     7.6    1.22  44.5711      57.6      5.25      0
1976-10-31     7.7    1.39  44.5600      57.9      5.02      0
1976-11-30     7.8    1.59  45.2454      58.1      4.95      0
1976-12-31     7.8    1.47  45.7310      58.4      4.65      0
1977-01-31     7.5    1.22  45.4156      58.7      4.61      0
1977-02-28     7.6    1.37  46.1358      59.3      4.68      0
1977-03-31     7.4    1.41  46.7158      59.6      4.69      0
            UNRATE  T10Y2Y    INDPRO  CPIAUCSL  FEDFUNDS  USREC
DATE                                                           
2020-01-31     3.6    0.18  101.3372   259.127      1.55      0
2020-02-29     3.5    0.27  101.6718   259.250      1.58      0
2020-03-31     4.4    0.47   97.6060   258.076      0.65      1
2020-04-30    14.8    0.44   84.6812   256.032      0.05      1
2020-05-31    13.2    0.49   86.0108   255.802      0.05      0
2020-06-30    11.0    0.50   91.6745   257.042      0.08      0
2020-07-31    10.2    0.44   95.0037   258.352      0.09      0
2020-08-31     8.4    0.58   95.9294   259.316      0.10      0
2020-09-30     7.8    0.56   95.8914   259.997      0.09      0
2020-10-31     6.9    0.54   96.5256   260.319      0.09      0
UNRATE      0
T10Y2Y      0
INDPRO      0
CPIAUCSL    0
FEDFUNDS    0
USREC       0
dtype: int64
In [ ]:
fig, ax1 = plt.subplots()

ax1.plot(df.index, df["UNRATE"], label="Unemployment Rate (%)")
ax1.set_ylabel("Unemployment Rate")

ax2 = ax1.twinx()
ax2.fill_between(df.index, 0, df["USREC"], color="grey", alpha=0.3, label="Recession (USREC)")

ax1.set_title("US Unemployment vs NBER Recessions")
ax1.legend(loc="upper left")
ax2.legend(loc="upper right")
plt.show()
No description has been provided for this image
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, roc_curve
In [ ]:
df_ml = df.copy()

# Year-on-year % changes
df_ml["INDPRO_yoy"] = df_ml["INDPRO"].pct_change(12) * 100
df_ml["CPI_yoy"] = df_ml["CPIAUCSL"].pct_change(12) * 100

# Drop initial rows with no YoY data
df_ml = df_ml.dropna()

feature_cols = ["UNRATE", "T10Y2Y", "INDPRO_yoy", "CPI_yoy", "FEDFUNDS"]
target = "USREC"

X = df_ml[feature_cols]
y = df_ml[target]
In [ ]:
# ADD LAGGED FEATURES
lags = [1, 3, 6, 12]  # you can adjust

for col in feature_cols:
    for lag in lags:
        df_ml[f"{col}_lag{lag}"] = df_ml[col].shift(lag)

# Drop rows with NaNs created by lagging
df_ml = df_ml.dropna()
In [ ]:
split_date = "2010-01-01"

X_train = X[X.index < split_date]
X_test = X[X.index >= split_date]
y_train = y[y.index < split_date]
y_test = y[y.index >= split_date]
In [ ]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
In [ ]:
# UPDATED LOGISTIC REGRESSION
logit = LogisticRegression(
    C=0.5,                   # slightly stronger regularization
    penalty="l2",
    class_weight={0:1, 1:12},   # give recession months more weight
    solver="lbfgs",
    max_iter=3000
)

logit.fit(X_train_scaled, y_train)

log_proba = logit.predict_proba(X_test_scaled)[:, 1]
log_pred = (log_proba > 0.35).astype(int)      # threshold tuning

print("Logistic Regression ROC-AUC:", roc_auc_score(y_test, log_proba))
print(classification_report(y_test, log_pred))
Logistic Regression ROC-AUC: 0.86
              precision    recall  f1-score   support

           0       0.99      0.94      0.96       125
           1       0.11      0.50      0.18         2

    accuracy                           0.93       127
   macro avg       0.55      0.72      0.57       127
weighted avg       0.98      0.93      0.95       127

In [ ]:
# UPDATED RANDOM FOREST
rf = RandomForestClassifier(
    n_estimators=600,
    max_depth=6,
    min_samples_leaf=8,           # smoother splits
    min_samples_split=10,
    class_weight={0:1, 1:20},
    max_features="sqrt",
    random_state=42
)

rf.fit(X_train, y_train)

rf_proba = rf.predict_proba(X_test)[:, 1]
rf_pred = (rf_proba > 0.30).astype(int)        # tuned threshold

print("Random Forest ROC-AUC:", roc_auc_score(y_test, rf_proba))
print(classification_report(y_test, rf_pred))
Random Forest ROC-AUC: 0.808
              precision    recall  f1-score   support

           0       1.00      0.58      0.74       125
           1       0.04      1.00      0.07         2

    accuracy                           0.59       127
   macro avg       0.52      0.79      0.40       127
weighted avg       0.98      0.59      0.73       127

In [ ]:
# UPDATED XGBOOST
xgb = XGBClassifier(
    n_estimators=600,
    learning_rate=0.03,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=1.5,
    reg_lambda=1.5,
    scale_pos_weight=18,       # force XGB to care about rare recessions
    eval_metric="logloss",
    random_state=42
)

xgb.fit(X_train, y_train)

xgb_proba = xgb.predict_proba(X_test)[:, 1]
xgb_pred = (xgb_proba > 0.30).astype(int)      # tuned threshold

print("XGBoost ROC-AUC:", roc_auc_score(y_test, xgb_proba))
print(classification_report(y_test, xgb_pred))
XGBoost ROC-AUC: 0.854
              precision    recall  f1-score   support

           0       1.00      0.70      0.83       125
           1       0.05      1.00      0.10         2

    accuracy                           0.71       127
   macro avg       0.53      0.85      0.46       127
weighted avg       0.99      0.71      0.81       127

In [ ]:
fpr_log, tpr_log, _ = roc_curve(y_test, log_proba)
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_proba)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, xgb_proba)

plt.figure(figsize=(8,5))
plt.plot(fpr_log, tpr_log, label="Logit")
plt.plot(fpr_rf, tpr_rf, label="Random Forest")
plt.plot(fpr_xgb, tpr_xgb, label="XGBoost")
plt.plot([0,1], [0,1], "--", color="grey")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.title("ROC Curves — Recession Prediction")
plt.show()
No description has been provided for this image
In [ ]:
pd.Series(rf.feature_importances_, index=feature_cols).sort_values().plot(kind="barh")
plt.title("Random Forest Feature Importance")
plt.show()

pd.Series(xgb.feature_importances_, index=feature_cols).sort_values().plot(kind="barh")
plt.title("XGBoost Feature Importance")
plt.show()
No description has been provided for this image
No description has been provided for this image
In [ ]:
# --- RECESSION PROBABILITY OVER TIME ---

# Align the predicted probabilities with the index
probs_df = pd.DataFrame({
    "Logit": log_proba,
    "RF": rf_proba,
    "XGB": xgb_proba
}, index=X_test.index)

plt.figure(figsize=(14,5))
plt.plot(probs_df.index, probs_df["Logit"], label="Logit", linewidth=2)
plt.plot(probs_df.index, probs_df["RF"], label="Random Forest", alpha=0.7)
plt.plot(probs_df.index, probs_df["XGB"], label="XGBoost", alpha=0.7)

# recession shading
plt.fill_between(df.index, 0, df["USREC"], color="gray", alpha=0.2, label="Recession")

plt.title("Recession Probability (Test Period: 2010–2020)")
plt.ylabel("Probability")
plt.legend()
plt.show()
No description has been provided for this image
In [ ]:
from sklearn.decomposition import PCA

# Scale all features
from sklearn.preprocessing import StandardScaler
scaled = StandardScaler().fit_transform(df_ml[feature_cols])

pca = PCA()
pca.fit(scaled)

plt.plot(np.cumsum(pca.explained_variance_ratio_), marker="o")
plt.title("PCA Explained Variance")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Variance Explained")
plt.show()
No description has been provided for this image
In [ ]:
import shap
shap.initjs()

explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values, X_test, feature_names=X_test.columns)
No description has been provided for this image
No description has been provided for this image

Project Summary¶

In this project, I built a practical machine learning workflow to understand how well macroeconomic data can signal upcoming recessions. I focused on indicators that analysts and economists rely on every day: unemployment, industrial production, inflation, interest rates, and the yield curve. The goal was simple. Can these variables tell us something useful before the economy turns?

I pulled data from FRED, cleaned and aligned everything to monthly frequency, and engineered features that actually matter, such as year-over-year changes. I trained three models: Logistic Regression, Random Forest, and XGBoost, using a time-based split so the models were tested on unseen data from 2010 to 2020. I evaluated performance using ROC-AUC, precision, recall, and feature interpretation tools like PCA and SHAP.

Logistic Regression ended up being the most reliable model. It produced steady and believable recession probabilities. Tree-based models picked up recession signals faster but created more noise, which is expected with rare economic events.