start = "1970-01-01"
end = "2020-10-01"
series = ["UNRATE", "T10Y2Y", "INDPRO", "CPIAUCSL", "FEDFUNDS", "USREC"]
dfs = {}
# Fetch & resample EACH SERIES separately
for s in series:
print("Fetching:", s)
data = pdr.DataReader(s, "fred", start, end)
# Convert all to MONTHLY, forward-fill internally
data = data.resample("ME").ffill()
dfs[s] = data
# MERGE all clean monthly series
df = pd.concat(dfs.values(), axis=1)
df.columns = series
# Drop pre-1976 where T10Y2Y is missing
df = df.dropna(subset=["T10Y2Y"])
print(df.shape)
print(df.head(10))
print(df.tail(10))
print(df.isna().sum())
Fetching: UNRATE
Fetching: T10Y2Y
Fetching: INDPRO
Fetching: CPIAUCSL
Fetching: FEDFUNDS
Fetching: USREC
(524, 6)
UNRATE T10Y2Y INDPRO CPIAUCSL FEDFUNDS USREC
DATE
1976-06-30 7.6 0.84 43.8775 56.7 5.48 0
1976-07-31 7.8 1.12 44.1248 57.0 5.31 0
1976-08-31 7.8 1.16 44.4262 57.3 5.29 0
1976-09-30 7.6 1.22 44.5711 57.6 5.25 0
1976-10-31 7.7 1.39 44.5600 57.9 5.02 0
1976-11-30 7.8 1.59 45.2454 58.1 4.95 0
1976-12-31 7.8 1.47 45.7310 58.4 4.65 0
1977-01-31 7.5 1.22 45.4156 58.7 4.61 0
1977-02-28 7.6 1.37 46.1358 59.3 4.68 0
1977-03-31 7.4 1.41 46.7158 59.6 4.69 0
UNRATE T10Y2Y INDPRO CPIAUCSL FEDFUNDS USREC
DATE
2020-01-31 3.6 0.18 101.3372 259.127 1.55 0
2020-02-29 3.5 0.27 101.6718 259.250 1.58 0
2020-03-31 4.4 0.47 97.6060 258.076 0.65 1
2020-04-30 14.8 0.44 84.6812 256.032 0.05 1
2020-05-31 13.2 0.49 86.0108 255.802 0.05 0
2020-06-30 11.0 0.50 91.6745 257.042 0.08 0
2020-07-31 10.2 0.44 95.0037 258.352 0.09 0
2020-08-31 8.4 0.58 95.9294 259.316 0.10 0
2020-09-30 7.8 0.56 95.8914 259.997 0.09 0
2020-10-31 6.9 0.54 96.5256 260.319 0.09 0
UNRATE 0
T10Y2Y 0
INDPRO 0
CPIAUCSL 0
FEDFUNDS 0
USREC 0
dtype: int64
fig, ax1 = plt.subplots()
ax1.plot(df.index, df["UNRATE"], label="Unemployment Rate (%)")
ax1.set_ylabel("Unemployment Rate")
ax2 = ax1.twinx()
ax2.fill_between(df.index, 0, df["USREC"], color="grey", alpha=0.3, label="Recession (USREC)")
ax1.set_title("US Unemployment vs NBER Recessions")
ax1.legend(loc="upper left")
ax2.legend(loc="upper right")
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, roc_curve
df_ml = df.copy()
# Year-on-year % changes
df_ml["INDPRO_yoy"] = df_ml["INDPRO"].pct_change(12) * 100
df_ml["CPI_yoy"] = df_ml["CPIAUCSL"].pct_change(12) * 100
# Drop initial rows with no YoY data
df_ml = df_ml.dropna()
feature_cols = ["UNRATE", "T10Y2Y", "INDPRO_yoy", "CPI_yoy", "FEDFUNDS"]
target = "USREC"
X = df_ml[feature_cols]
y = df_ml[target]
# ADD LAGGED FEATURES
lags = [1, 3, 6, 12] # you can adjust
for col in feature_cols:
for lag in lags:
df_ml[f"{col}_lag{lag}"] = df_ml[col].shift(lag)
# Drop rows with NaNs created by lagging
df_ml = df_ml.dropna()
split_date = "2010-01-01"
X_train = X[X.index < split_date]
X_test = X[X.index >= split_date]
y_train = y[y.index < split_date]
y_test = y[y.index >= split_date]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# UPDATED LOGISTIC REGRESSION
logit = LogisticRegression(
C=0.5, # slightly stronger regularization
penalty="l2",
class_weight={0:1, 1:12}, # give recession months more weight
solver="lbfgs",
max_iter=3000
)
logit.fit(X_train_scaled, y_train)
log_proba = logit.predict_proba(X_test_scaled)[:, 1]
log_pred = (log_proba > 0.35).astype(int) # threshold tuning
print("Logistic Regression ROC-AUC:", roc_auc_score(y_test, log_proba))
print(classification_report(y_test, log_pred))
Logistic Regression ROC-AUC: 0.86
precision recall f1-score support
0 0.99 0.94 0.96 125
1 0.11 0.50 0.18 2
accuracy 0.93 127
macro avg 0.55 0.72 0.57 127
weighted avg 0.98 0.93 0.95 127
# UPDATED RANDOM FOREST
rf = RandomForestClassifier(
n_estimators=600,
max_depth=6,
min_samples_leaf=8, # smoother splits
min_samples_split=10,
class_weight={0:1, 1:20},
max_features="sqrt",
random_state=42
)
rf.fit(X_train, y_train)
rf_proba = rf.predict_proba(X_test)[:, 1]
rf_pred = (rf_proba > 0.30).astype(int) # tuned threshold
print("Random Forest ROC-AUC:", roc_auc_score(y_test, rf_proba))
print(classification_report(y_test, rf_pred))
Random Forest ROC-AUC: 0.808
precision recall f1-score support
0 1.00 0.58 0.74 125
1 0.04 1.00 0.07 2
accuracy 0.59 127
macro avg 0.52 0.79 0.40 127
weighted avg 0.98 0.59 0.73 127
# UPDATED XGBOOST
xgb = XGBClassifier(
n_estimators=600,
learning_rate=0.03,
max_depth=4,
subsample=0.9,
colsample_bytree=0.9,
reg_alpha=1.5,
reg_lambda=1.5,
scale_pos_weight=18, # force XGB to care about rare recessions
eval_metric="logloss",
random_state=42
)
xgb.fit(X_train, y_train)
xgb_proba = xgb.predict_proba(X_test)[:, 1]
xgb_pred = (xgb_proba > 0.30).astype(int) # tuned threshold
print("XGBoost ROC-AUC:", roc_auc_score(y_test, xgb_proba))
print(classification_report(y_test, xgb_pred))
XGBoost ROC-AUC: 0.854
precision recall f1-score support
0 1.00 0.70 0.83 125
1 0.05 1.00 0.10 2
accuracy 0.71 127
macro avg 0.53 0.85 0.46 127
weighted avg 0.99 0.71 0.81 127
fpr_log, tpr_log, _ = roc_curve(y_test, log_proba)
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_proba)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, xgb_proba)
plt.figure(figsize=(8,5))
plt.plot(fpr_log, tpr_log, label="Logit")
plt.plot(fpr_rf, tpr_rf, label="Random Forest")
plt.plot(fpr_xgb, tpr_xgb, label="XGBoost")
plt.plot([0,1], [0,1], "--", color="grey")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.title("ROC Curves — Recession Prediction")
plt.show()
pd.Series(rf.feature_importances_, index=feature_cols).sort_values().plot(kind="barh")
plt.title("Random Forest Feature Importance")
plt.show()
pd.Series(xgb.feature_importances_, index=feature_cols).sort_values().plot(kind="barh")
plt.title("XGBoost Feature Importance")
plt.show()
# --- RECESSION PROBABILITY OVER TIME ---
# Align the predicted probabilities with the index
probs_df = pd.DataFrame({
"Logit": log_proba,
"RF": rf_proba,
"XGB": xgb_proba
}, index=X_test.index)
plt.figure(figsize=(14,5))
plt.plot(probs_df.index, probs_df["Logit"], label="Logit", linewidth=2)
plt.plot(probs_df.index, probs_df["RF"], label="Random Forest", alpha=0.7)
plt.plot(probs_df.index, probs_df["XGB"], label="XGBoost", alpha=0.7)
# recession shading
plt.fill_between(df.index, 0, df["USREC"], color="gray", alpha=0.2, label="Recession")
plt.title("Recession Probability (Test Period: 2010–2020)")
plt.ylabel("Probability")
plt.legend()
plt.show()
from sklearn.decomposition import PCA
# Scale all features
from sklearn.preprocessing import StandardScaler
scaled = StandardScaler().fit_transform(df_ml[feature_cols])
pca = PCA()
pca.fit(scaled)
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker="o")
plt.title("PCA Explained Variance")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Variance Explained")
plt.show()
import shap
shap.initjs()
explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, feature_names=X_test.columns)
Project Summary¶
In this project, I built a practical machine learning workflow to understand how well macroeconomic data can signal upcoming recessions. I focused on indicators that analysts and economists rely on every day: unemployment, industrial production, inflation, interest rates, and the yield curve. The goal was simple. Can these variables tell us something useful before the economy turns?
I pulled data from FRED, cleaned and aligned everything to monthly frequency, and engineered features that actually matter, such as year-over-year changes. I trained three models: Logistic Regression, Random Forest, and XGBoost, using a time-based split so the models were tested on unseen data from 2010 to 2020. I evaluated performance using ROC-AUC, precision, recall, and feature interpretation tools like PCA and SHAP.
Logistic Regression ended up being the most reliable model. It produced steady and believable recession probabilities. Tree-based models picked up recession signals faster but created more noise, which is expected with rare economic events.