Cross-Validation in Machine Learning
Cross-validation assesses model performance by training and testing on different data splits. It provides reliable performance estimates and helps prevent overfitting.
K-Fold Cross-Validation
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
# Create sample data
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
# Create model
model = LogisticRegression()
# 5-fold cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"Scores: {scores}")
print(f"Mean accuracy: {scores.mean():.3f} (+/- {scores.std():.3f})")
# Output: Mean accuracy: 0.885 (+/- 0.012)
How K-Fold Works
# Example with k=5:
# 1. Split data into 5 folds
# 2. Train on folds 1,2,3,4 → Test on fold 5
# 3. Train on folds 1,2,3,5 → Test on fold 4
# 4. Train on folds 1,2,4,5 → Test on fold 3
# 5. Train on folds 1,3,4,5 → Test on fold 2
# 6. Train on folds 2,3,4,5 → Test on fold 1
# 7. Average the 5 test scores
# Every sample used for testing exactly once!
Stratified K-Fold (For Imbalanced Data)
from sklearn.model_selection import StratifiedKFold
# Maintains class distribution in each fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=skf, scoring='accuracy')
print(f"Stratified CV scores: {scores.mean():.3f}")
Different Scoring Metrics
# Accuracy
scores_acc = cross_val_score(model, X, y, cv=5, scoring='accuracy')
# Precision
scores_prec = cross_val_score(model, X, y, cv=5, scoring='precision')
# Recall
scores_rec = cross_val_score(model, X, y, cv=5, scoring='recall')
# F1 Score
scores_f1 = cross_val_score(model, X, y, cv=5, scoring='f1')
# ROC AUC
scores_auc = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
print(f"Accuracy: {scores_acc.mean():.3f}")
print(f"Precision: {scores_prec.mean():.3f}")
print(f"Recall: {scores_rec.mean():.3f}")
print(f"F1: {scores_f1.mean():.3f}")
print(f"AUC: {scores_auc.mean():.3f}")
Leave-One-Out CV (LOOCV)
from sklearn.model_selection import LeaveOneOut
# Use n-1 samples for training, 1 for testing
# Repeat n times (each sample as test once)
loo = LeaveOneOut()
scores = cross_val_score(model, X, y, cv=loo)
print(f"LOOCV score: {scores.mean():.3f}")
# Very thorough but computationally expensive
# Use for small datasets only
Time Series Cross-Validation
from sklearn.model_selection import TimeSeriesSplit
# For time-ordered data (no future data in training!)
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# Training always uses past data
# Testing uses future data
# Expanding window approach
Cross-Val with Multiple Metrics
from sklearn.model_selection import cross_validate
# Get multiple metrics at once
scoring = ['accuracy', 'precision', 'recall', 'f1']
results = cross_validate(model, X, y, cv=5, scoring=scoring)
print("Test accuracy:", results['test_accuracy'].mean())
print("Test precision:", results['test_precision'].mean())
print("Test recall:", results['test_recall'].mean())
print("Test f1:", results['test_f1'].mean())
Hyperparameter Tuning with CV
from sklearn.model_selection import GridSearchCV
# Test different hyperparameters
param_grid = {
'C': [0.1, 1, 10],
'penalty': ['l1', 'l2'],
'solver': ['liblinear']
}
grid_search = GridSearchCV(
LogisticRegression(),
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1
)
grid_search.fit(X, y)
print("Best parameters:", grid_search.best_params_)
print("Best CV score:", grid_search.best_score_)
print("Best estimator:", grid_search.best_estimator_)
Nested Cross-Validation
# For unbiased model performance estimation
from sklearn.model_selection import cross_val_score, GridSearchCV
# Outer loop: Assess model performance
# Inner loop: Tune hyperparameters
outer_cv = 5
inner_cv = 3
# Grid search with inner CV
model = GridSearchCV(
LogisticRegression(),
param_grid={'C': [0.1, 1, 10]},
cv=inner_cv
)
# Outer CV for final performance estimate
scores = cross_val_score(model, X, y, cv=outer_cv)
print(f"Nested CV score: {scores.mean():.3f} (+/- {scores.std():.3f})")
Common Mistakes to Avoid
- Data leakage: Don't fit preprocessing on entire dataset
- Wrong CV for time series: Use TimeSeriesSplit, not random splits
- Not stratifying: Use StratifiedKFold for imbalanced classes
- Too few folds: Usually use k=5 or k=10
- Testing on training data: Always hold out final test set
Best Practices
# Correct preprocessing pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# Preprocessing inside cross-validation
pipeline = Pipeline([
('scaler', StandardScaler()),
('model', LogisticRegression())
])
# Scaling happens separately for each fold
scores = cross_val_score(pipeline, X, y, cv=5)
# This prevents data leakage!
When to Use Which CV Method
| Method | When to Use |
|---|---|
| K-Fold | Default choice, balanced classes |
| Stratified K-Fold | Imbalanced classes |
| TimeSeriesSplit | Time-ordered data |
| LOOCV | Small datasets (<100 samples) |
| Nested CV | Hyperparameter tuning + performance estimation |
Pro Tip: Always use cross-validation for model evaluation. A single train/test split can be misleading. Use 5 or 10 folds for most cases, and always put preprocessing inside your CV pipeline to avoid data leakage!
← Back to Data Analysis Tips