Feature Selection in Data Science
Feature selection identifies the most important variables for your model. It improves performance, reduces overfitting, and makes models faster and more interpretable.
Why Feature Selection Matters
# Problems with too many features:
# - Overfitting (model learns noise)
# - Slow training
# - Hard to interpret
# - Curse of dimensionality
# Benefits of feature selection:
# - Better performance
# - Faster training
# - Simpler models
# - Easier interpretation
Method 1: Filter Methods (Statistical)
Correlation with Target
import pandas as pd
import numpy as np
# Calculate correlation
correlations = df.corr()['target'].abs().sort_values(ascending=False)
print(correlations)
# Select features with correlation > 0.5
selected_features = correlations[correlations > 0.5].index.tolist()
print(f"Selected: {selected_features}")
Chi-Square Test (Categorical Features)
from sklearn.feature_selection import chi2, SelectKBest
# Select top 10 features
selector = SelectKBest(chi2, k=10)
X_new = selector.fit_transform(X, y)
# Get selected feature names
selected_indices = selector.get_support(indices=True)
selected_features = X.columns[selected_indices]
print(f"Selected features: {selected_features.tolist()}")
Mutual Information
from sklearn.feature_selection import mutual_info_classif
# Calculate mutual information
mi_scores = mutual_info_classif(X, y)
# Create series with feature names
mi_scores = pd.Series(mi_scores, index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)
print("Top 5 features by mutual information:")
print(mi_scores.head())
Method 2: Wrapper Methods (Model-Based)
Recursive Feature Elimination (RFE)
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# Create model
model = LogisticRegression()
# Select top 5 features
rfe = RFE(estimator=model, n_features_to_select=5)
rfe.fit(X, y)
# Get selected features
selected_features = X.columns[rfe.support_]
print(f"RFE selected: {selected_features.tolist()}")
# Get feature rankings
rankings = pd.Series(rfe.ranking_, index=X.columns)
print("\nFeature rankings:")
print(rankings.sort_values())
Forward Selection
from mlxtend.feature_selection import SequentialFeatureSelector
# Forward selection
sfs = SequentialFeatureSelector(
model,
k_features=5,
forward=True,
scoring='accuracy',
cv=5
)
sfs.fit(X, y)
print(f"Selected features: {list(sfs.k_feature_names_)}")
Method 3: Embedded Methods (Built-in)
Lasso (L1 Regularization)
from sklearn.linear_model import LassoCV
# Lasso automatically zeros out unimportant features
lasso = LassoCV(cv=5)
lasso.fit(X, y)
# Get non-zero coefficients
importance = np.abs(lasso.coef_)
selected_features = X.columns[importance > 0]
print(f"Lasso selected {len(selected_features)} features:")
print(selected_features.tolist())
Tree-Based Feature Importance
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
# Train random forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)
# Get feature importances
importances = pd.Series(rf.feature_importances_, index=X.columns)
importances = importances.sort_values(ascending=False)
# Plot
plt.figure(figsize=(10, 6))
importances.head(10).plot(kind='barh')
plt.xlabel('Importance')
plt.title('Top 10 Features by Random Forest')
plt.show()
# Select features with importance > threshold
threshold = 0.05
selected_features = importances[importances > threshold].index
print(f"Selected {len(selected_features)} features")
Removing Low Variance Features
from sklearn.feature_selection import VarianceThreshold
# Remove features with < 10% variance
selector = VarianceThreshold(threshold=0.1)
X_high_variance = selector.fit_transform(X)
# Get feature names
selected_features = X.columns[selector.get_support()]
print(f"Kept {len(selected_features)} features with sufficient variance")
Handling Multicollinearity
# Remove highly correlated features
correlation_matrix = X.corr().abs()
# Get upper triangle
upper_tri = correlation_matrix.where(
np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
)
# Find features with correlation > 0.95
to_drop = [column for column in upper_tri.columns
if any(upper_tri[column] > 0.95)]
print(f"Dropping {len(to_drop)} highly correlated features:")
print(to_drop)
X_reduced = X.drop(columns=to_drop)
Complete Feature Selection Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# Combine multiple selection methods
def select_features(X, y, k=10):
# 1. Remove low variance
var_selector = VarianceThreshold(threshold=0.1)
X_var = var_selector.fit_transform(X)
# 2. Remove highly correlated
corr_matrix = pd.DataFrame(X_var).corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
X_uncorr = pd.DataFrame(X_var).drop(columns=to_drop)
# 3. Select top k by importance
rf = RandomForestClassifier()
rf.fit(X_uncorr, y)
importances = pd.Series(rf.feature_importances_)
top_k = importances.nlargest(k).index
return X_uncorr.iloc[:, top_k]
X_selected = select_features(X, y, k=10)
print(f"Final feature set: {X_selected.shape[1]} features")
Evaluating Feature Selection
from sklearn.model_selection import cross_val_score
# Compare performance before and after
model = LogisticRegression()
# Before selection
scores_before = cross_val_score(model, X, y, cv=5)
print(f"Before: {scores_before.mean():.3f} (+/- {scores_before.std():.3f})")
# After selection
scores_after = cross_val_score(model, X_selected, y, cv=5)
print(f"After: {scores_after.mean():.3f} (+/- {scores_after.std():.3f})")
Comparison of Methods
| Method | Speed | Accuracy |
|---|---|---|
| Filter (Correlation) | Fast | Good |
| Wrapper (RFE) | Slow | Best |
| Embedded (Lasso) | Medium | Very Good |
Best Practices
- Start with filter methods for quick wins
- Use domain knowledge - don't rely only on algorithms
- Validate with cross-validation
- Try multiple methods and compare
- Consider feature engineering before selection
Pro Tip: Feature selection should be part of your cross-validation pipeline to avoid data leakage. Don't select features using the entire dataset, then evaluate—select within each CV fold!
← Back to Data Analysis Tips