Useful Data Tips

Feature Selection in Data Science

⏱️ 30 sec read 📈 Data Analysis

Feature selection identifies the most important variables for your model. It improves performance, reduces overfitting, and makes models faster and more interpretable.

Why Feature Selection Matters

# Problems with too many features:
# - Overfitting (model learns noise)
# - Slow training
# - Hard to interpret
# - Curse of dimensionality

# Benefits of feature selection:
# - Better performance
# - Faster training
# - Simpler models
# - Easier interpretation

Method 1: Filter Methods (Statistical)

Correlation with Target

import pandas as pd
import numpy as np

# Calculate correlation
correlations = df.corr()['target'].abs().sort_values(ascending=False)
print(correlations)

# Select features with correlation > 0.5
selected_features = correlations[correlations > 0.5].index.tolist()
print(f"Selected: {selected_features}")

Chi-Square Test (Categorical Features)

from sklearn.feature_selection import chi2, SelectKBest

# Select top 10 features
selector = SelectKBest(chi2, k=10)
X_new = selector.fit_transform(X, y)

# Get selected feature names
selected_indices = selector.get_support(indices=True)
selected_features = X.columns[selected_indices]
print(f"Selected features: {selected_features.tolist()}")

Mutual Information

from sklearn.feature_selection import mutual_info_classif

# Calculate mutual information
mi_scores = mutual_info_classif(X, y)

# Create series with feature names
mi_scores = pd.Series(mi_scores, index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)

print("Top 5 features by mutual information:")
print(mi_scores.head())

Method 2: Wrapper Methods (Model-Based)

Recursive Feature Elimination (RFE)

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Create model
model = LogisticRegression()

# Select top 5 features
rfe = RFE(estimator=model, n_features_to_select=5)
rfe.fit(X, y)

# Get selected features
selected_features = X.columns[rfe.support_]
print(f"RFE selected: {selected_features.tolist()}")

# Get feature rankings
rankings = pd.Series(rfe.ranking_, index=X.columns)
print("\nFeature rankings:")
print(rankings.sort_values())

Forward Selection

from mlxtend.feature_selection import SequentialFeatureSelector

# Forward selection
sfs = SequentialFeatureSelector(
    model,
    k_features=5,
    forward=True,
    scoring='accuracy',
    cv=5
)

sfs.fit(X, y)
print(f"Selected features: {list(sfs.k_feature_names_)}")

Method 3: Embedded Methods (Built-in)

Lasso (L1 Regularization)

from sklearn.linear_model import LassoCV

# Lasso automatically zeros out unimportant features
lasso = LassoCV(cv=5)
lasso.fit(X, y)

# Get non-zero coefficients
importance = np.abs(lasso.coef_)
selected_features = X.columns[importance > 0]

print(f"Lasso selected {len(selected_features)} features:")
print(selected_features.tolist())

Tree-Based Feature Importance

from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# Train random forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get feature importances
importances = pd.Series(rf.feature_importances_, index=X.columns)
importances = importances.sort_values(ascending=False)

# Plot
plt.figure(figsize=(10, 6))
importances.head(10).plot(kind='barh')
plt.xlabel('Importance')
plt.title('Top 10 Features by Random Forest')
plt.show()

# Select features with importance > threshold
threshold = 0.05
selected_features = importances[importances > threshold].index
print(f"Selected {len(selected_features)} features")

Removing Low Variance Features

from sklearn.feature_selection import VarianceThreshold

# Remove features with < 10% variance
selector = VarianceThreshold(threshold=0.1)
X_high_variance = selector.fit_transform(X)

# Get feature names
selected_features = X.columns[selector.get_support()]
print(f"Kept {len(selected_features)} features with sufficient variance")

Handling Multicollinearity

# Remove highly correlated features
correlation_matrix = X.corr().abs()

# Get upper triangle
upper_tri = correlation_matrix.where(
    np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
)

# Find features with correlation > 0.95
to_drop = [column for column in upper_tri.columns
           if any(upper_tri[column] > 0.95)]

print(f"Dropping {len(to_drop)} highly correlated features:")
print(to_drop)

X_reduced = X.drop(columns=to_drop)

Complete Feature Selection Pipeline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Combine multiple selection methods
def select_features(X, y, k=10):
    # 1. Remove low variance
    var_selector = VarianceThreshold(threshold=0.1)
    X_var = var_selector.fit_transform(X)

    # 2. Remove highly correlated
    corr_matrix = pd.DataFrame(X_var).corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
    X_uncorr = pd.DataFrame(X_var).drop(columns=to_drop)

    # 3. Select top k by importance
    rf = RandomForestClassifier()
    rf.fit(X_uncorr, y)
    importances = pd.Series(rf.feature_importances_)
    top_k = importances.nlargest(k).index

    return X_uncorr.iloc[:, top_k]

X_selected = select_features(X, y, k=10)
print(f"Final feature set: {X_selected.shape[1]} features")

Evaluating Feature Selection

from sklearn.model_selection import cross_val_score

# Compare performance before and after
model = LogisticRegression()

# Before selection
scores_before = cross_val_score(model, X, y, cv=5)
print(f"Before: {scores_before.mean():.3f} (+/- {scores_before.std():.3f})")

# After selection
scores_after = cross_val_score(model, X_selected, y, cv=5)
print(f"After: {scores_after.mean():.3f} (+/- {scores_after.std():.3f})")

Comparison of Methods

Method Speed Accuracy
Filter (Correlation) Fast Good
Wrapper (RFE) Slow Best
Embedded (Lasso) Medium Very Good

Best Practices

Pro Tip: Feature selection should be part of your cross-validation pipeline to avoid data leakage. Don't select features using the entire dataset, then evaluate—select within each CV fold!

← Back to Data Analysis Tips