Essential Data Quality Checks
Data quality checks identify issues before analysis. Bad data leads to bad insights. Always validate completeness, accuracy, consistency, and validity before drawing conclusions.
1. Check for Missing Values
import pandas as pd
import numpy as np
# Load data
df = pd.read_csv('data.csv')
# Count missing values
missing = df.isnull().sum()
missing_pct = (df.isnull().sum() / len(df)) * 100
# Create summary
missing_summary = pd.DataFrame({
'Missing': missing,
'Percent': missing_pct
})
print(missing_summary[missing_summary['Missing'] > 0])
# Visualize
import matplotlib.pyplot as plt
import seaborn as sns
sns.heatmap(df.isnull(), cbar=False, yticklabels=False)
plt.title('Missing Data Pattern')
plt.show()
2. Detect Duplicates
# Find duplicate rows
duplicates = df.duplicated()
print(f"Total duplicates: {duplicates.sum()}")
# Show duplicate rows
print(df[duplicates])
# Remove duplicates
df_clean = df.drop_duplicates()
print(f"Removed {len(df) - len(df_clean)} duplicate rows")
# Find duplicates based on specific columns
id_duplicates = df.duplicated(subset=['customer_id'], keep='first')
print(f"Duplicate customer IDs: {id_duplicates.sum()}")
3. Identify Outliers
Using IQR Method
def find_outliers_iqr(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
return outliers
outliers = find_outliers_iqr(df, 'price')
print(f"Found {len(outliers)} outliers in price column")
print(f"Range: {outliers['price'].min()} to {outliers['price'].max()}")
Using Z-Score
from scipy import stats
def find_outliers_zscore(df, column, threshold=3):
z_scores = np.abs(stats.zscore(df[column].dropna()))
outliers = df[z_scores > threshold]
return outliers
outliers_z = find_outliers_zscore(df, 'price')
print(f"Z-score outliers: {len(outliers_z)}")
4. Validate Data Types
# Check data types
print(df.dtypes)
# Validate specific columns
issues = []
# Check if IDs are numeric
if df['customer_id'].dtype != 'int64':
issues.append("customer_id should be integer")
# Check if dates are datetime
if df['order_date'].dtype != 'datetime64[ns]':
issues.append("order_date should be datetime")
# Check if amounts are numeric
if not pd.api.types.is_numeric_dtype(df['amount']):
issues.append("amount should be numeric")
if issues:
print("Data type issues found:")
for issue in issues:
print(f" - {issue}")
5. Check Value Ranges
# Validate logical ranges
issues = []
# Age should be 0-120
invalid_age = df[(df['age'] < 0) | (df['age'] > 120)]
if len(invalid_age) > 0:
issues.append(f"{len(invalid_age)} invalid ages")
# Percentage should be 0-100
invalid_pct = df[(df['discount_pct'] < 0) | (df['discount_pct'] > 100)]
if len(invalid_pct) > 0:
issues.append(f"{len(invalid_pct)} invalid percentages")
# Prices should be positive
negative_prices = df[df['price'] < 0]
if len(negative_prices) > 0:
issues.append(f"{len(negative_prices)} negative prices")
if issues:
print("Range validation issues:")
for issue in issues:
print(f" - {issue}")
6. Verify Categorical Values
# Check for unexpected categories
expected_statuses = ['pending', 'shipped', 'delivered', 'cancelled']
actual_statuses = df['status'].unique()
unexpected = set(actual_statuses) - set(expected_statuses)
if unexpected:
print(f"Unexpected status values: {unexpected}")
# Check value frequencies
print("Status distribution:")
print(df['status'].value_counts())
# Find rare categories (< 1% of data)
value_counts = df['status'].value_counts()
rare_categories = value_counts[value_counts < len(df) * 0.01]
print(f"\nRare categories: {rare_categories.index.tolist()}")
7. Check for Consistency
# Cross-field validation
issues = []
# End date should be after start date
invalid_dates = df[df['end_date'] < df['start_date']]
if len(invalid_dates) > 0:
issues.append(f"{len(invalid_dates)} rows with end_date before start_date")
# Total should equal sum of parts
df['calculated_total'] = df['subtotal'] + df['tax'] + df['shipping']
mismatch = df[abs(df['total'] - df['calculated_total']) > 0.01]
if len(mismatch) > 0:
issues.append(f"{len(mismatch)} rows with total != sum of parts")
if issues:
print("Consistency issues:")
for issue in issues:
print(f" - {issue}")
8. Validate Uniqueness
# Check if IDs are unique
id_counts = df['customer_id'].value_counts()
duplicate_ids = id_counts[id_counts > 1]
if len(duplicate_ids) > 0:
print(f"Warning: {len(duplicate_ids)} duplicate customer IDs")
print(duplicate_ids.head())
# Check for expected unique combinations
combo_duplicates = df.duplicated(subset=['customer_id', 'product_id'], keep=False)
if combo_duplicates.sum() > 0:
print(f"Warning: {combo_duplicates.sum()} duplicate customer-product combinations")
Complete Data Quality Report
def data_quality_report(df):
report = {
'total_rows': len(df),
'total_columns': len(df.columns),
'missing_values': df.isnull().sum().sum(),
'duplicate_rows': df.duplicated().sum(),
'memory_usage': df.memory_usage(deep=True).sum() / 1024**2 # MB
}
print("=" * 50)
print("DATA QUALITY REPORT")
print("=" * 50)
print(f"Total Rows: {report['total_rows']:,}")
print(f"Total Columns: {report['total_columns']}")
print(f"Missing Values: {report['missing_values']:,}")
print(f"Duplicate Rows: {report['duplicate_rows']:,}")
print(f"Memory Usage: {report['memory_usage']:.2f} MB")
print("\n" + "=" * 50)
print("COLUMN SUMMARY")
print("=" * 50)
for col in df.columns:
print(f"\n{col}:")
print(f" Type: {df[col].dtype}")
print(f" Missing: {df[col].isnull().sum()} ({df[col].isnull().sum()/len(df)*100:.1f}%)")
print(f" Unique: {df[col].nunique()}")
if pd.api.types.is_numeric_dtype(df[col]):
print(f" Range: {df[col].min()} to {df[col].max()}")
print(f" Mean: {df[col].mean():.2f}")
data_quality_report(df)
Automated Quality Checks
# Using Great Expectations library
# pip install great-expectations
import great_expectations as gx
# Create expectations
expectation_suite = {
'customer_id': [
'expect_column_values_to_be_unique',
'expect_column_values_to_not_be_null'
],
'age': [
'expect_column_values_to_be_between(min_value=0, max_value=120)'
],
'email': [
'expect_column_values_to_match_regex(regex="^[a-zA-Z0-9+_.-]+@[a-zA-Z0-9.-]+$")'
]
}
# Run validations and get results
Data Quality Checklist
- Missing values identified and handled
- Duplicates removed or justified
- Outliers detected and investigated
- Data types correct for each column
- Values within expected ranges
- Categorical values match expected lists
- Cross-field logic validated
- Uniqueness constraints verified
Pro Tip: Create automated data quality checks that run every time new data arrives. Document all data quality issues and decisions. Bad data in = bad insights out!
← Back to Data Analysis Tips