Learn StatClean through practical examples and real-world use cases.
Get started with basic outlier detection and removal:
import pandas as pd
from statclean import StatClean
# Sample data with outliers
df = pd.DataFrame({
'income': [25000, 30000, 35000, 40000, 500000, 45000, 50000],
'age': [25, 30, 35, 40, 35, 45, 50]
})
print("Original data:")
print(df)
# Initialize StatClean
cleaner = StatClean(df)
# Basic outlier removal using Z-score method
cleaner.remove_outliers_zscore('income')
cleaned_df = cleaner.clean_df
print(f"\\nOriginal shape: {df.shape}")
print(f"Cleaned shape: {cleaned_df.shape}")
print("\\nCleaned data:")
print(cleaned_df)
Use formal statistical tests to validate outlier detection:
# Formal statistical testing with p-values
cleaner = StatClean(df)
# Grubbs' test for outliers with statistical significance
grubbs_result = cleaner.grubbs_test('income', alpha=0.05)
print("Grubbs' Test Results:")
print(f" Test statistic: {grubbs_result['statistic']:.3f}")
print(f" P-value: {grubbs_result['p_value']:.6f}")
print(f" Outlier detected: {grubbs_result['is_outlier']}")
print(f" Outlier value: {grubbs_result['outlier_value']}")
# Dixon's Q-test for small samples (recommended for n < 30)
dixon_result = cleaner.dixon_q_test('age', alpha=0.05)
print("\\nDixon's Q-Test Results:")
print(f" Q statistic: {dixon_result['statistic']:.3f}")
print(f" Critical value: {dixon_result['critical_value']:.3f}")
print(f" P-value: {dixon_result['p_value']:.6f}")
print(f" Outlier detected: {dixon_result['is_outlier']}")
Detect outliers using relationships between multiple variables:
# Multivariate outlier detection using Mahalanobis distance
cleaner = StatClean(df)
# Detect outliers considering both income and age together
outliers = cleaner.detect_outliers_mahalanobis(['income', 'age'],
chi2_threshold=0.95)
print(f"Multivariate outliers detected: {outliers.sum()}")
print("Outlier indices:", outliers[outliers].index.tolist())
# More conservative threshold (99th percentile)
outliers_conservative = cleaner.detect_outliers_mahalanobis(
['income', 'age'],
chi2_threshold=0.99,
use_shrinkage=True # Use shrinkage covariance for stability
)
print(f"Conservative outliers: {outliers_conservative.sum()}")
# Remove multivariate outliers
cleaner.remove_outliers_mahalanobis(['income', 'age'])
cleaned_df = cleaner.clean_df
print(f"Shape after multivariate cleaning: {cleaned_df.shape}")
Apply statistical transformations to normalize skewed data:
# Automatic transformation recommendation
cleaner = StatClean(df)
recommendation = cleaner.recommend_transformation('income')
print("Transformation Recommendation:")
print(f" Recommended method: {recommendation['recommended_method']}")
print(f" Current skewness: {recommendation['current_skewness']:.3f}")
print(f" Expected improvement: {recommendation['expected_improvement']:.3f}")
# Apply Box-Cox transformation with automatic lambda estimation
_, transform_info = cleaner.transform_boxcox('income')
print(f"\\nBox-Cox transformation applied:")
print(f" Optimal lambda: {transform_info['lambda']:.3f}")
print(f" Skewness before: {transform_info['skewness_before']:.3f}")
print(f" Skewness after: {transform_info['skewness_after']:.3f}")
# Alternative transformations
cleaner_log = StatClean(df.copy())
_, log_info = cleaner_log.transform_log('income', base='natural')
print(f"\\nLog transformation skewness: {log_info['skewness_after']:.3f}")
cleaner_sqrt = StatClean(df.copy())
_, sqrt_info = cleaner_sqrt.transform_sqrt('income')
print(f"Square root transformation skewness: {sqrt_info['skewness_after']:.3f}")
Use the fluent API to chain multiple operations:
# Method chaining for complex workflows
cleaner = StatClean(df)
# Chain multiple operations together
result_df = (cleaner
.set_thresholds(zscore_threshold=2.5, iqr_lower_factor=2.0)
.add_zscore_columns(['income']) # Add Z-score columns for analysis
.transform_boxcox('income') # Apply transformation
.winsorize_outliers_iqr('income', lower_factor=1.5, upper_factor=1.5)
.remove_outliers_modified_zscore('age', threshold=3.0)
.clean_df)
print("Method chaining result:")
print(result_df)
print(f"Final shape: {result_df.shape}")
# Access outlier information
print("\\nOutlier details:")
print(cleaner.outlier_info)
Analyze data distribution and compare detection methods:
# Distribution analysis with automatic recommendations
cleaner = StatClean(df)
analysis = cleaner.analyze_distribution('income')
print("Distribution Analysis for 'income':")
print(f" Skewness: {analysis['skewness']:.3f}")
print(f" Kurtosis: {analysis['kurtosis']:.3f}")
print(f" Normality test p-value: {analysis['normality_test']['p_value']:.6f}")
print(f" Recommended method: {analysis['recommended_method']}")
print(f" Recommendation reason: {analysis['recommendation_reason']}")
# Compare different detection methods
comparison = cleaner.compare_methods(
['income'],
methods=['iqr', 'zscore', 'modified_zscore']
)
print("\\nMethod Comparison Results:")
for method, stats in comparison['income']['method_stats'].items():
print(f" {method.upper()}:")
print(f" Outliers detected: {stats['outliers_detected']}")
print(f" Outlier indices: {stats['outlier_indices']}")
print(f"\\nAgreement summary: {comparison['income']['summary']}")
# Get detailed outlier statistics without removing data
stats_df = cleaner.get_outlier_stats(['income', 'age'], include_indices=True)
print("\\nDetailed outlier statistics:")
print(stats_df)
Create comprehensive plots for outlier analysis:
import matplotlib.pyplot as plt
from statclean.utils import plot_outliers, plot_distribution, plot_qq
# Comprehensive analysis plots for multiple columns
cleaner = StatClean(df)
figures = cleaner.plot_outlier_analysis(['income', 'age'])
# The figures dictionary contains matplotlib figure objects
# You can save or display them as needed
for column, fig in figures.items():
fig.savefig(f'{column}_analysis.png', dpi=300, bbox_inches='tight')
# Individual visualization components
outliers = cleaner.detect_outliers_zscore('income', threshold=3.0)
# Create individual plots
plt.figure(figsize=(15, 4))
plt.subplot(1, 3, 1)
plot_outliers(df['income'], outliers, title='Income Outlier Detection')
plt.subplot(1, 3, 2)
plot_distribution(df['income'], outliers, title='Income Distribution')
plt.subplot(1, 3, 3)
plot_qq(df['income'], outliers, title='Income Q-Q Plot')
plt.tight_layout()
plt.show()
# Visualize specific outliers
print("Detected outliers:")
outlier_data = df[outliers]
print(outlier_data)
MPLBACKEND=Agg before importing matplotlib.
Complete workflow with the California Housing dataset:
from sklearn.datasets import fetch_california_housing
import pandas as pd
from statclean import StatClean
# Load California Housing dataset
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['PRICE'] = housing.target
print(f"Dataset shape: {df.shape}")
print("Features:", list(df.columns))
# Initialize with index preservation for tracking
cleaner = StatClean(df, preserve_index=True)
# Analyze key features for skewness and outliers
features = ['MedInc', 'AveRooms', 'Population', 'PRICE']
print("\\nDistribution analysis:")
for feature in features:
analysis = cleaner.analyze_distribution(feature)
print(f"{feature}:")
print(f" Skewness: {analysis['skewness']:.3f}")
print(f" Recommended method: {analysis['recommended_method']}")
# Apply formal testing for highly skewed features
if abs(analysis['skewness']) > 1:
grubbs_result = cleaner.grubbs_test(feature, alpha=0.01)
print(f" Grubbs test p-value: {grubbs_result['p_value']:.6f}")
# Comprehensive cleaning with progress tracking
cleaned_df, cleaning_info = cleaner.clean_columns(
columns=features,
method='auto', # Automatic method selection based on distribution
show_progress=True,
include_indices=True
)
print(f"\\nCleaning Results:")
print(f"Original shape: {df.shape}")
print(f"Cleaned shape: {cleaned_df.shape}")
# Detailed results for each feature
for feature, info in cleaning_info.items():
print(f"\\n{feature}:")
print(f" Method used: {info['method_used']}")
print(f" Outliers removed: {info['outliers_removed']}")
print(f" Percentage removed: {info['percentage_removed']:.2f}%")
if 'p_value' in info:
print(f" Statistical significance: p = {info['p_value']:.6f}")
# Generate comprehensive visualizations for cleaned data
figures = cleaner.plot_outlier_analysis(features)
# Method comparison to validate cleaning decisions
comparison = cleaner.compare_methods(features)
print("\\nMethod agreement analysis:")
for feature in features:
print(f"{feature}: {comparison[feature]['summary']}")
# Generate final summary report
summary_report = cleaner.get_summary_report()
print("\\nSummary Report:")
print(summary_report)
Advanced techniques for research and publication-quality analysis:
# Advanced statistical analysis with custom strategies
import numpy as np
# Create more complex dataset
np.random.seed(42)
df_advanced = pd.DataFrame({
'reaction_time': np.random.gamma(2, 0.15, 500), # Skewed distribution
'accuracy': np.random.beta(8, 2, 500) * 100, # Bounded data
'confidence': np.random.normal(7, 1.5, 500), # Normal-ish data
'trial_number': range(1, 501)
})
# Add some realistic outliers
df_advanced.loc[50:52, 'reaction_time'] *= 5 # Participant distraction
df_advanced.loc[100, 'accuracy'] = 30 # Data entry error
df_advanced.loc[200:205, 'confidence'] = np.nan # Missing responses
cleaner = StatClean(df_advanced.dropna(), preserve_index=True)
# Custom cleaning strategy for different data types
strategy = {
'reaction_time': {
'method': 'modified_zscore',
'threshold': 3.0,
'reason': 'Skewed distribution, robust method needed'
},
'accuracy': {
'method': 'iqr',
'lower_factor': 2.0,
'upper_factor': 2.0,
'reason': 'Bounded data, IQR appropriate'
},
'confidence': {
'method': 'zscore',
'threshold': 2.5,
'reason': 'Approximately normal distribution'
}
}
# Apply custom strategy
print("Applying custom cleaning strategy:")
for column, params in strategy.items():
method = params['method']
print(f"\\n{column}: Using {method} - {params['reason']}")
if method == 'modified_zscore':
cleaner.remove_outliers_modified_zscore(
column, threshold=params['threshold']
)
elif method == 'iqr':
cleaner.remove_outliers_iqr(
column,
lower_factor=params['lower_factor'],
upper_factor=params['upper_factor']
)
elif method == 'zscore':
cleaner.remove_outliers_zscore(
column, threshold=params['threshold']
)
# Get cleaned data and comprehensive statistics
cleaned_df = cleaner.clean_df
print(f"\\nFinal shape: {cleaned_df.shape}")
# Publication-quality statistical testing
features = ['reaction_time', 'accuracy', 'confidence']
print("\\nStatistical Test Results:")
for feature in features:
# Multiple testing with Bonferroni correction
alpha_corrected = 0.05 / len(features)
grubbs_result = cleaner.grubbs_test(feature, alpha=alpha_corrected)
dixon_result = cleaner.dixon_q_test(feature, alpha=alpha_corrected)
print(f"\\n{feature}:")
print(f" Grubbs: p = {grubbs_result['p_value']:.6f}, " +
f"significant = {grubbs_result['is_outlier']}")
print(f" Dixon: p = {dixon_result['p_value']:.6f}, " +
f"significant = {dixon_result['is_outlier']}")
# Generate comprehensive report for publication
final_report = cleaner.get_summary_report()
print("\\n" + "="*60)
print("PUBLICATION SUMMARY REPORT")
print("="*60)
print(final_report)
Explore advanced topics and detailed documentation: