Outlier Detection
IQR method
Z-score method
# outlier treatment
def remove_outliers_zscore(df, threshold=2): #(considering 2 std.dev away from mean approx 95% of data)
"""
Remove outliers from a DataFrame using the Z-score method.
Parameters:
df (DataFrame): The input DataFrame.
threshold (float): The Z-score threshold for identifying outliers.
Observations with a Z-score greater than this threshold
will be considered as outliers.
Returns:
DataFrame: The DataFrame with outliers removed.
"""
# Calculate Z-scores for numerical columns
z_scores = (df[numerical_cols] - df[numerical_cols].mean()) / df[numerical_cols].std()
# Identify outliers
outliers = np.abs(z_scores) > threshold
# Keep non-outliers for numerical columns
df_cleaned = df[~outliers.any(axis=1)]
return df_cleaned
cleaned_df = remove_outliers_zscore(df1)
print(cleaned_df.shape)Last updated