diff --git a/custom_definitions.py b/custom_definitions.py index b331731..66c5dfe 100644 --- a/custom_definitions.py +++ b/custom_definitions.py @@ -19,3 +19,41 @@ # Your code #==============Your Name===================== #Dont remove the following snippet and follow the same + +from sklearn.base import BaseEstimator, TransformerMixin +import numpy as np +import pandas as pd +from scipy.stats import zscore + +class lsZScoreOutlierRemover(BaseEstimator, TransformerMixin): + def _init_(self, threshold=3.0): + self.threshold = threshold + + def fit(self, Xls, yls=None): + return self # stateless + + def transform(self, Xls, yls=None): + Xls = Xls.copy() + + # Feature engineering first (to include them in outlier handling) + Xls["Proximity_Temp"] = Xls["Proximity to Star"] / (Xls["Surface Temperature"] + 1e-5) + Xls["Surface Density"] = Xls["Mineral Abundance"] * (Xls["Surface Temperature"] + 1e-5) + Xls["log_Density"] = np.log1p(Xls["Atmospheric Density"] - Xls["Atmospheric Density"].min() + 1) + + # Select numeric columns + num_colsls = Xls.select_dtypes(include=[np.number]).columns.tolist() + + # Ensure all numeric data is float (important for NaNs) + Xls[num_colsls] = Xls[num_colsls].astype(float) + + # Compute Z-scores + z_scoresls = np.abs(zscore(Xls[num_colsls], nan_policy='omit')) + + # Handle 1D zscore output (edge case) + if z_scoresls.ndim == 1: + z_scoresls = z_scoresls[:, np.newaxis] + + # Replace outliers with NaN + Xls[num_colsls] = Xls[num_colsls].where(z_scoresls < self.threshold, np.nan) + + return Xls