Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions custom_definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,41 @@
# Your code
#==============Your Name=====================
#Dont remove the following snippet and follow the same

from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from scipy.stats import zscore

class lsZScoreOutlierRemover(BaseEstimator, TransformerMixin):
def _init_(self, threshold=3.0):
self.threshold = threshold

def fit(self, Xls, yls=None):
return self # stateless

def transform(self, Xls, yls=None):
Xls = Xls.copy()

# Feature engineering first (to include them in outlier handling)
Xls["Proximity_Temp"] = Xls["Proximity to Star"] / (Xls["Surface Temperature"] + 1e-5)
Xls["Surface Density"] = Xls["Mineral Abundance"] * (Xls["Surface Temperature"] + 1e-5)
Xls["log_Density"] = np.log1p(Xls["Atmospheric Density"] - Xls["Atmospheric Density"].min() + 1)

# Select numeric columns
num_colsls = Xls.select_dtypes(include=[np.number]).columns.tolist()

# Ensure all numeric data is float (important for NaNs)
Xls[num_colsls] = Xls[num_colsls].astype(float)

# Compute Z-scores
z_scoresls = np.abs(zscore(Xls[num_colsls], nan_policy='omit'))

# Handle 1D zscore output (edge case)
if z_scoresls.ndim == 1:
z_scoresls = z_scoresls[:, np.newaxis]

# Replace outliers with NaN
Xls[num_colsls] = Xls[num_colsls].where(z_scoresls < self.threshold, np.nan)

return Xls