Detection of univariate outliers
Univariate outliers are those observations which deviate from the population only along one feature. An example would be a worker that has a similar education and experience as other members in a team, but that has an unusually high salary.
Two widely used methods for univariate outlier detection are the IQR and z-score methods. Scikit-learn, the popular Python package for ML, however, does not implement them. See, for example, this discussion on the sklearn page on GitHub. The reason seems to be that the core team maintaining sklearn would like to avoid including transformers that change the number of instances in the data (which would be the case if outliers were detected and removed).
Below is an implementation of the IQR and z-score methods, that, instead of deleting instances, sets the the outlier values for specific features to np.nan. This gives one the choice to handle them further in any preferred way — for example, impute these values, possibly within the preprocessing pipeline, or delete the instances.
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.validation import validate_data
class UnivariateOutlierDetector(TransformerMixin, BaseEstimator):
def __init__(self, factor=1.5, method="iqr"):
"""Detect outliers in a univariate series using either the
IQR or the standard deviation method.
Parameters
----------
factor : float, default=1.5
The factor to multiply the IQR or the standard deviation
by to determine the outlier thresholds
method : str, default="iqr"
The method to use for outlier detection. Either "iqr" or
"zscore"
"""
self.factor = factor
self.method = method
def fit(self, X, y=None):
self._upper, self._lower = None, None
if type(X) == pd.Series:
X = X.values.reshape(-1, 1)
X = validate_data(self, X)
try:
assert X.isnull().sum().sum() == 0
except AttributeError:
assert np.isnan(X).sum() == 0
if self.method == "iqr":
q25 = np.percentile(X, 25, axis=0)
q75 = np.percentile(X, 75, axis=0)
cut_off = (q75 - q25) * self.factor
self._lower = q25 - cut_off
self._upper = q75 + cut_off
elif self.method == "zscore":
mean = np.mean(X, axis=0)
std = np.std(X, axis=0)
self._lower = mean - self.factor * std
self._upper = mean + self.factor * std
self._is_fitted = True
return self
def predict(self, X, y=None):
"""Return True to indicate outliers, False to indicate inliers
"""
check_is_fitted(self)
if type(X) == pd.Series:
X = X.values.reshape(-1, 1)
X = validate_data(self, X, reset=False)
return np.where(((X >= self._lower) & (X <= self._upper)), False, True)
def transform(self, X, y=None):
"""Detect outliers and set them to np.nan
"""
check_is_fitted(self)
if type(X) == pd.Series:
X = X.values.reshape(-1, 1)
X = validate_data(self, X, reset=False)
X = X.copy().astype('float')
yhat = self.predict(X)
X[yhat] = np.nan
return X
def fit_transform(self, X, y=None):
return self.fit(X, y).transform(X)
def __sklearn_is_fitted__(self):
"""
Check fitted status and return a Boolean value.
"""
return hasattr(self, "_is_fitted") and self._is_fitted
The implementation conforms to sklearn’s requirements to custom estimator classes, and as such can be included into an sklearn or sktime Pipeline, along with other preprocessing steps. In the pipeline, it can be followed by an imputer:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.datasets import make_regression
X, y = make_regression(
n_samples=500,
n_features=10,
noise=0.1,
random_state=42
)
pipeline = Pipeline([
("outlier_detector", UnivariateOutlierDetector(method="zscore")),
("imputer", SimpleImputer(strategy="mean")),
])
pipeline.fit(X, y)
y_pred = pipeline.transform(X)