Source code for ai4water.preprocessing.imputation


from typing import Union

from ai4water.backend import imputations, np, pd, plt


# use LGBM imputation method
# https://www.kaggle.com/robikscube/handling-with-missing-data-youtube-stream#Level-4:-LightGBM-Imputer!!
# plot imputation distribution

[docs]class Imputation(object): """ Implements imputation of missing values using a range of methods. Imputation Methods ----------------- - pandas: Pandas library provides two methods for filling input data. `interpolate`: filling by interpolation Example of imputer_args can be {'method': 'spline': 'order': 2} For detailed args to be passed see interpolate_ `fillna`: example of imputer_args can be {'method': 'ffill'} For detailed args to be passed see fillna_ - sklearn: scikit-learn library provides 3 different imputation methods. `SimplteImputer`: For details see SimpleImputer_ `IterativeImputer`: imputer_args example: {'n_nearest_features': 2} For details see IterativeImputer_ `KNNIMputer`: All the args accepted by KNNImputer of sklearn can be passed as in imputer_args. imputer_args example: {'n_neighbors': 3}. For details KNNImputer_ - fancyimpute: knn: NuclearnNormMinimization SoftImpute Biscaler transdim: Methods -------- - :py:meth:`ai4water.preprocessing.imputation.Imputation.plot` plots the imputed values. - :py:meth:`ai4water.preprocessing.imputation.Imputation.missing_indices` indices of missing data. Examples: >>> import pandas as pd >>> import numpy as np >>> from ai4water.preprocessing import Imputation >>> df = pd.DataFrame([1,3,np.nan, np.nan, 9, np.nan, 11]) >>> imputer = Imputation(df, method='fillna', imputer_args={'method': 'ffill'}) >>> imputer() # change the imputation method >>> imputer.method = 'interpolate' >>> imputer(method='cubic') # Now try with KNN imputation >>> imputer.method = 'KNNImputer' >>> imputer(n_neighbors=3) .. _fillna: https://pandas.pydata.org/pandas-docs/version/0.22.0/generated/pandas.DataFrame.fillna.html .. _interpolate: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.interpolate.html .. _SimpleImputer: https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html#sklearn.impute.SimpleImputer .. _IterativeImputer: https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html#sklearn.impute.IterativeImputer .. _KNNImputer: https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html """
[docs] def __init__( self, data: Union[pd.DataFrame, np.ndarray, list], method: str = 'KNNImputer', features=None, imputer_args: dict = None ): """ Arguments: data: the data which contains missing values method: the method to apply for missing features: the features on which imputation is to be applied imputer_args: arguments for underlying imputer function """ self.data = self.maybe_make_df(data) self.method = method self.features = features or self.data.columns self.imputer_args = {} if imputer_args is None else imputer_args self.new_data = None
@property def method(self): return self._method @method.setter def method(self, x): self._method = x
[docs] def call(self, *args, **kwargs): raise NotImplementedError(f"You must ovewrite the `call` method to implement {self.method} method")
def __call__(self, *args, **kwargs): """ If kwargs are provided they will overwrite self.imputer_args. This helps to use same instance of Imputantion class with different args. """ if kwargs: kwargs = kwargs else: kwargs = self.imputer_args if self.method in ['fillna', 'interpolate']: # it is a pandas based for col in self.data.columns: if col in self.features: self.data[col] = getattr(self.data[col], self.method)(**kwargs) elif self.method in imputations: imputer = imputations[self.method](**kwargs) data = self.data.copy() # making a copy so that non-imputed features remain intact _data = self.data[self.features].values data_ = imputer.fit_transform(_data) if isinstance(data_, np.ndarray): data_ = pd.DataFrame(data_, columns=self.features, index=self.data.index) data[self.features] = data_ setattr(self, 'data', data) else: return self.call() if self._dtype == 'list': self.data = self.data.values.reshape(-1,).tolist() elif self._dtype == 'ndarray': self.data = self.data.values return self.data
[docs] def plot(self, cols=None, st=0, en=None): """ cols: columns to plot from data st: int en: int Example ------- >>> imputer.plot(cols=['in1', 'in2'], st=0, en=25) """ if cols is not None: if not isinstance(cols, list): assert isinstance(cols, str) and cols in self.data cols = [cols] else: cols = list(self.new_data.columns) if en is None: en = len(self.data) plt.close('all') _, axis = plt.subplots(len(cols), sharex='all') if not isinstance(axis, np.ndarray): axis = [axis] indices = self.missing_indices() for col, ax in zip(cols, axis): idx = indices[col] ax.plot(self.data[col][st:en], linestyle='-', color='k', marker='o', fillstyle='full', label="Original") ax.plot(self.new_data[col][idx][st:en], linestyle='--', marker='*', color='aqua', label="Imputed") ax.set_title(col) ax.legend() plt.show() return
[docs] def missing_indices(self) -> dict: # https://github.com/scikit-learn/scikit-learn/blob/7cc3dbcbe/sklearn/impute/_base.py#L556 indices = {} for col in self.data.columns: # https://stackoverflow.com/a/42795371/5982232 indices[col] = np.isnan(self.data[col].values.astype(float)) return indices
[docs] def maybe_make_df(self, data): setattr(self, '_dtype', data.__class__.__name__) data = data.copy() if isinstance(data, pd.DataFrame): data = data else: data = np.array(data) if data.ndim == 1: data = data.reshape(-1, 1) assert isinstance(data, np.ndarray) data = pd.DataFrame(data, columns=['data'+str(i) for i in range(data.shape[1])]) return data