Source code for ai4water.preprocessing.imputation


from typing import Union

from ai4water.backend import imputations, np, pd, plt


# use LGBM imputation method
# https://www.kaggle.com/robikscube/handling-with-missing-data-youtube-stream#Level-4:-LightGBM-Imputer!!
# plot imputation distribution

[docs]class Imputation(object):

    """
    Implements imputation of missing values using a range of methods.

    Imputation Methods
    -----------------

    - pandas:
        Pandas library provides two methods for filling input data.
        `interpolate`: filling by interpolation
          Example of imputer_args can be
              {'method': 'spline': 'order': 2}
          For detailed args to be passed see interpolate_

        `fillna`:
          example of imputer_args can be
             {'method': 'ffill'}
          For detailed args to be passed see fillna_

    - sklearn:
        scikit-learn library provides 3 different imputation methods.
        `SimplteImputer`:
          For details see SimpleImputer_
        `IterativeImputer`:
          imputer_args example: {'n_nearest_features': 2}
          For details see IterativeImputer_
        `KNNIMputer`:
          All the args accepted by KNNImputer of sklearn can be passed as in imputer_args.
          imputer_args example: {'n_neighbors': 3}.
          For details KNNImputer_
    - fancyimpute:
        knn:
        NuclearnNormMinimization
        SoftImpute
        Biscaler
    transdim:

    Methods
    --------
    - :py:meth:`ai4water.preprocessing.imputation.Imputation.plot` plots the imputed values.

    - :py:meth:`ai4water.preprocessing.imputation.Imputation.missing_indices` indices of missing data.


    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> from ai4water.preprocessing import Imputation
        >>> df = pd.DataFrame([1,3,np.nan,  np.nan, 9, np.nan, 11])
        >>> imputer = Imputation(df, method='fillna', imputer_args={'method': 'ffill'})
        >>> imputer()
        # change the imputation method
        >>> imputer.method = 'interpolate'
        >>> imputer(method='cubic')
        # Now try with KNN imputation
        >>> imputer.method = 'KNNImputer'
        >>> imputer(n_neighbors=3)

    .. _fillna:
        https://pandas.pydata.org/pandas-docs/version/0.22.0/generated/pandas.DataFrame.fillna.html
    .. _interpolate:
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.interpolate.html
    .. _SimpleImputer:
        https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html#sklearn.impute.SimpleImputer
    .. _IterativeImputer:
        https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html#sklearn.impute.IterativeImputer
    .. _KNNImputer:
        https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html
    """
[docs]    def __init__(
            self,
            data: Union[pd.DataFrame, np.ndarray, list],
            method: str = 'KNNImputer',
            features=None,
            imputer_args: dict = None
    ):
        """
        Arguments:
            data:
                the data which contains missing values
            method:
                the method to apply for missing
            features:
                the features on which imputation is to be applied
            imputer_args:
                arguments for underlying imputer function
        """
        self.data = self.maybe_make_df(data)
        self.method = method
        self.features = features or self.data.columns
        self.imputer_args = {} if imputer_args is None else imputer_args
        self.new_data = None

    @property
    def method(self):
        return self._method

    @method.setter
    def method(self, x):
        self._method = x

[docs]    def call(self, *args, **kwargs):
        raise NotImplementedError(f"You must ovewrite the `call` method to implement {self.method} method")

    def __call__(self, *args, **kwargs):
        """
        If kwargs are provided they will overwrite self.imputer_args. This helps to use same instance of
        Imputantion class with different args.
        """
        if kwargs:
            kwargs = kwargs
        else:
            kwargs = self.imputer_args

        if self.method in ['fillna', 'interpolate']:  # it is a pandas based
            for col in self.data.columns:
                if col in self.features:
                    self.data[col] = getattr(self.data[col], self.method)(**kwargs)

        elif self.method in imputations:
            imputer = imputations[self.method](**kwargs)

            data = self.data.copy()  # making a copy so that non-imputed features remain intact
            _data = self.data[self.features].values
            data_ = imputer.fit_transform(_data)

            if isinstance(data_, np.ndarray):
                data_ = pd.DataFrame(data_, columns=self.features, index=self.data.index)

            data[self.features] = data_

            setattr(self, 'data', data)

        else:
            return self.call()

        if self._dtype == 'list':
            self.data = self.data.values.reshape(-1,).tolist()
        elif self._dtype == 'ndarray':
            self.data = self.data.values

        return self.data

[docs]    def plot(self, cols=None, st=0, en=None):
        """
        cols: columns to plot from data
        st: int
        en: int

        Example
        -------
            >>> imputer.plot(cols=['in1', 'in2'], st=0, en=25)
        """

        if cols is not None:
            if not isinstance(cols, list):
                assert isinstance(cols, str) and cols in self.data
                cols = [cols]
        else:
            cols = list(self.new_data.columns)

        if en is None:
            en = len(self.data)
        plt.close('all')
        _, axis = plt.subplots(len(cols), sharex='all')

        if not isinstance(axis, np.ndarray):
            axis = [axis]

        indices = self.missing_indices()
        for col, ax in zip(cols, axis):
            idx = indices[col]
            ax.plot(self.data[col][st:en], linestyle='-', color='k', marker='o', fillstyle='full', label="Original")
            ax.plot(self.new_data[col][idx][st:en], linestyle='--', marker='*', color='aqua', label="Imputed")

            ax.set_title(col)
            ax.legend()

        plt.show()

        return

[docs]    def missing_indices(self) -> dict:
        # https://github.com/scikit-learn/scikit-learn/blob/7cc3dbcbe/sklearn/impute/_base.py#L556
        indices = {}
        for col in self.data.columns:
            # https://stackoverflow.com/a/42795371/5982232
            indices[col] = np.isnan(self.data[col].values.astype(float))

        return indices

[docs]    def maybe_make_df(self, data):
        setattr(self, '_dtype', data.__class__.__name__)

        data = data.copy()

        if isinstance(data, pd.DataFrame):
            data = data
        else:
            data = np.array(data)
            if data.ndim == 1:
                data = data.reshape(-1, 1)
            assert isinstance(data, np.ndarray)
            data = pd.DataFrame(data, columns=['data'+str(i) for i in range(data.shape[1])])
        return data