[ ]:
%matplotlib inline

Open In Colab

View Source on GitHub

Building Machine learning models for regression

This notebook shows how to build machine leanring models for a regression problem.

[ ]:
try:
    import ai4water
except (ImportError, ModuleNotFoundError):
    !pip install ai4water[ml]
[15]:
import site
site.addsitedir("D:\\mytools\\AI4Water")


from ai4water import Model
from ai4water.datasets import MtropicsLaos
from ai4water.utils.utils import get_version_info

[16]:


for k,v in get_version_info().items(): print(f"{k} version: {v}")
python version: 3.9.7 | packaged by conda-forge | (default, Sep 29 2021, 19:20:16) [MSC v.1916 64 bit (AMD64)]
os version: nt
ai4water version: 1.07
lightgbm version: 3.3.1
tcn version: 3.4.0
catboost version: 0.26
xgboost version: 1.5.0
easy_mpl version: 0.21.3
SeqMetrics version: 1.3.3
tensorflow version: 2.7.0
keras.api._v2.keras version: 2.7.0
numpy version: 1.21.0
pandas version: 1.3.4
matplotlib version: 3.4.3
h5py version: 3.5.0
sklearn version: 1.0.1
shapefile version: 2.3.0
fiona version: 1.8.22
xarray version: 0.20.1
netCDF4 version: 1.5.7
optuna version: 2.10.1
skopt version: 0.9.0
hyperopt version: 0.2.7
plotly version: 5.3.1
lime version: NotDefined
seaborn version: 0.11.2
[18]:


dataset = MtropicsLaos(save_as_nc=True, # if set to True, then netcdf must be installed convert_to_csv=False, path="F:\\data\\MtropicsLaos", # path where the data is saved ) data = dataset.make_regression(lookback_steps=1) data.shape

    Not downloading the data since the directory
    F:\data\MtropicsLaos already exists.
    Use overwrite=True to remove previously saved files and download again
Value based partial slicing on non-monotonic DatetimeIndexes with non-existing keys is deprecated and will raise a KeyError in a future Version.
[18]:
(258, 9)

Random Forest

[19]:
model = Model(model="RandomForestRegressor")

            building ML model for
            regression problem using RandomForestRegressor

Training

[20]:

model.fit(data=data)
***** Training *****
input_x shape:  (144, 8)
target shape:  (144, 1)
[20]:
RandomForestRegressor(random_state=313)
[21]:
model.fit_on_all_training_data(data=data)
***** Training *****
input_x shape:  (144, 8)
target shape:  (144, 1)
***** Validation *****
input_x shape:  (36, 8)
target shape:  (36, 1)
[21]:
RandomForestRegressor(random_state=313)
[22]:
x, y = model.training_data()

output = model.fit(x=x, y=y)
***** Training *****
input_x shape:  (144, 8)
target shape:  (144, 1)
[23]:
type(output)
[23]:
sklearn.ensemble._forest.RandomForestRegressor

Prediction

Once the model is trained, we can make predictions from it.

[24]:
_ = model.predict(data=data)
***** Test *****
input_x shape:  (78, 8)
target shape:  (78, 1)
../../_images/_notebooks_model_ml_rgr_14_1.png
../../_images/_notebooks_model_ml_rgr_14_2.png
../../_images/_notebooks_model_ml_rgr_14_3.png
../../_images/_notebooks_model_ml_rgr_14_4.png
../../_images/_notebooks_model_ml_rgr_14_5.png
../../_images/_notebooks_model_ml_rgr_14_6.png
[17]:
_ = model.predict(data=data, process_results=False)
***** Test *****
input_x shape:  (78, 8)
target shape:  (78, 1)
[18]:
_ = model.predict(data=data, plots=["regression"])
***** Test *****
input_x shape:  (78, 8)
target shape:  (78, 1)
../../_images/_notebooks_model_ml_rgr_16_1.png
[25]:
t,p = model.predict(data=data, process_results=False, return_true=True)
t.shape, p.shape
***** Test *****
input_x shape:  (78, 8)
target shape:  (78, 1)
[25]:
((78, 1), (78,))
[26]:
_ = model.predict_on_training_data(data=data, plots=["regression"])
***** Training *****
input_x shape:  (144, 8)
target shape:  (144, 1)
../../_images/_notebooks_model_ml_rgr_18_1.png

Evaluation

[27]:
model.evaluate(data=data)
***** Test *****
input_x shape:  (78, 8)
target shape:  (78, 1)
[27]:
0.1064417278288089
[28]:
model.evaluate(data=data, metrics=["r2", "r2_score", "rmse"])
***** Test *****
input_x shape:  (78, 8)
target shape:  (78, 1)
[28]:
{'r2': 0.15480087060681375,
 'r2_score': 0.1064417278288089,
 'rmse': 24752.12321159926}
[29]:
model.evaluate_on_test_data(data=data, metrics=["r2", "r2_score", "rmse"])
***** Test *****
input_x shape:  (78, 8)
target shape:  (78, 1)
[29]:
{'r2': 0.15480087060681375,
 'r2_score': 0.1064417278288089,
 'rmse': 24752.12321159926}
[30]:
model.evaluate_on_training_data(data=data, metrics=["r2", "r2_score", "rmse"])
***** Training *****
input_x shape:  (144, 8)
target shape:  (144, 1)
[30]:
{'r2': 0.9148494793552957,
 'r2_score': 0.8699813684402202,
 'rmse': 2495.2543044816152}
[31]:
model.evaluate(x=x, y=y, metrics=["r2", "r2_score", "rmse"])
[31]:
{'r2': 0.9148494793552957,
 'r2_score': 0.8699813684402202,
 'rmse': 2495.2543044816152}
[36]:
model = Model(model={"RandomForestRegressor": {"n_estimators": 200}})


            building ML model for
            regression problem using RandomForestRegressor
[37]:
model.fit(data=data)
model.evaluate_on_test_data(data=data, metrics=["r2", "r2_score", "rmse"])
***** Training *****
input_x shape:  (144, 8)
target shape:  (144, 1)
***** Test *****
input_x shape:  (78, 8)
target shape:  (78, 1)
[37]:
{'r2': 0.17442335408602663,
 'r2_score': 0.11857515027421384,
 'rmse': 24583.497124708025}

XGBoost

[38]:
model = Model(model={"XGBRegressor": {"learning_rate": 0.01}})

            building ML model for
            regression problem using XGBRegressor
[39]:
model.fit(data=data)
model.evaluate_on_test_data(data=data, metrics=["r2", "r2_score", "rmse"])
***** Training *****
input_x shape:  (144, 8)
target shape:  (144, 1)
***** Test *****
input_x shape:  (78, 8)
target shape:  (78, 1)
[39]:
{'r2': 0.006633469985602984,
 'r2_score': -0.016195787844924148,
 'rmse': 26396.0968745532}

Cat Boost

[42]:
model = Model(model={"CatBoostRegressor": {"learning_rate": 0.01}}, verbosity=0)
[43]:
model.fit(data=data)
model.evaluate_on_test_data(data=data, metrics=["r2", "r2_score", "rmse"])
[43]:
{'r2': 0.14617658996808958,
 'r2_score': 0.07440578685241073,
 'rmse': 25191.923784722494}

Light Graient Boost Machine

[44]:
model = Model(model={"LGBMRegressor": {"num_leaves": 45}})

            building ML model for
            regression problem using LGBMRegressor
[45]:
model.fit(data=data)
model.evaluate_on_test_data(data=data, metrics=["r2", "r2_score", "rmse"])
***** Training *****
input_x shape:  (144, 8)
target shape:  (144, 1)
***** Test *****
input_x shape:  (78, 8)
target shape:  (78, 1)
[45]:
{'r2': 0.0028383594363326804,
 'r2_score': -0.01702525035421454,
 'rmse': 26406.86748933602}
[7]:
model._model
[7]:
LGBMRegressor(num_leaves=45, random_state=313)

Custom model/estimator/algorithm

uninitiated

[9]:
from sklearn.ensemble import RandomForestRegressor

class MyRF(RandomForestRegressor):
    pass


model = Model(model=MyRF,
              ts_args={'lookback': 1},
              mode="regression")

            building ML model for
            regression problem using <class '__main__.MyRF'>

uninitiated with arguments

[10]:
model = Model(model={MyRF: {"n_estimators": 10}},
              ts_args={'lookback': 1},
              mode="regression")

            building ML model for
            regression problem using <class '__main__.MyRF'>

initiated

[ ]:
model = Model(model=MyRF(), mode="regression")
[ ]: