Open In Colab

View Source on GitHub

Data Preparation for Regression task

This notebook describes how to prepare data for a single machine learning regression problem.

[ ]:
try:
    import ai4water
except ModuleNotFoundError:
    !pip install ai4water
[104]:
import site
site.addsitedir("D:\\mytools\\AI4Water")

from ai4water.datasets import busan_beach
from ai4water.preprocessing import DataSet
from ai4water.utils.utils import get_version_info

[51]:
for lib, ver in get_version_info().items():
    print(lib, ver)
python 3.9.7 | packaged by conda-forge | (default, Sep 29 2021, 19:20:16) [MSC v.1916 64 bit (AMD64)]
os nt
ai4water 1.06
lightgbm 3.3.1
tcn 3.4.0
catboost 0.26
xgboost 1.5.0
easy_mpl 0.21.3
SeqMetrics 1.3.3
tensorflow 2.7.0
keras.api._v2.keras 2.7.0
numpy 1.21.0
pandas 1.3.4
matplotlib 3.4.3
h5py 3.5.0
sklearn 1.0.1
shapefile 2.3.0
fiona 1.8.22
xarray 0.20.1
netCDF4 1.5.7
optuna 2.10.1
skopt 0.9.0
hyperopt 0.2.7
plotly 5.3.1
lime NotDefined
seaborn 0.11.2
[52]:
data = busan_beach()
[53]:
data.shape
[53]:
(1446, 14)
[54]:
data.columns
[54]:
Index(['tide_cm', 'wat_temp_c', 'sal_psu', 'air_temp_c', 'pcp_mm', 'pcp3_mm',
       'pcp6_mm', 'pcp12_mm', 'wind_dir_deg', 'wind_speed_mps', 'air_p_hpa',
       'mslp_hpa', 'rel_hum', 'tetx_coppml'],
      dtype='object')
[55]:
data.head()
[55]:
tide_cm wat_temp_c sal_psu air_temp_c pcp_mm pcp3_mm pcp6_mm pcp12_mm wind_dir_deg wind_speed_mps air_p_hpa mslp_hpa rel_hum tetx_coppml
index
2018-06-19 00:00:00 36.407149 19.321232 33.956058 19.780000 0.0 0.0 0.0 0.0 159.533333 0.960000 1002.856667 1007.256667 95.000000 NaN
2018-06-19 00:30:00 35.562515 19.320124 33.950508 19.093333 0.0 0.0 0.0 0.0 86.596667 0.163333 1002.300000 1006.700000 95.000000 NaN
2018-06-19 01:00:00 34.808016 19.319666 33.942532 18.733333 0.0 0.0 0.0 0.0 2.260000 0.080000 1001.973333 1006.373333 95.000000 NaN
2018-06-19 01:30:00 30.645216 19.320406 33.931263 18.760000 0.0 0.0 0.0 0.0 62.710000 0.193333 1001.776667 1006.120000 95.006667 NaN
2018-06-19 02:00:00 26.608980 19.326729 33.917961 18.633333 0.0 0.0 0.0 0.0 63.446667 0.510000 1001.743333 1006.103333 95.006667 NaN
[56]:
data.tail()
[56]:
tide_cm wat_temp_c sal_psu air_temp_c pcp_mm pcp3_mm pcp6_mm pcp12_mm wind_dir_deg wind_speed_mps air_p_hpa mslp_hpa rel_hum tetx_coppml
index
2019-09-07 22:00:00 -3.989912 20.990612 33.776449 23.700000 0.0 0.0 0.0 0.5 203.760000 6.506667 1003.446667 1007.746667 88.170000 NaN
2019-09-07 22:30:00 -2.807042 21.012014 33.702310 23.620000 0.0 0.0 0.0 0.0 205.353333 5.633333 1003.520000 1007.820000 88.256667 NaN
2019-09-07 23:00:00 -3.471326 20.831739 33.726177 23.666667 0.0 0.0 0.0 0.0 202.540000 4.480000 1003.610000 1007.910000 87.833333 NaN
2019-09-07 23:30:00 0.707771 21.006086 33.716274 23.633333 0.0 0.0 0.0 0.0 207.206667 4.946667 1003.633333 1007.933333 88.370000 NaN
2019-09-08 00:00:00 1.011731 20.896149 33.729773 23.600000 0.0 0.0 0.0 0.0 210.200000 4.400000 1003.700000 1008.000000 87.700000 NaN
[57]:
data.isna().sum()
[57]:
tide_cm              0
wat_temp_c           0
sal_psu              0
air_temp_c           0
pcp_mm               0
pcp3_mm              0
pcp6_mm              0
pcp12_mm             0
wind_dir_deg         0
wind_speed_mps       0
air_p_hpa            0
mslp_hpa             0
rel_hum              0
tetx_coppml       1228
dtype: int64
[58]:
data.dropna().shape
[58]:
(218, 14)
[59]:
data.dropna().head()
[59]:
tide_cm wat_temp_c sal_psu air_temp_c pcp_mm pcp3_mm pcp6_mm pcp12_mm wind_dir_deg wind_speed_mps air_p_hpa mslp_hpa rel_hum tetx_coppml
index
2018-06-20 09:00:00 -22.245026 19.457182 34.004292 24.280000 0.0 0.0 0.0 6.0 205.006667 1.653333 998.613333 1002.913333 75.100000 444866.9004
2018-06-20 12:00:00 10.906243 19.511044 34.044975 26.076667 0.0 0.0 0.0 0.0 201.593333 2.993333 998.830000 1003.130000 67.423333 193368.2195
2018-06-20 15:00:00 15.025008 19.582047 34.134964 25.043333 0.0 0.0 0.0 0.0 188.976667 2.010000 998.190000 1002.490000 67.136667 287920.3535
2018-06-20 18:00:00 -7.755828 19.579559 34.106552 22.826667 0.0 0.0 0.0 0.0 209.493333 1.480000 998.416667 1002.716667 77.413333 246005.6510
2018-06-20 21:00:00 -18.817711 19.570045 34.100220 20.910000 0.0 0.0 0.0 0.0 260.616667 1.080000 999.843333 1004.143333 79.093333 273757.5439
[60]:
data.dropna().tail()
[60]:
tide_cm wat_temp_c sal_psu air_temp_c pcp_mm pcp3_mm pcp6_mm pcp12_mm wind_dir_deg wind_speed_mps air_p_hpa mslp_hpa rel_hum tetx_coppml
index
2019-09-06 11:00:00 15.146028 19.247823 33.746046 27.666667 0.0 0.0 0.0 0.0 71.336667 1.666667 1006.450000 1010.750000 75.393333 1.320332e+07
2019-09-06 12:00:00 24.810148 20.357189 33.778996 27.383333 0.0 0.0 0.0 0.0 49.626667 1.386667 1006.106667 1010.406667 75.896667 2.437392e+06
2019-09-06 13:00:00 25.666843 19.362318 33.810041 27.533333 0.0 0.0 0.0 0.0 43.590000 2.076667 1005.316667 1009.616667 76.056667 2.927098e+06
2019-09-06 14:00:00 25.712396 19.317668 33.727930 28.213333 0.0 0.0 0.0 0.0 42.160000 2.603333 1004.246667 1008.546667 71.943333 4.699929e+06
2019-09-06 15:00:00 18.448916 20.592932 33.831501 27.896667 0.0 0.0 0.0 0.0 29.850000 2.743333 1003.846667 1008.146667 72.740000 3.506092e+06
[61]:

ds = DataSet( data = data )
[62]:
train_x, train_y = ds.training_data()

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)
[63]:
train_x.shape, train_y.shape
[63]:
((121, 13), (121, 1))
[64]:
train_x[0]
[64]:
array([ -22.245026 ,   19.457182 ,   34.00429  ,   24.28     ,
          0.       ,    0.       ,    0.       ,    6.       ,
        205.00667  ,    1.6533333,  998.61334  , 1002.9133   ,
         75.1      ], dtype=float32)
[65]:
train_y[0]
[65]:
array([444866.9004])
[66]:
train_x[1]
[66]:
array([  10.906243 ,   19.511044 ,   34.044975 ,   26.076666 ,
          0.       ,    0.       ,    0.       ,    0.       ,
        201.59334  ,    2.9933333,  998.83     , 1003.13     ,
         67.42333  ], dtype=float32)
[67]:
train_y[1]
[67]:
array([193368.2195])
[68]:
test_x, test_y = ds.test_data()

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)
[69]:
test_x[-1]
[69]:
array([  18.448915 ,   20.592932 ,   33.8315   ,   27.896667 ,
          0.       ,    0.       ,    0.       ,    0.       ,
         29.85     ,    2.7433333, 1003.8467   , 1008.14667  ,
         72.74     ], dtype=float32)
[70]:
test_y[-1]
[70]:
array([3506092.003])

defining inputs

[71]:
ds = DataSet(
    data=data,
    input_features=['tide_cm', 'wat_temp_c', 'sal_psu', 'air_temp_c', 'pcp_mm', 'pcp3_mm']
)
[72]:
train_x, train_y = ds.training_data()
train_x.shape, train_y.shape

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 6)
target shape:  (121, 8)
[72]:
((121, 6), (121, 8))
[73]:
train_x[0]
[73]:
array([-22.245026,  19.457182,  34.00429 ,  24.28    ,   0.      ,
         0.      ], dtype=float32)

defining outputs

[74]:
ds = DataSet(
    data=data,
    input_features=['tide_cm', 'wat_temp_c', 'sal_psu', 'air_temp_c', 'pcp_mm', 'pcp3_mm'],
    output_features=["tetx_coppml"]
)
[75]:
train_x, train_y = ds.training_data()

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 6)
target shape:  (121, 1)
[76]:
train_x[0]
[76]:
array([-22.245026,  19.457182,  34.00429 ,  24.28    ,   0.      ,
         0.      ], dtype=float32)
[77]:
train_y[0]
[77]:
array([444866.9004])
[78]:
train_x[1]
[78]:
array([10.906243, 19.511044, 34.044975, 26.076666,  0.      ,  0.      ],
      dtype=float32)
[79]:
train_y[1]
[79]:
array([193368.2195])
[80]:
test_x, test_y = ds.test_data()
test_x.shape, test_y.shape

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 6)
target shape:  (66, 1)
[80]:
((66, 6), (66, 1))
[81]:
test_x[-1]
[81]:
array([18.448915, 20.592932, 33.8315  , 27.896667,  0.      ,  0.      ],
      dtype=float32)
[82]:
test_y[-1]
[82]:
array([3506092.003])
[83]:
data = busan_beach(target=["blaTEM_coppml", "tetx_coppml"])
data.shape
[83]:
(1446, 15)
[84]:
data.columns
[84]:
Index(['tide_cm', 'wat_temp_c', 'sal_psu', 'air_temp_c', 'pcp_mm', 'pcp3_mm',
       'pcp6_mm', 'pcp12_mm', 'wind_dir_deg', 'wind_speed_mps', 'air_p_hpa',
       'mslp_hpa', 'rel_hum', 'blaTEM_coppml', 'tetx_coppml'],
      dtype='object')
[85]:
data.dropna().head()
[85]:
tide_cm wat_temp_c sal_psu air_temp_c pcp_mm pcp3_mm pcp6_mm pcp12_mm wind_dir_deg wind_speed_mps air_p_hpa mslp_hpa rel_hum blaTEM_coppml tetx_coppml
index
2018-06-20 09:00:00 -22.245026 19.457182 34.004292 24.280000 0.0 0.0 0.0 6.0 205.006667 1.653333 998.613333 1002.913333 75.100000 9.665350e+05 444866.9004
2018-06-20 12:00:00 10.906243 19.511044 34.044975 26.076667 0.0 0.0 0.0 0.0 201.593333 2.993333 998.830000 1003.130000 67.423333 3.834816e+05 193368.2195
2018-06-20 15:00:00 15.025008 19.582047 34.134964 25.043333 0.0 0.0 0.0 0.0 188.976667 2.010000 998.190000 1002.490000 67.136667 1.673262e+06 287920.3535
2018-06-20 18:00:00 -7.755828 19.579559 34.106552 22.826667 0.0 0.0 0.0 0.0 209.493333 1.480000 998.416667 1002.716667 77.413333 5.645747e+06 246005.6510
2018-06-20 21:00:00 -18.817711 19.570045 34.100220 20.910000 0.0 0.0 0.0 0.0 260.616667 1.080000 999.843333 1004.143333 79.093333 1.630322e+06 273757.5439
[88]:
ds = DataSet(
    data=data,
    output_features=["blaTEM_coppml", "tetx_coppml"],
    verbosity=0
)
[90]:
train_x, train_y = ds.training_data()
train_x.shape, train_y.shape
[90]:
((121, 13), (121, 2))
[91]:
train_x[0]
[91]:
array([ -22.245026 ,   19.457182 ,   34.00429  ,   24.28     ,
          0.       ,    0.       ,    0.       ,    6.       ,
        205.00667  ,    1.6533333,  998.61334  , 1002.9133   ,
         75.1      ], dtype=float32)
[92]:
train_y[0]
[92]:
array([966535.0042, 444866.9004])
[93]:
test_x, test_y = ds.test_data()
test_x.shape, test_y.shape
[93]:
((66, 13), (66, 2))
[94]:
test_x[-1]
[94]:
array([  18.448915 ,   20.592932 ,   33.8315   ,   27.896667 ,
          0.       ,    0.       ,    0.       ,    0.       ,
         29.85     ,    2.7433333, 1003.8467   , 1008.14667  ,
         72.74     ], dtype=float32)
[95]:
test_y[-1]
[95]:
array([8473063.881, 3506092.003])

saving prepared data in h5 file

[99]:
ds = DataSet(
    data=data,
    output_features=["blaTEM_coppml", "tetx_coppml"],
    save=True
)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 2)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 2)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 2)

Loading from h5 file

[100]:
ds = DataSet.from_h5("data.h5")
[102]:
train_x, train_y = ds.training_data()
train_x.shape, train_y.shape
[102]:
((121, 13), (121, 2))
[103]:
train_x, train_y = ds.test_data()
train_x.shape, train_y.shape
[103]:
((66, 13), (66, 2))

Multiple Inputs

Higher dimensional features

[ ]: