Data Preparation for Regression task

This notebook describes how to prepare data for a single machine learning regression problem.

[ ]:

try:
    import ai4water
except ModuleNotFoundError:
    !pip install ai4water

[104]:

import site
site.addsitedir("D:\\mytools\\AI4Water")

from ai4water.datasets import busan_beach
from ai4water.preprocessing import DataSet
from ai4water.utils.utils import get_version_info

[51]:

for lib, ver in get_version_info().items():
    print(lib, ver)

python 3.9.7 | packaged by conda-forge | (default, Sep 29 2021, 19:20:16) [MSC v.1916 64 bit (AMD64)]
os nt
ai4water 1.06
lightgbm 3.3.1
tcn 3.4.0
catboost 0.26
xgboost 1.5.0
easy_mpl 0.21.3
SeqMetrics 1.3.3
tensorflow 2.7.0
keras.api._v2.keras 2.7.0
numpy 1.21.0
pandas 1.3.4
matplotlib 3.4.3
h5py 3.5.0
sklearn 1.0.1
shapefile 2.3.0
fiona 1.8.22
xarray 0.20.1
netCDF4 1.5.7
optuna 2.10.1
skopt 0.9.0
hyperopt 0.2.7
plotly 5.3.1
lime NotDefined
seaborn 0.11.2

[52]:

data = busan_beach()

[53]:

data.shape

[53]:

(1446, 14)

[54]:

data.columns

[54]:

Index(['tide_cm', 'wat_temp_c', 'sal_psu', 'air_temp_c', 'pcp_mm', 'pcp3_mm',
       'pcp6_mm', 'pcp12_mm', 'wind_dir_deg', 'wind_speed_mps', 'air_p_hpa',
       'mslp_hpa', 'rel_hum', 'tetx_coppml'],
      dtype='object')

[55]:

data.head()

[55]:

	tide_cm	wat_temp_c	sal_psu	air_temp_c	pcp_mm	pcp3_mm	pcp6_mm	pcp12_mm	wind_dir_deg	wind_speed_mps	air_p_hpa	mslp_hpa	rel_hum	tetx_coppml
index
2018-06-19 00:00:00	36.407149	19.321232	33.956058	19.780000	0.0	0.0	0.0	0.0	159.533333	0.960000	1002.856667	1007.256667	95.000000	NaN
2018-06-19 00:30:00	35.562515	19.320124	33.950508	19.093333	0.0	0.0	0.0	0.0	86.596667	0.163333	1002.300000	1006.700000	95.000000	NaN
2018-06-19 01:00:00	34.808016	19.319666	33.942532	18.733333	0.0	0.0	0.0	0.0	2.260000	0.080000	1001.973333	1006.373333	95.000000	NaN
2018-06-19 01:30:00	30.645216	19.320406	33.931263	18.760000	0.0	0.0	0.0	0.0	62.710000	0.193333	1001.776667	1006.120000	95.006667	NaN
2018-06-19 02:00:00	26.608980	19.326729	33.917961	18.633333	0.0	0.0	0.0	0.0	63.446667	0.510000	1001.743333	1006.103333	95.006667	NaN

[56]:

data.tail()

[56]:

	tide_cm	wat_temp_c	sal_psu	air_temp_c	pcp_mm	pcp3_mm	pcp6_mm	pcp12_mm	wind_dir_deg	wind_speed_mps	air_p_hpa	mslp_hpa	rel_hum	tetx_coppml
index
2019-09-07 22:00:00	-3.989912	20.990612	33.776449	23.700000	0.0	0.0	0.0	0.5	203.760000	6.506667	1003.446667	1007.746667	88.170000	NaN
2019-09-07 22:30:00	-2.807042	21.012014	33.702310	23.620000	0.0	0.0	0.0	0.0	205.353333	5.633333	1003.520000	1007.820000	88.256667	NaN
2019-09-07 23:00:00	-3.471326	20.831739	33.726177	23.666667	0.0	0.0	0.0	0.0	202.540000	4.480000	1003.610000	1007.910000	87.833333	NaN
2019-09-07 23:30:00	0.707771	21.006086	33.716274	23.633333	0.0	0.0	0.0	0.0	207.206667	4.946667	1003.633333	1007.933333	88.370000	NaN
2019-09-08 00:00:00	1.011731	20.896149	33.729773	23.600000	0.0	0.0	0.0	0.0	210.200000	4.400000	1003.700000	1008.000000	87.700000	NaN

[57]:

data.isna().sum()

[57]:

tide_cm              0
wat_temp_c           0
sal_psu              0
air_temp_c           0
pcp_mm               0
pcp3_mm              0
pcp6_mm              0
pcp12_mm             0
wind_dir_deg         0
wind_speed_mps       0
air_p_hpa            0
mslp_hpa             0
rel_hum              0
tetx_coppml       1228
dtype: int64

[58]:

data.dropna().shape

[58]:

(218, 14)

[59]:

data.dropna().head()

[59]:

	tide_cm	wat_temp_c	sal_psu	air_temp_c	pcp_mm	pcp3_mm	pcp6_mm	pcp12_mm	wind_dir_deg	wind_speed_mps	air_p_hpa	mslp_hpa	rel_hum	tetx_coppml
index
2018-06-20 09:00:00	-22.245026	19.457182	34.004292	24.280000	0.0	0.0	0.0	6.0	205.006667	1.653333	998.613333	1002.913333	75.100000	444866.9004
2018-06-20 12:00:00	10.906243	19.511044	34.044975	26.076667	0.0	0.0	0.0	0.0	201.593333	2.993333	998.830000	1003.130000	67.423333	193368.2195
2018-06-20 15:00:00	15.025008	19.582047	34.134964	25.043333	0.0	0.0	0.0	0.0	188.976667	2.010000	998.190000	1002.490000	67.136667	287920.3535
2018-06-20 18:00:00	-7.755828	19.579559	34.106552	22.826667	0.0	0.0	0.0	0.0	209.493333	1.480000	998.416667	1002.716667	77.413333	246005.6510
2018-06-20 21:00:00	-18.817711	19.570045	34.100220	20.910000	0.0	0.0	0.0	0.0	260.616667	1.080000	999.843333	1004.143333	79.093333	273757.5439

[60]:

data.dropna().tail()

[60]:

	tide_cm	wat_temp_c	sal_psu	air_temp_c	pcp_mm	pcp3_mm	pcp6_mm	pcp12_mm	wind_dir_deg	wind_speed_mps	air_p_hpa	mslp_hpa	rel_hum	tetx_coppml
index
2019-09-06 11:00:00	15.146028	19.247823	33.746046	27.666667	0.0	0.0	0.0	0.0	71.336667	1.666667	1006.450000	1010.750000	75.393333	1.320332e+07
2019-09-06 12:00:00	24.810148	20.357189	33.778996	27.383333	0.0	0.0	0.0	0.0	49.626667	1.386667	1006.106667	1010.406667	75.896667	2.437392e+06
2019-09-06 13:00:00	25.666843	19.362318	33.810041	27.533333	0.0	0.0	0.0	0.0	43.590000	2.076667	1005.316667	1009.616667	76.056667	2.927098e+06
2019-09-06 14:00:00	25.712396	19.317668	33.727930	28.213333	0.0	0.0	0.0	0.0	42.160000	2.603333	1004.246667	1008.546667	71.943333	4.699929e+06
2019-09-06 15:00:00	18.448916	20.592932	33.831501	27.896667	0.0	0.0	0.0	0.0	29.850000	2.743333	1003.846667	1008.146667	72.740000	3.506092e+06

[61]:

ds = DataSet(
    data = data
)

[62]:

train_x, train_y = ds.training_data()


********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

[63]:

train_x.shape, train_y.shape

[63]:

((121, 13), (121, 1))

[64]:

train_x[0]

[64]:

array([ -22.245026 ,   19.457182 ,   34.00429  ,   24.28     ,
          0.       ,    0.       ,    0.       ,    6.       ,
        205.00667  ,    1.6533333,  998.61334  , 1002.9133   ,
         75.1      ], dtype=float32)

[65]:

train_y[0]

[65]:

array([444866.9004])

[66]:

train_x[1]

[66]:

array([  10.906243 ,   19.511044 ,   34.044975 ,   26.076666 ,
          0.       ,    0.       ,    0.       ,    0.       ,
        201.59334  ,    2.9933333,  998.83     , 1003.13     ,
         67.42333  ], dtype=float32)

[67]:

train_y[1]

[67]:

array([193368.2195])

[68]:

test_x, test_y = ds.test_data()


********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

[69]:

test_x[-1]

[69]:

array([  18.448915 ,   20.592932 ,   33.8315   ,   27.896667 ,
          0.       ,    0.       ,    0.       ,    0.       ,
         29.85     ,    2.7433333, 1003.8467   , 1008.14667  ,
         72.74     ], dtype=float32)

[70]:

test_y[-1]

[70]:

array([3506092.003])

defining inputs

[71]:

ds = DataSet(
    data=data,
    input_features=['tide_cm', 'wat_temp_c', 'sal_psu', 'air_temp_c', 'pcp_mm', 'pcp3_mm']
)

[72]:

train_x, train_y = ds.training_data()
train_x.shape, train_y.shape


********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 6)
target shape:  (121, 8)

[72]:

((121, 6), (121, 8))

[73]:

train_x[0]

[73]:

array([-22.245026,  19.457182,  34.00429 ,  24.28    ,   0.      ,
         0.      ], dtype=float32)

defining outputs

[74]:

ds = DataSet(
    data=data,
    input_features=['tide_cm', 'wat_temp_c', 'sal_psu', 'air_temp_c', 'pcp_mm', 'pcp3_mm'],
    output_features=["tetx_coppml"]
)

[75]:

train_x, train_y = ds.training_data()


********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 6)
target shape:  (121, 1)

[76]:

train_x[0]

[76]:

array([-22.245026,  19.457182,  34.00429 ,  24.28    ,   0.      ,
         0.      ], dtype=float32)

[77]:

train_y[0]

[77]:

array([444866.9004])

[78]:

train_x[1]

[78]:

array([10.906243, 19.511044, 34.044975, 26.076666,  0.      ,  0.      ],
      dtype=float32)

[79]:

train_y[1]

[79]:

array([193368.2195])

[80]:

test_x, test_y = ds.test_data()
test_x.shape, test_y.shape


********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 6)
target shape:  (66, 1)

[80]:

((66, 6), (66, 1))

[81]:

test_x[-1]

[81]:

array([18.448915, 20.592932, 33.8315  , 27.896667,  0.      ,  0.      ],
      dtype=float32)

[82]:

test_y[-1]

[82]:

array([3506092.003])

[83]:

data = busan_beach(target=["blaTEM_coppml", "tetx_coppml"])
data.shape

[83]:

(1446, 15)

[84]:

data.columns

[84]:

Index(['tide_cm', 'wat_temp_c', 'sal_psu', 'air_temp_c', 'pcp_mm', 'pcp3_mm',
       'pcp6_mm', 'pcp12_mm', 'wind_dir_deg', 'wind_speed_mps', 'air_p_hpa',
       'mslp_hpa', 'rel_hum', 'blaTEM_coppml', 'tetx_coppml'],
      dtype='object')

[85]:

data.dropna().head()

[85]:

	tide_cm	wat_temp_c	sal_psu	air_temp_c	pcp_mm	pcp3_mm	pcp6_mm	pcp12_mm	wind_dir_deg	wind_speed_mps	air_p_hpa	mslp_hpa	rel_hum	blaTEM_coppml	tetx_coppml
index
2018-06-20 09:00:00	-22.245026	19.457182	34.004292	24.280000	0.0	0.0	0.0	6.0	205.006667	1.653333	998.613333	1002.913333	75.100000	9.665350e+05	444866.9004
2018-06-20 12:00:00	10.906243	19.511044	34.044975	26.076667	0.0	0.0	0.0	0.0	201.593333	2.993333	998.830000	1003.130000	67.423333	3.834816e+05	193368.2195
2018-06-20 15:00:00	15.025008	19.582047	34.134964	25.043333	0.0	0.0	0.0	0.0	188.976667	2.010000	998.190000	1002.490000	67.136667	1.673262e+06	287920.3535
2018-06-20 18:00:00	-7.755828	19.579559	34.106552	22.826667	0.0	0.0	0.0	0.0	209.493333	1.480000	998.416667	1002.716667	77.413333	5.645747e+06	246005.6510
2018-06-20 21:00:00	-18.817711	19.570045	34.100220	20.910000	0.0	0.0	0.0	0.0	260.616667	1.080000	999.843333	1004.143333	79.093333	1.630322e+06	273757.5439

[88]:

ds = DataSet(
    data=data,
    output_features=["blaTEM_coppml", "tetx_coppml"],
    verbosity=0
)

[90]:

train_x, train_y = ds.training_data()
train_x.shape, train_y.shape

[90]:

((121, 13), (121, 2))

[91]:

train_x[0]

[91]:

array([ -22.245026 ,   19.457182 ,   34.00429  ,   24.28     ,
          0.       ,    0.       ,    0.       ,    6.       ,
        205.00667  ,    1.6533333,  998.61334  , 1002.9133   ,
         75.1      ], dtype=float32)

[92]:

train_y[0]

[92]:

array([966535.0042, 444866.9004])

[93]:

test_x, test_y = ds.test_data()
test_x.shape, test_y.shape

[93]:

((66, 13), (66, 2))

[94]:

test_x[-1]

[94]:

array([  18.448915 ,   20.592932 ,   33.8315   ,   27.896667 ,
          0.       ,    0.       ,    0.       ,    0.       ,
         29.85     ,    2.7433333, 1003.8467   , 1008.14667  ,
         72.74     ], dtype=float32)

[95]:

test_y[-1]

[95]:

array([8473063.881, 3506092.003])

saving prepared data in h5 file

[99]:

ds = DataSet(
    data=data,
    output_features=["blaTEM_coppml", "tetx_coppml"],
    save=True
)


********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 2)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 2)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 2)

Loading from h5 file

[100]:

ds = DataSet.from_h5("data.h5")

[102]:

train_x, train_y = ds.training_data()
train_x.shape, train_y.shape

[102]:

((121, 13), (121, 2))

[103]:

train_x, train_y = ds.test_data()
train_x.shape, train_y.shape

[103]:

((66, 13), (66, 2))

Multiple Inputs

Higher dimensional features

[ ]: