Data Preparation for Regression task
This notebook describes how to prepare data for a single machine learning regression problem.
[ ]:
try:
import ai4water
except ModuleNotFoundError:
!pip install ai4water
[104]:
import site
site.addsitedir("D:\\mytools\\AI4Water")
from ai4water.datasets import busan_beach
from ai4water.preprocessing import DataSet
from ai4water.utils.utils import get_version_info
[51]:
for lib, ver in get_version_info().items():
print(lib, ver)
python 3.9.7 | packaged by conda-forge | (default, Sep 29 2021, 19:20:16) [MSC v.1916 64 bit (AMD64)]
os nt
ai4water 1.06
lightgbm 3.3.1
tcn 3.4.0
catboost 0.26
xgboost 1.5.0
easy_mpl 0.21.3
SeqMetrics 1.3.3
tensorflow 2.7.0
keras.api._v2.keras 2.7.0
numpy 1.21.0
pandas 1.3.4
matplotlib 3.4.3
h5py 3.5.0
sklearn 1.0.1
shapefile 2.3.0
fiona 1.8.22
xarray 0.20.1
netCDF4 1.5.7
optuna 2.10.1
skopt 0.9.0
hyperopt 0.2.7
plotly 5.3.1
lime NotDefined
seaborn 0.11.2
[52]:
data = busan_beach()
[53]:
data.shape
[53]:
(1446, 14)
[54]:
data.columns
[54]:
Index(['tide_cm', 'wat_temp_c', 'sal_psu', 'air_temp_c', 'pcp_mm', 'pcp3_mm',
'pcp6_mm', 'pcp12_mm', 'wind_dir_deg', 'wind_speed_mps', 'air_p_hpa',
'mslp_hpa', 'rel_hum', 'tetx_coppml'],
dtype='object')
[55]:
data.head()
[55]:
tide_cm | wat_temp_c | sal_psu | air_temp_c | pcp_mm | pcp3_mm | pcp6_mm | pcp12_mm | wind_dir_deg | wind_speed_mps | air_p_hpa | mslp_hpa | rel_hum | tetx_coppml | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | ||||||||||||||
2018-06-19 00:00:00 | 36.407149 | 19.321232 | 33.956058 | 19.780000 | 0.0 | 0.0 | 0.0 | 0.0 | 159.533333 | 0.960000 | 1002.856667 | 1007.256667 | 95.000000 | NaN |
2018-06-19 00:30:00 | 35.562515 | 19.320124 | 33.950508 | 19.093333 | 0.0 | 0.0 | 0.0 | 0.0 | 86.596667 | 0.163333 | 1002.300000 | 1006.700000 | 95.000000 | NaN |
2018-06-19 01:00:00 | 34.808016 | 19.319666 | 33.942532 | 18.733333 | 0.0 | 0.0 | 0.0 | 0.0 | 2.260000 | 0.080000 | 1001.973333 | 1006.373333 | 95.000000 | NaN |
2018-06-19 01:30:00 | 30.645216 | 19.320406 | 33.931263 | 18.760000 | 0.0 | 0.0 | 0.0 | 0.0 | 62.710000 | 0.193333 | 1001.776667 | 1006.120000 | 95.006667 | NaN |
2018-06-19 02:00:00 | 26.608980 | 19.326729 | 33.917961 | 18.633333 | 0.0 | 0.0 | 0.0 | 0.0 | 63.446667 | 0.510000 | 1001.743333 | 1006.103333 | 95.006667 | NaN |
[56]:
data.tail()
[56]:
tide_cm | wat_temp_c | sal_psu | air_temp_c | pcp_mm | pcp3_mm | pcp6_mm | pcp12_mm | wind_dir_deg | wind_speed_mps | air_p_hpa | mslp_hpa | rel_hum | tetx_coppml | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | ||||||||||||||
2019-09-07 22:00:00 | -3.989912 | 20.990612 | 33.776449 | 23.700000 | 0.0 | 0.0 | 0.0 | 0.5 | 203.760000 | 6.506667 | 1003.446667 | 1007.746667 | 88.170000 | NaN |
2019-09-07 22:30:00 | -2.807042 | 21.012014 | 33.702310 | 23.620000 | 0.0 | 0.0 | 0.0 | 0.0 | 205.353333 | 5.633333 | 1003.520000 | 1007.820000 | 88.256667 | NaN |
2019-09-07 23:00:00 | -3.471326 | 20.831739 | 33.726177 | 23.666667 | 0.0 | 0.0 | 0.0 | 0.0 | 202.540000 | 4.480000 | 1003.610000 | 1007.910000 | 87.833333 | NaN |
2019-09-07 23:30:00 | 0.707771 | 21.006086 | 33.716274 | 23.633333 | 0.0 | 0.0 | 0.0 | 0.0 | 207.206667 | 4.946667 | 1003.633333 | 1007.933333 | 88.370000 | NaN |
2019-09-08 00:00:00 | 1.011731 | 20.896149 | 33.729773 | 23.600000 | 0.0 | 0.0 | 0.0 | 0.0 | 210.200000 | 4.400000 | 1003.700000 | 1008.000000 | 87.700000 | NaN |
[57]:
data.isna().sum()
[57]:
tide_cm 0
wat_temp_c 0
sal_psu 0
air_temp_c 0
pcp_mm 0
pcp3_mm 0
pcp6_mm 0
pcp12_mm 0
wind_dir_deg 0
wind_speed_mps 0
air_p_hpa 0
mslp_hpa 0
rel_hum 0
tetx_coppml 1228
dtype: int64
[58]:
data.dropna().shape
[58]:
(218, 14)
[59]:
data.dropna().head()
[59]:
tide_cm | wat_temp_c | sal_psu | air_temp_c | pcp_mm | pcp3_mm | pcp6_mm | pcp12_mm | wind_dir_deg | wind_speed_mps | air_p_hpa | mslp_hpa | rel_hum | tetx_coppml | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | ||||||||||||||
2018-06-20 09:00:00 | -22.245026 | 19.457182 | 34.004292 | 24.280000 | 0.0 | 0.0 | 0.0 | 6.0 | 205.006667 | 1.653333 | 998.613333 | 1002.913333 | 75.100000 | 444866.9004 |
2018-06-20 12:00:00 | 10.906243 | 19.511044 | 34.044975 | 26.076667 | 0.0 | 0.0 | 0.0 | 0.0 | 201.593333 | 2.993333 | 998.830000 | 1003.130000 | 67.423333 | 193368.2195 |
2018-06-20 15:00:00 | 15.025008 | 19.582047 | 34.134964 | 25.043333 | 0.0 | 0.0 | 0.0 | 0.0 | 188.976667 | 2.010000 | 998.190000 | 1002.490000 | 67.136667 | 287920.3535 |
2018-06-20 18:00:00 | -7.755828 | 19.579559 | 34.106552 | 22.826667 | 0.0 | 0.0 | 0.0 | 0.0 | 209.493333 | 1.480000 | 998.416667 | 1002.716667 | 77.413333 | 246005.6510 |
2018-06-20 21:00:00 | -18.817711 | 19.570045 | 34.100220 | 20.910000 | 0.0 | 0.0 | 0.0 | 0.0 | 260.616667 | 1.080000 | 999.843333 | 1004.143333 | 79.093333 | 273757.5439 |
[60]:
data.dropna().tail()
[60]:
tide_cm | wat_temp_c | sal_psu | air_temp_c | pcp_mm | pcp3_mm | pcp6_mm | pcp12_mm | wind_dir_deg | wind_speed_mps | air_p_hpa | mslp_hpa | rel_hum | tetx_coppml | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | ||||||||||||||
2019-09-06 11:00:00 | 15.146028 | 19.247823 | 33.746046 | 27.666667 | 0.0 | 0.0 | 0.0 | 0.0 | 71.336667 | 1.666667 | 1006.450000 | 1010.750000 | 75.393333 | 1.320332e+07 |
2019-09-06 12:00:00 | 24.810148 | 20.357189 | 33.778996 | 27.383333 | 0.0 | 0.0 | 0.0 | 0.0 | 49.626667 | 1.386667 | 1006.106667 | 1010.406667 | 75.896667 | 2.437392e+06 |
2019-09-06 13:00:00 | 25.666843 | 19.362318 | 33.810041 | 27.533333 | 0.0 | 0.0 | 0.0 | 0.0 | 43.590000 | 2.076667 | 1005.316667 | 1009.616667 | 76.056667 | 2.927098e+06 |
2019-09-06 14:00:00 | 25.712396 | 19.317668 | 33.727930 | 28.213333 | 0.0 | 0.0 | 0.0 | 0.0 | 42.160000 | 2.603333 | 1004.246667 | 1008.546667 | 71.943333 | 4.699929e+06 |
2019-09-06 15:00:00 | 18.448916 | 20.592932 | 33.831501 | 27.896667 | 0.0 | 0.0 | 0.0 | 0.0 | 29.850000 | 2.743333 | 1003.846667 | 1008.146667 | 72.740000 | 3.506092e+06 |
[61]:
ds = DataSet(
data = data
)
[62]:
train_x, train_y = ds.training_data()
********** Removing Examples with nan in labels **********
***** Training *****
input_x shape: (121, 13)
target shape: (121, 1)
[63]:
train_x.shape, train_y.shape
[63]:
((121, 13), (121, 1))
[64]:
train_x[0]
[64]:
array([ -22.245026 , 19.457182 , 34.00429 , 24.28 ,
0. , 0. , 0. , 6. ,
205.00667 , 1.6533333, 998.61334 , 1002.9133 ,
75.1 ], dtype=float32)
[65]:
train_y[0]
[65]:
array([444866.9004])
[66]:
train_x[1]
[66]:
array([ 10.906243 , 19.511044 , 34.044975 , 26.076666 ,
0. , 0. , 0. , 0. ,
201.59334 , 2.9933333, 998.83 , 1003.13 ,
67.42333 ], dtype=float32)
[67]:
train_y[1]
[67]:
array([193368.2195])
[68]:
test_x, test_y = ds.test_data()
********** Removing Examples with nan in labels **********
***** Test *****
input_x shape: (66, 13)
target shape: (66, 1)
[69]:
test_x[-1]
[69]:
array([ 18.448915 , 20.592932 , 33.8315 , 27.896667 ,
0. , 0. , 0. , 0. ,
29.85 , 2.7433333, 1003.8467 , 1008.14667 ,
72.74 ], dtype=float32)
[70]:
test_y[-1]
[70]:
array([3506092.003])
defining inputs
[71]:
ds = DataSet(
data=data,
input_features=['tide_cm', 'wat_temp_c', 'sal_psu', 'air_temp_c', 'pcp_mm', 'pcp3_mm']
)
[72]:
train_x, train_y = ds.training_data()
train_x.shape, train_y.shape
********** Removing Examples with nan in labels **********
***** Training *****
input_x shape: (121, 6)
target shape: (121, 8)
[72]:
((121, 6), (121, 8))
[73]:
train_x[0]
[73]:
array([-22.245026, 19.457182, 34.00429 , 24.28 , 0. ,
0. ], dtype=float32)
defining outputs
[74]:
ds = DataSet(
data=data,
input_features=['tide_cm', 'wat_temp_c', 'sal_psu', 'air_temp_c', 'pcp_mm', 'pcp3_mm'],
output_features=["tetx_coppml"]
)
[75]:
train_x, train_y = ds.training_data()
********** Removing Examples with nan in labels **********
***** Training *****
input_x shape: (121, 6)
target shape: (121, 1)
[76]:
train_x[0]
[76]:
array([-22.245026, 19.457182, 34.00429 , 24.28 , 0. ,
0. ], dtype=float32)
[77]:
train_y[0]
[77]:
array([444866.9004])
[78]:
train_x[1]
[78]:
array([10.906243, 19.511044, 34.044975, 26.076666, 0. , 0. ],
dtype=float32)
[79]:
train_y[1]
[79]:
array([193368.2195])
[80]:
test_x, test_y = ds.test_data()
test_x.shape, test_y.shape
********** Removing Examples with nan in labels **********
***** Test *****
input_x shape: (66, 6)
target shape: (66, 1)
[80]:
((66, 6), (66, 1))
[81]:
test_x[-1]
[81]:
array([18.448915, 20.592932, 33.8315 , 27.896667, 0. , 0. ],
dtype=float32)
[82]:
test_y[-1]
[82]:
array([3506092.003])
[83]:
data = busan_beach(target=["blaTEM_coppml", "tetx_coppml"])
data.shape
[83]:
(1446, 15)
[84]:
data.columns
[84]:
Index(['tide_cm', 'wat_temp_c', 'sal_psu', 'air_temp_c', 'pcp_mm', 'pcp3_mm',
'pcp6_mm', 'pcp12_mm', 'wind_dir_deg', 'wind_speed_mps', 'air_p_hpa',
'mslp_hpa', 'rel_hum', 'blaTEM_coppml', 'tetx_coppml'],
dtype='object')
[85]:
data.dropna().head()
[85]:
tide_cm | wat_temp_c | sal_psu | air_temp_c | pcp_mm | pcp3_mm | pcp6_mm | pcp12_mm | wind_dir_deg | wind_speed_mps | air_p_hpa | mslp_hpa | rel_hum | blaTEM_coppml | tetx_coppml | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | |||||||||||||||
2018-06-20 09:00:00 | -22.245026 | 19.457182 | 34.004292 | 24.280000 | 0.0 | 0.0 | 0.0 | 6.0 | 205.006667 | 1.653333 | 998.613333 | 1002.913333 | 75.100000 | 9.665350e+05 | 444866.9004 |
2018-06-20 12:00:00 | 10.906243 | 19.511044 | 34.044975 | 26.076667 | 0.0 | 0.0 | 0.0 | 0.0 | 201.593333 | 2.993333 | 998.830000 | 1003.130000 | 67.423333 | 3.834816e+05 | 193368.2195 |
2018-06-20 15:00:00 | 15.025008 | 19.582047 | 34.134964 | 25.043333 | 0.0 | 0.0 | 0.0 | 0.0 | 188.976667 | 2.010000 | 998.190000 | 1002.490000 | 67.136667 | 1.673262e+06 | 287920.3535 |
2018-06-20 18:00:00 | -7.755828 | 19.579559 | 34.106552 | 22.826667 | 0.0 | 0.0 | 0.0 | 0.0 | 209.493333 | 1.480000 | 998.416667 | 1002.716667 | 77.413333 | 5.645747e+06 | 246005.6510 |
2018-06-20 21:00:00 | -18.817711 | 19.570045 | 34.100220 | 20.910000 | 0.0 | 0.0 | 0.0 | 0.0 | 260.616667 | 1.080000 | 999.843333 | 1004.143333 | 79.093333 | 1.630322e+06 | 273757.5439 |
[88]:
ds = DataSet(
data=data,
output_features=["blaTEM_coppml", "tetx_coppml"],
verbosity=0
)
[90]:
train_x, train_y = ds.training_data()
train_x.shape, train_y.shape
[90]:
((121, 13), (121, 2))
[91]:
train_x[0]
[91]:
array([ -22.245026 , 19.457182 , 34.00429 , 24.28 ,
0. , 0. , 0. , 6. ,
205.00667 , 1.6533333, 998.61334 , 1002.9133 ,
75.1 ], dtype=float32)
[92]:
train_y[0]
[92]:
array([966535.0042, 444866.9004])
[93]:
test_x, test_y = ds.test_data()
test_x.shape, test_y.shape
[93]:
((66, 13), (66, 2))
[94]:
test_x[-1]
[94]:
array([ 18.448915 , 20.592932 , 33.8315 , 27.896667 ,
0. , 0. , 0. , 0. ,
29.85 , 2.7433333, 1003.8467 , 1008.14667 ,
72.74 ], dtype=float32)
[95]:
test_y[-1]
[95]:
array([8473063.881, 3506092.003])
saving prepared data in h5 file
[99]:
ds = DataSet(
data=data,
output_features=["blaTEM_coppml", "tetx_coppml"],
save=True
)
********** Removing Examples with nan in labels **********
***** Training *****
input_x shape: (121, 13)
target shape: (121, 2)
********** Removing Examples with nan in labels **********
***** Validation *****
input_x shape: (31, 13)
target shape: (31, 2)
********** Removing Examples with nan in labels **********
***** Test *****
input_x shape: (66, 13)
target shape: (66, 2)
Loading from h5 file
[100]:
ds = DataSet.from_h5("data.h5")
[102]:
train_x, train_y = ds.training_data()
train_x.shape, train_y.shape
[102]:
((121, 13), (121, 2))
[103]:
train_x, train_y = ds.test_data()
train_x.shape, train_y.shape
[103]:
((66, 13), (66, 2))
Multiple Inputs
Higher dimensional features
[ ]: