# camels #
# https://confluence.ecmwf.int/display/COPSRV/GloFAS+Climate+Data+Store
# https://www.sciencedirect.com/journal/data-in-brief
# https://data.mendeley.com/datasets/5vzp6svhwh/4
# https://zenodo.org/record/4218413#.YA6w7BZS-Uk
# https://www.sciencedirect.com/search?qs=time%20series&pub=Data%20in%20Brief&cid=311593
# https://doi.pangaea.de/10.1594/PANGAEA.898217
# https://doi.pangaea.de/10.1594/PANGAEA.811992 # protected
# https://doi.pangaea.de/10.1594/PANGAEA.905446
# https://doi.pangaea.de/10.1594/PANGAEA.900958
# https://doi.pangaea.de/10.1594/PANGAEA.831193
# https://data.world/datagov-uk/223a8f60-e3ac-4a25-987d-587cc3a92fa1
# https://www.bafg.de/GRDC/EN/04_spcldtbss/41_ARDB/arcticHycos.html?nn=201574 flow dataset
# https://doi.pangaea.de/10.1594/PANGAEA.924561 Air temp Tiangin china
# https://zenodo.org/record/3712407#.YExYDtyRWUk
# https://zenodo.org/record/3844201#.YExYi9yRWUk
# https://zenodo.org/record/1471322#.YExYftyRWUk
# https://zenodo.org/record/3961605#.YExYcdyRWUk
# https://zenodo.org/record/1452383#.YExZRdyRWUk
# https://zenodo.org/record/4428151#.YExZPdyRWUk
# https://zenodo.org/record/3903238#.YExZItyRWUk
# https://zenodo.org/record/3670864#.YExZFdyRWUk
# https://zenodo.org/record/3834623#.YExY7tyRWUk
# https://zenodo.org/record/4029572#.YExY5NyRWUk
# https://zenodo.org/record/4552842#.YExaR9yRWUk
# https://zenodo.org/record/3466097#.YExaQ9yRWUk
# https://zenodo.org/record/4327078#.YExaCdyRWUk
# https://zenodo.org/record/3712397#.YExbsdyRWUk
# https://zenodo.org/record/3560706#.YExeztyRWUk
# https://zenodo.org/record/3698998#.YExekdyRWUk
# https://zenodo.org/record/3564237#.YExlh9yRWUl
# https://zenodo.org/record/581145#.YExeV9yRWUk
# https://zenodo.org/record/3978225#.YExeEtyRWUk
# https://zenodo.org/record/3763766#.YExdntyRWUk
# https://zenodo.org/record/3744217#.YExdi9yRWUk
# https://zenodo.org/record/3948568#.YExdeNyRWUk
# https://zenodo.org/record/3538207#.YExdbtyRWUk
# https://zenodo.org/record/1486058#.YExc-dyRWUk
# https://zenodo.org/record/3561032#.YExc7tyRWUk
# https://zenodo.org/record/1466038#.YExc3dyRWUk
# https://zenodo.org/record/581186#.YExcz9yRWUk
# https://zenodo.org/record/4572636#.YExcwNyRWUk
# https://zenodo.org/record/1267837#.YExcZNyRWUk
# https://zenodo.org/record/3808223#.YExcX9yRWUk
# https://zenodo.org/record/4447435#.YExcWtyRWUk
# https://zenodo.org/record/1300354#.YExcVdyRWUk
# https://zenodo.org/record/4308036#.YExcJdyRWUk
# https://zenodo.org/record/3459610#.YExhNNyRWUk
# https://zenodo.org/record/3763342#.YExhCdyRWUk
# https://zenodo.org/record/4559571#.YExhBNyRWUk
# https://zenodo.org/record/3663630#.YExg89yRWUk
# https://zenodo.org/record/4382937#.YExg7dyRWUk
# https://zenodo.org/record/3876148#.YExgUdyRWUk
# https://zenodo.org/record/3982962#.YExgTdyRWUk
# https://zenodo.org/record/2559480#.YExvWtyRWUk
# https://zenodo.org/record/4094684#.YExvS9yRWUk
# https://zenodo.org/record/2596929#.YExvP9yRWUk
# https://zenodo.org/record/977773#.YExvEtyRWUk
# https://zenodo.org/record/3520146#.YExu_tyRWUk
# https://zenodo.org/record/3836648#.YExu09yRWUk
# https://zenodo.org/record/4290294#.YExo5tyRWUk
# https://zenodo.org/record/2728636#.YEx4EdyRWUk
# https://zenodo.org/record/3581187#.YEx5CNyRWUk
# https://zenodo.org/record/3946242#.YEx5FtyRWUk
# https://zenodo.org/record/883100#.YEx5L9yRWUk
# https://zenodo.org/record/3239401#.YEx5gtyRWUk
# https://zenodo.org/record/4183611#.YEx5vNyRWUk
# https://zenodo.org/record/4559696#.YEx5xdyRWUk
# https://zenodo.org/record/3776011#.YEx6YdyRWUk
# https://zenodo.org/record/4315647#.YEx6v9yRWUk
# https://zenodo.org/record/1185084#.YEx77NyRWUk
# https://zenodo.org/record/4271209#.YEx7z9yRWUk
# https://zenodo.org/record/4570780#.YEx7y9yRWUk
# https://zenodo.org/record/3593395#.YEx7x9yRWUk
# https://zenodo.org/record/3632501#.YEx7qtyRWUk
# https://zenodo.org/record/1122635#.YEx7ndyRWUk
# https://zenodo.org/record/3893897#.YEx7gNyRWUk
# https://zenodo.org/record/4395737#.YEx7a9yRWUk
# https://zenodo.org/record/3779473#.YEx7aNyRWUk
# https://zenodo.org/record/1226394#.YEx7O9yRWUk
# https://zenodo.org/record/4391461#.YEx7MtyRWUk
# https://zenodo.org/record/4247833#.YEx7HtyRWUk
# https://zenodo.org/record/1486058#.YEx7G9yRWUk
# https://zenodo.org/record/3928587#.YEx7E9yRWUk
# https://zenodo.org/record/4341521#.YEx7DdyRWUk
# https://zenodo.org/record/3974871#.YEx7CdyRWUk
# https://zenodo.org/record/1298526#.YEx7B9yRWUk
# https://zenodo.org/record/57293#.YEx6_dyRWUk
# https://zenodo.org/record/4268711#.YEx6-9yRWUk
# https://zenodo.org/record/322827#.YEx69tyRWUk
# https://zenodo.org/record/1050301#.YEx6y9yRWUk
# https://zenodo.org/record/4734372#.YKc9QKGRWUk
# https://www.nature.com/articles/s41597-019-0288-y#Abs1
# https://catalogue-imos.aodn.org.au/geonetwork/srv/api/records/9e2ba32a-5da3-4ea5-b750-e6279680dd71
#https://daac.ornl.gov/cgi-bin/dsviewer.pl?ds_id=1566
# https://doi.pangaea.de/10.1594/PANGAEA.885860
# https://doi.pangaea.de/10.1594/PANGAEA.913939
# https://www.nature.com/articles/s41597-019-0346-5#Sec8
# ETP
# https://zenodo.org/record/4038399#.YEx6INyRWUk
# https://zenodo.org/record/4601596#.YEx6M9yRWUk
# https://zenodo.org/record/3981919#.YEx6ONyRWUk
# https://zenodo.org/record/4271331#.YEx6PdyRWUk
# https://zenodo.org/record/3726856#.YEx6RdyRWUk
# https://zenodo.org/record/4580292#.YEx6TtyRWUk
# https://zenodo.org/record/1044306#.YEx6UNyRWUk
# https://zenodo.org/record/3891936#.YEx7S9yRWUk
# https://zenodo.org/record/4060319#.YEx7QtyRWUk
# rr
# https://zenodo.org/record/3341592#.YEx5RtyRWUk
# https://zenodo.org/record/3931582#.YEx5W9yRWUk
# https://zenodo.org/record/3528098#.YEx64NyRWUk
# https://hess.copernicus.org/articles/25/3105/2021/
# https://www.nature.com/articles/s41597-019-0282-4#Sec12
# https://www.nature.com/articles/sdata201880#Tab3
# https://edg.epa.gov/metadata/catalog/search/resource/details.page?uuid=https://doi.org/10.23719/1378947
# air
# https://zenodo.org/record/4311854#.YExpwNyRWUk
# https://zenodo.org/record/4281271#.YExpYNyRWUk
# ocean
# https://zenodo.org/record/4600696#.YExpSdyRWUk
# Water Quality
# https://zenodo.org/record/1495558#.YExqFtyRWUk
#https://www.nature.com/articles/sdata201798#Sec18
# https://www.nature.com/articles/s41597-020-0455-1#Sec11
# Flow
# https://zenodo.org/record/3941890#.YExp5NyRWUk
# https://zenodo.org/record/1206188#.YExn-dyRWUk
# https://zenodo.org/record/4394503#.YEx6ndyRWUk
# https://zenodo.org/record/3240954#.YEx6s9yRWUk
# Groundwater
# https://zenodo.org/record/3887120#.YExq1tyRWUk
# https://zenodo.org/record/3928587#.YExnztyRWUk
# https://zenodo.org/record/1158631#.YEx7ZdyRWUk
# https://zenodo.org/record/4139912#.YEx7XdyRWUk
# Weather
# https://zenodo.org/record/3678799#.YExsP9yRWUk
# https://zenodo.org/record/3679247#.YExsOdyRWUk
# https://zenodo.org/record/3678789#.YExsN9yRWUk
# https://zenodo.org/record/4567325#.YExqjtyRWUk
# https://zenodo.org/record/3549899#.YExqNdyRWUk
# https://zenodo.org/record/4319773#.YExoq9yRWUk
# https://zenodo.org/record/4319770#.YExooNyRWUk
# https://zenodo.org/record/4319756#.YExnl9yRWUk
# https://zenodo.org/record/854619#.YExnityRWUk
# https://essd.copernicus.org/articles/13/1289/2021/
# https://essd.copernicus.org/articles/13/1307/2021/
# https://www.tr32db.uni-koeln.de/search/view.php?dataID=1786
# https://doi.org/10.3334/ORNLDAAC/1840
#DWD
# https://opendata.dwd.de/climate_environment/CDC/observations_germany/
# geologic
# https://zenodo.org/record/4536561#.YExpQNyRWUk
# https://zenodo.org/record/2549499#.YExo09yRWUk
# 2D time series datasets
# https://zenodo.org/record/1135230#.YExYotyRWUk
# https://zenodo.org/record/2630456#.YExb4tyRWUk
# https://zenodo.org/record/4559368#.YExd1NyRWUk
# https://zenodo.org/record/4542076#.YExuxtyRWUk
# https://zenodo.org/record/4489056#.YExoBtyRWUk
# https://zenodo.org/record/1157344#.YExnqNyRWUk
# https://www.nature.com/articles/s41597-020-0450-6
# https://www.nature.com/articles/sdata201542#Abs1
# https://www.nature.com/articles/s41597-019-0228-x
# https://zenodo.org/record/4058167
# https://www.nature.com/articles/sdata2018224#Sec10
# soil
# https://www.tr32db.uni-koeln.de/search/view.php?dataID=1839
# https://www.tr32db.uni-koeln.de/search/view.php?dataID=1838
# https://www.tr32db.uni-koeln.de/search/view.php?dataID=1837
# https://www.tr32db.uni-koeln.de/search/view.php?dataID=1760
# https://www.tr32db.uni-koeln.de/search/view.php?dataID=1761
import glob
import warnings
from typing import Union, Tuple, Any, Optional, List
try:
from shapely.geometry import shape, mapping
from shapely.ops import unary_union
except (ModuleNotFoundError, OSError):
shape, mapping, unary_union = None, None, None
from ai4water.backend import os, random, np, pd
from ai4water.backend import netCDF4
from ai4water.backend import xr
from .download_pangaea import PanDataSet
from .utils import download_all_http_directory
from .utils import maybe_download, download_and_unzip, unzip_all_in_dir, download
from .utils import check_attributes, check_st_en
from .utils import encode_column, LabelEncoder, OneHotEncoder
SEP = os.sep
# TODO, add visualization
# TODO all available datasets should be available using a single interface instead of importing each separately
DATASETS = [
'ISWDC',
'SEBAL_ET_CHINA',
'GeoChemMatane',
'PrecipBerlin',
'HydroChemJava',
'WaterChemVictoriaLakes',
'WaterChemEcuador',
'HydrocarbonsGabes',
'SedimentAmersee',
'FlowTetRiver',
'HoloceneTemp',
'RiverTempEroo',
'StreamTempSpain',
'FlowSedDenmark',
'FlowSamoylov',
'EtpPcpSamoylov',
'RiverIsotope',
'WQCantareira',
'RiverTempSpain',
'HydrometricParana',
'FlowBenin',
'YamaguchiClimateJp',
'WQJordan2',
'WQJordan',
'Weisssee'
]
[docs]class Datasets(object):
"""
This is the base class for datasets
Note:
We don't host datasets. Each dataset is downloaded fromt he target remote
server and saved into local disk.
"""
[docs] def __init__(self,
name=None,
units=None,
path:str = None
):
"""
Arguments:
name : str (default=None)
name of dataset
units : str, (default=None)
the unit system being used
path : str (default=None)
path where the data is available (manually downloaded).
If None, it will be downloaded
"""
if name is None:
name = self.__class__.__name__
if units is not None:
assert units in ['si', 'imperial', 'metric']
self.units = units
self.name = name
@property
def url(self):
raise NotImplementedError(f"url must be defined.")
@property
def base_ds_dir(self):
"""Base datasets directory"""
return os.path.join(os.path.dirname(__file__), 'data')
@property
def ds_dir(self):
return self._ds_dir
@ds_dir.setter
def ds_dir(self, path=None):
if path is None:
_dir = os.path.join(self.base_ds_dir, self.__class__.__name__)
else:
_dir = path
if not os.path.exists(_dir):
os.makedirs(_dir)
self._ds_dir = _dir
return
def _download(self, overwrite=False, **kwargs):
"""Downloads the dataset. If already downloaded, then
Parameters
-----------
overwrite : bool
**kwargs :
any keyword arguments for maybe_download function
"""
maybe_download(self.ds_dir, overwrite=overwrite,
url=self.url, name=self.name, **kwargs)
return
def _download_and_unzip(self):
download_and_unzip(self.ds_dir, self.url)
return
[docs] def download_from_pangaea(self, overwrite=False):
if os.path.exists(self.ds_dir):
if overwrite:
print("removing previously downloaded data and downloading again")
else:
print(f"The path {self.ds_dir} already exists.")
self.data_files = [f for f in os.listdir(self.ds_dir) if f.endswith('.txt')]
self.metadata_files = [f for f in os.listdir(self.ds_dir) if f.endswith('.json')]
if len(self.data_files) == 0:
print(f"The path {self.ds_dir} is empty so downloading the files again")
self._download_from_pangaea()
else:
self._download_from_pangaea()
return
def _download_from_pangaea(self):
self.data_files = []
self.metadata_files = []
ds = PanDataSet(self.url)
kids = ds.children()
if len(kids) > 1:
for kid in kids:
kid_ds = PanDataSet(kid)
fname = kid_ds.download(self.ds_dir)
self.metadata_files.append(fname + '._metadata.json')
self.data_files.append(fname + '.txt')
else:
fname = ds.download(self.ds_dir)
self.metadata_files.append(fname + '._metadata.json')
self.data_files.append(fname + '.txt')
return
[docs]class Weisssee(Datasets):
dynamic_attributes = ['Precipitation_measurements',
'long_wave_upward_radiation',
'snow_density_at_30cm',
'long_wave_downward_radiation'
]
url = '10.1594/PANGAEA.898217'
[docs] def __init__(self, path=None, overwrite=False, **kwargs):
super(Weisssee, self).__init__(path=path, **kwargs)
self.ds_dir = path
self.download_from_pangaea(overwrite=overwrite)
[docs] def fetch(self, **kwargs):
"""
Examples
--------
>>> from ai4water.datasets import Weisssee
>>> dataset = Weisssee()
>>> data = dataset.fetch()
"""
data = {}
for f in self.data_files:
fpath = os.path.join(self.ds_dir, f)
df = pd.read_csv(fpath, **kwargs)
if 'index_col' in kwargs:
df.index = pd.to_datetime(df.index)
data[f.split('.txt')[0]] = df
return data
class ETP_CHN_SEBAL(Datasets):
url = "https://zenodo.org/record/4218413#.YBNhThZS-Ul"
class ISWDC(Datasets):
url = "https://zenodo.org/record/2616035#.YBNl5hZS-Uk"
class WQJordan(Weisssee):
"""Jordan River water quality data of 9 variables for two variables."""
url = 'https://doi.pangaea.de/10.1594/PANGAEA.919103'
class WQJordan2(Weisssee):
"""Stage and Turbidity data of Jordan River"""
url = '10.1594/PANGAEA.919104'
class YamaguchiClimateJp(Weisssee):
"""Daily climate and flow data of Japan from 2006 2018"""
url = "https://doi.pangaea.de/10.1594/PANGAEA.909880"
class FlowBenin(Weisssee):
"""Flow data"""
url = "10.1594/PANGAEA.831196"
class HydrometricParana(Weisssee):
"""Daily and monthly water level and flow data of Parana river Argentina
from 1875 to 2017."""
url = "https://doi.pangaea.de/10.1594/PANGAEA.882613"
class RiverTempSpain(Weisssee):
"""Daily mean stream temperatures in Central Spain for different periods."""
url = "https://doi.pangaea.de/10.1594/PANGAEA.879494"
class WQCantareira(Weisssee):
"""Water quality and quantity primary data from field campaigns in the Cantareira Water Supply System,
period Oct. 2013 - May 2014"""
url = "https://doi.pangaea.de/10.1594/PANGAEA.892384"
class RiverIsotope(Weisssee):
"""399 δ18O and δD values in river surface waters of Indian River"""
url = "https://doi.pangaea.de/10.1594/PANGAEA.912582"
class EtpPcpSamoylov(Weisssee):
"""Evpotranspiration and Precipitation at station TOWER on Samoylov Island Russia
from 20110524 to 20110819 with 30 minute frequency"""
url = "10.1594/PANGAEA.811076"
class FlowSamoylov(Weisssee):
"""Net lateral flow at station INT2 on Samoylov Island Russia
from 20110612 to 20110819 with 30 minute frequency"""
url = "10.1594/PANGAEA.811072"
class FlowSedDenmark(Weisssee):
"""Flow and suspended sediment concentration fields over tidal bedforms, ADCP profile"""
url = "10.1594/PANGAEA.841977"
class StreamTempSpain(Weisssee):
"""Daily Mean Stream Temperature at station Tormes3, Central Spain from 199711 to 199906."""
url = "https://doi.pangaea.de/10.1594/PANGAEA.879507"
class RiverTempEroo(Weisssee):
"""Water temperature records in the Eroo River and some tributaries (Selenga River basin, Mongolia, 2011-2012)"""
url = "10.1594/PANGAEA.890070"
class HoloceneTemp(Weisssee):
"""Holocene temperature reconstructions for northeastern North America and the northwestern Atlantic,
core Big_Round_Lake."""
url = "10.1594/PANGAEA.905446"
class FlowTetRiver(Weisssee):
"""Daily mean river discharge at meteorological station Perpignan upstream, Têt basin France from 1980
to 2000."""
url = "10.1594/PANGAEA.226925"
class SedimentAmersee(Weisssee):
"""Occurence of flood laminae in sediments of Ammersee"""
url = "10.1594/PANGAEA.746240"
class HydrocarbonsGabes(Weisssee):
"""Concentration and geological parameters of n-alkanes and n-alkenes in surface sediments from the Gulf of Gabes,
Tunisia"""
url = "10.1594/PANGAEA.774595"
class WaterChemEcuador(Weisssee):
"""weekly and biweekly Water chemistry of cloud forest streams at baseflow conditions,
Rio San Francisco, Ecuador"""
url = "10.1594/PANGAEA.778629"
class WaterChemVictoriaLakes(Weisssee):
"""Surface water chemistry of northern Victoria Land lakes"""
url = "10.1594/PANGAEA.807883"
class HydroChemJava(Weisssee):
"""Hydrochemical data from subsurface rivers, coastal and submarine springsin a karstic region
in southern Java."""
url = "10.1594/PANGAEA.882178"
class PrecipBerlin(Weisssee):
"""Sub-hourly Berlin Dahlem precipitation time-series 2001-2013"""
url = "10.1594/PANGAEA.883587"
class GeoChemMatane(Weisssee):
"""Geochemical data collected in shallow groundwater and river water in a subpolar environment
(Matane river, QC, Canada)."""
url = "10.1594/PANGAEA.908290"
class HydroMeteorAndes(Datasets):
"""Hydrometeriological dataset of tropical Andes region"""
url = ["https://springernature.figshare.com/ndownloader/files/10514506",
"https://springernature.figshare.com/ndownloader/files/10514509"]
[docs]class WeatherJena(Datasets):
"""
10 minute weather dataset of Jena, Germany hosted at https://www.bgc-jena.mpg.de/wetter/index.html
from 2002 onwards.
Examples
--------
>>> from ai4water.datasets import WeatherJena
>>> dataset = WeatherJena()
>>> data = dataset.fetch()
>>> data.sum()
"""
url = "https://www.bgc-jena.mpg.de/wetter/weather_data.html"
[docs] def __init__(self,
path=None,
obs_loc='roof'):
"""
The ETP data is collected at three different locations i.e. roof, soil and saale(hall).
Parameters
----------
obs_loc : str, optional (default=roof)
location of observation. It can be one of following
- roof
- soil
- saale
"""
if obs_loc not in ['roof', 'soil', 'saale']:
raise ValueError
self.obs_loc = obs_loc
super().__init__(path=path)
self.ds_dir = path
sub_dir = os.path.join(self.ds_dir, self.obs_loc)
if not os.path.exists(sub_dir):
os.makedirs(sub_dir)
if xr is None:
warnings.warn("""
loading data from csv files is slow.
Try installing xarray and netcdf for faster loading
""")
download_all_http_directory(self.url, sub_dir, match_name=self.obs_loc)
unzip_all_in_dir(sub_dir, 'zip')
else:
nc_path = os.path.join(sub_dir, "data.nc")
if not os.path.exists(nc_path):
download_all_http_directory(self.url, sub_dir, match_name=self.obs_loc)
unzip_all_in_dir(sub_dir, 'zip')
print("converting data to netcdf file. This will happen only once.")
df = self._read_as_df()
ndf = pd.DataFrame()
for _col in df.columns:
col = _col.replace("/", "_")
ndf[col] = df[_col].copy()
ndf = ndf.reset_index()
ndf.to_xarray().to_netcdf(nc_path)
@property
def dynamic_features(self)->list:
"""returns names of features available"""
return self.fetch().columns.tolist()
[docs] def fetch(
self,
st: Union[str, int, pd.DatetimeIndex] = None,
en: Union[str, int, pd.DatetimeIndex] = None
) -> pd.DataFrame:
"""
Fetches the time series data between given period as pandas dataframe.
Parameters
----------
st : Optional
start of data to be fetched. If None, the data from start (2003-01-01)
will be retuned
en : Optional
end of data to be fetched. If None, the data from till (2021-12-31)
end be retuned.
Returns
-------
pd.DataFrame
a pandas dataframe of shape (972111, 21)
Examples
--------
>>> from ai4water.datasets import WeatherJena
>>> dataset = WeatherJena()
>>> data = dataset.fetch()
>>> data.shape
(972111, 21)
... # get data between specific period
>>> data = dataset.fetch("20110101", "20201231")
>>> data.shape
(525622, 21)
"""
sub_dir = os.path.join(self.ds_dir, self.obs_loc)
if xr is None:
df = self._read_as_df()
else:
nc_path = os.path.join(sub_dir, "data.nc")
df = xr.load_dataset(nc_path).to_dataframe()
if 'Date Time' in df:
df.index = pd.to_datetime(df.pop('Date Time'))
if isinstance(st, int):
if en is None:
en = len(df)
assert isinstance(en, int)
return df.iloc[st:en]
elif st is not None:
return df.loc[st:en]
return df
def _read_as_df(self)->pd.DataFrame:
sub_dir = os.path.join(self.ds_dir, self.obs_loc)
all_files = glob.glob(f"{sub_dir}/*.csv")
df = pd.DataFrame()
for fpath in all_files:
f_df = pd.read_csv(fpath, index_col='Date Time',
encoding='unicode_escape', na_values=-9999)
f_df.index = pd.DatetimeIndex(f_df.index)
df = pd.concat([df, f_df]) # todo, such concatenation is slow.
return df.sort_index()
[docs]class SWECanada(Datasets):
"""
Daily Canadian historical Snow Water Equivalent dataset from 1928 to 2020
from Brown_ et al., 2019 .
Examples
--------
>>> from ai4water.datasets import SWECanada
>>> swe = SWECanada()
... # get names of all available stations
>>> stns = swe.stations()
>>> len(stns)
2607
... # get data of one station
>>> df1 = swe.fetch('SCD-NS010')
>>> df1['SCD-NS010'].shape
(33816, 3)
... # get data of 10 stations
>>> df5 = swe.fetch(5, st='20110101')
>>> df5.keys()
['YT-10AA-SC01', 'ALE-05CA805', 'SCD-NF078', 'SCD-NF086', 'INA-07RA01B']
>>> [v.shape for v in df5.values()]
[(3500, 3), (3500, 3), (3500, 3), (3500, 3), (3500, 3)]
... # get data of 0.1% of stations
>>> df2 = swe.fetch(0.001, st='20110101')
... # get data of one stations starting from 2011
>>> df3 = swe.fetch('ALE-05AE810', st='20110101')
>>> df3.keys()
>>> ['ALE-05AE810']
>>> df4 = swe.fetch(stns[0:10], st='20110101')
.. _Brown:
https://doi.org/10.1080/07055900.2019.1598843
"""
url = "https://doi.org/10.5194/essd-2021-160"
features = ['snw', 'snd', 'den']
q_flags = ['data_flag_snw', 'data_flag_snd', 'qc_flag_snw', 'qc_flag_snd']
[docs] def __init__(self, path=None, **kwargs):
super().__init__(path=path, **kwargs)
self.ds_dir = path
self._download()
[docs] def stations(self) -> list:
nc = netCDF4.Dataset(os.path.join(self.ds_dir, 'CanSWE-CanEEN_1928-2020_v1.nc'))
s = nc['station_id'][:]
return s.tolist()
@property
def start(self):
return '19280101'
@property
def end(self):
return '20200731'
[docs] def fetch(
self,
station_id: Union[None, str, float, int, list] = None,
features: Union[None, str, list] = None,
q_flags: Union[None, str, list] = None,
st=None,
en=None
) -> dict:
"""
Fetches time series data from selected stations.
Parameters
----------
station_id :
station/stations to be retrieved. In None, then data
from all stations will be returned.
features :
Names of features to be retrieved. Following features
are allowed:
- ``snw`` snow water equivalent kg/m3
- ``snd`` snow depth m
- ``den`` snowpack bulk density kg/m3
If None, then all three features will be retrieved.
q_flags :
If None, then no qflags will be returned. Following q_flag
values are available.
- ``data_flag_snw``
- ``data_flag_snd``
- ``qc_flag_snw``
- ``qc_flag_snd``
st :
start of data to be retrieved
en :
end of data to be retrived.
Returns
-------
dict
a dictionary of dataframes of shape (st:en, features + q_flags) whose
length is equal to length of stations being considered.
"""
# todo, q_flags not working
if station_id is None:
station_id = self.stations()
elif isinstance(station_id, str):
station_id = [station_id]
elif isinstance(station_id, list):
pass
elif isinstance(station_id, int):
station_id = random.sample(self.stations(), station_id)
elif isinstance(station_id, float):
num_stations = int(len(self.stations()) * station_id)
station_id = random.sample(self.stations(), num_stations)
stns = self.stations()
stn_id_dict = {k: v for k, v in zip(stns, np.arange(len(stns)))}
stn_id_dict_inv = {v: k for k, v in stn_id_dict.items()}
stn_ids = [stn_id_dict[i] for i in station_id]
features = check_attributes(features, self.features)
qflags = []
if q_flags is not None:
qflags = check_attributes(q_flags, self.q_flags)
features_to_fetch = features + qflags
all_stn_data = {}
for stn in stn_ids:
stn_df = self.fetch_station_attributes(stn, features_to_fetch, st=st, en=en)
all_stn_data[stn_id_dict_inv[stn]] = stn_df
return all_stn_data
[docs] def fetch_station_attributes(self,
stn,
features_to_fetch,
st=None,
en=None,
) -> pd.DataFrame:
"""fetches attributes of one station"""
# st, en = self._check_length(st, en)
nc = netCDF4.Dataset(os.path.join(self.ds_dir, 'CanSWE-CanEEN_1928-2020_v1.nc'))
stn_df = pd.DataFrame(columns=features_to_fetch)
for var in nc.variables:
if var in features_to_fetch:
ma = np.array(nc[var][:])
ma[ma == nc[var]._FillValue] = np.nan
ta = ma[stn, :] # target array of on station
s = pd.Series(ta, index=pd.date_range(self.start, self.end, freq='D'), name=var)
stn_df[var] = s[st:en]
nc.close()
return stn_df
[docs]class RRLuleaSweden(Datasets):
"""
Rainfall runoff data for an urban catchment from 2016-2019 following the work
of Broekhuizen et al., 2020 [11]_ .
.. [11] https://doi.org/10.5194/hess-24-869-2020
"""
url = "https://zenodo.org/record/3931582"
[docs] def __init__(self, path=None, **kwargs):
super().__init__(path=path, **kwargs)
self.ds_dir = path
self._download()
[docs] def fetch(
self,
st: Union[str, int, pd.DatetimeIndex] = None,
en: Union[str, int, pd.DatetimeIndex] = None
):
"""fetches rainfall runoff data
Parameters
----------
st : optional
start of data to be fetched. By default the data starts from
2016-06-16 20:50:00
en : optional
end of data to be fetched. By default the end is 2019-09-15 18:41
"""
flow = self.fetch_flow(st,en)
pcp = self.fetch_pcp(st, en)
return flow, pcp
[docs] def fetch_flow(
self,
st: Union[str, int, pd.DatetimeIndex] = None,
en: Union[str, int, pd.DatetimeIndex] = None
)->pd.DataFrame:
"""fetches flow data
Parameters
----------
st : optional
start of data to be fetched. By default the data starts from
2016-06-16 20:50:00
en : optional
end of data to be fetched. By default the end is 2019-09-15 18:35:00
Returns
-------
pd.DataFrame
a dataframe of shape (37_618, 3) where the columns are velocity,
level and flow rate
Examples
--------
>>> from ai4water.datasets import RRLuleaSweden
>>> dataset = RRLuleaSweden()
>>> flow = dataset.fetch_flow()
>>> flow.shape
(37618, 3)
"""
fname = os.path.join(self.ds_dir, "flow_2016_2019.csv")
df = pd.read_csv(fname, sep=";")
df.index = pd.to_datetime(df.pop("time"))
return check_st_en(df, st, en)
[docs] def fetch_pcp(
self,
st: Union[str, int, pd.DatetimeIndex] = None,
en: Union[str, int, pd.DatetimeIndex] = None
)->pd.DataFrame:
"""fetches precipitation data
Parameters
----------
st : optional
start of data to be fetched. By default the data starts from
2016-06-16 19:48:00
en : optional
end of data to be fetched. By default the end is 2019-10-26 23:59:00
Returns
-------
pd.DataFrame
a dataframe of shape (967_080, 1)
Examples
--------
>>> from ai4water.datasets import RRLuleaSweden
>>> dataset = RRLuleaSweden()
>>> pcp = dataset.fetch_pcp()
>>> pcp.shape
(967080, 1)
"""
fname = os.path.join(self.ds_dir, "prec_2016_2019.csv")
df = pd.read_csv(fname, sep=";")
df.index = pd.to_datetime(df.pop("time"))
return check_st_en(df, st, en)
class RRAlpineCatchments(Datasets):
"""
Modelled runoff in contrasting Alpine catchments in Austria from 1981 to 2100
using 14 models follwoing the work of Hanus et al., 2021 [12]_ .
past 1981 - 2010
future
.. [12] https://hess.copernicus.org/preprints/hess-2021-92/
"""
url = "https://zenodo.org/record/4539986"
def __init__(self, **kwargs):
super().__init__(**kwargs)
self._download()
class ETPAgroForestGermany(Datasets):
"""
Evapotranspiration over agroforestry sites in Germany
https://doi.org/10.5194/bg-17-5183-2020
SiteName_Landuse_Content_Figures_Tables.csv
"""
url = "https://zenodo.org/record/4038399"
class ETPTelesinaItaly(Datasets):
"""
Daily rain and reference evapotranspiration for three years 2002-2004
"""
url = "https://zenodo.org/record/3726856"
def mg_photodegradation(
inputs: list = None,
target: str = "Efficiency (%)",
encoding:str = None
)->Tuple[pd.DataFrame,
Union[LabelEncoder, OneHotEncoder, Any],
Union[LabelEncoder, OneHotEncoder, Any]]:
"""
This data is about photocatalytic degradation of melachite green dye using
nobel metal dobe BiFeO3. For further description of this data see
`Jafari et al., 2023 <https://doi.org/10.1016/j.jhazmat.2022.130031>`_ and
for the use of this data for removal efficiency prediction `see <https://github.com/ZeeshanHJ/Photocatalytic_Performance_Prediction>`_ .
This dataset consists of 1200 points collected during ~135 experiments.
Parameters
----------
inputs : list, optional
features to use as input. By default following features are used as input
- ``Catalyst_type``
- ``Surface area``
- ``Pore Volume``
- ``Catalyst_loading (g/L)``
- ``Light_intensity (W)``
- ``time (min)``
- ``solution_pH``
- ``HA (mg/L)``
- ``Anions``
- ``Ci (mg/L)``
- ``Cf (mg/L)``
target : str, optional, default="Efficiency (%)"
features to use as target. By default ``Efficiency (%)`` is used as target
which is photodegradation removal efficiency of dye from wastewater. Following
are valid target names
- ``Efficiency (%)``
- ``k_first``
- ``k_2nd``
encoding : str, default=None
type of encoding to use for the two categorical features i.e., ``Catalyst_type``
and ``Anions``, to convert them into numberical. Available options are ``ohe``,
``le`` and None. If ohe is selected the original input columns are replaced
with ohe hot encoded columns. This will result in 6 columns for Anions and
15 columns for Catalyst_type.
Returns
-------
data : pd.DataFrame
a pandas dataframe consisting of input and output features. The default
setting will result in dataframe shape of (1200, 12)
cat_encoder :
catalyst encoder
an_encoder :
encoder for anions
Examples
--------
>>> from ai4water.datasets import mg_photodegradation
>>> mg_data, catalyst_encoder, anion_encoder = mg_photodegradation()
>>> mg_data.shape
(1200, 12)
... # the default encoding is None, but if we want to use one hot encoder
>>> mg_data_ohe, cat_enc, an_enc = mg_photodegradation(encoding="ohe")
>>> mg_data_ohe.shape
(1200, 31)
>>> cat_enc.inverse_transform(mg_data_ohe.iloc[:, 9:24].values)
>>> an_enc.inverse_transform(mg_data_ohe.iloc[:, 24:30].values)
... # if we want to use label encoder
>>> mg_data_le, cat_enc, an_enc = mg_photodegradation(encoding="le")
>>> mg_data_le.shape
(1200, 12)
>>> cat_enc.inverse_transform(mg_data_le.iloc[:, 9].values.astype(int))
>>> an_enc.inverse_transform(mg_data_le.iloc[:, 10].values.astype(int))
... # By default the target is efficiency but if we want
... # to use first order k as target
>>> mg_data_k, _, _ = mg_photodegradation(target="k_first")
... # if we want to use 2nd order k as target
>>> mg_data_k2, _, _ = mg_photodegradation(target="k_2nd")
"""
df = pd.read_csv(
"https://raw.githubusercontent.com/ZeeshanHJ/Photocatalytic_Performance_Prediction/main/Raw%20data.csv"
)
default_inputs = ['Surface area', 'Pore Volume', 'Catalyst_loading (g/L)',
'Light_intensity (W)', 'time (min)', 'solution_pH', 'HA (mg/L)',
'Ci (mg/L)', 'Cf (mg/L)', 'Catalyst_type', 'Anions',
]
default_targets = ['Efficiency (%)', 'k_first', 'k_2nd']
# first order
df["k_first"] = np.log(df["Ci (mg/L)"] / df["Cf (mg/L)"]) / df["time (min)"]
# k second order
df["k_2nd"] = ((1 / df["Cf (mg/L)"]) - (1 / df["Ci (mg/L)"])) / df["time (min)"]
if inputs is None:
inputs = default_inputs
if not isinstance(target, list):
if isinstance(target, str):
target = [target]
elif isinstance(target, list):
pass
else:
target = default_targets
assert isinstance(target, list)
assert all(trgt in default_targets for trgt in target)
df = df[inputs + target]
# consider encoding of categorical features
cat_encoder, an_encoder = None, None
if encoding:
df, cols_added, cat_encoder = encode_column(df, "Catalyst_type", encoding)
df, an_added, an_encoder = encode_column(df, "Anions", encoding)
# move the target to the end
for t in target:
df[t] = df.pop(t)
return df, cat_encoder, an_encoder
def gw_punjab(
data_type:str = "full",
country:str = None,
)->pd.DataFrame:
"""
groundwater level (meters below ground level) dataset from Punjab region
(Pakistan and north-west India) following the study of MacAllister_ et al., 2022.
parameters
----------
data_type : str (default="full")
either ``full`` or ``LTS``. The ``full`` contains the
full dataset, there are 68783 rows of observed groundwater level data from
4028 individual sites. In ``LTS`` there are 7547 rows of groundwater
level observations from 130 individual sites, which have water level data available
for a period of more than 40 years and from which at least two thirds of the
annual observations are available.
country : str (default=None)
the country for which data to retrieve. Either ``PAK`` or ``IND``.
Returns
-------
pd.DataFrame
a pandas DataFrame with datetime index
Examples
---------
>>> from ai4water.datasets import gw_punjab
>>> full_data = gw_punjab()
find out the earliest observation
>>> print(full_data.sort_index().head(1))
>>> lts_data = gw_punjab()
>>> lts_data.shape
(68782, 4)
>>> df_pak = gw_punjab(country="PAK")
>>> df_pak.sort_index().dropna().head(1)
.. MacAllister : https://doi.org/10.1038/s41561-022-00926-1
"""
f = 'https://webservices.bgs.ac.uk/accessions/download/167240?fileName=India_Pakistan_WL_NGDC.xlsx'
ds_dir =os.path.join(os.path.dirname(__file__), "data", 'gw_punjab')
if not os.path.exists(ds_dir):
os.makedirs(ds_dir)
fname = os.path.join(ds_dir, "gw_punjab.xlsx")
if not os.path.exists(fname):
print(f"downloading {fname}")
download(f, fname)
assert data_type in ("full", "LTS")
if data_type == "full":
sheet_name = "Full_dataset"
else:
sheet_name = "LTS"
df = pd.read_excel(fname, sheet_name=sheet_name)
if sheet_name == "LTS":
df.iloc[5571, 3] = '01/10/1887'
df.iloc[5572, 3] = '01/10/1892'
df.iloc[6227, 3] = '01/10/1887'
df.iloc[5511, 3] = '01/10/1887'
df.iloc[5512, 3] = '01/10/1892'
df.iloc[6228, 3] = '01/10/1892'
df.index = pd.to_datetime(df.pop("DATE"))
if country:
if country == "PAK":
pak_stations = [st for st in df['OW_ID'].unique() if st.startswith("PAK")]
df = df[df['OW_ID'].isin(pak_stations)]
else:
pak_stations = [st for st in df['OW_ID'].unique() if st.startswith("IND")]
df = df[df['OW_ID'].isin(pak_stations)]
return df
def qe_biochar_ec(
input_features:List[str]=None,
encoding:str = None
)->tuple:
"""
data of adsorption capacity for removal of emerging pollutants from wastewater
using biochar. For more description of this data see `Jaffari et al., 2023 <>_`
Parameters
----------
input_features :
By default following features are used as input
- `Adsorbent``
- `Pyrolysis temperature``
- `Pyrolysis time``
- `C``
- `H``
- `O``
- `N``
- ``(O+N)/C``
- ``Ash``
- ``H/C``
- ``O/C``
- ``Surface area``
- ``Pore volume``
- ``Average pore size``
- ``Pollutant``
- ``Adsorption time``
- `concentration``
- ``Solution pH``
- ``RPM``
- ``Volume``
- ``Adsorbent dosage``
- ``Adsorption temperature``
- ``Ion concentration``
- ``Humid acid``
- ``Wastewater type``
- ``Adsorption type``
encoding : str, default=None
the type of encoding to use for categorical features. If not None, it should
be either ``ohe`` or ``le``.
Returns
--------
tuple
Examples
--------
>>> from ai4water.datasets import qe_biochar_ec
>>> data, *_ = qe_biochar_ec()
>>> data.shape
(3757, 27)
>>> data, ads_enc, pol_enc, wwt_enc, adspt_enc = qe_biochar_ec(encoding="le")
>>> data.shape
(3757, 27)
>>> ads_enc.inverse_transform(data.iloc[:, 22].values.astype(int))
>>> pol_enc.inverse_transform(data.iloc[:, 23].values.astype(int))
>>> wwt_enc.inverse_transform(data.iloc[:, 24].values.astype(int))
>>> adspt_enc.inverse_transform(data.iloc[:, 25].values.astype(int))
>>> data, adsp_enc, polt_enc, wwt_enc, adspt_enc = qe_biochar_ec(encoding="ohe")
>>> data.shape
(3757, 58)
>>> adsp_enc.inverse_transform(data.iloc[:, 22:37].values)
>>> polt_enc.inverse_transform(data.iloc[:, 37:51].values)
>>> wwt_enc.inverse_transform(data.iloc[:, 51:55].values)
>>> adspt_enc.inverse_transform(data.iloc[:, 55:-1].values)
"""
fpath = os.path.join(os.path.dirname(__file__), 'qe_biochar_ec.csv')
url = 'https://raw.githubusercontent.com/ZeeshanHJ/Adsorption-capacity-prediction-for-ECs/main/Raw_data.csv'
if os.path.exists(fpath):
data = pd.read_csv(fpath)
else:
data = pd.read_csv(url)
# remove space in 'Pyrolysis temperature '
data['Pyrolysis temperature'] = data.pop('Pyrolysis temperature ')
data['Adsorbent'] = data.pop('Adsorbent')
data['Pollutant'] = data.pop('Pollutant')
data['Wastewater type'] = data.pop('Wastewater type')
data['Adsorption type'] = data.pop('Adsorption type')
data['Capacity'] = data.pop('Capacity')
data.to_csv(fpath, index=False)
def_inputs = [
'Pyrolysis temperature',
'Pyrolysis time',
'C',
'H',
'O',
'N',
'(O+N)/C',
'Ash',
'H/C',
'O/C',
'Surface area',
'Pore volume',
'Average pore size',
'Adsorption time',
'Initial concentration',
'Solution pH',
'RPM',
'Volume',
'Adsorbent dosage',
'Adsorption temperature',
'Ion concentration',
'Humic acid',
'Adsorbent',
'Pollutant',
'Wastewater type',
'Adsorption type',
]
if input_features is not None:
assert isinstance(input_features, list)
assert all([feature in def_inputs for feature in input_features])
else:
input_features = def_inputs
data = data[input_features + ['Capacity']]
ads_enc, pol_enc, wwt_enc, adspt_enc = None, None, None, None
if encoding:
data, _, ads_enc = encode_column(data, 'Adsorbent', encoding)
data, _, pol_enc = encode_column(data, 'Pollutant', encoding)
data, _, wwt_enc = encode_column(data, 'Wastewater type', encoding)
data, _, adspt_enc = encode_column(data, 'Adsorption type', encoding)
data['Capacity'] = data.pop('Capacity')
return data, ads_enc, pol_enc, wwt_enc, adspt_enc