Source code for ai4water.datasets._grqa


__all__ = ["GRQA"]

from typing import Union, List

from ai4water.backend import pd, os

from ._datasets import Datasets
from .utils import check_st_en


[docs]class GRQA(Datasets): """ Global River Water Quality Archive following the work of Virro et al., 2021 [21]_. .. [21] https://essd.copernicus.org/articles/13/5483/2021/ """ url = 'https://zenodo.org/record/7056647#.YzBzDHZByUk'
[docs] def __init__( self, download_source:bool = False, path = None, **kwargs): """ parameters ---------- download_source : bool whether to download source data or not """ super().__init__(path=path, **kwargs) self.ds_dir = path files = ['GRQA_data_v1.3.zip', 'GRQA_meta.zip'] if download_source: files += ['GRQA_source_data.zip'] self._download(include=files)
@property def files(self): return os.listdir(os.path.join(self.ds_dir, "GRQA_data_v1.3", "GRQA_data_v1.3")) @property def parameters(self): return [f.split('_')[0] for f in self.files]
[docs] def fetch_parameter( self, parameter: str = "COD", site_name: Union[List[str], str] = None, country: Union[List[str], str] = None, st:Union[int, str, pd.DatetimeIndex] = None, en:Union[int, str, pd.DatetimeIndex] = None, )->pd.DataFrame: """ parameters ---------- parameter : str, optional name of parameter site_name : str/list, optional location for which data is to be fetched. country : str/list optional (default=None) st : str starting date date or index en : str end date or index Returns ------- pd.DataFrame a pandas dataframe Example -------- >>> from ai4water.datasets import GRQA >>> dataset = GRQA() >>> df = dataset.fetch_parameter() fetch data for only one country >>> cod_pak = dataset.fetch_parameter("COD", country="Pakistan") fetch data for only one site >>> cod_kotri = dataset.fetch_parameter("COD", site_name="Indus River - at Kotri") we can find out the number of data points and sites available for a specific country as below >>> for para in dataset.parameters: >>> data = dataset.fetch_parameter(para, country="Germany") >>> if len(data)>0: >>> print(f"{para}, {df.shape}, {len(df['site_name'].unique())}") """ assert isinstance(parameter, str) assert parameter in self.parameters if isinstance(site_name, str): site_name = [site_name] if isinstance(country, str): country = [country] df = self._load_df(parameter) if site_name is not None: assert isinstance(site_name, list) df = df[df['site_name'].isin(site_name)] if country is not None: assert isinstance(country, list) df = df[df['site_country'].isin(country)] df.index = pd.to_datetime(df.pop("obs_date") + " " + df.pop("obs_time")) return check_st_en(df, st, en)
def _load_df(self, parameter): if hasattr(self, f"_load_{parameter}"): return getattr(self, f"_load_{parameter}")() fname = os.path.join(self.ds_dir, "GRQA_data_v1.3", "GRQA_data_v1.3", f"{parameter}_GRQA.csv") return pd.read_csv(fname, sep=";") def _load_DO(self): # read_csv is causing mysterious errors f = os.path.join(self.ds_dir, "GRQA_data_v1.3", "GRQA_data_v1.3", f"DO_GRQA.csv") lines = [] with open(f, 'r', encoding='utf-8') as fp: for idx, line in enumerate(fp): lines.append(line.split(';')) return pd.DataFrame(lines[1:], columns=lines[0])