Source code for hydropandas.io.knmi

"""Module with functions to read or download time series with observations from knmi.

The main function to download time series are:

- get_knmi_timeseries_xy: obtain a knmi station based on the xy location
- get_knmi_timeseries_stn: obtain a knmi station based on the station number

The main function to read time series txt files is:

 - read_knmi_timeseries_file: read a knmi txt file

"""

import datetime as dt
import logging
import os
import re
import tempfile
from functools import lru_cache
from io import StringIO

import numpy as np
import pandas as pd
import requests

from .. import util

logger = logging.getLogger(__name__)

URL_DAILY_PREC = "https://www.daggegevens.knmi.nl/klimatologie/monv/reeksen"
URL_DAILY_METEO = "https://www.daggegevens.knmi.nl/klimatologie/daggegevens"
URL_HOURLY_METEO = "https://www.daggegevens.knmi.nl/klimatologie/uurgegevens"


[docs]def get_knmi_obs(
    stn=None,
    fname=None,
    xy=None,
    meteo_var=None,
    start=None,
    end=None,
    **kwargs,
):
    """get knmi observation from stn, fname or nearest xy coordinates.

    Parameters
    ----------
    stn : int, str or None, optional
        measurement station e.g. 829. The default is None.
    fname : str, path object, file-like object or None, optional
        filename of a knmi file. The default is None.
    xy : list, tuple or None, optional
        RD coördinates of a location in the Netherlands. The station nearest
        to this location used. The Default is None.
    meteo_var : str or None, optional
        meteo variable e.g. "RH" or "EV24". See list with all options in the
        hydropandas documentation.
    start : str, datetime or None, optional
        start date of observations. The default is None.
    end : str, datetime or None, optional
        end date of observations. The default is None.
    **kwargs:
        fill_missing_obs : bool, optional
            if True nan values in time series are filled with nearby time series.
            The default is False.
        interval : str, optional
            desired time interval for observations. Options are 'daily' and
            'hourly'. The default is 'daily'.
        use_api : bool, optional
            if True the api is used to obtain the data, API documentation is here:
                https://www.knmi.nl/kennis-en-datacentrum/achtergrond/data-ophalen-vanuit-een-script
            if False a text file is downloaded into a temporary folder and the
            data is read from there. Default is True since the api is back
            online (July 2021).
        raise_exceptions : bool, optional
            if True you get errors when no data is returned. The default is False.

    Returns
    -------
    pd.DataFrame, measurements
    dict, metadata

    Raises
    ------
    ValueError
        if no meteo_var is given or stn, fname and xy are all None.
    """
    if meteo_var is None and fname is None:
        raise ValueError("To get knmi data a meteo_var should be specified")
    elif meteo_var is not None:
        if not isinstance(meteo_var, str):
            raise (TypeError(f"meteo var should be string not {type(meteo_var)}"))

    settings = _get_default_settings(kwargs)

    start, end = _start_end_to_datetime(start, end)

    if stn is not None:
        stn = int(stn)

        start_str = str(start).replace(" 00:00:00", "")
        end_str = str(end).replace(" 00:00:00", "")

        logger.info(
            f"get data from station {stn} and variable {meteo_var} "
            f"from {start_str} to {end_str}"
        )
        ts, meta = get_knmi_timeseries_stn(stn, meteo_var, settings, start, end)
    elif fname is not None:
        logger.info(f"get KNMI data from file {fname} and meteo variable {meteo_var}")
        ts, meta = get_knmi_timeseries_fname(
            str(fname), meteo_var, settings, start, end
        )
    elif xy is not None:
        logger.info(
            f"get KNMI data from station nearest to coordinates {xy} and meteo"
            f"variable {meteo_var}"
        )
        stns = get_n_nearest_stations_xy(xy, meteo_var, n=1)
        ts, meta = get_knmi_timeseries_stn(stns[0], meteo_var, settings, start, end)
    else:
        raise ValueError(
            "specify KNMI station (stn), filename (fname) or coordinates (xy)"
        )

    return ts, meta


[docs]def get_knmi_timeseries_fname(fname, meteo_var, settings, start, end):
    # first try to figure out filetype by it's name
    if "neerslaggeg" in fname:
        # neerslagstation
        ts, meta = read_knmi_daily_rainfall_file(fname, start, end)
        meteo_var = "RD"
    elif "etmgeg" in fname:
        # meteo station
        ts, meta = read_knmi_daily_meteo_file(fname, meteo_var, start, end)
    elif "uurgeg" in fname:
        # hourly station
        ts, meta = read_knmi_hourly(fname, meteo_var, start, end)
    # if that doesn't work try to figure out by the meteo_var and settings
    elif meteo_var is None or meteo_var == "RD":
        # neerslagstation
        ts, meta = read_knmi_daily_rainfall_file(fname, start, end)
        meteo_var = "RD"
    elif settings["interval"] == "daily":
        # meteo station
        ts, meta = read_knmi_daily_meteo_file(fname, meteo_var, start, end)
    elif settings["interval"] == "hourly":
        # uurlijks station
        ts, meta = read_knmi_hourly(fname, meteo_var, start, end)
    else:
        raise ValueError(
            "please indicate how to read the file by specifying a meteo_var and"
            " an interval"
        )
    stn = meta["station"]
    stations = get_stations(meteo_var=meteo_var)
    stn_name = get_station_name(stn, stations)

    # set metadata
    meta.update(
        {
            "x": stations.loc[stn, "x"],
            "y": stations.loc[stn, "y"],
            "name": f"{meteo_var}_{stn_name}",
            "source": "KNMI",
            "filename": fname,
        }
    )

    return ts, meta


def _get_default_settings(settings=None):
    """adds the default settings to a dictinary with settings. If settings
    is None all the settings are default. If there are already settings given
    only the non-existing settings are added with their default value.

    The default settings are:
    fill_missing_obs = False
        nan values in time series are filled with nearby time series.
    interval = 'daily'
        desired time interval for observations. Can be 'daily' or 'hourly'.
        'hourly' is only for precipitation ('RH') data from meteo stations.
    use_api : bool, optional
        if True the api is used to obtain the data, API documentation is here:
        https://www.knmi.nl/kennis-en-datacentrum/achtergrond/data-ophalen-vanuit-een-script
        if False a text file is downloaded into a temporary folder and the data is read
        from that file
        Default is True since the api is back online (July 2021).
    raise_exceptions : bool, optional
        if True you get errors when no data is returned. The default is False.

    Parameters
    ----------
    settings : dict or None
        settings dictionary without default values.

    Returns
    -------
    settings : dict
        settings dictionary with default values of no values was given in
        the input dictionary.
    """

    default_settings = {
        "fill_missing_obs": False,
        "interval": "daily",
        "use_api": True,
        "raise_exceptions": True,
    }

    if settings is None:
        settings = {}

    if "fill_missing_obs" in settings.keys():
        if "raise_exceptions" in settings.keys():
            logger.debug("set raise_exceptions=False because fill_missing_obs is True")
            if settings["fill_missing_obs"] and settings["raise_exceptions"]:
                settings["raise_exceptions"] = False
        else:
            settings["raise_exceptions"] = False

    for key, value in default_settings.items():
        if key not in settings.keys():
            settings[key] = value

    return settings


def _start_end_to_datetime(start, end):
    """convert start and endtime to datetime.

    Parameters
    ----------
    start : str, datetime, None
        start time
    end : str, datetime, None
        start time

    Returns
    -------
    start : pd.TimeStamp or None
        start time
    end : pd.TimeStamp or None
        end time
    """

    if start is not None:
        start = pd.to_datetime(start)

    if end is not None:
        end = pd.to_datetime(end)

    return start, end


[docs]def get_knmi_timeseries_stn(stn, meteo_var, settings, start=None, end=None):
    """Get a knmi time series and metadata.

    Parameters
    ----------
    stn : int
        measurement station e.g. 829.
    meteo_var : str
        observation type e.g. "RH" or "EV24". See list with all options in the
        hpd.read_knmi function.
    settings : dict
        settings for obtaining the right time series, see _get_default_settings
        for more information
    start : pd.TimeStamp or None, optional
        start date of observations. The default is None.
    end : pd.TimeStamp or None, optional
        end date of observations. The default is None.

    Returns
    -------
    knmi_df : pandas DataFrame
        time series with measurements.
    meta : dictionary
        metadata from the measurement station.
    """

    # get station
    stations = get_stations(meteo_var=meteo_var)
    stn_name = get_station_name(stn=stn, stations=stations)

    # raise error if hourly neerslag station data is requested
    if (meteo_var == "RD") and settings["interval"].startswith("hour"):
        message = (
            "No hourly precipitation data available at precipitation station, "
            "use meteo_var 'RH' to obtain hourly precipitation data from "
            "meteo stations."
        )
        raise ValueError(message)
    elif (
        (meteo_var == "EV24")
        and settings["interval"].startswith("hour")
        and settings["use_api"]
    ):
        message = (
            "No hourly evaporation data available through the api, "
            "set use_api=False."
        )
        raise ValueError(message)
    elif settings["fill_missing_obs"]:
        # download data
        knmi_df, meta = fill_missing_measurements(
            stn, meteo_var, start, end, settings, stn_name
        )

        return knmi_df, meta
    elif meteo_var in ["penman", "makkink", "hargreaves"]:
        # compute evaporation from data
        knmi_df, meta = get_evaporation(meteo_var, stn, start, end, settings)
    else:
        knmi_df, variables, station_meta = download_knmi_data(
            stn, meteo_var, start, end, settings, stn_name
        )

        if str(stn) in station_meta.index:
            meta = station_meta.loc[f"{stn}"].to_dict()
        else:
            meta = {}

        # set metadata
        x = stations.at[stn, "x"] if stn in stations.index else np.nan
        y = stations.at[stn, "y"] if stn in stations.index else np.nan
        meta.update(
            {
                "x": x,
                "y": y,
                "station": stn,
                "name": f"{meteo_var}_{stn_name}",
                "source": "KNMI",
            }
        )
        meta.update(variables)

    return knmi_df, meta


[docs]def get_stations(meteo_var):
    """get knmi stations from json files according to variable.

    Parameters
    ----------
    meteo_var : str, optional
        type of meteodata, by default 'RH'

    Returns
    -------
    pandas DataFrame with stations, names and coordinates (Lat/Lon & RD)
    """

    dir_path = os.path.dirname(os.path.realpath(__file__))

    if meteo_var == "RD":
        # read precipitation station data only
        fname = "../data/knmi_neerslagstation.json"
        stations = pd.read_json(os.path.join(dir_path, fname))
    else:
        fname = "../data/knmi_meteostation.json"
        stations = pd.read_json(os.path.join(dir_path, fname))

    return stations


[docs]def get_station_name(stn, stations=None):
    """Returns the station name from a KNMI station.

    Modifies the station name in such a way that a valid url can be obtained.

    Parameters
    ----------
    stn : int
        station number.
    stations : pandas DataFrame
        DataFrame with the station metadata.

    Returns
    -------
    str or None
        Name of the station or None if station is not found.
    """
    if stations is None:
        stations = pd.concat([get_stations("RD"), get_stations("EV24")], axis=0)

    if stn not in stations.index:
        logger.warning(f"station {stn} not found")
        return None

    stn_name = stations.at[stn, "naam"]
    stn_name = stn_name.upper().replace(" ", "-").replace("(", "").replace(")", "")
    return stn_name


[docs]def fill_missing_measurements(stn, meteo_var, start, end, settings, stn_name=None):
    """fill missing measurements in knmi data.

    Parameters
    ----------
    stn : int
        measurement station.
    meteo_var : str
        observation type.
    start : pd.TimeStamp
        start date of observations.
    end : pd.TimeStamp
        end date of observations.
    settings : dict
        settings for obtaining data.
    stn_name : str, optional
        station name. If None the name is obtained form te stn number. The
        default is None.

    Returns
    -------
    knmi_df : pandas DataFrame
        data from one station from one type of observation, with additional
        column to see which station is used to fill the value
    variables : dictionary
        information about the observerd variables
    stations : pandas DataFrame
        information about the measurement station.
    """
    if settings["interval"] == "hourly":
        raise NotImplementedError("cannot yet fill missing values in hourly data")

    if (start is None) or (end is None):
        raise ValueError(
            "a start and end date have to be specified if fill_missing_obs is True"
        )

    # get the location of the stations
    stations = get_stations(meteo_var=meteo_var)
    if stn_name is None:
        stn_name = get_station_name(stn=stn, stations=stations)

    # download data from station
    knmi_df, variables, station_meta = download_knmi_data(
        stn, meteo_var, start, end, settings, stn_name
    )

    # if the first station cannot be read, read another station as the first
    ignore = [stn]
    while knmi_df.empty:
        logger.info(f"station {stn} has no measurements between {start} and {end}")
        logger.info("trying to get measurements from nearest station")

        stn_lst = get_nearest_station_df(
            stations.loc[[stn]], meteo_var=meteo_var, ignore=ignore
        )
        if stn_lst is None:
            logger.warning(
                f"there is no station with measurements of {meteo_var} between "
                f"{start} and {end}"
            )
            return pd.DataFrame(), dict()

        stn = stn_lst[0]
        stn_name = get_station_name(stn=stn, stations=stations)
        knmi_df, variables, station_meta = download_knmi_data(
            stn, meteo_var, start, end, settings, stn_name
        )
        ignore.append(stn)

    if end > knmi_df.index[-1]:
        # check latest date at which measurements are available at De Bilt
        new_end = _check_latest_measurement_date_de_bilt(
            meteo_var,
            use_api=settings["use_api"],
            start=start if knmi_df.empty else knmi_df.index[-1],
            end=end,
        )
        if new_end < end:
            end = new_end
            logger.warning(f'changing end_date to {end.strftime("%Y-%m-%d")}')

    # find missing values
    knmi_df = _add_missing_indices(knmi_df, stn, start, end)

    missing = knmi_df[meteo_var].isna()
    logger.debug(f"station {stn} has {missing.sum()} missing measurements")

    knmi_df.loc[~missing, "station"] = str(stn)

    # fill missing values
    while np.any(missing) and not np.all(missing):
        stn_comp = get_nearest_station_df(
            stations.loc[[stn]], meteo_var=meteo_var, ignore=ignore
        )

        if stn_comp is None:
            logger.info(
                "could not fill all missing measurements as there are "
                "no stations left to check"
            )

            missing[:] = False
            break
        else:
            stn_comp = stn_comp[0]

        n_missing = missing.sum()
        logger.info(
            f"trying to fill {n_missing} missing measurements with station {stn_comp}"
        )

        stn_name_comp = get_station_name(stn_comp, stations)
        knmi_df_comp, _, __ = download_knmi_data(
            stn_comp, meteo_var, start, end, settings, stn_name_comp
        )

        if knmi_df_comp.empty:
            logger.warning(f"station {stn_comp} cannot be downloaded")

        else:
            # dropnans from new data
            knmi_df_comp = knmi_df_comp.loc[~knmi_df_comp[meteo_var].isna(), :]
            # get index of missing data in original timeseries
            missing_idx = missing.loc[missing].index
            # if any missing are in the new data, update
            if missing_idx.isin(knmi_df_comp.index).any():
                # index for missing but in newly downloaded data
                ix_idx = missing_idx.intersection(knmi_df_comp.index)
                # update missing data
                knmi_df.loc[ix_idx, meteo_var] = knmi_df_comp.loc[ix_idx, meteo_var]
                # add source station number
                knmi_df.loc[ix_idx, "station"] = str(stn_comp)

        missing = knmi_df[meteo_var].isna()
        ignore.append(stn_comp)

    if str(stn) in station_meta.index:
        meta = station_meta.loc[f"{stn}"].to_dict()
    else:
        meta = {}

    # set metadata
    meta.update(
        {
            "x": stations.loc[stn, "x"],
            "y": stations.loc[stn, "y"],
            "station": stn,
            "name": f"{meteo_var}_{stn_name}",
            "source": "KNMI",
        }
    )

    meta.update(variables)

    return knmi_df, meta


[docs]def download_knmi_data(stn, meteo_var, start, end, settings, stn_name=None):
    """download knmi data of a measurements station for certain observation
    type.

    Parameters
    ----------
    stn : int
        measurement station.
    meteo_var : str
        observation type.
    start : pd.TimeStamp or None
        start date of observations.
    end : pd.TimeStamp or None
        end date of observations.
    settings : dict
        settings for obtaining data
    stn_name : str, optional
        station name. If None the name is obtained form te stn number. The
        default is None.

    Raises
    ------
    ValueError
        if the data from knmi cannot not be read a ValueError is raised.
        Unless raise_exceptions is False

    Returns
    -------
    knmi_df : pandas DataFrame
        data from one station from one type of observation
    variables : dictionary
        information about the observerd variables
    stations : pandas DataFrame
        information about the measurement station.
    """

    logger.debug(
        f"download KNMI {meteo_var} data from station "
        f"{stn}-{stn_name} between {start} and {end}"
    )

    # define variables
    knmi_df = pd.DataFrame()
    variables = {}
    stations = pd.DataFrame()

    # download and read data
    try:
        try:
            if settings["use_api"]:
                if settings["interval"].startswith("hour"):
                    # hourly data from meteorological stations
                    knmi_df, variables = get_knmi_hourly_api(stn, meteo_var, start, end)
                elif meteo_var == "RD":
                    # daily data from rainfall-stations
                    knmi_df, variables = get_knmi_daily_rainfall_api(stn, start, end)
                else:
                    # daily data from meteorological stations
                    knmi_df, variables, stations = get_knmi_daily_meteo_api(
                        stn, meteo_var, start, end
                    )

        except (RuntimeError, requests.ConnectionError) as e:
            logger.warning(
                "KNMI API failed, try setting the 'use_api' argument to 'False'"
            )
            if settings["raise_exceptions"]:
                raise e
            logger.info("Try non api method")
            settings = settings.copy()
            settings["use_api"] = False

        if not settings["use_api"]:
            if settings["interval"].startswith("hour"):
                # hourly data from meteorological stations
                raise NotImplementedError()
            elif meteo_var == "RD":
                # daily data from rainfall-stations
                knmi_df, variables = get_knmi_daily_rainfall_url(
                    stn, stn_name, start, end
                )
            else:
                # daily data from meteorological stations
                knmi_df, variables = get_knmi_daily_meteo_url(
                    stn, meteo_var, start, end
                )
    except (ValueError, KeyError) as e:
        logger.error(e)
        if settings["raise_exceptions"]:
            raise ValueError(e)

    if knmi_df.empty:
        logger.debug(
            "no measurements found for station "
            f"{stn}-{stn_name} between {start} and {end}"
        )

    return knmi_df, variables, stations


[docs]@lru_cache()
def get_knmi_daily_rainfall_api(stn, start=None, end=None):
    """download and read knmi daily rainfall.

    Parameters
    ----------
    stn : int
        station number.
    start : pd.TimeStamp or None
        start time of observations.
    end : pd.TimeStamp or None
        end time of observations.

    Raises
    ------
    ValueError
        if there is no data for the provided stn an error is raised.

    Returns
    -------
    pandas DataFrame
        measurements.
    variables : dictionary
        additional information about the variables
    """
    meteo_var = "RD"
    url = URL_DAILY_PREC
    params = {"vars": meteo_var, "stns": str(stn)}

    if start is not None:
        params["start"] = start.strftime("%Y%m%d")
    if end is not None:
        params["end"] = end.strftime("%Y%m%d")

    result = requests.get(url, params=params)

    if result.status_code != 200:
        raise requests.ConnectionError(f"Cannot connect to {url}")

    result_str = result.text

    if result_str.startswith("<!DOCTYPE html>"):
        raise RuntimeError("KNMI API down")

    f = StringIO(result_str)
    knmi_df, variables = read_knmi_daily_rainfall(f, meteo_var)
    if stn not in knmi_df["STN"].unique():
        return pd.DataFrame(), variables

    return knmi_df[[meteo_var]], variables


[docs]@lru_cache()
def get_knmi_daily_rainfall_url(stn, stn_name, start=None, end=None):
    """download and read knmi daily rainfall.

    Parameters
    ----------
    stn : int
        station number.
    stn_name : str
        the name of the station in capital letters, can be tricky
    start : pd.TimeStamp or None
        start time of observations.
    end : pd.TimeStamp or None
        end time of observations.

    Raises
    ------
    ValueError
        if there is no data for the provided stn an error is raised.

    Returns
    -------
    pandas DataFrame
        measurements.
    variables : dictionary
        additional information about the variables
    """
    if end is not None:
        end = end + dt.timedelta(days=1)

    stn = str(stn)
    url = (
        "https://cdn.knmi.nl/knmi/map/page/klimatologie/"
        f"gegevens/monv_reeksen/neerslaggeg_{stn_name}_{stn}.zip"
    )

    # get file name
    basedir = os.path.join(tempfile.gettempdir(), "knmi")
    if not os.path.isdir(basedir):
        os.mkdir(basedir)
    fname_zip = os.path.join(tempfile.gettempdir(), "knmi", f"neerslaggeg_{stn}.zip")
    fname_dir = os.path.join(tempfile.gettempdir(), "knmi", f"neerslaggeg_{stn}")
    fname_txt = os.path.join(fname_dir, f"neerslaggeg_{stn_name}_{stn}.txt")

    # download zip file
    r = requests.get(url, stream=True)
    if r.status_code != 200:
        raise ValueError(f"invalid url {url} please check station name {stn_name}")
    with open(fname_zip, "wb") as fd:
        for chunk in r.iter_content(chunk_size=128):
            fd.write(chunk)

    # unzip file
    util.unzip_file(fname_zip, fname_dir, force=True, preserve_datetime=True)

    return read_knmi_daily_rainfall_file(fname_txt, start=start, end=end)


[docs]def read_knmi_daily_rainfall_file(fname_txt, start=None, end=None):
    """read a knmi file with daily rainfall data.

    Parameters
    ----------
    path : str
        file path of a knmi .txt file.
    start : pd.TimeStamp or None
        start time of observations.
    end : pd.TimeStamp or None
        end time of observations.

    Returns
    -------
    pandas DataFrame
        measurements.
    variables : dictionary
        additional information about the variables

    """
    meteo_var = "RD"

    with open(fname_txt, "r") as f:
        line = f.readline()
        # get meteo var
        for _ in range(50):
            if "RD   " in line:  # RD komt te vaak voor vandaar de spaties
                key, item = line.split("=")
                variables = {key.strip(): item.strip()}
                break
            line = f.readline()

        # get dataframe
        for _ in range(50):
            if "STN" in line:
                columns = line.strip("# ").strip("\n").split(",")
                columns = [x.strip(" ") for x in columns]
                values = f.readline()
                if values == "\n":
                    values = f.readline()
            line = f.readline()

        df = pd.read_csv(f, header=None, names=columns, na_values="     ")

        df.set_index(pd.to_datetime(df.YYYYMMDD, format="%Y%m%d"), inplace=True)
        df = df.drop("YYYYMMDD", axis=1)

        if df.index.duplicated().sum() > 0:
            df = df.loc[~df.index.duplicated(keep="first")]
            logger.info("duplicate indices removed from RD measurements")

        # sometimes the last row is empty, check for that and remove it
        if not df.empty:
            if df.iloc[-1].isna().any():
                logger.debug("duplicate indices removed from RD measurements")
                df = df.drop(index=df.index[-1])
                df.loc[:, meteo_var] = df[meteo_var].astype(float)

        df, variables = _transform_variables(df, variables)
        variables["unit"] = "m"

        # add station to variables
        unique_stn = df["STN"].unique()
        if len(unique_stn) > 1:
            raise ValueError(f"multiple stations in single file {unique_stn}")
        else:
            variables["station"] = df["STN"].iloc[0]

    return df.loc[start:end, [meteo_var]], variables


def _read_knmi_header(f, meteo_var):
    variables = {}
    line = f.readline()
    for iline in range(500):
        if " = " in line or " : " in line:
            line = line.lstrip(" #").strip("\n")
            if " = " in line:
                varDes = line.split(" = ")
            else:
                varDes = line.split(" : ")
            if varDes[0].strip() == meteo_var:
                variables[varDes[0].strip()] = varDes[1].strip()

        if "STN,YY" in line:
            header = line.replace("#", "").split(",")
            header = [item.lstrip().rstrip() for item in header]
            break

        line = f.readline()

    if iline > 498:
        raise ValueError("cannot read measurements from file")

    return f, variables, header


def _transform_variables(df, variables):
    """Transforms the timeseries to default units and settings.

    Does 3 things:
        1. all values equal to -1 are converted to zero
        2. the units are changed from 0.1 mm to 1 mm.
        3. the units are changed from mm to m.

    Parameters
    ----------
    df : pandas DataFrame
        time series.
    variables : dictionary
        description of variables in time series.

    Raises
    ------
    NameError
        if there are columns in the DataFrame and no matching key in the
        variables dictionary.

    Returns
    -------
    df : pandas DataFrame
        time series.
    variables : dictionary
        description of variables in time series.
    """

    for key, value in variables.items():
        # test if key existst in data
        if key not in df.columns:
            if key == "YYYYMMDD" or key == "HH":
                pass
            elif key == "T10N":
                variables.pop(key)
                key = "T10"
            else:
                raise NameError(key + " does not exist in data")

        if "(-1 voor <0.05 mm)" in value:
            # remove -1 for precipitation smaller than <0.05 mm
            df.loc[df.loc[:, key] == -1, key] = 0.0
            value = value.replace("(-1 voor <0.05 mm)", "(0 voor <0.05mm)").replace(
                "(-1 for <0.05 mm)", "(0 for <0.05mm)"
            )
            logger.debug(f"transform {key}, {value} lower than 0.05 mm to 0 mm")

        if "0.1 " in value:
            logger.debug(f"transform {key}, {value} from 0.1 to 1")

            df[key] = df[key] * 0.1
            value = value.replace("0.1 ", "")
        if " tiende " in value:
            logger.debug(f"transform {key}, {value} from 0.1 to 1")

            df[key] = df[key] * 0.1
            value = value.replace(" tiende ", " ")
        if " mm" in value:
            logger.debug(f"transform {key}, {value} from mm to m")

            df[key] = df[key] * 0.001
            value = value.replace(" mm", " m")
        if " millimeters" in value:
            logger.debug(f"transform {key}, {value} from mm to m")

            df[key] = df[key] * 0.001
            value = value.replace(" millimeters", " m")
        if "08.00 UTC" in value:
            logger.debug(f"transform {key}, {value} from UTC to UTC+1")

            # over the period 08.00 preceding day - 08.00 UTC present day
            df.index = df.index + pd.to_timedelta(8, unit="h")

            # from UT to UT+1 (standard-time in the Netherlands)
            df.index = df.index + pd.to_timedelta(1, unit="h")

            value = value.replace("08.00", "09.00").replace("UTC", "UTC+1")

        # Store new variable
        variables[key] = value

    return df, variables


[docs]def read_knmi_daily_rainfall(f, meteo_var):
    """
    Read daily rainfall data from a KNMI file.

    Parameters
    ----------
    f : file-like object
        The file object containing the KNMI data.
    meteo_var : str
        The meteorological variable to extract.

    Returns
    -------
    pandas.DataFrame
        The DataFrame containing the extracted daily rainfall data.
    dict
        A dictionary containing information about the variables in the DataFrame.

    Notes
    -----
    This function assumes that the file object `f` is already open and
    positioned at the start of the data. The file is expected to have a header
    with variable names and a corresponding data table.

    The DataFrame returned by this function has the following modifications:
    - The index is set to the datetime values derived from the 'YYYYMMDD' column.
    - The 'YYYYMMDD' column is dropped.
    - Duplicate indices are removed, keeping the first occurrence.
    - If the last row has missing values, it is removed.
    - The 'meteo_var' column is cast to float data type.
    - Variables are transformed using an internal function `_transform_variables`.
    - The unit of measurement for the 'meteo_var' variable is set to 'm'.

    """
    f, variables, header = _read_knmi_header(f, meteo_var)

    df = pd.read_csv(f, header=None, names=header, na_values="     ")
    f.close()

    df.set_index(pd.to_datetime(df.YYYYMMDD, format="%Y%m%d"), inplace=True)
    df = df.drop("YYYYMMDD", axis=1)

    if df.index.duplicated().sum() > 0:
        df = df.loc[~df.index.duplicated(keep="first")]
        logger.debug("duplicate indices removed from RD measurements")

    # sometimes the last row is messed up, check for that and remove it
    if not df.empty:
        if df.iloc[-1].isna().any():
            logger.debug("last row contains no data, remove last row")

            df = df.drop(index=df.index[-1])
            df.loc[:, meteo_var] = df[meteo_var].astype(float)

    df, variables = _transform_variables(df, variables)
    variables["unit"] = "m"

    return df, variables


def _read_station_location(f):
    stations = None

    line = f.readline()
    for _ in range(30):
        if "STN" in line:
            titels = line.strip("# ").split()
            titels = [x.replace("(", "_") for x in titels]
            titels = [x.replace(r")", "") for x in titels]

            values = f.readline().strip("# ").strip().replace(":", "")
            values = re.split(r"\s{2,}", values)

            # Create pd.DataFrame for station data
            stations = pd.DataFrame(columns=titels, data=[values])
            stations.set_index(["STN"], inplace=True)
            for col in stations.columns:
                try:
                    stations[col] = stations[col].astype(float)
                except ValueError:
                    pass

            break

        line = f.readline()

    if stations is None:
        logger.warning("could not find stations")

    return f, stations


[docs]@lru_cache()
def get_knmi_daily_meteo_api(stn, meteo_var, start=None, end=None):
    """download and read knmi daily meteo data.

    Parameters
    ----------
    stn : int
        station number.
    meteo_var : str
        e.g. 'EV24'.
    start : pd.TimeStamp or None
        start time of observations.
    end : pd.TimeStamp or None
        end time of observations.

    Returns
    -------
    pandas DataFrame
        measurements.
    variables : dictionary
        additional information about the variables
    stations : pandas DataFrame
        additional data about the measurement station
    """

    url = URL_DAILY_METEO

    params = {
        "vars": meteo_var,
        "stns": str(stn),
    }

    # modify start and end date for the api call because the dates are later
    # transformed, see example notebook 02_knmi_observations.
    if start is not None:
        start = start - dt.timedelta(days=1)
        params["start"] = start.strftime("%Y%m%d")
    if end is not None:
        end = end - dt.timedelta(days=1)
        params["end"] = end.strftime("%Y%m%d")

    result = requests.get(url, params=params)

    if result.status_code != 200:
        raise requests.ConnectionError(f"Cannot connect to {url}")

    result_str = result.text

    if result_str.startswith("<!DOCTYPE html>"):
        raise RuntimeError("KNMI API down")

    f = StringIO(result_str)
    knmi_df, variables, stations = read_knmi_daily_meteo(f, meteo_var)

    knmi_df.dropna(subset=[meteo_var], inplace=True)

    return knmi_df[[meteo_var]], variables, stations


[docs]@lru_cache()
def get_knmi_daily_meteo_url(stn, meteo_var, start=None, end=None):
    """download and read knmi daily meteo data.

    Parameters
    ----------
    stn : str
        station number.
    meteo_var : str
        e.g. 'EV24'.
    start : pd.TimeStamp or None
        start time of observations.
    end : pd.TimeStamp or None
        end time of observations.

    Returns
    -------
    pandas DataFrame
        measurements.
    variables : dictionary
        additional information about the variables
    stations : pandas DataFrame
        additional data about the measurement station
    """
    url = (
        "https://cdn.knmi.nl/knmi/map/page/klimatologie"
        f"/gegevens/daggegevens/etmgeg_{stn}.zip"
    )

    # get file name
    basedir = os.path.join(tempfile.gettempdir(), "knmi")
    if not os.path.isdir(basedir):
        os.mkdir(basedir)
    fname_zip = os.path.join(tempfile.gettempdir(), "knmi", f"etmgeg_{stn}.zip")
    fname_dir = os.path.join(tempfile.gettempdir(), "knmi", f"etmgeg_{stn}")
    fname_txt = os.path.join(fname_dir, f"etmgeg_{stn}.txt")

    # download zip file
    r = requests.get(url, stream=True)
    with open(fname_zip, "wb") as fd:
        for chunk in r.iter_content(chunk_size=128):
            fd.write(chunk)

    # unzip file
    util.unzip_file(fname_zip, fname_dir, force=True, preserve_datetime=True)

    return read_knmi_daily_meteo_file(fname_txt, meteo_var, start, end)


[docs]def read_knmi_daily_meteo_file(path, meteo_var, start=None, end=None):
    """read knmi daily meteo data from a file

    Parameters
    ----------
    path : str
        file path of .txt file.
    meteo_var : str
        e.g. 'EV24'.
    start : pd.TimeStamp or None
        start time of observations.
    end : pd.TimeStamp or None
        end time of observations.

    Raises
    ------
    ValueError
        If the meteo var is not in the file.

    Returns
    -------
    pandas DataFrame
        measurements.
    variables : dictionary
        additional information about the variables
    """
    variables = None
    with open(path, "r") as f:
        line = f.readline()
        # get meteo var
        for _ in range(50):
            if meteo_var in line:
                key, item = line.split("=")
                var_name = key.strip()
                variables = {var_name: item.strip()}
                if var_name == meteo_var:
                    break
            line = f.readline()

        if variables is None:
            raise ValueError(f"could not find {meteo_var} in file {path}")

        # get dataframe
        for _ in range(50):
            if "STN" in line:
                columns = line.strip("# ").strip("\n").split(",")
                columns = [x.strip(" ") for x in columns]
                values = f.readline()
                if values == "\n":
                    values = f.readline()
                df = pd.read_csv(f, header=None, names=columns, na_values="     ")
                df.set_index(pd.to_datetime(df.YYYYMMDD, format="%Y%m%d"), inplace=True)
                df = df.drop("YYYYMMDD", axis=1)

                df = df.loc[df.index.notnull(), :]

                # add a full day for meteorological data, so that the
                # timestamp is at the end of the period in the data
                df.index = df.index + pd.to_timedelta(1, unit="d")

                # from UT to UT+1 (standard-time in the Netherlands)
                df.index = df.index + pd.to_timedelta(1, unit="h")

                # add station to variables
                unique_stn = df["STN"].unique()
                if len(unique_stn) > 1:
                    raise ValueError(f"multiple stations in single file {unique_stn}")
                else:
                    station = df["STN"].iloc[0]

                df = df.loc[start:end, [meteo_var]]
                df = df.dropna()
                df, variables = _transform_variables(df, variables)
                variables["unit"] = ""
                variables["station"] = station
                break

            line = f.readline()

    return df, variables


[docs]def read_knmi_daily_meteo(f, meteo_var):
    f, stations = _read_station_location(f)
    f, variables, header = _read_knmi_header(f, meteo_var)
    header[0] = header[0].lstrip("# ")
    df = pd.read_csv(f, header=None, names=header, na_values="     ")
    f.close()

    df.set_index(pd.to_datetime(df.YYYYMMDD, format="%Y%m%d"), inplace=True)
    df = df.drop("YYYYMMDD", axis=1)

    df = df.loc[df.index.notnull(), :]

    # add a full day for meteorological data, so that the
    # timestamp is at the end of the period in the data
    df.index = df.index + pd.to_timedelta(1, unit="d")

    # from UT to UT+1 (standard-time in the Netherlands)
    df.index = df.index + pd.to_timedelta(1, unit="h")

    df, variables = _transform_variables(df, variables)
    variables["unit"] = ""

    return df, variables, stations


[docs]@lru_cache()
def get_knmi_hourly_api(stn, meteo_var, start, end):
    """Retrieve hourly meteorological data from the KNMI API.

    Parameters
    ----------
    stn : int
        The station number.
    meteo_var : str
        The meteorological variable to retrieve.
    start : pd.Timestamp or None
        The start date and time of the data.
    end : pd.Timestamp or None
        The end date and time of the data.

    Returns
    -------
    pandas.DataFrame
        A DataFrame containing the requested meteorological variable data.
    dict
        A dictionary containing information about the variables in the DataFrame.

    Raises
    ------
    requests.ConnectionError
        If there is a connection error while accessing the KNMI API.
    RuntimeError
        If the KNMI API is down or returns unexpected data.
    """
    url = URL_HOURLY_METEO

    params = {
        "vars": meteo_var,
        "stns": str(stn),
    }

    # It looks like the API has an option to select the start and end hour that
    # you want but in reality it does not have this option.
    if start is None:
        raise ValueError("A start date is required when using hourly interval")
    if end is None:
        raise ValueError("An end date is required when using hourly interval")

    if (end - start).days > 4150:
        raise ValueError("time span for hourly data cannot be greater than 10 years")
    if (end - start).days < 1:
        raise ValueError("time span should be more than 1 day")

    params["end"] = end.strftime("%Y%m%d") + "24"

    s = start - pd.Timedelta(1, unit="H")
    params["start"] = s.strftime("%Y%m%d") + "01"

    result = requests.get(url, params=params)

    if result.status_code != 200:
        raise requests.ConnectionError(f"Cannot connect to {url}")

    result_str = result.text

    if result_str.startswith("<!DOCTYPE html>"):
        raise RuntimeError("KNMI API down")

    f = StringIO(result_str)

    return read_knmi_hourly(f, meteo_var, start, end)


[docs]def read_knmi_hourly(f, meteo_var, start=None, end=None):
    """Read hourly KNMI file.

    Parameters
    ----------
    f : str or filelike object
        path to file or filelike object

    Returns
    -------
    df : pd.DataFrame
        DataFrame containing data
    variables : dict
        dictionary containing metadata about the variables
    """

    ispath = False
    if isinstance(f, str):
        f = open(f, "r")
        ispath = True

    f, variables, header = _read_knmi_header(f, meteo_var)
    df = pd.read_csv(f, header=None, names=header, na_values="     ")

    if ispath:
        f.close()

    # convert 24 to 0
    if "H" in df.columns:
        hour_col = "H"
    elif "HH" in df.columns:
        hour_col = "HH"
    else:
        raise ValueError("could not find column with hour")

    df.loc[df[hour_col] == 24, hour_col] = 0
    datetime = pd.to_datetime(
        df.YYYYMMDD.astype(str) + df[hour_col].astype(str).str.zfill(2),
        format="%Y%m%d%H",
    )
    datetime.loc[datetime.dt.hour == 0] = datetime + dt.timedelta(days=1)

    # add station to variables
    unique_stn = df["STN"].unique()
    if len(unique_stn) > 1:
        raise ValueError(f"multiple stations in single file {unique_stn}")
    else:
        station = df["STN"].iloc[0]

    # set index
    df.set_index(datetime, inplace=True)

    df = df[[meteo_var]]
    variables = variables

    df, variables = _transform_variables(df, variables)
    variables["unit"] = "m"
    variables["station"] = station

    return df.loc[start:end, [meteo_var]], variables


def _check_latest_measurement_date_de_bilt(
    meteo_var, use_api=True, start=None, end=None
):
    """According to the website of the knmi it can take up to 3 weeks before
    precipitation data is updated. If you use the fill_missing_measurements
    method to fill a time series untill today, it will keep looking at all
    stations to find the data of these last days that probably does not exist.

    To prevent this, this function will find the last day there are measure-
    ments at knmi station de Bilt. It is assumed that if de Bilt doesn't have
    recent measurements, no station will have measurements for these dates.

    website knmi: https://www.knmi.nl/nederland-nu/klimatologie/monv/reeksen

    Parameters
    ----------
    meteo_var : str
        meteo variable.
    use_api : bool, optional
        if True the api is used to obtain the data, API documentation is here:
            https://www.knmi.nl/kennis-en-datacentrum/achtergrond/data-ophalen-vanuit-een-script
        Default is True.
    start : pd.TimeStamp or None, optional
        start date of observations. Set to 365 days before today when None. The default
        is None.
    end : pd.TimeStamp or None, optional
        end date of observations. Set to 10 days after today when None. The default is
        None.

    Returns
    -------
    last_measurement_date_debilt : pd.TimeStamp
        last date with measurements at station de Bilt
    """
    if start is None:
        start = dt.datetime.now() - pd.Timedelta(365, unit="D")
    if end is None:
        end = dt.datetime.now() + pd.Timedelta(10, unit="D")
    if meteo_var == "RD":
        if use_api:
            try:
                knmi_df, _ = get_knmi_daily_rainfall_api(550, start, end)
            except (RuntimeError, requests.ConnectionError):
                logger.warning("KNMI API failed, switching to non-API method")
                knmi_df, _ = get_knmi_daily_rainfall_url(550, "DE-BILT", start, end)
        else:
            knmi_df, _ = get_knmi_daily_rainfall_url(550, "DE-BILT", start, end)
    else:
        if use_api:
            try:
                knmi_df, _, _ = get_knmi_daily_meteo_api(
                    260, meteo_var, start=start, end=end
                )
            except (RuntimeError, requests.ConnectionError):
                logger.warning("KNMI API failed, switching to non-API method")
                knmi_df, _, _ = get_knmi_daily_meteo_url(260, meteo_var, start, end)
        else:
            knmi_df, _, _ = get_knmi_daily_meteo_url(260, meteo_var, start, end)

    knmi_df = knmi_df.dropna()
    end_str = end.strftime("%Y-%m-%d")
    if knmi_df.empty:
        start_str = start.strftime("%Y-%m-%d")
        raise ValueError(
            "knmi station de Bilt has no measurements between "
            f"{start_str} and {end_str} for variable {meteo_var}."
        )

    last_measurement_date_debilt = knmi_df.index[-1]

    logger.debug(
        f"last {meteo_var} measurement available at the Bilt until {end_str} is from"
        f' {last_measurement_date_debilt.strftime("%Y-%m-%d")}'
    )
    logger.debug(
        f"assuming no {meteo_var} measurements are available at "
        "other stations after this date"
    )

    return last_measurement_date_debilt


[docs]def get_nearest_station_df(
    locations, xcol="x", ycol="y", stations=None, meteo_var="RH", ignore=None
):
    """Find the KNMI stations that measure 'meteo_var' closest to the
    coordinates in 'locations'.

    Parameters
    ----------
    locations : pandas.DataFrame
        DataFrame containing x and y coordinates
    xcol : str
        name of the column in the locations dataframe with the x values
    ycol : str
        name of the column in the locations dataframe with the y values
    stations : pandas DataFrame, optional
        if None stations will be obtained using the get_stations function.
        The default is None.
    meteo_var : str
        measurement variable e.g. 'RH' or 'EV24'
    ignore : list, optional
        list of stations to ignore. The default is None.

    Returns
    -------
    stns : list
        station numbers.
    """
    if stations is None:
        stations = get_stations(meteo_var=meteo_var)
    if ignore is not None:
        stations.drop(ignore, inplace=True)
        if stations.empty:
            return None

    xo = pd.to_numeric(locations[xcol])
    xt = pd.to_numeric(stations.x)
    yo = pd.to_numeric(locations[ycol])
    yt = pd.to_numeric(stations.y)

    xh, xi = np.meshgrid(xt, xo)
    yh, yi = np.meshgrid(yt, yo)

    distances = pd.DataFrame(
        np.sqrt((xh - xi) ** 2 + (yh - yi) ** 2),
        index=locations.index,
        columns=stations.index,
    )

    stns = distances.idxmin(axis=1).to_list()

    return stns


[docs]def get_nearest_station_xy(xy, stations=None, meteo_var="RH", ignore=None):
    """find the KNMI stations that measure 'meteo_var' closest to the given
    x and y coordinates.

    Parameters
    ----------
    xy : list or numpy array, optional
        xy coordinates of the locations. e.g. [[10,25], [5,25]]
    stations : pandas DataFrame, optional
        if None stations will be obtained using the get_stations function.
        The default is None.
    meteo_var : str
        measurement variable e.g. 'RH' or 'EV24'
    ignore : list, optional
        list of stations to ignore. The default is None.

    Returns
    -------
    stns : list
        station numbers.

    Notes
    -----
    assumes you have a structured rectangular grid.
    """

    locations = pd.DataFrame(data=xy, columns=["x", "y"])

    stns = get_nearest_station_df(
        locations,
        xcol="x",
        ycol="y",
        stations=stations,
        meteo_var=meteo_var,
        ignore=ignore,
    )

    return stns


[docs]def get_n_nearest_stations_xy(xy, meteo_var, n=1, stations=None, ignore=None):
    """Find the N nearest KNMI stations that measure variable 'meteo_var' to
    the x, y coordinates.

    Parameters
    ----------
    xy : list, tuple or numpy.array of int or float
        sinlge pair of xy coordinates. e.g. (150_000., 400_000.)
    meteo_var : str
        measurement variable e.g. 'RH' or 'EV24'
    n : int, optional
        number of stations you want to return. The default is 1.
    stations : pandas DataFrame, optional
        if None stations will be obtained using the get_stations function.
        The default is None.
    ignore : list, optional
        list of stations to ignore. The default is None.

    Returns
    -------
    list
        station numbers.
    """

    if stations is None:
        stations = get_stations(meteo_var=meteo_var)
    if ignore is not None:
        stations.drop(ignore, inplace=True)
        if stations.empty:
            return None

    distance = np.sqrt((stations.x - xy[0]) ** 2 + (stations.y - xy[1]) ** 2)
    stns = distance.nsmallest(n).index.to_list()

    return stns


def _add_missing_indices(knmi_df, stn, start, end):
    """When downloading KNMI data you don't always get a DataFrame with the
    periods that you provided in your request. Thus the index does not cover
    the complete period that you are interested in. This function adds the
    missing period to the index of the DataFrame.

    Parameters
    ----------
    knmi_df : pandas DataFrame
        data from one station from one type of observation, with additional
        column to see which station is used to fill the value
    stn : int or str
        measurement station.
    start : pd.TimeStamp
        start time of observations.
    end : pd.TimeStamp
        end time of observations.

    Returns
    -------
    knmi_df : pandas DataFrame
        data from one station from one type of observation
    """
    # check if given dates are more or less similar than measurement dates
    if (knmi_df.index[0] - start).days < 2:
        new_start = knmi_df.index[0]
    else:
        new_start = pd.Timestamp(
            year=start.year,
            month=start.month,
            day=start.day,
            hour=knmi_df.index[0].hour,
            minute=knmi_df.index[0].minute,
            second=knmi_df.index[0].second,
        )
        logger.info(f"station {stn} has no measurements before {knmi_df.index[0]}")

    if (end - knmi_df.index[-1]).days < 2:
        new_end = knmi_df.index[-1]
    else:
        new_end = pd.Timestamp(
            year=end.year,
            month=end.month,
            day=end.day,
            hour=knmi_df.index[-1].hour,
            minute=knmi_df.index[-1].minute,
            second=knmi_df.index[-1].second,
        )
        logger.info(f"station {stn} has no measurements after {knmi_df.index[-1]}")

    # add missing indices
    new_index = pd.date_range(new_start, new_end, freq="D")
    knmi_df = knmi_df.reindex(new_index)

    return knmi_df


[docs]def get_knmi_obslist(
    locations=None,
    stns=None,
    xy=None,
    meteo_vars=("RH",),
    starts=None,
    ends=None,
    ObsClasses=None,
    **kwargs,
):
    """Get a list of observations of knmi stations. Either specify a list of
    knmi stations (stns) or a dataframe with x, y coordinates (locations).

    Parameters
    ----------
    locations : pandas DataFrame or None
        dataframe with x and y coordinates. The default is None
    stns : list of str or None
        list of knmi stations. The default is None
    xy : list or numpy array, optional
        xy coordinates of the locations. e.g. [[10,25], [5,25]]
    meteo_vars : list or tuple of str
        meteo variables e.g. ["RH", "EV24"]. The default is ("RH")
    starts : None, str, datetime or list, optional
        start date of observations per meteo variable. The start date is
        included in the time series.
        If start is None the start date will be January 1st of the
        previous year.
        if start is str it will be converted to datetime
        if start is a list it should be the same length as meteo_vars and
        the start time for each variable. The default is None
    ends : list of str, datetime or None
        end date of observations per meteo variable. The end date is
        included in the time series.
        If end is None the start date will be January 1st of the
        previous year.
        if end is a str it will be converted to datetime
        if end is a list it should be the same length as meteo_vars and
        the end time for each meteo variable. The default is None
    ObsClasses : list of type or None
        class of the observations, can be PrecipitationObs or
        EvaporationObs. The default is None.
    **kwargs:
        fill_missing_obs : bool, optional
            if True nan values in time series are filled with nearby time series.
            The default is False.
        interval : str, optional
            desired time interval for observations. Options are 'daily' and
            'hourly'. The default is 'daily'.
        use_api : bool, optional
            if True the api is used to obtain the data, API documentation is here:
                https://www.knmi.nl/kennis-en-datacentrum/achtergrond/data-ophalen-vanuit-een-script
            if False a text file is downloaded into a temporary folder and the
            data is read from there. Default is True since the api is back
            online (July 2021).
        raise_exceptions : bool, optional
            if True you get errors when no data is returned. The default is False.

    Returns
    -------
    obs_list : list of obsevation objects
        collection of multiple point observations
    """

    settings = _get_default_settings(kwargs)

    if isinstance(meteo_vars, str):
        meteo_vars = [meteo_vars]

    if starts is None:
        starts = [None] * len(meteo_vars)
    elif isinstance(starts, (str, dt.datetime)):
        starts = [starts] * len(meteo_vars)
    elif isinstance(starts, list):
        pass
    else:
        raise TypeError("must be None, str, dt.datetime or list")

    if ends is None:
        ends = [None] * len(meteo_vars)
    elif isinstance(ends, (str, dt.datetime)):
        ends = [ends] * len(meteo_vars)
    elif isinstance(ends, list):
        pass
    else:
        raise TypeError("must be None, str, dt.datetime or list")

    if not isinstance(xy, (tuple, list, np.ndarray, type(None))):
        raise TypeError("must be tuple, list or numpy.ndarray")

    obs_list = []
    for meteo_var, start, end, ObsClass in zip(meteo_vars, starts, ends, ObsClasses):
        start, end = _start_end_to_datetime(start, end)

        # get stations
        if stns is None:
            stations = get_stations(meteo_var=meteo_var)
            if (locations is None) and (xy is not None):
                _stns = get_nearest_station_xy(
                    xy, stations=stations, meteo_var=meteo_var
                )
            elif locations is not None:
                _stns = get_nearest_station_df(
                    locations, stations=stations, meteo_var=meteo_var
                )
            else:
                raise ValueError(
                    "stns, location and xy are all None. "
                    "Please specify one of these arguments."
                )
            _stns = np.unique(_stns)

            if locations is not None:
                xy = locations[["x", "y"]].values
        else:
            _stns = stns

        for stn in _stns:
            o = ObsClass.from_knmi(
                meteo_var=meteo_var,
                stn=stn,
                start=start,
                end=end,
                **settings,
            )

            obs_list.append(o)

    return obs_list


[docs]def get_evaporation(meteo_var, stn=260, start=None, end=None, settings=None):
    """Collect different types of (reference) evaporation
    from KNMI weather stations

    Parameters
    ----------
    meteo_var : str
        Choice between 'penman', 'makkink' or 'hargraves'.
    stn : str
        station number, defaults to 260 De Bilt
    start : pd.TimeStamp
        start time of observations.
    end : pd.TimeStamp
        end time of observations.
    settings : dict or None, optional
        settings for the time series

    Returns
    ------
    pandas.DataFrame

    """
    if meteo_var == "hargreaves":
        if not settings["use_api"]:
            raise NotImplementedError(
                "cannot use hargreaves without the api because hargreaves needs "
                "the lattitude"
            )
        d = {}
        mvs = ["TG", "TN", "TX"]
        for mv in mvs:
            ts, meta = get_knmi_timeseries_stn(
                stn, meteo_var=mv, start=start, end=end, settings=settings
            )
            d[mv] = ts.squeeze()
        et = hargreaves(
            d["TG"],
            d["TN"],
            d["TX"],
            d["TG"].index,
            meta["LAT_north"],
        ).to_frame(name=meteo_var)
    elif meteo_var == "makkink":
        d = {}
        mvs = ["TG", "Q"]
        for mv in mvs:
            ts, meta = get_knmi_timeseries_stn(
                stn, meteo_var=mv, start=start, end=end, settings=settings
            )
            d[mv] = ts.squeeze()
        et = makkink(d["TG"], d["Q"]).to_frame(name=meteo_var)
    elif meteo_var == "penman":
        if not settings["use_api"]:
            raise NotImplementedError(
                "cannot use penman without the api because penman needs the "
                "lattitude and height of the meteo station"
            )
        d = {}
        mvs = ["TG", "TN", "TX", "Q", "FG", "UG"]
        for mv in mvs:
            ts, meta = get_knmi_timeseries_stn(
                stn, meteo_var=mv, start=start, end=end, settings=settings
            )
            d[mv] = ts.squeeze()
        et = penman(
            d["TG"],
            d["TN"],
            d["TX"],
            d["Q"],
            d["FG"],
            d["UG"],
            d["TG"].index,
            meta["LAT_north"],
            meta["ALT_m"],
        ).to_frame(name=meteo_var)
    else:
        raise ValueError(
            "Provide valid argument for meteo_var -> 'hargreaves', 'makkink' or "
            "'penman'"
        )

    stn_name = get_station_name(meta["station"], stations=get_stations(meteo_var))
    meta["name"] = f"{meteo_var}_{stn_name}"
    meta["unit"] = "m"

    return et, meta


[docs]def makkink(tmean, K):
    """Estimate of Makkink reference evaporation
    according to KNMI.

    Parameters
    ----------
    tmean : tmean : pandas.Series
        Daily mean temperature in Celsius
    K : pandas.Series
        Global radiation estimate in J/cm2

    Returns
    -------
    pandas.Series

    """

    a = 0.646 + 0.0006 * tmean
    b = 1 + tmean / 237.3
    c = 7.5 * np.log(10) * 6.107 * 10 ** (7.5 * (1 - 1 / b))
    et = 0.0065 * (1 - a / (c / (237.3 * b * b) + a)) / (2501 - 2.38 * tmean) * K
    return et


[docs]def penman(
    tmean, tmin, tmax, K, wind, rh, dates, z=1.0, lat=52.1, G=0.0, wh=10.0, tdew=None
):
    """Estimate of Penman reference evaporation
    according to Allen et al 1990.

    Parameters
    ----------
    tmean : pandas.Series
        Daily mean temperature in Celsius
    tmin : pandas.Series
        Daily minimum temperature in Celsius
    tmax : pandas.Series
        Daily maximum temperature in Celsius
    K : pandas.Series
        Global radiation estimate in J/cm2
    wind : pandas.Series
        Daily mean wind speed in m/s
    rh : pandas.Series
        Relative humidity in %
    dates : pandas.Series
        Dates
    z : float, optional
        Elevation of station in m, by default 1.0
    lat : float, optional
        Latitude of station, by default 52.1
    G : float, optional
        Ground flux in MJ/m2, by default 0.0
    wh : float, optional
        Height of wind measurement in m, by default 10.0
    tdew : pandas.Series
        Dew point temperature in C, by default None

    Returns
    -------
    pandas.Series

    """
    K = K * 1e4 * 0.000012  # J/cm2 to W/m2/d
    P = 101.3 * ((293 - 0.0065 * z) / 293) ** 5.26  # kPa
    gamma = 0.665e-3 * P  # kPa/C
    tg = (tmax - tmin) / 2  # C
    s = 4098 * (0.6108 * np.exp(17.27 * tg / (tg + 237.3)) / (tg + 237.3) ** 2)  # kPa/C
    es0 = 0.6108 * np.exp(17.27 * tmean / (tmean + 237.3))  # kPa
    esmax = 0.6108 * np.exp(17.27 * tmax / (tmax + 237.3))  # kPa
    esmin = 0.6108 * np.exp(17.27 * tmin / (tmin + 237.3))  # kPa
    es = (esmax + esmin) / 2  # kpa
    if tdew:
        ea = 0.6108 * np.exp(17.27 * tdew / (tdew + 237.3))  # kPa
    else:
        ea = es0 * rh / 100  # kPa
    u2 = wind * 4.87 / np.log(67.8 * wh - 5.42)  # m/s
    J = pd.Series([int(date.strftime("%j")) for date in dates], index=dates)
    dr = 1 + 0.033 * np.cos(2 * np.pi * J / 365)  # -
    delt = 0.409 * np.sin(2 * np.pi * J / 365 - 1.39)  # rad
    phi = lat * np.pi / 180  # rad
    ws = np.arccos(-np.tan(phi) * np.tan(delt))  # rad
    Kext = (
        1366
        * dr
        / np.pi
        * (ws * np.sin(phi) * np.sin(delt) + np.cos(phi) * np.cos(delt) * np.sin(ws))
    )  # W/m2/d
    K0 = Kext * (0.75 + 2e-5 * z)  # W/m2/d
    Lout_in = (
        5.67e-8
        * (((tmax + 273) ** 4 + (tmin + 273) ** 4) / 2)
        * (0.34 - 0.14 * np.sqrt(ea))
        * (1.35 * K / K0 - 0.35)
    )  # W/m2/d
    Q = (1 - 0.23) * K + Lout_in  # W/m2/d
    Q = Q * 0.0864  # W/m2/d to MJ/m2
    straling = 0.408 * s * (Q - G)
    windterm = gamma * 900 / (tmean + 273) * u2 * (es - ea)
    denom = s + gamma * (1 + 0.34 * u2)
    et = (straling + windterm) / denom * 1e-3
    et[et < 0] = 0
    return et


[docs]def hargreaves(tmean, tmin, tmax, dates, lat=52.1, x=None):
    """Estimate of Hargraves potential evaporation
    according to Allen et al. 1990.

    Parameters
    ----------
    tmean : pandas.Series
        Daily mean temperature
    tmin : pandas.Series
        Daily minimum temperature
    tmax : pandas.Series
        Daily maximum temperature
    dates : pandas.Series.index
        Dates
    lat : float, optional
        Latitude of station, by default 52.1
    x : _type_, optional
        Optional parameter to scale evaporation estimate, by default None

    Returns
    -------
    pandas.Series

    """
    J = pd.Series([int(date.strftime("%j")) for date in dates], index=dates)
    dr = 1 + 0.033 * np.cos(2 * np.pi * J / 365)
    delt = 0.409 * np.sin(2 * np.pi * J / 365 - 1.39)
    phi = lat * np.pi / 180
    ws = np.arccos(-np.tan(phi) * np.tan(delt))
    Kext = (
        12
        * 60
        * 0.0820
        * dr
        / np.pi
        * (ws * np.sin(phi) * np.sin(delt) + np.cos(phi) * np.cos(delt) * np.sin(ws))
    )  # MJ/m2/d
    Kext = 0.408 * Kext  # MJ/m2/d to mm/d
    et = 0.0023 * (tmean + 273 + 17.8) / np.sqrt(tmax - tmin) * Kext * 1e-3
    if x:
        et = x[0] + x[1] * et
    return et