Skip to content

API reference

eo_tides.model

Functions:

Name Description
list_models

List all tide models available for tide modelling, and

model_tides

Model tide heights at multiple coordinates and/or timesteps

list_models

list_models(
    directory=None,
    show_available=True,
    show_supported=True,
    raise_error=False,
)

List all tide models available for tide modelling, and all models supported by eo-tides and pyTMD.

This function scans the specified tide model directory and returns a list of models that are available in the directory as well as the full list of all supported models.

For instructions on setting up tide models, see: https://geoscienceaustralia.github.io/eo-tides/setup/

Parameters:

Name Type Description Default

directory

str

The directory containing tide model data files. If no path is provided, this will default to the environment variable EO_TIDES_TIDE_MODELS if set, or raise an error if not. Tide modelling files should be stored in sub-folders for each model that match the structure required by pyTMD (https://geoscienceaustralia.github.io/eo-tides/setup/).

None

show_available

bool

Whether to print a list of locally available models.

True

show_supported

bool

Whether to print a list of all supported models, in addition to models available locally.

True

raise_error

bool

If True, raise an error if no available models are found. If False, raise a warning.

False

Returns:

Name Type Description
available_models list of str

A list of all tide models available within directory.

supported_models list of str

A list of all tide models supported by eo-tides.

Source code in eo_tides/model.py
def list_models(
    directory: str | os.PathLike | None = None,
    show_available: bool = True,
    show_supported: bool = True,
    raise_error: bool = False,
) -> tuple[list[str], list[str]]:
    """
    List all tide models available for tide modelling, and
    all models supported by `eo-tides` and `pyTMD`.

    This function scans the specified tide model directory
    and returns a list of models that are available in the
    directory as well as the full list of all supported models.

    For instructions on setting up tide models, see:
    <https://geoscienceaustralia.github.io/eo-tides/setup/>

    Parameters
    ----------
    directory : str, optional
        The directory containing tide model data files. If no path is
        provided, this will default to the environment variable
        `EO_TIDES_TIDE_MODELS` if set, or raise an error if not.
        Tide modelling files should be stored in sub-folders for each
        model that match the structure required by `pyTMD`
        (<https://geoscienceaustralia.github.io/eo-tides/setup/>).
    show_available : bool, optional
        Whether to print a list of locally available models.
    show_supported : bool, optional
        Whether to print a list of all supported models, in
        addition to models available locally.
    raise_error : bool, optional
        If True, raise an error if no available models are found.
        If False, raise a warning.

    Returns
    -------
    available_models : list of str
        A list of all tide models available within `directory`.
    supported_models : list of str
        A list of all tide models supported by `eo-tides`.
    """
    init()  # Initialize colorama

    # Set tide modelling files directory. If no custom path is
    # provided, try global environment variable.
    directory = _set_directory(directory)

    # Get full list of supported models from pyTMD database
    model_database = load_database()["elevation"]
    supported_models = list(model_database.keys())

    # Extract expected model paths
    expected_paths = {}
    for m in supported_models:
        model_file = model_database[m]["model_file"]
        model_file = model_file[0] if isinstance(model_file, list) else model_file
        expected_paths[m] = str(directory / pathlib.Path(model_file).expanduser().parent)

    # Define column widths
    status_width = 4  # Width for emoji
    name_width = max(len(name) for name in supported_models)
    path_width = max(len(path) for path in expected_paths.values())

    # Print list of supported models, marking available and
    # unavailable models and appending available to list
    if show_available or show_supported:
        total_width = min(status_width + name_width + path_width + 6, 80)
        print("─" * total_width)
        print(f"{'󠀠🌊':^{status_width}} | {'Model':<{name_width}} | {'Expected path':<{path_width}}")
        print("─" * total_width)

    available_models = []
    for m in supported_models:
        try:
            model_file = model(directory=directory).elevation(m=m)
            available_models.append(m)

            if show_available:
                # Mark available models with a green tick
                status = "✅"
                print(f"{status:^{status_width}}{m:<{name_width}}{expected_paths[m]:<{path_width}}")
        except FileNotFoundError:
            if show_supported:
                # Mark unavailable models with a red cross
                status = "❌"
                print(
                    f"{status:^{status_width}}{Style.DIM}{m:<{name_width}}{expected_paths[m]:<{path_width}}{Style.RESET_ALL}"
                )

    if show_available or show_supported:
        print("─" * total_width)

        # Print summary
        print(f"\n{Style.BRIGHT}Summary:{Style.RESET_ALL}")
        print(f"Available models: {len(available_models)}/{len(supported_models)}")

    # Raise error or warning if no models are available
    if not available_models:
        warning_msg = textwrap.dedent(
            f"""
            No valid tide models are available in `{directory}`.
            Are you sure you have provided the correct `directory` path, or set the
            `EO_TIDES_TIDE_MODELS` environment variable to point to the location of your
            tide model directory?
            """
        ).strip()

        if raise_error:
            raise Exception(warning_msg)
        else:
            warnings.warn(warning_msg, UserWarning)

    # Return list of available and supported models
    return available_models, supported_models

model_tides

model_tides(
    x,
    y,
    time,
    model="EOT20",
    directory=None,
    crs="EPSG:4326",
    crop=True,
    method="spline",
    extrapolate=True,
    cutoff=None,
    mode="one-to-many",
    parallel=True,
    parallel_splits=5,
    output_units="m",
    output_format="long",
    ensemble_models=None,
    **ensemble_kwargs
)

Model tide heights at multiple coordinates and/or timesteps using using one or more ocean tide models.

This function is parallelised to improve performance, and supports all tidal models supported by pyTMD, including:

  • Empirical Ocean Tide model (EOT20)
  • Finite Element Solution tide models (FES2022, FES2014, FES2012)
  • TOPEX/POSEIDON global tide models (TPXO10, TPXO9, TPXO8)
  • Global Ocean Tide models (GOT5.6, GOT5.5, GOT4.10, GOT4.8, GOT4.7)
  • Hamburg direct data Assimilation Methods for Tides models (HAMTIDE11)

This function requires access to tide model data files. These should be placed in a folder with subfolders matching the structure required by pyTMD. For more details: https://geoscienceaustralia.github.io/eo-tides/setup/ https://pytmd.readthedocs.io/en/latest/getting_started/Getting-Started.html#directories

This function is a modification of the pyTMD package's pyTMD.compute.tide_elevations function. For more info: https://pytmd.readthedocs.io/en/latest/api_reference/compute.html#pyTMD.compute.tide_elevations

Parameters:

Name Type Description Default

x

float or list of float

One or more x and y coordinates used to define the location at which to model tides. By default these coordinates should be lat/lon; use "crs" if they are in a custom coordinate reference system.

required

y

float or list of float

One or more x and y coordinates used to define the location at which to model tides. By default these coordinates should be lat/lon; use "crs" if they are in a custom coordinate reference system.

required

time

Numpy datetime array or pandas.DatetimeIndex

An array containing datetime64[ns] values or a pandas.DatetimeIndex providing the times at which to model tides in UTC time.

required

model

str or list of str

The tide model (or models) to use to model tides. Defaults to "EOT20"; for a full list of available/supported models, run eo_tides.model.list_models.

'EOT20'

directory

str

The directory containing tide model data files. If no path is provided, this will default to the environment variable EO_TIDES_TIDE_MODELS if set, or raise an error if not. Tide modelling files should be stored in sub-folders for each model that match the structure required by pyTMD (https://geoscienceaustralia.github.io/eo-tides/setup/).

None

crs

str

Input coordinate reference system for x and y coordinates. Defaults to "EPSG:4326" (WGS84; degrees latitude, longitude).

'EPSG:4326'

crop

bool

Whether to crop tide model constituent files on-the-fly to improve performance. Cropping will be performed based on a 1 degree buffer around all input points. Defaults to True.

True

method

str

Method used to interpolate tidal constituents from model files. Options include:

  • "spline": scipy bivariate spline interpolation (default)
  • "bilinear": quick bilinear interpolation
  • "linear", "nearest": scipy regular grid interpolations
'spline'

extrapolate

bool

Whether to extrapolate tides for x and y coordinates outside of the valid tide modelling domain using nearest-neighbor.

True

cutoff

float

Extrapolation cutoff in kilometers. The default is None, which will extrapolate for all points regardless of distance from the valid tide modelling domain.

None

mode

str

The analysis mode to use for tide modelling. Supports two options:

  • "one-to-many": Models tides for every timestep in "time" at every input x and y coordinate point. This is useful if you want to model tides for a specific list of timesteps across multiple spatial points (e.g. for the same set of satellite acquisition times at various locations across your study area).
  • "one-to-one": Model tides using a unique timestep for each set of x and y coordinates. In this mode, the number of x and y points must equal the number of timesteps provided in "time".
'one-to-many'

parallel

bool

Whether to parallelise tide modelling using concurrent.futures. If multiple tide models are requested, these will be run in parallel. Optionally, tide modelling can also be run in parallel across input x and y coordinates (see "parallel_splits" below). Default is True.

True

parallel_splits

int

Whether to split the input x and y coordinates into smaller, evenly-sized chunks that are processed in parallel. This can provide a large performance boost when processing large numbers of coordinates. The default is 5 chunks, which will split coordinates into 5 parallelised chunks.

5

output_units

str

Whether to return modelled tides in floating point metre units, or integer centimetre units (i.e. scaled by 100) or integer millimetre units (i.e. scaled by 1000. Returning outputs in integer units can be useful for reducing memory usage. Defaults to "m" for metres; set to "cm" for centimetres or "mm" for millimetres.

'm'

output_format

str

Whether to return the output dataframe in long format (with results stacked vertically along "tide_model" and "tide_height" columns), or wide format (with a column for each tide model). Defaults to "long".

'long'

ensemble_models

list of str

An optional list of models used to generate the ensemble tide model if "ensemble" tide modelling is requested. Defaults to ["FES2014", "TPXO9-atlas-v5", "EOT20", "HAMTIDE11", "GOT4.10", "FES2012", "TPXO8-atlas-v1"].

None

**ensemble_kwargs

Keyword arguments used to customise the generation of optional ensemble tide models if "ensemble" modelling are requested. These are passed to the underlying _ensemble_model function. Useful parameters include ranking_points (path to model rankings data), k (for controlling how model rankings are interpolated), and ensemble_top_n (how many top models to use in the ensemble calculation).

{}

Returns:

Type Description
DataFrame

A dataframe containing modelled tide heights.

Source code in eo_tides/model.py
def model_tides(
    x: float | list[float] | xr.DataArray,
    y: float | list[float] | xr.DataArray,
    time: np.ndarray | pd.DatetimeIndex,
    model: str | list[str] = "EOT20",
    directory: str | os.PathLike | None = None,
    crs: str = "EPSG:4326",
    crop: bool = True,
    method: str = "spline",
    extrapolate: bool = True,
    cutoff: float | None = None,
    mode: str = "one-to-many",
    parallel: bool = True,
    parallel_splits: int = 5,
    output_units: str = "m",
    output_format: str = "long",
    ensemble_models: list[str] | None = None,
    **ensemble_kwargs,
) -> pd.DataFrame:
    """
    Model tide heights at multiple coordinates and/or timesteps
    using using one or more ocean tide models.

    This function is parallelised to improve performance, and
    supports all tidal models supported by `pyTMD`, including:

    - Empirical Ocean Tide model (EOT20)
    - Finite Element Solution tide models (FES2022, FES2014, FES2012)
    - TOPEX/POSEIDON global tide models (TPXO10, TPXO9, TPXO8)
    - Global Ocean Tide models (GOT5.6, GOT5.5, GOT4.10, GOT4.8, GOT4.7)
    - Hamburg direct data Assimilation Methods for Tides models (HAMTIDE11)

    This function requires access to tide model data files.
    These should be placed in a folder with subfolders matching
    the structure required by `pyTMD`. For more details:
    <https://geoscienceaustralia.github.io/eo-tides/setup/>
    <https://pytmd.readthedocs.io/en/latest/getting_started/Getting-Started.html#directories>

    This function is a modification of the `pyTMD` package's
    `pyTMD.compute.tide_elevations` function. For more info:
    <https://pytmd.readthedocs.io/en/latest/api_reference/compute.html#pyTMD.compute.tide_elevations>

    Parameters
    ----------
    x, y : float or list of float
        One or more x and y coordinates used to define
        the location at which to model tides. By default these
        coordinates should be lat/lon; use "crs" if they
        are in a custom coordinate reference system.
    time : Numpy datetime array or pandas.DatetimeIndex
        An array containing `datetime64[ns]` values or a
        `pandas.DatetimeIndex` providing the times at which to
        model tides in UTC time.
    model : str or list of str, optional
        The tide model (or models) to use to model tides.
        Defaults to "EOT20"; for a full list of available/supported
        models, run `eo_tides.model.list_models`.
    directory : str, optional
        The directory containing tide model data files. If no path is
        provided, this will default to the environment variable
        `EO_TIDES_TIDE_MODELS` if set, or raise an error if not.
        Tide modelling files should be stored in sub-folders for each
        model that match the structure required by `pyTMD`
        (<https://geoscienceaustralia.github.io/eo-tides/setup/>).
    crs : str, optional
        Input coordinate reference system for x and y coordinates.
        Defaults to "EPSG:4326" (WGS84; degrees latitude, longitude).
    crop : bool, optional
        Whether to crop tide model constituent files on-the-fly to
        improve performance. Cropping will be performed based on a
        1 degree buffer around all input points. Defaults to True.
    method : str, optional
        Method used to interpolate tidal constituents
        from model files. Options include:

        - "spline": scipy bivariate spline interpolation (default)
        - "bilinear": quick bilinear interpolation
        - "linear", "nearest": scipy regular grid interpolations
    extrapolate : bool, optional
        Whether to extrapolate tides for x and y coordinates outside of
        the valid tide modelling domain using nearest-neighbor.
    cutoff : float, optional
        Extrapolation cutoff in kilometers. The default is None, which
        will extrapolate for all points regardless of distance from the
        valid tide modelling domain.
    mode : str, optional
        The analysis mode to use for tide modelling. Supports two options:

        - "one-to-many": Models tides for every timestep in "time" at
        every input x and y coordinate point. This is useful if you
        want to model tides for a specific list of timesteps across
        multiple spatial points (e.g. for the same set of satellite
        acquisition times at various locations across your study area).
        - "one-to-one": Model tides using a unique timestep for each
        set of x and y coordinates. In this mode, the number of x and
        y points must equal the number of timesteps provided in "time".

    parallel : bool, optional
        Whether to parallelise tide modelling using `concurrent.futures`.
        If multiple tide models are requested, these will be run in
        parallel. Optionally, tide modelling can also be run in parallel
        across input x and y coordinates (see "parallel_splits" below).
        Default is True.
    parallel_splits : int, optional
        Whether to split the input x and y coordinates into smaller,
        evenly-sized chunks that are processed in parallel. This can
        provide a large performance boost when processing large numbers
        of coordinates. The default is 5 chunks, which will split
        coordinates into 5 parallelised chunks.
    output_units : str, optional
        Whether to return modelled tides in floating point metre units,
        or integer centimetre units (i.e. scaled by 100) or integer
        millimetre units (i.e. scaled by 1000. Returning outputs in
        integer units can be useful for reducing memory usage.
        Defaults to "m" for metres; set to "cm" for centimetres or "mm"
        for millimetres.
    output_format : str, optional
        Whether to return the output dataframe in long format (with
        results stacked vertically along "tide_model" and "tide_height"
        columns), or wide format (with a column for each tide model).
        Defaults to "long".
    ensemble_models : list of str, optional
        An optional list of models used to generate the ensemble tide
        model if "ensemble" tide modelling is requested. Defaults to
        ["FES2014", "TPXO9-atlas-v5", "EOT20", "HAMTIDE11", "GOT4.10",
        "FES2012", "TPXO8-atlas-v1"].
    **ensemble_kwargs :
        Keyword arguments used to customise the generation of optional
        ensemble tide models if "ensemble" modelling are requested.
        These are passed to the underlying `_ensemble_model` function.
        Useful parameters include `ranking_points` (path to model
        rankings data), `k` (for controlling how model rankings are
        interpolated), and `ensemble_top_n` (how many top models to use
        in the ensemble calculation).

    Returns
    -------
    pandas.DataFrame
        A dataframe containing modelled tide heights.

    """
    # Turn inputs into arrays for consistent handling
    models_requested = list(np.atleast_1d(model))
    x = np.atleast_1d(x)
    y = np.atleast_1d(y)
    time = np.atleast_1d(time)

    # Validate input arguments
    assert method in ("bilinear", "spline", "linear", "nearest")
    assert output_units in (
        "m",
        "cm",
        "mm",
    ), "Output units must be either 'm', 'cm', or 'mm'."
    assert output_format in (
        "long",
        "wide",
    ), "Output format must be either 'long' or 'wide'."
    assert len(x) == len(y), "x and y must be the same length."
    if mode == "one-to-one":
        assert len(x) == len(time), (
            "The number of supplied x and y points and times must be "
            "identical in 'one-to-one' mode. Use 'one-to-many' mode if "
            "you intended to model multiple timesteps at each point."
        )

    # If time passed as a single Timestamp, convert to datetime64
    if isinstance(time, pd.Timestamp):
        time = time.to_datetime64()

    # Set tide modelling files directory. If no custom path is
    # provided, try global environment variable.
    directory = _set_directory(directory)

    # Get full list of supported models from pyTMD database;
    # add ensemble option to list of models
    available_models, valid_models = list_models(
        directory, show_available=False, show_supported=False, raise_error=True
    )
    # TODO: This is hacky, find a better way. Perhaps a kwarg that
    # turns ensemble functionality on, and checks that supplied
    # models match models expected for ensemble?
    available_models = available_models + ["ensemble"]
    valid_models = valid_models + ["ensemble"]

    # Error if any models are not supported
    if not all(m in valid_models for m in models_requested):
        error_text = (
            f"One or more of the requested models are not valid:\n"
            f"{models_requested}\n\n"
            "The following models are supported:\n"
            f"{valid_models}"
        )
        raise ValueError(error_text)

    # Error if any models are not available in `directory`
    if not all(m in available_models for m in models_requested):
        error_text = (
            f"One or more of the requested models are valid, but not available in `{directory}`:\n"
            f"{models_requested}\n\n"
            f"The following models are available in `{directory}`:\n"
            f"{available_models}"
        )
        raise ValueError(error_text)

    # If ensemble modelling is requested, use a custom list of models
    # for subsequent processing
    if "ensemble" in models_requested:
        print("Running ensemble tide modelling")
        models_to_process = (
            ensemble_models
            if ensemble_models is not None
            else [
                "FES2014",
                "TPXO9-atlas-v5",
                "EOT20",
                "HAMTIDE11",
                "GOT4.10",
                "FES2012",
                "TPXO8-atlas-v1",
            ]
        )

    # Otherwise, models to process are the same as those requested
    else:
        models_to_process = models_requested

    # Update tide modelling func to add default keyword arguments that
    # are used for every iteration during parallel processing
    iter_func = partial(
        _model_tides,
        directory=directory,
        crs=crs,
        crop=crop,
        method=method,
        extrapolate=extrapolate,
        cutoff=np.inf if cutoff is None else cutoff,
        output_units=output_units,
        mode=mode,
    )

    # Ensure requested parallel splits is not smaller than number of points
    parallel_splits = min(parallel_splits, len(x))

    # Parallelise if either multiple models or multiple splits requested
    if parallel & ((len(models_to_process) > 1) | (parallel_splits > 1)):
        with ProcessPoolExecutor() as executor:
            print(f"Modelling tides using {', '.join(models_to_process)} in parallel")

            # Optionally split lon/lat points into `splits_n` chunks
            # that will be applied in parallel
            x_split = np.array_split(x, parallel_splits)
            y_split = np.array_split(y, parallel_splits)

            # Get every combination of models and lat/lon points, and
            # extract as iterables that can be passed to `executor.map()`
            # In "one-to-many" mode, pass entire set of timesteps to each
            # parallel iteration by repeating timesteps by number of total
            # parallel iterations. In "one-to-one" mode, split up
            # timesteps into smaller parallel chunks too.
            if mode == "one-to-many":
                model_iters, x_iters, y_iters = zip(
                    *[(m, x_split[i], y_split[i]) for m in models_to_process for i in range(parallel_splits)],
                )
                time_iters = [time] * len(model_iters)
            elif mode == "one-to-one":
                time_split = np.array_split(time, parallel_splits)
                model_iters, x_iters, y_iters, time_iters = zip(
                    *[
                        (m, x_split[i], y_split[i], time_split[i])
                        for m in models_to_process
                        for i in range(parallel_splits)
                    ],
                )

            # Apply func in parallel, iterating through each input param
            try:
                model_outputs = list(
                    tqdm(
                        executor.map(iter_func, model_iters, x_iters, y_iters, time_iters),
                        total=len(model_iters),
                    ),
                )
            except BrokenProcessPool:
                error_msg = (
                    "Parallelised tide modelling failed, likely to to an out-of-memory error. "
                    "Try reducing the size of your analysis, or set `parallel=False`."
                )
                raise RuntimeError(error_msg)

    # Model tides in series if parallelisation is off
    else:
        model_outputs = []

        for model_i in models_to_process:
            print(f"Modelling tides using {model_i}")
            tide_df = iter_func(model_i, x, y, time)
            model_outputs.append(tide_df)

    # Combine outputs into a single dataframe
    tide_df = pd.concat(model_outputs, axis=0)

    # Optionally compute ensemble model and add to dataframe
    if "ensemble" in models_requested:
        ensemble_df = _ensemble_model(tide_df, crs, models_to_process, **ensemble_kwargs)

        # Update requested models with any custom ensemble models, then
        # filter the dataframe to keep only models originally requested
        models_requested = list(np.union1d(models_requested, ensemble_df.tide_model.unique()))
        tide_df = pd.concat([tide_df, ensemble_df]).query("tide_model in @models_requested")

    # Optionally convert to a wide format dataframe with a tide model in
    # each dataframe column
    if output_format == "wide":
        # Pivot into wide format with each time model as a column
        print("Converting to a wide format dataframe")
        tide_df = tide_df.pivot(columns="tide_model", values="tide_height")

        # If in 'one-to-one' mode, reindex using our input time/x/y
        # values to ensure the output is sorted the same as our inputs
        if mode == "one-to-one":
            output_indices = pd.MultiIndex.from_arrays([time, x, y], names=["time", "x", "y"])
            tide_df = tide_df.reindex(output_indices)

    return tide_df

eo_tides.eo

Functions:

Name Description
pixel_tides

Model tide heights for every pixel in a multi-dimensional

tag_tides

Model tide heights for every timestep in a multi-dimensional

pixel_tides

pixel_tides(
    ds,
    times=None,
    model="EOT20",
    directory=None,
    resample=True,
    calculate_quantiles=None,
    resolution=None,
    buffer=None,
    resample_method="bilinear",
    dask_chunks="auto",
    dask_compute=True,
    **model_tides_kwargs
)

Model tide heights for every pixel in a multi-dimensional dataset, using one or more ocean tide models.

This function models tides into a low-resolution tide modelling grid covering the spatial extent of the input data (buffered to reduce potential edge effects). These modelled tides are then (optionally) resampled back into the original higher resolution dataset's extent and resolution - resulting in a modelled tide height for every pixel through time.

This function uses the parallelised model_tides function under the hood. It supports all tidal models supported by pyTMD, including:

  • Empirical Ocean Tide model (EOT20)
  • Finite Element Solution tide models (FES2022, FES2014, FES2012)
  • TOPEX/POSEIDON global tide models (TPXO10, TPXO9, TPXO8)
  • Global Ocean Tide models (GOT5.6, GOT5.5, GOT4.10, GOT4.8, GOT4.7)
  • Hamburg direct data Assimilation Methods for Tides models (HAMTIDE11)

This function requires access to tide model data files. These should be placed in a folder with subfolders matching the structure required by pyTMD. For more details: https://geoscienceaustralia.github.io/eo-tides/setup/ https://pytmd.readthedocs.io/en/latest/getting_started/Getting-Started.html#directories

Parameters:

Name Type Description Default

ds

Dataset or DataArray

A multi-dimensional dataset (e.g. "x", "y", "time") that will be used to define the tide modelling grid.

required

times

pd.DatetimeIndex or list of pd.Timestamp

By default, the function will model tides using the times contained in the time dimension of ds. Alternatively, this param can be used to model tides for a custom set of times instead. For example: times=pd.date_range(start="2000", end="2001", freq="5h")

None

model

str or list of str

The tide model (or models) used to model tides. If a list is provided, a new "tide_model" dimension will be added to the xarray.DataArray outputs. Defaults to "EOT20"; for a full list of available/supported models, run eo_tides.model.list_models.

'EOT20'

directory

str

The directory containing tide model data files. If no path is provided, this will default to the environment variable EO_TIDES_TIDE_MODELS if set, or raise an error if not. Tide modelling files should be stored in sub-folders for each model that match the structure required by pyTMD (https://geoscienceaustralia.github.io/eo-tides/setup/).

None

resample

bool

Whether to resample low resolution tides back into ds's original higher resolution grid. Set this to False if you do not want low resolution tides to be re-projected back to higher resolution.

True

calculate_quantiles

tuple of float or numpy.ndarray

Rather than returning all individual tides, low-resolution tides can be first aggregated using a quantile calculation by passing in a tuple or array of quantiles to compute. For example, this could be used to calculate the min/max tide across all times: calculate_quantiles=(0.0, 1.0).

None

resolution

float

The desired resolution of the low-resolution grid used for tide modelling. The default None will create a 5000 m resolution grid if ds has a projected CRS (i.e. metre units), or a 0.05 degree resolution grid if ds has a geographic CRS (e.g. degree units). Note: higher resolutions do not necessarily provide better tide modelling performance, as results will be limited by the resolution of the underlying global tide model (e.g. 1/16th degree / ~5 km resolution grid for FES2014).

None

buffer

float

The amount by which to buffer the higher resolution grid extent when creating the new low resolution grid. This buffering is important as it ensures that ensure pixel-based tides are seamless across dataset boundaries. This buffer will eventually be clipped away when the low-resolution data is re-projected back to the resolution and extent of the higher resolution dataset. To ensure that at least two pixels occur outside of the dataset bounds, the default None applies a 12000 m buffer if ds has a projected CRS (i.e. metre units), or a 0.12 degree buffer if ds has a geographic CRS (e.g. degree units).

None

resample_method

str

If resampling is requested (see resample above), use this resampling method when converting from low resolution to high resolution pixels. Defaults to "bilinear"; valid options include "nearest", "cubic", "min", "max", "average" etc.

'bilinear'

dask_chunks

str or tuple of float

Can be used to configure custom Dask chunking for the final resampling step. The default of "auto" will automatically set x/y chunks to match those in ds if they exist, otherwise will set x/y chunks that cover the entire extent of the dataset. For custom chunks, provide a tuple in the form (y, x), e.g. (2048, 2048).

'auto'

dask_compute

bool

Whether to compute results of the resampling step using Dask. If False, this will return tides_highres as a Dask array.

True

**model_tides_kwargs

Optional parameters passed to the eo_tides.model.model_tides function. Important parameters include cutoff (used to extrapolate modelled tides away from the coast; defaults to np.inf), crop (whether to crop tide model constituent files on-the-fly to improve performance) etc.

{}

Returns:

Name Type Description
tides_da DataArray

If resample=True (default), a high-resolution array of tide heights matching the exact spatial resolution and extents of ds. This will contain either tide heights every timestep in ds (if times is None), tide heights at every time in times (if times is not None), or tide height quantiles for every quantile provided by calculate_quantiles. If resample=False, results for the intermediate low-resolution tide modelling grid will be returned instead.

Source code in eo_tides/eo.py
def pixel_tides(
    ds: xr.Dataset | xr.DataArray,
    times=None,
    model: str | list[str] = "EOT20",
    directory: str | os.PathLike | None = None,
    resample: bool = True,
    calculate_quantiles: np.ndarray | tuple[float, float] | None = None,
    resolution: float | None = None,
    buffer: float | None = None,
    resample_method: str = "bilinear",
    dask_chunks: str | tuple[float, float] = "auto",
    dask_compute: bool = True,
    **model_tides_kwargs,
) -> xr.DataArray:
    """
    Model tide heights for every pixel in a multi-dimensional
    dataset, using one or more ocean tide models.

    This function models tides into a low-resolution tide
    modelling grid covering the spatial extent of the input
    data (buffered to reduce potential edge effects). These
    modelled tides are then (optionally) resampled back into
    the original higher resolution dataset's extent and
    resolution - resulting in a modelled tide height for every
    pixel through time.

    This function uses the parallelised `model_tides` function
    under the hood. It supports all tidal models supported by
    `pyTMD`, including:

    - Empirical Ocean Tide model (EOT20)
    - Finite Element Solution tide models (FES2022, FES2014, FES2012)
    - TOPEX/POSEIDON global tide models (TPXO10, TPXO9, TPXO8)
    - Global Ocean Tide models (GOT5.6, GOT5.5, GOT4.10, GOT4.8, GOT4.7)
    - Hamburg direct data Assimilation Methods for Tides models (HAMTIDE11)

    This function requires access to tide model data files.
    These should be placed in a folder with subfolders matching
    the structure required by `pyTMD`. For more details:
    <https://geoscienceaustralia.github.io/eo-tides/setup/>
    <https://pytmd.readthedocs.io/en/latest/getting_started/Getting-Started.html#directories>

    Parameters
    ----------
    ds : xarray.Dataset or xarray.DataArray
        A multi-dimensional dataset (e.g. "x", "y", "time") that will
        be used to define the tide modelling grid.
    times : pd.DatetimeIndex or list of pd.Timestamp, optional
        By default, the function will model tides using the times
        contained in the `time` dimension of `ds`. Alternatively, this
        param can be used to model tides for a custom set of times
        instead. For example:
        `times=pd.date_range(start="2000", end="2001", freq="5h")`
    model : str or list of str, optional
        The tide model (or models) used to model tides. If a list is
        provided, a new "tide_model" dimension will be added to the
        `xarray.DataArray` outputs. Defaults to "EOT20"; for a full
        list of available/supported models, run `eo_tides.model.list_models`.
    directory : str, optional
        The directory containing tide model data files. If no path is
        provided, this will default to the environment variable
        `EO_TIDES_TIDE_MODELS` if set, or raise an error if not.
        Tide modelling files should be stored in sub-folders for each
        model that match the structure required by `pyTMD`
        (<https://geoscienceaustralia.github.io/eo-tides/setup/>).
    resample : bool, optional
        Whether to resample low resolution tides back into `ds`'s original
        higher resolution grid. Set this to `False` if you do not want
        low resolution tides to be re-projected back to higher resolution.
    calculate_quantiles : tuple of float or numpy.ndarray, optional
        Rather than returning all individual tides, low-resolution tides
        can be first aggregated using a quantile calculation by passing in
        a tuple or array of quantiles to compute. For example, this could
        be used to calculate the min/max tide across all times:
        `calculate_quantiles=(0.0, 1.0)`.
    resolution : float, optional
        The desired resolution of the low-resolution grid used for tide
        modelling. The default None will create a 5000 m resolution grid
        if `ds` has a projected CRS (i.e. metre units), or a 0.05 degree
        resolution grid if `ds` has a geographic CRS (e.g. degree units).
        Note: higher resolutions do not necessarily provide better
        tide modelling performance, as results will be limited by the
        resolution of the underlying global tide model (e.g. 1/16th
        degree / ~5 km resolution grid for FES2014).
    buffer : float, optional
        The amount by which to buffer the higher resolution grid extent
        when creating the new low resolution grid. This buffering is
        important as it ensures that ensure pixel-based tides are seamless
        across dataset boundaries. This buffer will eventually be clipped
        away when the low-resolution data is re-projected back to the
        resolution and extent of the higher resolution dataset. To
        ensure that at least two pixels occur outside of the dataset
        bounds, the default None applies a 12000 m buffer if `ds` has a
        projected CRS (i.e. metre units), or a 0.12 degree buffer if
        `ds` has a geographic CRS (e.g. degree units).
    resample_method : str, optional
        If resampling is requested (see `resample` above), use this
        resampling method when converting from low resolution to high
        resolution pixels. Defaults to "bilinear"; valid options include
        "nearest", "cubic", "min", "max", "average" etc.
    dask_chunks : str or tuple of float, optional
        Can be used to configure custom Dask chunking for the final
        resampling step. The default of "auto" will automatically set
        x/y chunks to match those in `ds` if they exist, otherwise will
        set x/y chunks that cover the entire extent of the dataset.
        For custom chunks, provide a tuple in the form `(y, x)`, e.g.
        `(2048, 2048)`.
    dask_compute : bool, optional
        Whether to compute results of the resampling step using Dask.
        If False, this will return `tides_highres` as a Dask array.
    **model_tides_kwargs :
        Optional parameters passed to the `eo_tides.model.model_tides`
        function. Important parameters include `cutoff` (used to
        extrapolate modelled tides away from the coast; defaults to
        `np.inf`), `crop` (whether to crop tide model constituent files
        on-the-fly to improve performance) etc.
    Returns
    -------
    tides_da : xr.DataArray
        If `resample=True` (default), a high-resolution array
        of tide heights matching the exact spatial resolution and
        extents of `ds`. This will contain either tide heights every
        timestep in `ds` (if `times` is None), tide heights at every
        time in `times` (if `times` is not None), or tide height
        quantiles for every quantile provided by `calculate_quantiles`.
        If `resample=False`, results for the intermediate low-resolution
        tide modelling grid will be returned instead.
    """
    # First test if no time dimension and nothing passed to `times`
    if ("time" not in ds.dims) & (times is None):
        raise ValueError(
            "`ds` does not contain a 'time' dimension. Times are required "
            "for modelling tides: please pass in a set of custom tides "
            "using the `times` parameter. For example: "
            "`times=pd.date_range(start='2000', end='2001', freq='5h')`",
        )

    # If custom times are provided, convert them to a consistent
    # pandas.DatatimeIndex format
    if times is not None:
        if isinstance(times, list):
            time_coords = pd.DatetimeIndex(times)
        elif isinstance(times, pd.Timestamp):
            time_coords = pd.DatetimeIndex([times])
        else:
            time_coords = times

    # Otherwise, use times from `ds` directly
    else:
        time_coords = ds.coords["time"]

    # Standardise model into a list for easy handling
    model = [model] if isinstance(model, str) else model

    # Determine spatial dimensions
    y_dim, x_dim = ds.odc.spatial_dims

    # Determine resolution and buffer, using different defaults for
    # geographic (i.e. degrees) and projected (i.e. metres) CRSs:
    crs_units = ds.odc.geobox.crs.units[0][0:6]
    if ds.odc.geobox.crs.geographic:
        if resolution is None:
            resolution = 0.05
        elif resolution > 360:
            raise ValueError(
                f"A resolution of greater than 360 was "
                f"provided, but `ds` has a geographic CRS "
                f"in {crs_units} units. Did you accidently "
                f"provide a resolution in projected "
                f"(i.e. metre) units?",
            )
        if buffer is None:
            buffer = 0.12
    else:
        if resolution is None:
            resolution = 5000
        elif resolution < 1:
            raise ValueError(
                f"A resolution of less than 1 was provided, "
                f"but `ds` has a projected CRS in "
                f"{crs_units} units. Did you accidently "
                f"provide a resolution in geographic "
                f"(degree) units?",
            )
        if buffer is None:
            buffer = 12000

    # Raise error if resolution is less than dataset resolution
    dataset_res = ds.odc.geobox.resolution.x
    if resolution < dataset_res:
        raise ValueError(
            f"The resolution of the low-resolution tide "
            f"modelling grid ({resolution:.2f}) is less "
            f"than `ds`'s pixel resolution ({dataset_res:.2f}). "
            f"This can cause extremely slow tide modelling "
            f"performance. Please select provide a resolution "
            f"greater than {dataset_res:.2f} using "
            f"`pixel_tides`'s 'resolution' parameter.",
        )

    # Create a new reduced resolution tide modelling grid after
    # first buffering the grid
    print(f"Creating reduced resolution {resolution} x {resolution} {crs_units} tide modelling array")
    buffered_geobox = ds.odc.geobox.buffered(buffer)
    rescaled_geobox = GeoBox.from_bbox(bbox=buffered_geobox.boundingbox, resolution=resolution)
    rescaled_ds = odc.geo.xr.xr_zeros(rescaled_geobox)

    # Flatten grid to 1D, then add time dimension
    flattened_ds = rescaled_ds.stack(z=(x_dim, y_dim))
    flattened_ds = flattened_ds.expand_dims(dim={"time": time_coords.values})

    # Model tides in parallel, returning a pandas.DataFrame
    tide_df = model_tides(
        x=flattened_ds[x_dim],
        y=flattened_ds[y_dim],
        time=flattened_ds.time,
        crs=f"EPSG:{ds.odc.geobox.crs.epsg}",
        model=model,
        directory=directory,
        **model_tides_kwargs,
    )

    # Convert our pandas.DataFrame tide modelling outputs to xarray
    tides_lowres = (
        # Rename x and y dataframe indexes to match x and y xarray dims
        tide_df.rename_axis(["time", x_dim, y_dim])
        # Add tide model column to dataframe indexes so we can convert
        # our dataframe to a multidimensional xarray
        .set_index("tide_model", append=True)
        # Convert to xarray and select our tide modelling xr.DataArray
        .to_xarray()
        .tide_height
        # Re-index and transpose into our input coordinates and dim order
        .reindex_like(rescaled_ds)
        .transpose("tide_model", "time", y_dim, x_dim)
    )

    # Optionally calculate and return quantiles rather than raw data.
    # Set dtype to dtype of the input data as quantile always returns
    # float64 (memory intensive)
    if calculate_quantiles is not None:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            print("Computing tide quantiles")
            tides_lowres = tides_lowres.quantile(q=calculate_quantiles, dim="time").astype(tides_lowres.dtype)

    # If only one tidal model exists, squeeze out "tide_model" dim
    if len(tides_lowres.tide_model) == 1:
        tides_lowres = tides_lowres.squeeze("tide_model")

    # Ensure CRS is present before we apply any resampling
    tides_lowres = tides_lowres.odc.assign_crs(ds.odc.geobox.crs)

    # Reproject into original high resolution grid
    if resample:
        print("Reprojecting tides into original resolution")
        tides_highres = _pixel_tides_resample(
            tides_lowres,
            ds,
            resample_method,
            dask_chunks,
            dask_compute,
        )
        return tides_highres

    print("Returning low resolution tide array")
    return tides_lowres

tag_tides

tag_tides(
    ds,
    model="EOT20",
    directory=None,
    tidepost_lat=None,
    tidepost_lon=None,
    **model_tides_kwargs
)

Model tide heights for every timestep in a multi-dimensional dataset, and return a new tide_height array that can be used to "tag" each observation with tide data.

The function models tides at the centroid of the dataset by default, but a custom tidal modelling location can be specified using tidepost_lat and tidepost_lon.

This function uses the parallelised model_tides function under the hood. It supports all tidal models supported by pyTMD, including:

  • Empirical Ocean Tide model (EOT20)
  • Finite Element Solution tide models (FES2022, FES2014, FES2012)
  • TOPEX/POSEIDON global tide models (TPXO10, TPXO9, TPXO8)
  • Global Ocean Tide models (GOT5.6, GOT5.5, GOT4.10, GOT4.8, GOT4.7)
  • Hamburg direct data Assimilation Methods for Tides models (HAMTIDE11)

Parameters:

Name Type Description Default

ds

Dataset or DataArray

A multi-dimensional dataset (e.g. "x", "y", "time") to tag with tide heights. This dataset must contain a "time" dimension.

required

model

str or list of str

The tide model (or models) to use to model tides. If a list is provided, a new "tide_model" dimension will be added to ds. Defaults to "EOT20"; for a full list of available/supported models, run eo_tides.model.list_models.

'EOT20'

directory

str

The directory containing tide model data files. If no path is provided, this will default to the environment variable EO_TIDES_TIDE_MODELS if set, or raise an error if not. Tide modelling files should be stored in sub-folders for each model that match the structure required by pyTMD (https://geoscienceaustralia.github.io/eo-tides/setup/).

None

tidepost_lat

float

Optional coordinates used to model tides. The default is None, which uses the centroid of the dataset as the tide modelling location.

None

tidepost_lon

float

Optional coordinates used to model tides. The default is None, which uses the centroid of the dataset as the tide modelling location.

None

**model_tides_kwargs

Optional parameters passed to the eo_tides.model.model_tides function. Important parameters include cutoff (used to extrapolate modelled tides away from the coast; defaults to np.inf), crop (whether to crop tide model constituent files on-the-fly to improve performance) etc.

{}

Returns:

Name Type Description
ds Dataset

The original xarray.Dataset with a new tide_height variable giving the height of the tide (and optionally, its ebb-flow phase) for each timestep in the data.

Source code in eo_tides/eo.py
def tag_tides(
    ds: xr.Dataset | xr.DataArray,
    model: str | list[str] = "EOT20",
    directory: str | os.PathLike | None = None,
    tidepost_lat: float | None = None,
    tidepost_lon: float | None = None,
    **model_tides_kwargs,
) -> xr.DataArray:
    """
    Model tide heights for every timestep in a multi-dimensional
    dataset, and return a new `tide_height` array that can
    be used to "tag" each observation with tide data.

    The function models tides at the centroid of the dataset
    by default, but a custom tidal modelling location can
    be specified using `tidepost_lat` and `tidepost_lon`.

    This function uses the parallelised `model_tides` function
    under the hood. It supports all tidal models supported by
    `pyTMD`, including:

    - Empirical Ocean Tide model (EOT20)
    - Finite Element Solution tide models (FES2022, FES2014, FES2012)
    - TOPEX/POSEIDON global tide models (TPXO10, TPXO9, TPXO8)
    - Global Ocean Tide models (GOT5.6, GOT5.5, GOT4.10, GOT4.8, GOT4.7)
    - Hamburg direct data Assimilation Methods for Tides models (HAMTIDE11)

    Parameters
    ----------
    ds : xarray.Dataset or xarray.DataArray
        A multi-dimensional dataset (e.g. "x", "y", "time") to
        tag with tide heights. This dataset must contain a "time"
        dimension.
    model : str or list of str, optional
        The tide model (or models) to use to model tides. If a list is
        provided, a new "tide_model" dimension will be added to `ds`.
        Defaults to "EOT20"; for a full list of available/supported
        models, run `eo_tides.model.list_models`.
    directory : str, optional
        The directory containing tide model data files. If no path is
        provided, this will default to the environment variable
        `EO_TIDES_TIDE_MODELS` if set, or raise an error if not.
        Tide modelling files should be stored in sub-folders for each
        model that match the structure required by `pyTMD`
        (<https://geoscienceaustralia.github.io/eo-tides/setup/>).
    tidepost_lat, tidepost_lon : float, optional
        Optional coordinates used to model tides. The default is None,
        which uses the centroid of the dataset as the tide modelling
        location.
    **model_tides_kwargs :
        Optional parameters passed to the `eo_tides.model.model_tides`
        function. Important parameters include `cutoff` (used to
        extrapolate modelled tides away from the coast; defaults to
        `np.inf`), `crop` (whether to crop tide model constituent files
        on-the-fly to improve performance) etc.

    Returns
    -------
    ds : xr.Dataset
        The original `xarray.Dataset` with a new `tide_height` variable
        giving the height of the tide (and optionally, its ebb-flow phase)
        for each timestep in the data.

    """
    # Only datasets are supported
    if not isinstance(ds, xr.Dataset):
        raise TypeError("Input must be an xarray.Dataset, not an xarray.DataArray or other data type.")

    # Standardise model into a list for easy handling. and verify only one
    model = [model] if isinstance(model, str) else model

    # If custom tide modelling locations are not provided, use the
    # dataset centroid
    if tidepost_lat is None or tidepost_lon is None:
        lon, lat = ds.odc.geobox.geographic_extent.centroid.coords[0]
        print(f"Setting tide modelling location from dataset centroid: {lon:.2f}, {lat:.2f}")
    else:
        lon, lat = tidepost_lon, tidepost_lat
        print(f"Using tide modelling location: {lon:.2f}, {lat:.2f}")

    # Model tide heights for each observation:
    tide_df = model_tides(
        x=lon,  # type: ignore
        y=lat,  # type: ignore
        time=ds.time,
        model=model,
        directory=directory,
        crs="EPSG:4326",
        **model_tides_kwargs,
    )

    # If tides cannot be successfully modeled (e.g. if the centre of the
    # xarray dataset is located is over land), raise an exception
    if tide_df.tide_height.isnull().all():
        raise ValueError(
            f"Tides could not be modelled for dataset centroid located "
            f"at {tidepost_lon:.2f}, {tidepost_lat:.2f}. This can occur if "
            f"this coordinate occurs over land. Please manually specify "
            f"a tide modelling location located over water using the "
            f"`tidepost_lat` and `tidepost_lon` parameters."
        )

    # # Optionally calculate the tide phase for each observation
    # if ebb_flow:
    #     # Model tides for a time 15 minutes prior to each previously
    #     # modelled satellite acquisition time. This allows us to compare
    #     # tide heights to see if they are rising or falling.
    #     print("Modelling tidal phase (e.g. ebb or flow)")
    #     tide_pre_df = model_tides(
    #         x=lon,  # type: ignore
    #         y=lat,  # type: ignore
    #         time=(ds.time - pd.Timedelta("15 min")),
    #         model=model,
    #         directory=directory,
    #         crs="EPSG:4326",
    #         **model_tides_kwargs,
    #     )

    #     # Compare tides computed for each timestep. If the previous tide
    #     # was higher than the current tide, the tide is 'ebbing'. If the
    #     # previous tide was lower, the tide is 'flowing'
    #     tide_df["ebb_flow"] = (tide_df.tide_height < tide_pre_df.tide_height.values).replace({
    #         True: "Ebb",
    #         False: "Flow",
    #     })

    # Convert to xarray format
    tide_xr = tide_df.reset_index().set_index(["time", "tide_model"]).drop(["x", "y"], axis=1).tide_height.to_xarray()

    # If only one tidal model exists, squeeze out "tide_model" dim
    if len(tide_xr.tide_model) == 1:
        tide_xr = tide_xr.squeeze("tide_model")

    return tide_xr

eo_tides.stats

Functions:

Name Description
pixel_stats

Takes a multi-dimensional dataset and generate two-dimensional

tide_stats

Takes a multi-dimensional dataset and generate tide statistics

pixel_stats

pixel_stats(
    ds,
    model="EOT20",
    directory=None,
    resample=False,
    modelled_freq="3h",
    min_max_q=(0.0, 1.0),
    extrapolate=True,
    cutoff=10,
    **pixel_tides_kwargs
)

Takes a multi-dimensional dataset and generate two-dimensional tide statistics and satellite-observed tide bias metrics, calculated based on every timestep in the satellte data and modelled into the spatial extent of the imagery.

By comparing the subset of tides observed by satellites against the full astronomical tidal range, we can evaluate whether the tides observed by satellites are biased (e.g. fail to observe either the highest or lowest tides).

Compared to tide_stats, this function models tide metrics spatially to produce a two-dimensional output.

For more information about the tidal statistics computed by this function, refer to Figure 8 in Bishop-Taylor et al. 2018: https://www.sciencedirect.com/science/article/pii/S0272771418308783#fig8

Parameters:

Name Type Description Default

ds

Dataset or DataArray

A multi-dimensional dataset (e.g. "x", "y", "time") used to calculate 2D tide statistics. This dataset must contain a "time" dimension.

required

model

str or list of str

The tide model (or models) to use to model tides. If a list is provided, a new "tide_model" dimension will be added to ds. Defaults to "EOT20"; for a full list of available/supported models, run eo_tides.model.list_models.

'EOT20'

directory

str

The directory containing tide model data files. If no path is provided, this will default to the environment variable EO_TIDES_TIDE_MODELS if set, or raise an error if not. Tide modelling files should be stored in sub-folders for each model that match the structure required by pyTMD (https://geoscienceaustralia.github.io/eo-tides/setup/).

None

resample

bool

Whether to resample tide statistics back into ds's original higher resolution grid. Defaults to False, which will return lower-resolution statistics that are typically sufficient for most purposes.

False

modelled_freq

str

An optional string giving the frequency at which to model tides when computing the full modelled tidal range. Defaults to '3h', which computes a tide height for every three hours across the temporal extent of ds.

'3h'

min_max_q

tuple

Quantiles used to calculate max and min observed and modelled astronomical tides. By default (0.0, 1.0) which is equivalent to minimum and maximum; to use a softer threshold that is more robust to outliers, use e.g. (0.1, 0.9).

(0.0, 1.0)

extrapolate

bool

Whether to extrapolate tides for x and y coordinates outside of the valid tide modelling domain using nearest-neighbor. Defaults to True.

True

cutoff

float

Extrapolation cutoff in kilometers. To avoid producing tide statistics too far inland, the default is 10 km.

10

**pixel_tides_kwargs

Optional parameters passed to the eo_tides.eo.pixel_tides function.

{}

Returns:

Name Type Description
stats_ds Dataset

An xarray.Dataset containing the following statistics as two-dimensional data variables:

  • lot: minimum tide height observed by the satellite (metres)
  • lat: minimum tide height from modelled astronomical tidal range (metres)
  • hot: maximum tide height observed by the satellite (metres)
  • hat: maximum tide height from modelled astronomical tidal range (metres)
  • otr: tidal range observed by the satellite (metres)
  • tr: modelled astronomical tide range (metres)
  • spread: proportion of the full modelled tidal range observed by the satellite
  • offset_low: proportion of the lowest tides never observed by the satellite
  • offset_high: proportion of the highest tides never observed by the satellite
Source code in eo_tides/stats.py
def pixel_stats(
    ds: xr.Dataset | xr.DataArray,
    model: str | list[str] = "EOT20",
    directory: str | os.PathLike | None = None,
    resample: bool = False,
    modelled_freq="3h",
    min_max_q=(0.0, 1.0),
    extrapolate: bool = True,
    cutoff: float = 10,
    **pixel_tides_kwargs,
) -> xr.Dataset:
    """
    Takes a multi-dimensional dataset and generate two-dimensional
    tide statistics and satellite-observed tide bias metrics,
    calculated based on every timestep in the satellte data and
    modelled into the spatial extent of the imagery.

    By comparing the subset of tides observed by satellites
    against the full astronomical tidal range, we can evaluate
    whether the tides observed by satellites are biased
    (e.g. fail to observe either the highest or lowest tides).

    Compared to `tide_stats`, this function models tide metrics
    spatially to produce a two-dimensional output.

    For more information about the tidal statistics computed by this
    function, refer to Figure 8 in Bishop-Taylor et al. 2018:
    <https://www.sciencedirect.com/science/article/pii/S0272771418308783#fig8>

    Parameters
    ----------
    ds : xarray.Dataset or xarray.DataArray
        A multi-dimensional dataset (e.g. "x", "y", "time") used
        to calculate 2D tide statistics. This dataset must contain
        a "time" dimension.
    model : str or list of str, optional
        The tide model (or models) to use to model tides. If a list is
        provided, a new "tide_model" dimension will be added to `ds`.
        Defaults to "EOT20"; for a full list of available/supported
        models, run `eo_tides.model.list_models`.
    directory : str, optional
        The directory containing tide model data files. If no path is
        provided, this will default to the environment variable
        `EO_TIDES_TIDE_MODELS` if set, or raise an error if not.
        Tide modelling files should be stored in sub-folders for each
        model that match the structure required by `pyTMD`
        (<https://geoscienceaustralia.github.io/eo-tides/setup/>).
    resample : bool, optional
        Whether to resample tide statistics back into `ds`'s original
        higher resolution grid. Defaults to False, which will return
        lower-resolution statistics that are typically sufficient for
        most purposes.
    modelled_freq : str, optional
        An optional string giving the frequency at which to model tides
        when computing the full modelled tidal range. Defaults to '3h',
        which computes a tide height for every three hours across the
        temporal extent of `ds`.
    min_max_q : tuple, optional
        Quantiles used to calculate max and min observed and modelled
        astronomical tides. By default `(0.0, 1.0)` which is equivalent
        to minimum and maximum; to use a softer threshold that is more
        robust to outliers, use e.g. `(0.1, 0.9)`.
    extrapolate : bool, optional
        Whether to extrapolate tides for x and y coordinates outside of
        the valid tide modelling domain using nearest-neighbor. Defaults
        to True.
    cutoff : float, optional
        Extrapolation cutoff in kilometers. To avoid producing tide
        statistics too far inland, the default is 10 km.
    **pixel_tides_kwargs :
        Optional parameters passed to the `eo_tides.eo.pixel_tides`
        function.

    Returns
    -------
    stats_ds : xarray.Dataset
        An `xarray.Dataset` containing the following statistics as two-dimensional data variables:

        - `lot`: minimum tide height observed by the satellite (metres)
        - `lat`: minimum tide height from modelled astronomical tidal range (metres)
        - `hot`: maximum tide height observed by the satellite (metres)
        - `hat`: maximum tide height from modelled astronomical tidal range (metres)
        - `otr`: tidal range observed by the satellite (metres)
        - `tr`: modelled astronomical tide range (metres)
        - `spread`: proportion of the full modelled tidal range observed by the satellite
        - `offset_low`: proportion of the lowest tides never observed by the satellite
        - `offset_high`: proportion of the highest tides never observed by the satellite

    """
    # Model observed tides
    obs_tides = pixel_tides(
        ds,
        resample=False,
        model=model,
        directory=directory,
        calculate_quantiles=min_max_q,
        extrapolate=extrapolate,
        cutoff=cutoff,
        **pixel_tides_kwargs,
    )

    # Generate times covering entire period of satellite record
    all_timerange = pd.date_range(
        start=ds.time.min().item(),
        end=ds.time.max().item(),
        freq=modelled_freq,
    )

    # Model all tides
    all_tides = pixel_tides(
        ds,
        times=all_timerange,
        model=model,
        directory=directory,
        calculate_quantiles=min_max_q,
        resample=False,
        extrapolate=extrapolate,
        cutoff=cutoff,
        **pixel_tides_kwargs,
    )

    # Calculate min and max tides
    lot = obs_tides.isel(quantile=0)
    hot = obs_tides.isel(quantile=-1)
    lat = all_tides.isel(quantile=0)
    hat = all_tides.isel(quantile=-1)

    # Calculate tidal range
    otr = hot - lot
    tr = hat - lat

    # Calculate Bishop-Taylor et al. 2018 tidal metrics
    spread = otr / tr
    offset_low_m = abs(lat - lot)
    offset_high_m = abs(hat - hot)
    offset_low = offset_low_m / tr
    offset_high = offset_high_m / tr

    # Combine into a single dataset
    stats_ds = (
        xr.merge(
            [
                hat.rename("hat"),
                hot.rename("hot"),
                lat.rename("lat"),
                lot.rename("lot"),
                otr.rename("otr"),
                tr.rename("tr"),
                spread.rename("spread"),
                offset_low.rename("offset_low"),
                offset_high.rename("offset_high"),
            ],
            compat="override",
        )
        .drop_vars("quantile")
        .odc.assign_crs(crs=ds.odc.crs)
    )

    # Optionally resample into the original pixel grid of `ds`
    if resample:
        stats_ds = stats_ds.odc.reproject(how=ds.odc.geobox, resample_method="bilinear")

    return stats_ds

tide_stats

tide_stats(
    ds,
    model="EOT20",
    directory=None,
    tidepost_lat=None,
    tidepost_lon=None,
    plain_english=True,
    plot=True,
    plot_col=None,
    modelled_freq="3h",
    linear_reg=False,
    min_max_q=(0.0, 1.0),
    round_stats=3,
    **model_tides_kwargs
)

Takes a multi-dimensional dataset and generate tide statistics and satellite-observed tide bias metrics, calculated based on every timestep in the satellte data and the geographic centroid of the imagery.

By comparing the subset of tides observed by satellites against the full astronomical tidal range, we can evaluate whether the tides observed by satellites are biased (e.g. fail to observe either the highest or lowest tides).

For more information about the tidal statistics computed by this function, refer to Figure 8 in Bishop-Taylor et al. 2018: https://www.sciencedirect.com/science/article/pii/S0272771418308783#fig8

Parameters:

Name Type Description Default

ds

Dataset or DataArray

A multi-dimensional dataset (e.g. "x", "y", "time") used to calculate tide statistics. This dataset must contain a "time" dimension.

required

model

string

The tide model to use to model tides. Defaults to "EOT20"; for a full list of available/supported models, run eo_tides.model.list_models.

'EOT20'

directory

string

The directory containing tide model data files. If no path is provided, this will default to the environment variable EO_TIDES_TIDE_MODELS if set, or raise an error if not. Tide modelling files should be stored in sub-folders for each model that match the structure required by pyTMD (https://geoscienceaustralia.github.io/eo-tides/setup/).

None

tidepost_lat

float or int

Optional coordinates used to model tides. The default is None, which uses the centroid of the dataset as the tide modelling location.

None

tidepost_lon

float or int

Optional coordinates used to model tides. The default is None, which uses the centroid of the dataset as the tide modelling location.

None

plain_english

bool

An optional boolean indicating whether to print a plain english version of the tidal statistics to the screen. Defaults to True.

True

plot

bool

An optional boolean indicating whether to plot how satellite- observed tide heights compare against the full tidal range. Defaults to True.

True

plot_col

str

Optional name of a coordinate, dimension or variable in the array that will be used to plot observations with unique symbols. Defaults to None, which will plot all observations as circles.

None

modelled_freq

str

An optional string giving the frequency at which to model tides when computing the full modelled tidal range. Defaults to '3h', which computes a tide height for every three hours across the temporal extent of ds.

'3h'

linear_reg

bool

Whether to return linear regression statistics that assess whether satellite-observed tides show any decreasing or increasing trends over time. This may indicate whether your satellite data may produce misleading trends based on uneven sampling of the local tide regime.

False

min_max_q

tuple

Quantiles used to calculate max and min observed and modelled astronomical tides. By default (0.0, 1.0) which is equivalent to minimum and maximum; to use a softer threshold that is more robust to outliers, use e.g. (0.1, 0.9).

(0.0, 1.0)

round_stats

int

The number of decimal places used to round the output statistics. Defaults to 3.

3

**model_tides_kwargs

Optional parameters passed to the eo_tides.model.model_tides function. Important parameters include cutoff (used to extrapolate modelled tides away from the coast; defaults to np.inf), crop (whether to crop tide model constituent files on-the-fly to improve performance) etc.

{}

Returns:

Name Type Description
stats_df Series

A pandas.Series containing the following statistics:

  • y: latitude used for modelling tide heights
  • x: longitude used for modelling tide heights
  • mot: mean tide height observed by the satellite (metres)
  • mat: mean modelled astronomical tide height (metres)
  • lot: minimum tide height observed by the satellite (metres)
  • lat: minimum tide height from modelled astronomical tidal range (metres)
  • hot: maximum tide height observed by the satellite (metres)
  • hat: maximum tide height from modelled astronomical tidal range (metres)
  • otr: tidal range observed by the satellite (metres)
  • tr: modelled astronomical tide range (metres)
  • spread: proportion of the full modelled tidal range observed by the satellite
  • offset_low: proportion of the lowest tides never observed by the satellite
  • offset_high: proportion of the highest tides never observed by the satellite

If linear_reg = True, the output will also contain:

  • observed_slope: slope of any relationship between observed tide heights and time
  • observed_pval: significance/p-value of any relationship between observed tide heights and time
Source code in eo_tides/stats.py
def tide_stats(
    ds: xr.Dataset | xr.DataArray,
    model: str = "EOT20",
    directory: str | os.PathLike | None = None,
    tidepost_lat: float | None = None,
    tidepost_lon: float | None = None,
    plain_english: bool = True,
    plot: bool = True,
    plot_col: str | None = None,
    modelled_freq: str = "3h",
    linear_reg: bool = False,
    min_max_q: tuple = (0.0, 1.0),
    round_stats: int = 3,
    **model_tides_kwargs,
) -> pd.Series:
    """
    Takes a multi-dimensional dataset and generate tide statistics
    and satellite-observed tide bias metrics, calculated based on
    every timestep in the satellte data and the geographic centroid
    of the imagery.

    By comparing the subset of tides observed by satellites
    against the full astronomical tidal range, we can evaluate
    whether the tides observed by satellites are biased
    (e.g. fail to observe either the highest or lowest tides).

    For more information about the tidal statistics computed by this
    function, refer to Figure 8 in Bishop-Taylor et al. 2018:
    <https://www.sciencedirect.com/science/article/pii/S0272771418308783#fig8>

    Parameters
    ----------
    ds : xarray.Dataset or xarray.DataArray
        A multi-dimensional dataset (e.g. "x", "y", "time") used
        to calculate tide statistics. This dataset must contain
        a "time" dimension.
    model : string, optional
        The tide model to use to model tides. Defaults to "EOT20";
        for a full list of available/supported models, run
        `eo_tides.model.list_models`.
    directory : string, optional
        The directory containing tide model data files. If no path is
        provided, this will default to the environment variable
        `EO_TIDES_TIDE_MODELS` if set, or raise an error if not.
        Tide modelling files should be stored in sub-folders for each
        model that match the structure required by `pyTMD`
        (<https://geoscienceaustralia.github.io/eo-tides/setup/>).
    tidepost_lat, tidepost_lon : float or int, optional
        Optional coordinates used to model tides. The default is None,
        which uses the centroid of the dataset as the tide modelling
        location.
    plain_english : bool, optional
        An optional boolean indicating whether to print a plain english
        version of the tidal statistics to the screen. Defaults to True.
    plot : bool, optional
        An optional boolean indicating whether to plot how satellite-
        observed tide heights compare against the full tidal range.
        Defaults to True.
    plot_col : str, optional
        Optional name of a coordinate, dimension or variable in the array
        that will be used to plot observations with unique symbols.
        Defaults to None, which will plot all observations as circles.
    modelled_freq : str, optional
        An optional string giving the frequency at which to model tides
        when computing the full modelled tidal range. Defaults to '3h',
        which computes a tide height for every three hours across the
        temporal extent of `ds`.
    linear_reg: bool, optional
        Whether to return linear regression statistics that assess
        whether satellite-observed tides show any decreasing  or
        increasing trends over time. This may indicate whether your
        satellite data may produce misleading trends based on uneven
        sampling of the local tide regime.
    min_max_q : tuple, optional
        Quantiles used to calculate max and min observed and modelled
        astronomical tides. By default `(0.0, 1.0)` which is equivalent
        to minimum and maximum; to use a softer threshold that is more
        robust to outliers, use e.g. `(0.1, 0.9)`.
    round_stats : int, optional
        The number of decimal places used to round the output statistics.
        Defaults to 3.
    **model_tides_kwargs :
        Optional parameters passed to the `eo_tides.model.model_tides`
        function. Important parameters include `cutoff` (used to
        extrapolate modelled tides away from the coast; defaults to
        `np.inf`), `crop` (whether to crop tide model constituent files
        on-the-fly to improve performance) etc.

    Returns
    -------
    stats_df : pandas.Series
        A `pandas.Series` containing the following statistics:

        - `y`: latitude used for modelling tide heights
        - `x`: longitude used for modelling tide heights
        - `mot`: mean tide height observed by the satellite (metres)
        - `mat`: mean modelled astronomical tide height (metres)
        - `lot`: minimum tide height observed by the satellite (metres)
        - `lat`: minimum tide height from modelled astronomical tidal range (metres)
        - `hot`: maximum tide height observed by the satellite (metres)
        - `hat`: maximum tide height from modelled astronomical tidal range (metres)
        - `otr`: tidal range observed by the satellite (metres)
        - `tr`: modelled astronomical tide range (metres)
        - `spread`: proportion of the full modelled tidal range observed by the satellite
        - `offset_low`: proportion of the lowest tides never observed by the satellite
        - `offset_high`: proportion of the highest tides never observed by the satellite

        If `linear_reg = True`, the output will also contain:

        - `observed_slope`: slope of any relationship between observed tide heights and time
        - `observed_pval`: significance/p-value of any relationship between observed tide heights and time
    """
    # Verify that only one tide model is provided
    if isinstance(model, list):
        raise Exception("Only single tide models are supported by `tide_stats`.")

    # If custom tide modelling locations are not provided, use the
    # dataset centroid
    if not tidepost_lat or not tidepost_lon:
        tidepost_lon, tidepost_lat = ds.odc.geobox.geographic_extent.centroid.coords[0]

    # Model tides for each observation in the supplied xarray object
    obs_tides_da = tag_tides(
        ds,
        model=model,
        directory=directory,
        tidepost_lat=tidepost_lat,  # type: ignore
        tidepost_lon=tidepost_lon,  # type: ignore
        return_tideposts=True,
        **model_tides_kwargs,
    )
    obs_tides_da = obs_tides_da.reindex_like(ds)

    # Generate range of times covering entire period of satellite record
    all_timerange = pd.date_range(
        start=obs_tides_da.time.min().item(),
        end=obs_tides_da.time.max().item(),
        freq=modelled_freq,
    )

    # Model tides for each timestep
    all_tides_df = model_tides(
        x=tidepost_lon,  # type: ignore
        y=tidepost_lat,  # type: ignore
        time=all_timerange,
        model=model,
        directory=directory,
        crs="EPSG:4326",
        **model_tides_kwargs,
    )

    # Get coarse statistics on all and observed tidal ranges
    obs_mean = obs_tides_da.mean().item()
    all_mean = all_tides_df.tide_height.mean()
    obs_min, obs_max = obs_tides_da.quantile(min_max_q).values
    all_min, all_max = all_tides_df.tide_height.quantile(min_max_q).values

    # Calculate tidal range
    obs_range = obs_max - obs_min
    all_range = all_max - all_min

    # Calculate Bishop-Taylor et al. 2018 tidal metrics
    spread = obs_range / all_range
    low_tide_offset_m = abs(all_min - obs_min)
    high_tide_offset_m = abs(all_max - obs_max)
    low_tide_offset = low_tide_offset_m / all_range
    high_tide_offset = high_tide_offset_m / all_range

    # Plain text descriptors
    mean_diff = "higher" if obs_mean > all_mean else "lower"
    mean_diff_icon = "⬆️" if obs_mean > all_mean else "⬇️"
    spread_icon = "🟢" if spread >= 0.9 else "🟡" if 0.7 < spread <= 0.9 else "🔴"
    low_tide_icon = "🟢" if low_tide_offset <= 0.1 else "🟡" if 0.1 <= low_tide_offset < 0.2 else "🔴"
    high_tide_icon = "🟢" if high_tide_offset <= 0.1 else "🟡" if 0.1 <= high_tide_offset < 0.2 else "🔴"

    # Extract x (time in decimal years) and y (distance) values
    obs_x = (
        obs_tides_da.time.dt.year + ((obs_tides_da.time.dt.dayofyear - 1) / 365) + ((obs_tides_da.time.dt.hour) / 24)
    )
    obs_y = obs_tides_da.values.astype(np.float32)

    # Compute linear regression
    obs_linreg = stats.linregress(x=obs_x, y=obs_y)

    if plain_english:
        print(f"\n\n🌊 Modelled astronomical tide range: {all_range:.2f} metres.")
        print(f"🛰️ Observed tide range: {obs_range:.2f} metres.\n")
        print(f"{spread_icon} {spread:.0%} of the modelled astronomical tide range was observed at this location.")
        print(
            f"{high_tide_icon} The highest {high_tide_offset:.0%} ({high_tide_offset_m:.2f} metres) of the tide range was never observed."
        )
        print(
            f"{low_tide_icon} The lowest {low_tide_offset:.0%} ({low_tide_offset_m:.2f} metres) of the tide range was never observed.\n"
        )
        print(f"🌊 Mean modelled astronomical tide height: {all_mean:.2f} metres.")
        print(f"🛰️ Mean observed tide height: {obs_mean:.2f} metres.\n")
        print(
            f"{mean_diff_icon} The mean observed tide height was {obs_mean - all_mean:.2f} metres {mean_diff} than the mean modelled astronomical tide height."
        )

        if linear_reg:
            if obs_linreg.pvalue > 0.01:
                print("➖ Observed tides showed no significant trends over time.")
            else:
                obs_slope_desc = "decreasing" if obs_linreg.slope < 0 else "increasing"
                print(
                    f"⚠️ Observed tides showed a significant {obs_slope_desc} trend over time (p={obs_linreg.pvalue:.3f}, {obs_linreg.slope:.2f} metres per year)"
                )

    if plot:
        _plot_biases(
            all_tides_df=all_tides_df,
            obs_tides_da=obs_tides_da,
            lat=all_min,
            lot=obs_min,
            hat=all_max,
            hot=obs_max,
            offset_low=low_tide_offset,
            offset_high=high_tide_offset,
            spread=spread,
            plot_col=ds[plot_col] if plot_col else None,
            obs_linreg=obs_linreg if linear_reg else None,
            obs_x=obs_x,
            all_timerange=all_timerange,
        )

    # Export pandas.Series containing tidal stats
    output_stats = {
        "y": tidepost_lat,
        "x": tidepost_lon,
        "mot": obs_mean,
        "mat": all_mean,
        "lot": obs_min,
        "lat": all_min,
        "hot": obs_max,
        "hat": all_max,
        "otr": obs_range,
        "tr": all_range,
        "spread": spread,
        "offset_low": low_tide_offset,
        "offset_high": high_tide_offset,
    }

    if linear_reg:
        output_stats.update({
            "observed_slope": obs_linreg.slope,
            "observed_pval": obs_linreg.pvalue,
        })

    # Return pandas data
    stats_df = pd.Series(output_stats).round(round_stats)
    return stats_df

eo_tides.validation

Functions:

Name Description
eval_metrics

Calculate a set of common statistical metrics

load_gauge_gesla

Load and process all available Global Extreme Sea Level Analysis

eval_metrics

eval_metrics(x, y, round=3, all_regress=False)

Calculate a set of common statistical metrics based on two input actual and predicted vectors.

These include:

  • Pearson correlation
  • Root Mean Squared Error
  • Mean Absolute Error
  • R-squared
  • Bias
  • Linear regression parameters (slope, p-value, intercept, standard error)

Parameters:

Name Type Description Default

x

array

An array providing "actual" variable values.

required

y

array

An array providing "predicted" variable values.

required

round

int

Number of decimal places to round each metric to. Defaults to 3.

3

all_regress

bool

Whether to return linear regression p-value, intercept and standard error (in addition to only regression slope). Defaults to False.

False

Returns:

Type Description
Series

A pd.Series containing all calculated metrics.

Source code in eo_tides/validation.py
def eval_metrics(x, y, round=3, all_regress=False):
    """
    Calculate a set of common statistical metrics
    based on two input actual and predicted vectors.

    These include:

    * Pearson correlation
    * Root Mean Squared Error
    * Mean Absolute Error
    * R-squared
    * Bias
    * Linear regression parameters (slope, p-value, intercept, standard error)

    Parameters
    ----------
    x : numpy.array
        An array providing "actual" variable values.
    y : numpy.array
        An array providing "predicted" variable values.
    round : int
        Number of decimal places to round each metric
        to. Defaults to 3.
    all_regress : bool
        Whether to return linear regression p-value,
        intercept and standard error (in addition to
        only regression slope). Defaults to False.

    Returns
    -------
    pandas.Series
        A `pd.Series` containing all calculated metrics.
    """

    # Create dataframe to drop na
    xy_df = pd.DataFrame({"x": x, "y": y}).dropna()

    # Compute linear regression
    lin_reg = stats.linregress(x=xy_df.x, y=xy_df.y)

    # Calculate statistics
    stats_dict = {
        "Correlation": xy_df.corr().iloc[0, 1],
        "RMSE": sqrt(mean_squared_error(xy_df.x, xy_df.y)),
        "MAE": mean_absolute_error(xy_df.x, xy_df.y),
        "R-squared": lin_reg.rvalue**2,
        "Bias": (xy_df.y - xy_df.x).mean(),
        "Regression slope": lin_reg.slope,
    }

    # Additional regression params
    if all_regress:
        stats_dict.update({
            "Regression p-value": lin_reg.pvalue,
            "Regression intercept": lin_reg.intercept,
            "Regression standard error": lin_reg.stderr,
        })

    # Return as
    return pd.Series(stats_dict).round(round)

load_gauge_gesla

load_gauge_gesla(
    x=None,
    y=None,
    site_code=None,
    time=("2018", "2020"),
    max_distance=None,
    correct_mean=False,
    filter_use_flag=True,
    site_metadata=True,
    data_path="/gdata1/data/sea_level/gesla/",
    metadata_path="/gdata1/data/sea_level/GESLA3_ALL 2.csv",
)

Load and process all available Global Extreme Sea Level Analysis (GESLA) tide gauge data with an x, y, time spatiotemporal query, or from a list of specific tide gauges.

Can optionally filter by gauge quality and append detailed gauge metadata.

Modified from original code in https://github.com/philiprt/GeslaDataset.

Parameters:

Name Type Description Default

x

numeric or list / tuple

Coordinates (in degrees longitude, latitude) used to load GESLA tide gauge observations. If provided as singular values (e.g. x=150, y=-32), then the nearest tide gauge will be returned. If provided as a list or tuple (e.g. x=(150, 152), y=(-32, -30)), then all gauges within the provided bounding box will be loaded. Leave as None to return all available gauges, or if providing a list of site codes using site_code.

None

y

numeric or list / tuple

Coordinates (in degrees longitude, latitude) used to load GESLA tide gauge observations. If provided as singular values (e.g. x=150, y=-32), then the nearest tide gauge will be returned. If provided as a list or tuple (e.g. x=(150, 152), y=(-32, -30)), then all gauges within the provided bounding box will be loaded. Leave as None to return all available gauges, or if providing a list of site codes using site_code.

None

site_code

str or list of str

GESLA site code(s) for which to load data (e.g. site_code="62650"). If site_code is provided, x and y will be ignored.

None

time

tuple or list of str

Time range to consider, given as a tuple of start and end dates, e.g. time=("2020", "2021"). The default of None will return all tide observations from the year 1800 onward.

('2018', '2020')

max_distance

numeric

Optional max distance within which to return the nearest tide gauge when x and y are provided as singular coordinates. Defaults to None, which will always return a tide gauge no matter how far away it is located from x and y.

None

correct_mean

bool

Whether to correct sea level measurements to a standardised mean sea level by subtracting the mean of all observed sea level observations. This can be useful when GESLA tide heights come from different or unknown tide datums. Note: the observed mean sea level calculated here may differ from true long-term/ astronomical Mean Sea Level (MSL) datum.

False

filter_use_flag

bool

Whether to filter out low quality observations with a "use_flag" value of 0 (do not use). Defaults to True.

True

site_metadata

bool

Whether to add tide gauge station metadata as additional columns in the output DataFrame. Defaults to True.

True

data_path

str

Path to the raw GESLA data files. Default is /gdata1/data/sea_level/gesla/.

'/gdata1/data/sea_level/gesla/'

metadata_path

str

Path to the GESLA station metadata file. Default is /gdata1/data/sea_level/GESLA3_ALL 2.csv.

'/gdata1/data/sea_level/GESLA3_ALL 2.csv'

Returns:

Type Description
DataFrame

Processed GESLA data as a DataFrame with columns including:

  • "time": Timestamps,
  • "sea_level": Observed sea level (m),
  • "qc_flag": Observed sea level QC flag,
  • "use_flag": Use-in-analysis flag (1 = use, 0 = do not use),

...and additional columns from station metadata.

Source code in eo_tides/validation.py
def load_gauge_gesla(
    x=None,
    y=None,
    site_code=None,
    time=("2018", "2020"),
    max_distance=None,
    correct_mean=False,
    filter_use_flag=True,
    site_metadata=True,
    data_path="/gdata1/data/sea_level/gesla/",
    metadata_path="/gdata1/data/sea_level/GESLA3_ALL 2.csv",
):
    """
    Load and process all available Global Extreme Sea Level Analysis
    (GESLA) tide gauge data with an `x, y, time` spatiotemporal query,
    or from a list of specific tide gauges.

    Can optionally filter by gauge quality and append detailed gauge metadata.

    Modified from original code in <https://github.com/philiprt/GeslaDataset>.

    Parameters
    ----------
    x, y : numeric or list/tuple, optional
        Coordinates (in degrees longitude, latitude) used to load GESLA
        tide gauge observations. If provided as singular values
        (e.g. `x=150, y=-32`), then the nearest tide gauge will be returned.
        If provided as a list or tuple (e.g. `x=(150, 152), y=(-32, -30)`),
        then all gauges within the provided bounding box will be loaded.
        Leave as `None` to return all available gauges, or if providing a
        list of site codes using `site_code`.
    site_code : str or list of str, optional
        GESLA site code(s) for which to load data (e.g. `site_code="62650"`).
        If `site_code` is provided, `x` and `y` will be ignored.
    time : tuple or list of str, optional
        Time range to consider, given as a tuple of start and end dates,
        e.g. `time=("2020", "2021")`. The default of None will return all
        tide observations from the year 1800 onward.
    max_distance : numeric, optional
        Optional max distance within which to return the nearest tide gauge
        when `x` and `y` are provided as singular coordinates. Defaults to
        None, which will always return a tide gauge no matter how far away
        it is located from `x` and `y`.
    correct_mean : bool, optional
        Whether to correct sea level measurements to a standardised mean
        sea level by subtracting the mean of all observed sea level
        observations. This can be useful when GESLA tide heights come
        from different or unknown tide datums. Note: the observed mean
        sea level calculated here may differ from true long-term/
        astronomical Mean Sea Level (MSL) datum.
    filter_use_flag : bool, optional
        Whether to filter out low quality observations with a "use_flag"
        value of 0 (do not use). Defaults to True.
    site_metadata : bool, optional
        Whether to add tide gauge station metadata as additional columns
        in the output DataFrame. Defaults to True.
    data_path : str, optional
        Path to the raw GESLA data files. Default is
        `/gdata1/data/sea_level/gesla/`.
    metadata_path : str, optional
        Path to the GESLA station metadata file.
        Default is `/gdata1/data/sea_level/GESLA3_ALL 2.csv`.

    Returns
    -------
    pd.DataFrame
        Processed GESLA data as a DataFrame with columns including:

        - "time": Timestamps,
        - "sea_level": Observed sea level (m),
        - "qc_flag": Observed sea level QC flag,
        - "use_flag": Use-in-analysis flag (1 = use, 0 = do not use),

        ...and additional columns from station metadata.
    """
    # Load tide gauge metadata
    metadata_df, metadata_gdf = _load_gauge_metadata(metadata_path)

    # Use supplied site codes if available
    if site_code is not None:
        site_code = [site_code] if not isinstance(site_code, list) else site_code

    # If x and y are tuples, use xy bounds to identify sites
    elif isinstance(x, (tuple, list)) & isinstance(y, (tuple, list)):
        bbox = BoundingBox.from_xy(x, y)
        site_code = metadata_gdf.cx[bbox.left : bbox.right, bbox.top : bbox.bottom].index

    # If x and y are single numbers, select nearest row
    elif isinstance(x, Number) & isinstance(y, Number):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            site_code = (
                _nearest_row(metadata_gdf, x, y, max_distance).rename({"index_right": "site_code"}, axis=1).site_code
            )
            # site_code = _nearest_row(metadata_gdf, x, y, max_distance).site_code

        # Raise exception if no valid tide gauges are found
        if site_code.isnull().all():
            raise Exception(f"No tide gauge found within {max_distance} degrees of {x}, {y}.")

    # Otherwise if all are None, return all available site codes
    elif (site_code is None) & (x is None) & (y is None):
        site_code = metadata_df.index.to_list()

    else:
        raise TypeError(
            "`x` and `y` must be provided as either singular coordinates (e.g. `x=150`), or as a tuple bounding box (e.g. `x=(150, 152)`)."
        )

    # Prepare times
    if time is None:
        time = ["1800", str(datetime.datetime.now().year)]
    time = [time] if not isinstance(time, (list, tuple)) else time
    start_time = _round_date_strings(time[0], round_type="start")
    end_time = _round_date_strings(time[-1], round_type="end")

    # Identify paths to load and nodata values for each site
    metadata_df["file_name"] = data_path + metadata_df["file_name"]
    paths_na = metadata_df.loc[site_code, ["file_name", "null_value"]]

    # Load and combine into a single dataframe
    data_df = (
        pd.concat([_load_gesla_dataset(s, p, na_value=na) for s, p, na in paths_na.itertuples()])
        .sort_index()
        .loc[slice(start_time, end_time)]
        .reset_index()
        .set_index("site_code")
    )

    # Optionally filter by use flag column
    if filter_use_flag:
        data_df = data_df.loc[data_df.use_flag == 1]

    # Optionally insert metadata into dataframe
    if site_metadata:
        data_df[metadata_df.columns] = metadata_df.loc[site_code]

    # Add time to index and remove duplicates
    data_df = data_df.set_index("time", append=True)
    duplicates = data_df.index.duplicated()
    if duplicates.sum() > 0:
        warnings.warn("Duplicate timestamps were removed.")
        data_df = data_df.loc[~duplicates]

    # Remove observed mean sea level if requested
    if correct_mean:
        data_df["sea_level"] = data_df["sea_level"].sub(data_df.groupby("site_code")["sea_level"].transform("mean"))

    # Return data
    return data_df

eo_tides.utils

Functions:

Name Description
idw

Perform Inverse Distance Weighting (IDW) interpolation.

idw

idw(
    input_z,
    input_x,
    input_y,
    output_x,
    output_y,
    p=1,
    k=10,
    max_dist=None,
    k_min=1,
    epsilon=1e-12,
)

Perform Inverse Distance Weighting (IDW) interpolation.

This function performs fast IDW interpolation by creating a KDTree from the input coordinates then uses it to find the k nearest neighbors for each output point. Weights are calculated based on the inverse distance to each neighbor, with weights descreasing with increasing distance.

Code inspired by: https://github.com/DahnJ/REM-xarray

Parameters:

Name Type Description Default

input_z

array - like

Array of values at the input points. This can be either a 1-dimensional array, or a 2-dimensional array where each column (axis=1) represents a different set of values to be interpolated.

required

input_x

array - like

Array of x-coordinates of the input points.

required

input_y

array - like

Array of y-coordinates of the input points.

required

output_x

array - like

Array of x-coordinates where the interpolation is to be computed.

required

output_y

array - like

Array of y-coordinates where the interpolation is to be computed.

required

p

int or float

Power function parameter defining how rapidly weightings should decrease as distance increases. Higher values of p will cause weights for distant points to decrease rapidly, resulting in nearby points having more influence on predictions. Defaults to 1.

1

k

int

Number of nearest neighbors to use for interpolation. k=1 is equivalent to "nearest" neighbour interpolation. Defaults to 10.

10

max_dist

int or float

Restrict neighbouring points to less than this distance. By default, no distance limit is applied.

None

k_min

int

If max_dist is provided, some points may end up with less than k nearest neighbours, potentially producing less reliable interpolations. Set k_min to set any points with less than k_min neighbours to NaN. Defaults to 1.

1

epsilon

float

Small value added to distances to prevent division by zero errors in the case that output coordinates are identical to input coordinates. Defaults to 1e-12.

1e-12

Returns:

Name Type Description
interp_values ndarray

Interpolated values at the output coordinates. If input_z is 1-dimensional, interp_values will also be 1-dimensional. If input_z is 2-dimensional, interp_values will have the same number of rows as input_z, with each column (axis=1) representing interpolated values for one set of input data.

Examples:

>>> input_z = [1, 2, 3, 4, 5]
>>> input_x = [0, 1, 2, 3, 4]
>>> input_y = [0, 1, 2, 3, 4]
>>> output_x = [0.5, 1.5, 2.5]
>>> output_y = [0.5, 1.5, 2.5]
>>> idw(input_z, input_x, input_y, output_x, output_y, k=2)
array([1.5, 2.5, 3.5])
Source code in eo_tides/utils.py
def idw(
    input_z,
    input_x,
    input_y,
    output_x,
    output_y,
    p=1,
    k=10,
    max_dist=None,
    k_min=1,
    epsilon=1e-12,
):
    """Perform Inverse Distance Weighting (IDW) interpolation.

    This function performs fast IDW interpolation by creating a KDTree
    from the input coordinates then uses it to find the `k` nearest
    neighbors for each output point. Weights are calculated based on the
    inverse distance to each neighbor, with weights descreasing with
    increasing distance.

    Code inspired by: https://github.com/DahnJ/REM-xarray

    Parameters
    ----------
    input_z : array-like
        Array of values at the input points. This can be either a
        1-dimensional array, or a 2-dimensional array where each column
        (axis=1) represents a different set of values to be interpolated.
    input_x : array-like
        Array of x-coordinates of the input points.
    input_y : array-like
        Array of y-coordinates of the input points.
    output_x : array-like
        Array of x-coordinates where the interpolation is to be computed.
    output_y : array-like
        Array of y-coordinates where the interpolation is to be computed.
    p : int or float, optional
        Power function parameter defining how rapidly weightings should
        decrease as distance increases. Higher values of `p` will cause
        weights for distant points to decrease rapidly, resulting in
        nearby points having more influence on predictions. Defaults to 1.
    k : int, optional
        Number of nearest neighbors to use for interpolation. `k=1` is
        equivalent to "nearest" neighbour interpolation. Defaults to 10.
    max_dist : int or float, optional
        Restrict neighbouring points to less than this distance.
        By default, no distance limit is applied.
    k_min : int, optional
        If `max_dist` is provided, some points may end up with less than
        `k` nearest neighbours, potentially producing less reliable
        interpolations. Set `k_min` to set any points with less than
        `k_min` neighbours to NaN. Defaults to 1.
    epsilon : float, optional
        Small value added to distances to prevent division by zero
        errors in the case that output coordinates are identical to
        input coordinates. Defaults to 1e-12.

    Returns
    -------
    interp_values : numpy.ndarray
        Interpolated values at the output coordinates. If `input_z` is
        1-dimensional, `interp_values` will also be 1-dimensional. If
        `input_z` is 2-dimensional, `interp_values` will have the same
        number of rows as `input_z`, with each column (axis=1)
        representing interpolated values for one set of input data.

    Examples
    --------
    >>> input_z = [1, 2, 3, 4, 5]
    >>> input_x = [0, 1, 2, 3, 4]
    >>> input_y = [0, 1, 2, 3, 4]
    >>> output_x = [0.5, 1.5, 2.5]
    >>> output_y = [0.5, 1.5, 2.5]
    >>> idw(input_z, input_x, input_y, output_x, output_y, k=2)
    array([1.5, 2.5, 3.5])

    """
    # Convert to numpy arrays
    input_x = np.atleast_1d(input_x)
    input_y = np.atleast_1d(input_y)
    input_z = np.atleast_1d(input_z)
    output_x = np.atleast_1d(output_x)
    output_y = np.atleast_1d(output_y)

    # Verify input and outputs have matching lengths
    if not (input_z.shape[0] == len(input_x) == len(input_y)):
        raise ValueError("All of `input_z`, `input_x` and `input_y` must be the same length.")
    if not (len(output_x) == len(output_y)):
        raise ValueError("Both `output_x` and `output_y` must be the same length.")

    # Verify k is smaller than total number of points, and non-zero
    if k > input_z.shape[0]:
        raise ValueError(
            f"The requested number of nearest neighbours (`k={k}`) "
            f"is smaller than the total number of points ({input_z.shape[0]}).",
        )
    if k == 0:
        raise ValueError("Interpolation based on `k=0` nearest neighbours is not valid.")

    # Create KDTree to efficiently find nearest neighbours
    points_xy = np.column_stack((input_y, input_x))
    tree = KDTree(points_xy)

    # Determine nearest neighbours and distances to each
    grid_stacked = np.column_stack((output_y, output_x))
    distances, indices = tree.query(grid_stacked, k=k, workers=-1)

    # If k == 1, add an additional axis for consistency
    if k == 1:
        distances = distances[..., np.newaxis]
        indices = indices[..., np.newaxis]

    # Add small epsilon to distances to prevent division by zero errors
    # if output coordinates are the same as input coordinates
    distances = np.maximum(distances, epsilon)

    # Set distances above max to NaN if specified
    if max_dist is not None:
        distances[distances > max_dist] = np.nan

    # Calculate weights based on distance to k nearest neighbours.
    weights = 1 / np.power(distances, p)
    weights = weights / np.nansum(weights, axis=1).reshape(-1, 1)

    # 1D case: Compute weighted sum of input_z values for each output point
    if input_z.ndim == 1:
        interp_values = np.nansum(weights * input_z[indices], axis=1)

    # 2D case: Compute weighted sum for each set of input_z values
    # weights[..., np.newaxis] adds a dimension for broadcasting
    else:
        interp_values = np.nansum(
            weights[..., np.newaxis] * input_z[indices],
            axis=1,
        )

    # Set any points with less than `k_min` valid weights to NaN
    interp_values[np.isfinite(weights).sum(axis=1) < k_min] = np.nan

    return interp_values