Source code for huracanpy.assess._match

"""Matching functions"""

import pandas as pd
import numpy as np
from haversine import haversine_vector, Unit
from itertools import combinations



[docs]
def match_pair(
    tracks1,
    tracks2,
    name1="1",
    name2="2",
    max_dist=300,
    min_overlap=0,
):
    """

    Parameters
    ----------
    tracks1 (pd.DataFrame): the first track set to match
    tracks2 (pd.DataFrame): the second tracks set to match
    name1 (str): Suffix for the first dataframe
    name2 (str): Suffix for the second dataframe
    max_dist (float) : Threshold for maximum distance between two tracks
    min_overlap (int) : Minimum number of overlapping time steps for matching

    Returns
    -------
    pd.DataFrame
        Dataframe containing the matching tracks with
            the id from both datasets
            the number of matching time steps
            the distance between two tracks
    """

    # Prepare data
    tracks1, tracks2 = (
        tracks1[["track_id", "lon", "lat", "time"]].to_dataframe(),
        tracks2[["track_id", "lon", "lat", "time"]].to_dataframe(),
    )
    tracks1["lon"] = np.where(tracks1.lon > 180, tracks1.lon - 360, tracks1.lon)
    tracks2["lon"] = np.where(tracks2.lon > 180, tracks2.lon - 360, tracks2.lon)

    # Find corresponding points (same time step, less than max_dist km)
    merged = pd.merge(tracks1, tracks2, on="time")
    X = np.concatenate([[merged.lat_x], [merged.lon_x]]).T
    Y = np.concatenate([[merged.lat_y], [merged.lon_y]]).T
    merged["dist"] = haversine_vector(X, Y, unit=Unit.KILOMETERS)
    merged = merged[merged.dist <= max_dist]
    # Compute temporal overlap
    temp = (
        merged.groupby(["track_id_x", "track_id_y"])[["dist"]]
        .count()
        .rename(columns={"dist": "temp"})
    )
    # Build a table of all pairs of tracks sharing at least one point
    matches = (
        merged[["track_id_x", "track_id_y"]]
        .drop_duplicates()
        .join(temp, on=["track_id_x", "track_id_y"])
    )
    matches = matches[matches.temp >= min_overlap]
    dist = merged.groupby(["track_id_x", "track_id_y"])[["dist"]].mean()
    matches = matches.merge(dist, on=["track_id_x", "track_id_y"])
    # Rename columns before output
    matches = matches.rename(
        columns={"track_id_x": "id_" + name1, "track_id_y": "id_" + name2}
    )
    return matches




[docs]
def match_multiple(
    datasets,
    names,
    max_dist=300,
    min_overlap=0,
):
    """
    Function to match any number of tracks sets

    Parameters
    ----------
    datasets : list of xr.Dataset
        list of the sets to be matched.
    names : list of str
        labels for the datasets. names must have the same length as datasets
    max_dist : float
        Threshold for maximum distance between two tracks
    min_overlap : int
        Minimum number of overlapping time steps for matching

    Raises
    ------
    NotImplementedError
        If two datasets have no match.

    Returns
    -------
    M : pd.dataframe
        table of matching tracks among all the datasets

    """

    assert len(datasets) == len(names), "datasets and names must have the same length."

    M = pd.DataFrame(columns=["id_" + n for n in names[:2]])
    for names_pair, dataset_pair in zip(
        combinations(names, 2), combinations(datasets, 2)
    ):
        m = match_pair(*dataset_pair, *names_pair, max_dist, min_overlap)
        if len(m) == 0:
            raise NotImplementedError(
                "For the moment, the case where two datasets have no match is not handled. Problem raised by datasets "  # TODO
                + str(names_pair)
            )
        M = M.merge(m[["id_" + names_pair[0], "id_" + names_pair[1]]], how="outer")
    return M



# TODO: Deal with duplicates: merge, max...?