Source code for huracanpy.assess._match

"""Matching functions"""

import pandas as pd
import numpy as np
from haversine import haversine_vector, Unit
from itertools import combinations


[docs] def match_pair( tracks1, tracks2, name1="1", name2="2", max_dist=300, min_overlap=0, ): """ Parameters ---------- tracks1 (pd.DataFrame): the first track set to match tracks2 (pd.DataFrame): the second tracks set to match name1 (str): Suffix for the first dataframe name2 (str): Suffix for the second dataframe max_dist (float) : Threshold for maximum distance between two tracks min_overlap (int) : Minimum number of overlapping time steps for matching Returns ------- pd.DataFrame Dataframe containing the matching tracks with the id from both datasets the number of matching time steps the distance between two tracks """ # Prepare data tracks1, tracks2 = ( tracks1[["track_id", "lon", "lat", "time"]].to_dataframe(), tracks2[["track_id", "lon", "lat", "time"]].to_dataframe(), ) tracks1["lon"] = np.where(tracks1.lon > 180, tracks1.lon - 360, tracks1.lon) tracks2["lon"] = np.where(tracks2.lon > 180, tracks2.lon - 360, tracks2.lon) # Find corresponding points (same time step, less than max_dist km) merged = pd.merge(tracks1, tracks2, on="time") X = np.concatenate([[merged.lat_x], [merged.lon_x]]).T Y = np.concatenate([[merged.lat_y], [merged.lon_y]]).T merged["dist"] = haversine_vector(X, Y, unit=Unit.KILOMETERS) merged = merged[merged.dist <= max_dist] # Compute temporal overlap temp = ( merged.groupby(["track_id_x", "track_id_y"])[["dist"]] .count() .rename(columns={"dist": "temp"}) ) # Build a table of all pairs of tracks sharing at least one point matches = ( merged[["track_id_x", "track_id_y"]] .drop_duplicates() .join(temp, on=["track_id_x", "track_id_y"]) ) matches = matches[matches.temp >= min_overlap] dist = merged.groupby(["track_id_x", "track_id_y"])[["dist"]].mean() matches = matches.merge(dist, on=["track_id_x", "track_id_y"]) # Rename columns before output matches = matches.rename( columns={"track_id_x": "id_" + name1, "track_id_y": "id_" + name2} ) return matches
[docs] def match_multiple( datasets, names, max_dist=300, min_overlap=0, ): """ Function to match any number of tracks sets Parameters ---------- datasets : list of xr.Dataset list of the sets to be matched. names : list of str labels for the datasets. names must have the same length as datasets max_dist : float Threshold for maximum distance between two tracks min_overlap : int Minimum number of overlapping time steps for matching Raises ------ NotImplementedError If two datasets have no match. Returns ------- M : pd.dataframe table of matching tracks among all the datasets """ assert len(datasets) == len(names), "datasets and names must have the same length." M = pd.DataFrame(columns=["id_" + n for n in names[:2]]) for names_pair, dataset_pair in zip( combinations(names, 2), combinations(datasets, 2) ): m = match_pair(*dataset_pair, *names_pair, max_dist, min_overlap) if len(m) == 0: raise NotImplementedError( "For the moment, the case where two datasets have no match is not handled. Problem raised by datasets " # TODO + str(names_pair) ) M = M.merge(m[["id_" + names_pair[0], "id_" + names_pair[1]]], how="outer") return M
# TODO: Deal with duplicates: merge, max...?