From 28cad45b9e08c76cfffa226ddd98257dd4844553 Mon Sep 17 00:00:00 2001 From: Nuc94 Date: Sun, 5 Feb 2023 19:16:50 +0100 Subject: [PATCH 1/5] cython code for frechet dist - simple build commands --- .../cy_similaritymeasures/benchmark.py | 24 ++++++ .../build_cython_extension.sh | 1 + .../cy_similaritymeasures.pyx | 79 +++++++++++++++++++ .../cy_similaritymeasures/readme.md | 3 + .../cy_similaritymeasures/setup.py | 17 ++++ 5 files changed, 124 insertions(+) create mode 100644 similaritymeasures/cy_similaritymeasures/benchmark.py create mode 100755 similaritymeasures/cy_similaritymeasures/build_cython_extension.sh create mode 100644 similaritymeasures/cy_similaritymeasures/cy_similaritymeasures.pyx create mode 100644 similaritymeasures/cy_similaritymeasures/readme.md create mode 100644 similaritymeasures/cy_similaritymeasures/setup.py diff --git a/similaritymeasures/cy_similaritymeasures/benchmark.py b/similaritymeasures/cy_similaritymeasures/benchmark.py new file mode 100644 index 0000000..ce4772d --- /dev/null +++ b/similaritymeasures/cy_similaritymeasures/benchmark.py @@ -0,0 +1,24 @@ +import numpy as np +from similaritymeasures import frechet_dist +from cy_similaritymeasures import frechet_dist as cy_frechet_dist + +#generating data in a way similar to the tests +x1 = np.linspace(0.0, 1.0, 100) +y1 = np.ones(100)*2 +x2 = np.linspace(0.0, 1.0, 50) +y2 = np.ones(50) + +np.random.seed(1212121) +curve_a_rand = np.random.random((100, 2)) +curve_b_rand = np.random.random((90, 2)) + +curve1 = np.array((x1, y1)).T +curve2 = np.array((x2, y2)).T + +res = frechet_dist(curve1, curve2) + +cy_res = cy_frechet_dist(curve1, curve2) + +print(res) +print(cy_res) +print(res == cy_res) \ No newline at end of file diff --git a/similaritymeasures/cy_similaritymeasures/build_cython_extension.sh b/similaritymeasures/cy_similaritymeasures/build_cython_extension.sh new file mode 100755 index 0000000..97c5182 --- /dev/null +++ b/similaritymeasures/cy_similaritymeasures/build_cython_extension.sh @@ -0,0 +1 @@ +python3 setup.py build_ext --inplace \ No newline at end of file diff --git a/similaritymeasures/cy_similaritymeasures/cy_similaritymeasures.pyx b/similaritymeasures/cy_similaritymeasures/cy_similaritymeasures.pyx new file mode 100644 index 0000000..08ab7af --- /dev/null +++ b/similaritymeasures/cy_similaritymeasures/cy_similaritymeasures.pyx @@ -0,0 +1,79 @@ +import numpy as np +from scipy.spatial import distance + +def frechet_dist(exp_data, num_data, p=2): + r""" + Compute the discrete Frechet distance + + Compute the Discrete Frechet Distance between two N-D curves according to + [1]_. The Frechet distance has been defined as the walking dog problem. + From Wikipedia: "In mathematics, the Frechet distance is a measure of + similarity between curves that takes into account the location and + ordering of the points along the curves. It is named after Maurice Frechet. + https://en.wikipedia.org/wiki/Fr%C3%A9chet_distance + + Parameters + ---------- + exp_data : array_like + Curve from your experimental data. exp_data is of (M, N) shape, where + M is the number of data points, and N is the number of dimmensions + num_data : array_like + Curve from your numerical data. num_data is of (P, N) shape, where P + is the number of data points, and N is the number of dimmensions + p : float, 1 <= p <= infinity + Which Minkowski p-norm to use. Default is p=2 (Eculidean). + The manhattan distance is p=1. + + Returns + ------- + df : float + discrete Frechet distance + + References + ---------- + .. [1] Thomas Eiter and Heikki Mannila. Computing discrete Frechet + distance. Technical report, 1994. + http://www.kr.tuwien.ac.at/staff/eiter/et-archive/cdtr9464.pdf + http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.90.937&rep=rep1&type=pdf + + Notes + ----- + Your x locations of data points should be exp_data[:, 0], and the y + locations of the data points should be exp_data[:, 1]. Same for num_data. + + Thanks to Arbel Amir for the issue, and Sen ZHANG for the iterative code + https://github.com/cjekel/similarity_measures/issues/6 + + Examples + -------- + >>> # Generate random experimental data + >>> x = np.random.random(100) + >>> y = np.random.random(100) + >>> exp_data = np.zeros((100, 2)) + >>> exp_data[:, 0] = x + >>> exp_data[:, 1] = y + >>> # Generate random numerical data + >>> x = np.random.random(100) + >>> y = np.random.random(100) + >>> num_data = np.zeros((100, 2)) + >>> num_data[:, 0] = x + >>> num_data[:, 1] = y + >>> df = frechet_dist(exp_data, num_data) + + """ + cdef int i, j + cdef int n = len(exp_data) + cdef int m = len(num_data) + c = distance.cdist(exp_data, num_data, metric='minkowski', p=p) + ca = np.ones((n, m)) + ca = np.multiply(ca, -1) + ca[0, 0] = c[0, 0] + for i in range(1, n): + ca[i, 0] = max(ca[i-1, 0], c[i, 0]) + for j in range(1, m): + ca[0, j] = max(ca[0, j-1], c[0, j]) + for i in range(1, n): + for j in range(1, m): + ca[i, j] = max(min(ca[i-1, j], ca[i, j-1], ca[i-1, j-1]), + c[i, j]) + return ca[n-1, m-1] \ No newline at end of file diff --git a/similaritymeasures/cy_similaritymeasures/readme.md b/similaritymeasures/cy_similaritymeasures/readme.md new file mode 100644 index 0000000..4e89972 --- /dev/null +++ b/similaritymeasures/cy_similaritymeasures/readme.md @@ -0,0 +1,3 @@ +# cy_similaritymeasures + +this folder shall hold stuff to cythonize the similaritymeasures functions \ No newline at end of file diff --git a/similaritymeasures/cy_similaritymeasures/setup.py b/similaritymeasures/cy_similaritymeasures/setup.py new file mode 100644 index 0000000..5a4edc1 --- /dev/null +++ b/similaritymeasures/cy_similaritymeasures/setup.py @@ -0,0 +1,17 @@ +''' +knowing this may be a bad thing, but i will place here a +quick setup.py to just +''' + +from setuptools import setup, Extension +from Cython.Build import cythonize + +setup( + name = "cy_frechet", + ext_modules = cythonize(Extension( + "cy_similaritymeasures", + sources=["cy_similaritymeasures.pyx"], + language="c++" + ) +)) + From 220b6081afe8e0eff2e1410ccdc572c2924f97d3 Mon Sep 17 00:00:00 2001 From: Nuc94 Date: Mon, 6 Feb 2023 19:59:04 +0100 Subject: [PATCH 2/5] benchmark now using timeit --- .../cy_similaritymeasures/benchmark.py | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/similaritymeasures/cy_similaritymeasures/benchmark.py b/similaritymeasures/cy_similaritymeasures/benchmark.py index ce4772d..506cabe 100644 --- a/similaritymeasures/cy_similaritymeasures/benchmark.py +++ b/similaritymeasures/cy_similaritymeasures/benchmark.py @@ -1,3 +1,9 @@ +''' +just a quick benchmaerking script to showcase cythonized vs +non cythonized performances +''' + +import timeit import numpy as np from similaritymeasures import frechet_dist from cy_similaritymeasures import frechet_dist as cy_frechet_dist @@ -21,4 +27,22 @@ print(res) print(cy_res) -print(res == cy_res) \ No newline at end of file +print(res == cy_res) + +def run_frechet(): + return frechet_dist(curve1, curve2) + +def run_cy_frechet(): + return cy_frechet_dist(curve1, curve2) + +n_repeats = 5 +n_runs = 200 + +t1 = timeit.repeat(run_frechet, repeat = n_repeats, number = n_runs) + +t2 = timeit.repeat(run_cy_frechet, repeat = n_repeats, number = n_runs) +print(t1) +print(t2) + +print('average execution time for non cythonized version: %f' % (sum(t1) / len(t1))) +print('average execution time for cythonized version: %f' % (sum(t2) / len(t2))) \ No newline at end of file From 7ac2b60b15766a06420c4297fd07973561a7fda9 Mon Sep 17 00:00:00 2001 From: Nuc94 Date: Mon, 6 Feb 2023 20:15:54 +0100 Subject: [PATCH 3/5] wrote better readme --- .../cy_similaritymeasures/benchmark.py | 8 ++++-- .../cy_similaritymeasures/readme.md | 25 ++++++++++++++++++- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/similaritymeasures/cy_similaritymeasures/benchmark.py b/similaritymeasures/cy_similaritymeasures/benchmark.py index 506cabe..ff6072f 100644 --- a/similaritymeasures/cy_similaritymeasures/benchmark.py +++ b/similaritymeasures/cy_similaritymeasures/benchmark.py @@ -44,5 +44,9 @@ def run_cy_frechet(): print(t1) print(t2) -print('average execution time for non cythonized version: %f' % (sum(t1) / len(t1))) -print('average execution time for cythonized version: %f' % (sum(t2) / len(t2))) \ No newline at end of file +avg = sum(t1) / len(t1) +cy_avg = sum(t2) / len(t2) +improv_pct = (avg - cy_avg) / avg * 100 +print('average execution time for non cythonized version: %f' % avg) +print('average execution time for cythonized version: %f' % cy_avg) +print('improvement: %d %%' % improv_pct) \ No newline at end of file diff --git a/similaritymeasures/cy_similaritymeasures/readme.md b/similaritymeasures/cy_similaritymeasures/readme.md index 4e89972..af2cb0d 100644 --- a/similaritymeasures/cy_similaritymeasures/readme.md +++ b/similaritymeasures/cy_similaritymeasures/readme.md @@ -1,3 +1,26 @@ # cy_similaritymeasures -this folder shall hold stuff to cythonize the similaritymeasures functions \ No newline at end of file +this folder shall contain stuff to cythonize the similaritymeasures functions + +``` +benchmark.py #shall be the script containg a quick performance check, to be executed at last after the cythonized version was correctly built +build_cython_extension.sh #to invoke the build from the setup.py file a command is required, and for linux such command is written in this file +cy_similaritymeasures.pyx #the cython code with the cythonized version of the frechet function +setup.py #a setup.py file used to build the cython code and obtain an extension from it +``` + +## extension building + +cython code from the `*.pyx` file is not directly invokable from python, instead it should be built and transformed into an extension. To do that there is the script `build_cython_extension.sh`, which invokes `setup.py` to obtain a `.cpp` and `.so` extension file to be invoked from python. + +You should have cython on your machine in order to build everything. + +## results + +to run the benchmark just run the `benchmark.py`. On my machine the results were: + +``` +average execution time for non cythonized version: 0.744819 +average execution time for cythonized version: 0.433768 +improvement: 41 % +``` \ No newline at end of file From d5f5bea3836bc9f721df540c5200a83fb20aff60 Mon Sep 17 00:00:00 2001 From: Nuc94 Date: Wed, 8 Feb 2023 09:32:32 +0100 Subject: [PATCH 4/5] removed intermediate ca matrix from frechet distance --- .../cy_similaritymeasures.pyx | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/similaritymeasures/cy_similaritymeasures/cy_similaritymeasures.pyx b/similaritymeasures/cy_similaritymeasures/cy_similaritymeasures.pyx index 08ab7af..d99be6c 100644 --- a/similaritymeasures/cy_similaritymeasures/cy_similaritymeasures.pyx +++ b/similaritymeasures/cy_similaritymeasures/cy_similaritymeasures.pyx @@ -65,15 +65,15 @@ def frechet_dist(exp_data, num_data, p=2): cdef int n = len(exp_data) cdef int m = len(num_data) c = distance.cdist(exp_data, num_data, metric='minkowski', p=p) - ca = np.ones((n, m)) - ca = np.multiply(ca, -1) - ca[0, 0] = c[0, 0] - for i in range(1, n): - ca[i, 0] = max(ca[i-1, 0], c[i, 0]) - for j in range(1, m): - ca[0, j] = max(ca[0, j-1], c[0, j]) + #ca = np.ones((n, m)) + #ca = np.multiply(ca, -1) + #ca[0, 0] = c[0, 0] + #for i in range(1, n): + # ca[i, 0] = max(ca[i-1, 0], c[i, 0]) + #for j in range(1, m): + # ca[0, j] = max(ca[0, j-1], c[0, j]) for i in range(1, n): for j in range(1, m): - ca[i, j] = max(min(ca[i-1, j], ca[i, j-1], ca[i-1, j-1]), + c[i, j] = max(min(c[i-1, j], c[i, j-1], c[i-1, j-1]), c[i, j]) - return ca[n-1, m-1] \ No newline at end of file + return c[n-1, m-1] \ No newline at end of file From a6e1edcb4ddb1caa11ada19ec2916d4f695067f8 Mon Sep 17 00:00:00 2001 From: Nuc94 Date: Wed, 22 Feb 2023 17:11:20 +0100 Subject: [PATCH 5/5] added dtw cythonized --- .../cy_similaritymeasures/benchmark.py | 32 +++- .../cy_similaritymeasures.pyx | 139 +++++++++++++++++- 2 files changed, 167 insertions(+), 4 deletions(-) diff --git a/similaritymeasures/cy_similaritymeasures/benchmark.py b/similaritymeasures/cy_similaritymeasures/benchmark.py index ff6072f..5a3683b 100644 --- a/similaritymeasures/cy_similaritymeasures/benchmark.py +++ b/similaritymeasures/cy_similaritymeasures/benchmark.py @@ -5,8 +5,9 @@ import timeit import numpy as np -from similaritymeasures import frechet_dist +from similaritymeasures import frechet_dist, dtw from cy_similaritymeasures import frechet_dist as cy_frechet_dist +from cy_similaritymeasures import dtw as cy_dtw #generating data in a way similar to the tests x1 = np.linspace(0.0, 1.0, 100) @@ -44,6 +45,35 @@ def run_cy_frechet(): print(t1) print(t2) +avg = sum(t1) / len(t1) +cy_avg = sum(t2) / len(t2) +improv_pct = (avg - cy_avg) / avg * 100 +print('average execution time for non cythonized version: %f' % avg) +print('average execution time for cythonized version: %f' % cy_avg) +print('improvement: %d %%' % improv_pct) + +#then i would benchmark dtw + +res, _ = dtw(curve1, curve2) + +cy_res, _ = cy_dtw(curve1, curve2) + +print(res) +print(cy_res) +print(res == cy_res) + +def run_dtw(): + return dtw(curve1, curve2) + +def run_cy_dtw(): + return cy_dtw(curve1, curve2) + +t1 = timeit.repeat(run_dtw, repeat = n_repeats, number = n_runs) + +t2 = timeit.repeat(run_cy_dtw, repeat = n_repeats, number = n_runs) +print(t1) +print(t2) + avg = sum(t1) / len(t1) cy_avg = sum(t2) / len(t2) improv_pct = (avg - cy_avg) / avg * 100 diff --git a/similaritymeasures/cy_similaritymeasures/cy_similaritymeasures.pyx b/similaritymeasures/cy_similaritymeasures/cy_similaritymeasures.pyx index d99be6c..1d850de 100644 --- a/similaritymeasures/cy_similaritymeasures/cy_similaritymeasures.pyx +++ b/similaritymeasures/cy_similaritymeasures/cy_similaritymeasures.pyx @@ -1,6 +1,16 @@ import numpy as np from scipy.spatial import distance +cdef double c_max(double a, double b): + if a > b: + return a + return b + +cdef double c_min(double a, double b): + if a < b: + return a + return b + def frechet_dist(exp_data, num_data, p=2): r""" Compute the discrete Frechet distance @@ -74,6 +84,129 @@ def frechet_dist(exp_data, num_data, p=2): # ca[0, j] = max(ca[0, j-1], c[0, j]) for i in range(1, n): for j in range(1, m): - c[i, j] = max(min(c[i-1, j], c[i, j-1], c[i-1, j-1]), - c[i, j]) - return c[n-1, m-1] \ No newline at end of file + c[i, j] = c_max( + c_min(c_min(c[i-1, j], c[i, j-1]) , c[i-1, j-1]), + c[i, j]) + return c[n-1, m-1] + +def dtw(exp_data, num_data, metric='euclidean', **kwargs): + r""" + Compute the Dynamic Time Warping distance. + + This computes a generic Dynamic Time Warping (DTW) distance and follows + the algorithm from [1]_. This can use all distance metrics that are + available in scipy.spatial.distance.cdist. + + Parameters + ---------- + exp_data : array_like + Curve from your experimental data. exp_data is of (M, N) shape, where + M is the number of data points, and N is the number of dimmensions + num_data : array_like + Curve from your numerical data. num_data is of (P, N) shape, where P + is the number of data points, and N is the number of dimmensions + metric : str or callable, optional + The distance metric to use. Default='euclidean'. Refer to the + documentation for scipy.spatial.distance.cdist. Some examples: + 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', + 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', + 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', + 'wminkowski', 'yule'. + **kwargs : dict, optional + Extra arguments to `metric`: refer to each metric documentation in + scipy.spatial.distance. + Some examples: + + p : scalar + The p-norm to apply for Minkowski, weighted and unweighted. + Default: 2. + + w : ndarray + The weight vector for metrics that support weights (e.g., + Minkowski). + + V : ndarray + The variance vector for standardized Euclidean. + Default: var(vstack([XA, XB]), axis=0, ddof=1) + + VI : ndarray + The inverse of the covariance matrix for Mahalanobis. + Default: inv(cov(vstack([XA, XB].T))).T + + out : ndarray + The output array + If not None, the distance matrix Y is stored in this array. + + Returns + ------- + r : float + DTW distance. + d : ndarray (2-D) + Cumulative distance matrix + + Notes + ----- + The DTW distance is d[-1, -1]. + + This has O(M, P) computational cost. + + The latest scipy.spatial.distance.cdist information can be found at + https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html + + Your x locations of data points should be exp_data[:, 0], and the y + locations of the data points should be exp_data[:, 1]. Same for num_data. + + This uses the euclidean distance for now. In the future it should be + possible to support other metrics. + + DTW is a non-metric distance, which means DTW doesn't hold the triangle + inequality. + https://en.wikipedia.org/wiki/Triangle_inequality + + References + ---------- + .. [1] Senin, P., 2008. Dynamic time warping algorithm review. Information + and Computer Science Department University of Hawaii at Manoa Honolulu, + USA, 855, pp.1-23. + http://seninp.github.io/assets/pubs/senin_dtw_litreview_2008.pdf + + Examples + -------- + >>> # Generate random experimental data + >>> x = np.random.random(100) + >>> y = np.random.random(100) + >>> exp_data = np.zeros((100, 2)) + >>> exp_data[:, 0] = x + >>> exp_data[:, 1] = y + >>> # Generate random numerical data + >>> x = np.random.random(100) + >>> y = np.random.random(100) + >>> num_data = np.zeros((100, 2)) + >>> num_data[:, 0] = x + >>> num_data[:, 1] = y + >>> r, d = dtw(exp_data, num_data) + + The euclidean distance is used by default. You can use metric and **kwargs + to specify different types of distance metrics. The following example uses + the city block or Manhattan distance between points. + + >>> r, d = dtw(exp_data, num_data, metric='cityblock') + + """ + cdef int i, j + cdef int n = len(exp_data) + cdef int m = len(num_data) + c = distance.cdist(exp_data, num_data, metric=metric, **kwargs) + + d = np.zeros(c.shape) + d[0, 0] = c[0, 0] + #n, m = c.shape + for i in range(1, n): + d[i, 0] = d[i-1, 0] + c[i, 0] + for j in range(1, m): + d[0, j] = d[0, j-1] + c[0, j] + for i in range(1, n): + for j in range(1, m): + d[i, j] = c[i, j] + c_min(c_min(d[i-1, j], d[i, j-1]), d[i-1, j-1]) + return d[-1, -1], d \ No newline at end of file