diff --git a/doc/overview/roadmap.qbk b/doc/overview/roadmap.qbk index 4f356ea5e4..920bc39904 100644 --- a/doc/overview/roadmap.qbk +++ b/doc/overview/roadmap.qbk @@ -17,6 +17,7 @@ Recent issues on GitHub [@https://github.com/boostorg/math/issues?utf8=%E2%9C%93 * Add improvement to Heuman Lambda precision. * Improve Skew Normal root finding, see [@https://github.com/boostorg/math/issues/1120 1120]. * Lots of minor fixes and improved code coverage. +* Add support for the M nearest-neighbour Chatterjee correlation of Lin and Han (2021), see [@https://github.com/boostorg/math/issues/990 990]. [h4 Math-4.2.0 (Boost-1.85)] diff --git a/doc/statistics/chatterjee_correlation.qbk b/doc/statistics/chatterjee_correlation.qbk index 520ab0cc6b..d506407f09 100644 --- a/doc/statistics/chatterjee_correlation.qbk +++ b/doc/statistics/chatterjee_correlation.qbk @@ -72,3 +72,83 @@ Of course, random numbers are not used internally, but the result is not guarant [endsect] [/section:chatterjee_correlation Chatterjee Correlation] + +[section:chatterjee_correlation_mnn Chatterjee Correlation (M nearest neighbours)] + +[heading Synopsis] + +`` +#include + +namespace boost::math::statistics { + + C++17: + template + auto chatterjee_correlation_mnn(ExecutionPolicy&& exec, const Container& u, const Container& v, std::size_t M); + + C++11: + template + auto chatterjee_correlation_mnn(const Container& u, const Container& v, std::size_t M); +} +`` + +[heading Description] + +`chatterjee_correlation_mnn` computes the revised Chatterjee rank correlation of Lin and Han (2021), which +generalises Chatterjee's coefficient by incorporating the `M` right nearest neighbours of each point rather +than only the single right neighbour. +The statistic still consistently estimates the same measure of dependence (between 0 and 1; zero if and only +if X and Y are independent, unity if and only if Y is a measurable function of X), but its use of additional +neighbours boosts the power of the associated independence test. + +The original coefficient `chatterjee_correlation` has a statistical detection boundary of n[super -1/4] for +testing independence, which is substantially weaker than the parametric n[super -1/2] rate. +By letting `M` grow with the sample size (with M/n -> 0), the revised statistic can approach near-parametric +efficiency. + +Let X and Y be random variables, where Y is not constant, and let (X_i, Y_i) be samples sorted so that +X_(0) < X_(1) < ... < X_(n-1). Writing R_i for the rank of Y_i and j_m(i) for the index of the m-th right +nearest neighbour of X_i, the statistic is + +`` + xi_{n,M} = -2 + 6 * sum_i sum_{m=1}^{M} min(R_i, R_{j_m(i)}) / ((n + 1) * (n*M + M*(M + 1) / 4)) +`` + +The complexity is O(n log n + n M). For `M` of order O(1) or O(poly-log n) this is nearly linear; as `M` +approaches `n` it tends to O(n[super 2]). + +An example is given below: + + std::vector X{1,2,3,4,5}; + std::vector Y{1,2,3,4,5}; + using boost::math::statistics::chatterjee_correlation_mnn; + std::size_t M = 2; + double coeff = chatterjee_correlation_mnn(X, Y, M); + +/Nota bene:/ If the input is an integer type the output will be a double precision type. + +[heading Choice of M] + +The asymptotic null variance of the statistic is minimised when `M` is of order sqrt(n), which is a reasonable +default for users who want improved power without the quadratic cost of large `M`. Pushing `M` closer to `n` +increases the power of the independence test against smooth alternatives at the cost of additional computation. +The choice is left to the caller; `M` must satisfy 1 <= M <= n. + +/Nota bene:/ Even at `M` = 1 this statistic is not identical to `chatterjee_correlation`: it uses +min(R_i, R_j) in place of |R_i - R_j| and a different normalisation, so the two agree only up to a term of +order 1/n. Use `chatterjee_correlation` when the original coefficient is required. + +[heading Invariants] + +The function expects at least two samples, a non-constant vector Y, the same number of X's as Y's, and +1 <= M <= n. +If Y is constant, the result is a quiet NaN. +The data set must be sorted by X values. +If there are ties in the values of X, then the statistic is random due to the random breaking of ties. + +[heading References] + +* Lin, Zhexiao, and Fang Han. "On boosting the power of Chatterjee's rank correlation." Biometrika 110.2 (2023): 283-299. [@https://arxiv.org/abs/2108.06828 arXiv:2108.06828]. + +[endsect] +[/section:chatterjee_correlation_mnn Chatterjee Correlation (M nearest neighbours)] \ No newline at end of file diff --git a/include/boost/math/statistics/chatterjee_correlation.hpp b/include/boost/math/statistics/chatterjee_correlation.hpp index ad0b33a429..5f4b3bcf59 100644 --- a/include/boost/math/statistics/chatterjee_correlation.hpp +++ b/include/boost/math/statistics/chatterjee_correlation.hpp @@ -1,4 +1,5 @@ // (C) Copyright Matt Borland 2022. +// (C) Copyright Oleksandr Kornijcuk 2026. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -52,7 +53,7 @@ template ReturnType chatterjee_correlation_seq_impl(ForwardIterator u_begin, ForwardIterator u_end, ForwardIterator v_begin, ForwardIterator v_end) { using std::abs; - + BOOST_MATH_ASSERT_MSG(std::is_sorted(u_begin, u_end), "The x values must be sorted in order to use this functionality"); const std::vector rank_vector = rank(v_begin, v_end); @@ -70,15 +71,107 @@ ReturnType chatterjee_correlation_seq_impl(ForwardIterator u_begin, ForwardItera return result; } +// Generalised (M right-nearest-neighbour) Chatterjee correlation of Lin and Han (2021), +// "On boosting the power of Chatterjee's rank correlation", https://arxiv.org/abs/2108.06828. +// +// For a sample sorted by X, the inner sum of equation (2.4) is +// +// S = sum_{i} sum_{m=1}^{M} min(R_i, R_{j_m(i)}), +// +// where R_i is the (1-based) rank of Y_i and j_m(i) is the m-th right nearest neighbour of X_i. +// After sorting by X the m-th right neighbour of element i is element i + m when it exists +// (i + m < n); otherwise j_m(i) = i (the sentinel of the paper), contributing min(R_i, R_i) = R_i. +// +// Note: rank() returns 0-based ranks, whereas the paper's formula uses 1-based ranks. The +1 offset +// cancels in the M = 1 statistic above (which uses |R_{i+1} - R_i|) but does NOT cancel under +// min(.,.), so it is applied explicitly here. +inline std::uint64_t chatterjee_mnn_min_sum(const std::vector& rank_vector, + std::size_t lo, std::size_t hi, std::size_t M) +{ + const std::size_t n = rank_vector.size(); + std::uint64_t sum = 0; + + for (std::size_t i = lo; i < hi; ++i) + { + const std::size_t r_i = rank_vector[i] + 1; + + // Number of right neighbours of i that actually exist within [0, n). + const std::size_t existing = (i + M < n) ? M : (n - 1 - i); + + for (std::size_t m = 1; m <= existing; ++m) + { + const std::size_t r_j = rank_vector[i + m] + 1; + sum += static_cast((std::min)(r_i, r_j)); + } + + // The remaining (M - existing) neighbours are sentinels j_m(i) = i, each contributing r_i. + sum += static_cast(M - existing) * static_cast(r_i); + } + + return sum; +} + +template +ReturnType chatterjee_mnn_result(std::uint64_t sum, std::size_t n, std::size_t M) +{ + const ReturnType n_r = static_cast(n); + const ReturnType m_r = static_cast(M); + + // Denominator (n + 1) * [nM + M(M + 1) / 4] of equation (2.4); chosen so that E[xi_{n,M}] = 0 under H0. + const ReturnType denominator = (n_r + static_cast(1)) * + (n_r * m_r + m_r * (m_r + static_cast(1)) / static_cast(4)); + + return static_cast(-2) + static_cast(6) * static_cast(sum) / denominator; +} + +template +ReturnType chatterjee_correlation_mnn_seq_impl(ForwardIterator u_begin, ForwardIterator u_end, + ForwardIterator v_begin, ForwardIterator v_end, std::size_t M) +{ + BOOST_MATH_ASSERT_MSG(std::is_sorted(u_begin, u_end), "The x values must be sorted in order to use this functionality"); + + const std::size_t n = static_cast(std::distance(v_begin, v_end)); + + BOOST_MATH_ASSERT_MSG(M >= 1, "The number of right nearest neighbours M must be at least 1"); + BOOST_MATH_ASSERT_MSG(M <= n, "The number of right nearest neighbours M must not exceed the number of samples"); + + // If Y is constant the statistic is undefined (Y must be non-constant); return a quiet NaN. + // This is checked on the input directly: rank() collapses tied values, so a constant Y would + // otherwise reduce the effective sample size to one. + if (v_begin != v_end) + { + using value_type = typename std::iterator_traits::value_type; + const value_type first_value = *v_begin; + if (std::all_of(std::next(v_begin), v_end, + [&first_value](const value_type& value) { return value == first_value; })) + { + return std::numeric_limits::quiet_NaN(); + } + } + + const std::vector rank_vector = rank(v_begin, v_end); + + const std::uint64_t sum = chatterjee_mnn_min_sum(rank_vector, 0, n, M); + + return chatterjee_mnn_result(sum, n, M); +} + } // Namespace detail -template ::value, double, Real>::type> inline ReturnType chatterjee_correlation(const Container& u, const Container& v) { return detail::chatterjee_correlation_seq_impl(std::begin(u), std::end(u), std::begin(v), std::end(v)); } +template ::value, double, Real>::type> +inline ReturnType chatterjee_correlation_mnn(const Container& u, const Container& v, std::size_t M) +{ + return detail::chatterjee_correlation_mnn_seq_impl(std::begin(u), std::end(u), std::begin(v), std::end(v), M); +} + }}} // Namespace boost::math::statistics #ifdef BOOST_MATH_EXEC_COMPATIBLE @@ -121,7 +214,7 @@ ReturnType chatterjee_correlation_par_impl(ExecutionPolicy&& exec, ForwardIterat { sum += future_manager[i].get(); } - + ReturnType result = static_cast(1) - (static_cast(3 * sum) / static_cast(rank_vector.size() * rank_vector.size() - 1)); // If the result is 1 then Y is constant and all the elements must be ties @@ -133,6 +226,66 @@ ReturnType chatterjee_correlation_par_impl(ExecutionPolicy&& exec, ForwardIterat return result; } +template +ReturnType chatterjee_correlation_mnn_par_impl(ExecutionPolicy&& exec, ForwardIterator u_begin, ForwardIterator u_end, + ForwardIterator v_begin, ForwardIterator v_end, std::size_t M) +{ + BOOST_MATH_ASSERT_MSG(std::is_sorted(std::forward(exec), u_begin, u_end), "The x values must be sorted in order to use this functionality"); + + const std::size_t n = static_cast(std::distance(v_begin, v_end)); + + BOOST_MATH_ASSERT_MSG(M >= 1, "The number of right nearest neighbours M must be at least 1"); + BOOST_MATH_ASSERT_MSG(M <= n, "The number of right nearest neighbours M must not exceed the number of samples"); + + // If Y is constant the statistic is undefined (Y must be non-constant); return a quiet NaN. + // This is checked on the input directly: rank() collapses tied values, so a constant Y would + // otherwise reduce the effective sample size to one. + if (v_begin != v_end) + { + using value_type = typename std::iterator_traits::value_type; + const value_type first_value = *v_begin; + if (std::all_of(std::next(v_begin), v_end, + [&first_value](const value_type& value) { return value == first_value; })) + { + return std::numeric_limits::quiet_NaN(); + } + } + + const auto rank_vector = rank(std::forward(exec), v_begin, v_end); + + const auto num_threads = std::thread::hardware_concurrency() == 0 ? 2u : std::thread::hardware_concurrency(); + + // The i-loop is embarrassingly parallel: each task sums over a disjoint contiguous range of + // indices [lo, hi) and reads (read-only) rank_vector entries up to i + M, which may fall in a + // neighbouring task's range. Because there are no writes, the partition needs no overlap; this + // differs from the M = 1 parallel path above, which splits the data array (and overlaps by one + // element) for the difference-based transform. + std::vector> future_manager {}; + future_manager.reserve(num_threads); + + const std::size_t chunk = static_cast(std::ceil(static_cast(n) / num_threads)); + + std::size_t lo = 0; + while (lo < n) + { + const std::size_t hi = (std::min)(lo + chunk, n); + future_manager.emplace_back(std::async(std::launch::async | std::launch::deferred, + [&rank_vector, lo, hi, M]() -> std::uint64_t + { + return chatterjee_mnn_min_sum(rank_vector, lo, hi, M); + })); + lo = hi; + } + + std::uint64_t sum = 0; + for (std::size_t i {}; i < future_manager.size(); ++i) + { + sum += future_manager[i].get(); + } + + return chatterjee_mnn_result(sum, n, M); +} + } // Namespace detail template , double, Real>> +inline ReturnType chatterjee_correlation_mnn(ExecutionPolicy&& exec, const Container& u, const Container& v, std::size_t M) +{ + if constexpr (std::is_same_v, decltype(std::execution::seq)>) + { + return detail::chatterjee_correlation_mnn_seq_impl(std::cbegin(u), std::cend(u), + std::cbegin(v), std::cend(v), M); + } + else + { + return detail::chatterjee_correlation_mnn_par_impl(std::forward(exec), + std::cbegin(u), std::cend(u), + std::cbegin(v), std::cend(v), M); + } +} + } // Namespace boost::math::statistics #endif diff --git a/test/test_chatterjee_correlation.cpp b/test/test_chatterjee_correlation.cpp index 3443980eb9..ac2655150a 100644 --- a/test/test_chatterjee_correlation.cpp +++ b/test/test_chatterjee_correlation.cpp @@ -1,4 +1,5 @@ // (C) Copyright Matt Borland 2022. +// (C) Copyright Oleksandr Kornijcuk 2026. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -20,6 +21,7 @@ // using boost::math::statistics::chatterjee_correlation; +using boost::math::statistics::chatterjee_correlation_mnn; template void properties() @@ -30,7 +32,7 @@ void properties() std::vector X(vector_size); std::vector Y(vector_size); - for (std::size_t i = 0; i < vector_size; ++i) + for (std::size_t i = 0; i < vector_size; ++i) { X[i] = unif(mt); Y[i] = unif(mt); @@ -42,7 +44,7 @@ void properties() CHECK_LE(coeff1, Real(1)); // Now apply a monotone function to the data - for (std::size_t i = 0; i < vector_size; ++i) + for (std::size_t i = 0; i < vector_size; ++i) { X[i] = Real(2.3)*X[i] - Real(7.3); Y[i] = Real(7.6)*Y[i] - Real(8.6); @@ -62,7 +64,7 @@ void properties() template void test_spots() -{ +{ // Rank Order: Result will be 1 - 3*3 / (4^2 - 1) = 1 - 9/15 = 0.6 std::vector x = {1, 2, 3, 4}; std::vector y = {1, 2, 3, 4}; @@ -81,6 +83,123 @@ void test_spots() CHECK_NAN(chatterjee_correlation(x, y)); } +// Closed forms for the M-NN statistic xi_{n,M} of Lin and Han (2021), Remark 2.5. +// These are exact (no external reference needed) and are used to validate the implementation. + +// Y = f(X) with f strictly increasing: xi_{n,M} = 1 - (3(M+1)/4) / (n + (M+1)/4). +template +Real xi_mnn_increasing(std::size_t n, std::size_t M) +{ + const Real nr = static_cast(n); + const Real Mr = static_cast(M); + return Real(1) - (Real(3)*(Mr + 1)/4) / (nr + (Mr + 1)/4); +} + +// Y = f(X) with f strictly decreasing: +// xi_{n,M} = 1 - 3(M+1)[(5n+1)/4 - (2M+1)/3] / ((n+1)(n + (M+1)/4)). +template +Real xi_mnn_decreasing(std::size_t n, std::size_t M) +{ + const Real nr = static_cast(n); + const Real Mr = static_cast(M); + return Real(1) - (Real(3)*(Mr + 1)*((Real(5)*nr + 1)/4 - (Real(2)*Mr + 1)/3)) + / ((nr + 1)*(nr + (Mr + 1)/4)); +} + +template +void test_mnn_spots() +{ + // Exact small spot values, computed independently as rationals. + std::vector x = {1, 2, 3, 4}; + std::vector y = {1, 2, 3, 4}; + + // M = 2: -2 + 6*20 / (5 * 9.5) = 10/19 + CHECK_ULP_CLOSE(chatterjee_correlation_mnn(x, y, 2), Real(10)/19, 4); + // M = 3: 2/5 + CHECK_ULP_CLOSE(chatterjee_correlation_mnn(x, y, 3), Real(2)/5, 4); + + // Reverse order, M = 2: -34/95. Note: unlike the M = 1 statistic, the M-NN statistic is NOT + // invariant under reversal of Y for M > 1 (min{.,.} is not reversal-symmetric). + y = {4, 3, 2, 1}; + CHECK_ULP_CLOSE(chatterjee_correlation_mnn(x, y, 2), Real(-34)/95, 4); + + // Alternating order, M = 2: 2/5 + y = {1, 3, 2, 4}; + CHECK_ULP_CLOSE(chatterjee_correlation_mnn(x, y, 2), Real(2)/5, 4); + + // n = 5 increasing, M = 2: 14/23 + std::vector x5 = {1, 2, 3, 4, 5}; + std::vector y5 = {1, 2, 3, 4, 5}; + CHECK_ULP_CLOSE(chatterjee_correlation_mnn(x5, y5, 2), Real(14)/23, 4); + + // Constant Y yields quiet NaN. + y = {1, 1, 1, 1}; + CHECK_NAN(chatterjee_correlation_mnn(x, y, 2)); +} + +template +void test_mnn_extremal() +{ + // Build an X sorted ascending with no ties. + const std::size_t n = 200; + std::vector x(n); + for (std::size_t i = 0; i < n; ++i) + { + x[i] = static_cast(i); + } + + // Strictly increasing dependence Y = X. + for (std::size_t M : {std::size_t(1), std::size_t(2), std::size_t(5), std::size_t(20), std::size_t(50)}) + { + const Real got = chatterjee_correlation_mnn(x, x, M); + CHECK_ULP_CLOSE(got, xi_mnn_increasing(n, M), 4); + } + + // Strictly decreasing dependence Y = -X. + std::vector y(n); + for (std::size_t i = 0; i < n; ++i) + { + y[i] = -x[i]; + } + for (std::size_t M : {std::size_t(1), std::size_t(2), std::size_t(5), std::size_t(20), std::size_t(50)}) + { + const Real got = chatterjee_correlation_mnn(x, y, M); + CHECK_ULP_CLOSE(got, xi_mnn_decreasing(n, M), 4); + } +} + +template +void test_mnn_properties() +{ + const std::size_t n = 256; + std::mt19937_64 mt(987654); + std::uniform_real_distribution unif(-1, 1); + std::vector X(n); + std::vector Y(n); + for (std::size_t i = 0; i < n; ++i) + { + X[i] = unif(mt); + Y[i] = unif(mt); + } + std::sort(X.begin(), X.end()); + + const std::size_t M = 16; + const Real coeff1 = chatterjee_correlation_mnn(X, Y, M); + + // The finite-sample range is [-1/2, 1] up to a bias of order M/n; use generous bounds. + CHECK_GE(coeff1, Real(-1)); + CHECK_LE(coeff1, Real(1)); + + // Invariance under strictly increasing transforms of X and Y. + for (std::size_t i = 0; i < n; ++i) + { + X[i] = Real(2.3)*X[i] - Real(7.3); + Y[i] = Real(7.6)*Y[i] - Real(8.6); + } + const Real coeff2 = chatterjee_correlation_mnn(X, Y, M); + CHECK_EQUAL(coeff1, coeff2); +} + #ifdef BOOST_MATH_EXEC_COMPATIBLE template @@ -97,13 +216,29 @@ void test_threaded(ExecutionPolicy&& exec) CHECK_ULP_CLOSE(seq_ans, par_ans, 1); }; +template +void test_mnn_threaded(ExecutionPolicy&& exec) +{ + std::vector x = boost::math::generate_random_vector(1024, 2); + std::vector y = boost::math::generate_random_vector(1024, 1); + + std::sort(std::forward(exec), x.begin(), x.end()); + + for (std::size_t M : {std::size_t(1), std::size_t(8), std::size_t(32)}) + { + auto seq_ans = chatterjee_correlation_mnn(x, y, M); + auto par_ans = chatterjee_correlation_mnn(exec, x, y, M); + CHECK_ULP_CLOSE(seq_ans, par_ans, 1); + } +}; + #endif // BOOST_MATH_EXEC_COMPATIBLE template void test_paper() { constexpr Real two_pi = boost::math::constants::two_pi(); - + // Page 9 figure (a) y = x size_t seed = 3; std::vector x = boost::math::generate_random_uniform_vector(100, seed, -two_pi, two_pi); @@ -141,6 +276,18 @@ int main(void) test_spots(); test_spots(); + test_mnn_spots(); + test_mnn_spots(); + test_mnn_spots(); + + test_mnn_extremal(); + test_mnn_extremal(); + test_mnn_extremal(); + + test_mnn_properties(); + test_mnn_properties(); + test_mnn_properties(); + #ifdef BOOST_MATH_EXEC_COMPATIBLE test_threaded(std::execution::par); @@ -150,6 +297,13 @@ int main(void) test_threaded(std::execution::par_unseq); test_threaded(std::execution::par_unseq); + test_mnn_threaded(std::execution::par); + test_mnn_threaded(std::execution::par); + test_mnn_threaded(std::execution::par); + test_mnn_threaded(std::execution::par_unseq); + test_mnn_threaded(std::execution::par_unseq); + test_mnn_threaded(std::execution::par_unseq); + #endif // BOOST_MATH_EXEC_COMPATIBLE test_paper();