boostorg · mborland · Jul 2, 2026 · Jul 1, 2026
diff --git a/doc/overview/roadmap.qbk b/doc/overview/roadmap.qbk
@@ -17,6 +17,7 @@ Recent issues on GitHub [@https://github.com/boostorg/math/issues?utf8=%E2%9C%93
 * Add improvement to Heuman Lambda precision.
 * Improve Skew Normal root finding, see [@https://github.com/boostorg/math/issues/1120 1120].
 * Lots of minor fixes and improved code coverage.
+* Add support for the M nearest-neighbour Chatterjee correlation of Lin and Han (2021), see [@https://github.com/boostorg/math/issues/990 990].
 
 [h4 Math-4.2.0 (Boost-1.85)]
 

diff --git a/doc/statistics/chatterjee_correlation.qbk b/doc/statistics/chatterjee_correlation.qbk
@@ -72,3 +72,83 @@ Of course, random numbers are not used internally, but the result is not guarant
 
 [endsect]
 [/section:chatterjee_correlation Chatterjee Correlation]
+
+[section:chatterjee_correlation_mnn Chatterjee Correlation (M nearest neighbours)]
+
+[heading Synopsis]
+
+``
+#include <boost/math/statistics/chatterjee_correlation.hpp>
+
+namespace boost::math::statistics {
+
+    C++17:
+    template <typename ExecutionPolicy, typename Container>
+    auto chatterjee_correlation_mnn(ExecutionPolicy&& exec, const Container& u, const Container& v, std::size_t M);
+
+    C++11:
+    template <typename Container>
+    auto chatterjee_correlation_mnn(const Container& u, const Container& v, std::size_t M);
+}
+``
+
+[heading Description]
+
+`chatterjee_correlation_mnn` computes the revised Chatterjee rank correlation of Lin and Han (2021), which
+generalises Chatterjee's coefficient by incorporating the `M` right nearest neighbours of each point rather
+than only the single right neighbour.
+The statistic still consistently estimates the same measure of dependence (between 0 and 1; zero if and only
+if X and Y are independent, unity if and only if Y is a measurable function of X), but its use of additional
+neighbours boosts the power of the associated independence test.
+
+The original coefficient `chatterjee_correlation` has a statistical detection boundary of n[super -1/4] for
+testing independence, which is substantially weaker than the parametric n[super -1/2] rate.
+By letting `M` grow with the sample size (with M/n -> 0), the revised statistic can approach near-parametric
+efficiency.
+
+Let X and Y be random variables, where Y is not constant, and let (X_i, Y_i) be samples sorted so that
+X_(0) < X_(1) < ... < X_(n-1). Writing R_i for the rank of Y_i and j_m(i) for the index of the m-th right
+nearest neighbour of X_i, the statistic is
+
+``
+    xi_{n,M} = -2 + 6 * sum_i sum_{m=1}^{M} min(R_i, R_{j_m(i)}) / ((n + 1) * (n*M + M*(M + 1) / 4))
+``
+
+The complexity is O(n log n + n M). For `M` of order O(1) or O(poly-log n) this is nearly linear; as `M`
+approaches `n` it tends to O(n[super 2]).
+
+An example is given below:
+
+    std::vector<double> X{1,2,3,4,5};
+    std::vector<double> Y{1,2,3,4,5};
+    using boost::math::statistics::chatterjee_correlation_mnn;
+    std::size_t M = 2;
+    double coeff = chatterjee_correlation_mnn(X, Y, M);
+
+/Nota bene:/ If the input is an integer type the output will be a double precision type.
+
+[heading Choice of M]
+
+The asymptotic null variance of the statistic is minimised when `M` is of order sqrt(n), which is a reasonable
+default for users who want improved power without the quadratic cost of large `M`. Pushing `M` closer to `n`
+increases the power of the independence test against smooth alternatives at the cost of additional computation.
+The choice is left to the caller; `M` must satisfy 1 <= M <= n.
+
+/Nota bene:/ Even at `M` = 1 this statistic is not identical to `chatterjee_correlation`: it uses
+min(R_i, R_j) in place of |R_i - R_j| and a different normalisation, so the two agree only up to a term of
+order 1/n. Use `chatterjee_correlation` when the original coefficient is required.
+
+[heading Invariants]
+
+The function expects at least two samples, a non-constant vector Y, the same number of X's as Y's, and
+1 <= M <= n.
+If Y is constant, the result is a quiet NaN.
+The data set must be sorted by X values.
+If there are ties in the values of X, then the statistic is random due to the random breaking of ties.
+
+[heading References]
+
+* Lin, Zhexiao, and Fang Han. "On boosting the power of Chatterjee's rank correlation." Biometrika 110.2 (2023): 283-299. [@https://arxiv.org/abs/2108.06828 arXiv:2108.06828].
+
+[endsect]
+[/section:chatterjee_correlation_mnn Chatterjee Correlation (M nearest neighbours)]
diff --git a/include/boost/math/statistics/chatterjee_correlation.hpp b/include/boost/math/statistics/chatterjee_correlation.hpp
@@ -1,4 +1,5 @@
 //  (C) Copyright Matt Borland 2022.
+//  (C) Copyright Oleksandr Kornijcuk 2026.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -52,7 +53,7 @@ template <typename ReturnType, typename ForwardIterator>
 ReturnType chatterjee_correlation_seq_impl(ForwardIterator u_begin, ForwardIterator u_end, ForwardIterator v_begin, ForwardIterator v_end)
 {
     using std::abs;
-    
+
     BOOST_MATH_ASSERT_MSG(std::is_sorted(u_begin, u_end), "The x values must be sorted in order to use this functionality");
 
     const std::vector<std::size_t> rank_vector = rank(v_begin, v_end);
@@ -70,15 +71,107 @@ ReturnType chatterjee_correlation_seq_impl(ForwardIterator u_begin, ForwardItera
     return result;
 }
 
+// Generalised (M right-nearest-neighbour) Chatterjee correlation of Lin and Han (2021),
+// "On boosting the power of Chatterjee's rank correlation", https://arxiv.org/abs/2108.06828.
+//
+// For a sample sorted by X, the inner sum of equation (2.4) is
+//
+//     S = sum_{i} sum_{m=1}^{M} min(R_i, R_{j_m(i)}),
+//
+// where R_i is the (1-based) rank of Y_i and j_m(i) is the m-th right nearest neighbour of X_i.
+// After sorting by X the m-th right neighbour of element i is element i + m when it exists
+// (i + m < n); otherwise j_m(i) = i (the sentinel of the paper), contributing min(R_i, R_i) = R_i.
+//
+// Note: rank() returns 0-based ranks, whereas the paper's formula uses 1-based ranks. The +1 offset
+// cancels in the M = 1 statistic above (which uses |R_{i+1} - R_i|) but does NOT cancel under
+// min(.,.), so it is applied explicitly here.
+inline std::uint64_t chatterjee_mnn_min_sum(const std::vector<std::size_t>& rank_vector,
+                                            std::size_t lo, std::size_t hi, std::size_t M)
+{
+    const std::size_t n = rank_vector.size();
+    std::uint64_t sum = 0;
+
+    for (std::size_t i = lo; i < hi; ++i)
+    {
+        const std::size_t r_i = rank_vector[i] + 1;
+
+        // Number of right neighbours of i that actually exist within [0, n).
+        const std::size_t existing = (i + M < n) ? M : (n - 1 - i);
+
+        for (std::size_t m = 1; m <= existing; ++m)
+        {
+            const std::size_t r_j = rank_vector[i + m] + 1;
+            sum += static_cast<std::uint64_t>((std::min)(r_i, r_j));
+        }
+
+        // The remaining (M - existing) neighbours are sentinels j_m(i) = i, each contributing r_i.
+        sum += static_cast<std::uint64_t>(M - existing) * static_cast<std::uint64_t>(r_i);
+    }
+
+    return sum;
+}
+
+template <typename ReturnType>
+ReturnType chatterjee_mnn_result(std::uint64_t sum, std::size_t n, std::size_t M)
+{
+    const ReturnType n_r = static_cast<ReturnType>(n);
+    const ReturnType m_r = static_cast<ReturnType>(M);
+
+    // Denominator (n + 1) * [nM + M(M + 1) / 4] of equation (2.4); chosen so that E[xi_{n,M}] = 0 under H0.
+    const ReturnType denominator = (n_r + static_cast<ReturnType>(1)) *
+        (n_r * m_r + m_r * (m_r + static_cast<ReturnType>(1)) / static_cast<ReturnType>(4));
+
+    return static_cast<ReturnType>(-2) + static_cast<ReturnType>(6) * static_cast<ReturnType>(sum) / denominator;
+}
+
+template <typename ReturnType, typename ForwardIterator>
+ReturnType chatterjee_correlation_mnn_seq_impl(ForwardIterator u_begin, ForwardIterator u_end,
+                                               ForwardIterator v_begin, ForwardIterator v_end, std::size_t M)
+{
+    BOOST_MATH_ASSERT_MSG(std::is_sorted(u_begin, u_end), "The x values must be sorted in order to use this functionality");
+
+    const std::size_t n = static_cast<std::size_t>(std::distance(v_begin, v_end));
+
+    BOOST_MATH_ASSERT_MSG(M >= 1, "The number of right nearest neighbours M must be at least 1");
+    BOOST_MATH_ASSERT_MSG(M <= n, "The number of right nearest neighbours M must not exceed the number of samples");
+
+    // If Y is constant the statistic is undefined (Y must be non-constant); return a quiet NaN.
+    // This is checked on the input directly: rank() collapses tied values, so a constant Y would
+    // otherwise reduce the effective sample size to one.
+    if (v_begin != v_end)
+    {
+        using value_type = typename std::iterator_traits<ForwardIterator>::value_type;
+        const value_type first_value = *v_begin;
+        if (std::all_of(std::next(v_begin), v_end,
+                        [&first_value](const value_type& value) { return value == first_value; }))
+        {
+            return std::numeric_limits<ReturnType>::quiet_NaN();
+        }
+    }
+
+    const std::vector<std::size_t> rank_vector = rank(v_begin, v_end);
+
+    const std::uint64_t sum = chatterjee_mnn_min_sum(rank_vector, 0, n, M);
+
+    return chatterjee_mnn_result<ReturnType>(sum, n, M);
+}
+
 } // Namespace detail
 
-template <typename Container, typename Real = typename Container::value_type, 
+template <typename Container, typename Real = typename Container::value_type,
           typename ReturnType = typename std::conditional<std::is_integral<Real>::value, double, Real>::type>
 inline ReturnType chatterjee_correlation(const Container& u, const Container& v)
 {
     return detail::chatterjee_correlation_seq_impl<ReturnType>(std::begin(u), std::end(u), std::begin(v), std::end(v));
 }
 
+template <typename Container, typename Real = typename Container::value_type,
+          typename ReturnType = typename std::conditional<std::is_integral<Real>::value, double, Real>::type>
+inline ReturnType chatterjee_correlation_mnn(const Container& u, const Container& v, std::size_t M)
+{
+    return detail::chatterjee_correlation_mnn_seq_impl<ReturnType>(std::begin(u), std::end(u), std::begin(v), std::end(v), M);
+}
+
 }}} // Namespace boost::math::statistics
 
 #ifdef BOOST_MATH_EXEC_COMPATIBLE
@@ -121,7 +214,7 @@ ReturnType chatterjee_correlation_par_impl(ExecutionPolicy&& exec, ForwardIterat
     {
         sum += future_manager[i].get();
     }
-    
+
     ReturnType result = static_cast<ReturnType>(1) - (static_cast<ReturnType>(3 * sum) / static_cast<ReturnType>(rank_vector.size() * rank_vector.size() - 1));
 
     // If the result is 1 then Y is constant and all the elements must be ties
@@ -133,6 +226,66 @@ ReturnType chatterjee_correlation_par_impl(ExecutionPolicy&& exec, ForwardIterat
     return result;
 }
 
+template <typename ReturnType, typename ExecutionPolicy, typename ForwardIterator>
+ReturnType chatterjee_correlation_mnn_par_impl(ExecutionPolicy&& exec, ForwardIterator u_begin, ForwardIterator u_end,
+                                                                       ForwardIterator v_begin, ForwardIterator v_end, std::size_t M)
+{
+    BOOST_MATH_ASSERT_MSG(std::is_sorted(std::forward<ExecutionPolicy>(exec), u_begin, u_end), "The x values must be sorted in order to use this functionality");
+
+    const std::size_t n = static_cast<std::size_t>(std::distance(v_begin, v_end));
+
+    BOOST_MATH_ASSERT_MSG(M >= 1, "The number of right nearest neighbours M must be at least 1");
+    BOOST_MATH_ASSERT_MSG(M <= n, "The number of right nearest neighbours M must not exceed the number of samples");
+
+    // If Y is constant the statistic is undefined (Y must be non-constant); return a quiet NaN.
+    // This is checked on the input directly: rank() collapses tied values, so a constant Y would
+    // otherwise reduce the effective sample size to one.
+    if (v_begin != v_end)
+    {
+        using value_type = typename std::iterator_traits<ForwardIterator>::value_type;
+        const value_type first_value = *v_begin;
+        if (std::all_of(std::next(v_begin), v_end,
+                        [&first_value](const value_type& value) { return value == first_value; }))
+        {
+            return std::numeric_limits<ReturnType>::quiet_NaN();
+        }
+    }
+
+    const auto rank_vector = rank(std::forward<ExecutionPolicy>(exec), v_begin, v_end);
+
+    const auto num_threads = std::thread::hardware_concurrency() == 0 ? 2u : std::thread::hardware_concurrency();
+
+    // The i-loop is embarrassingly parallel: each task sums over a disjoint contiguous range of
+    // indices [lo, hi) and reads (read-only) rank_vector entries up to i + M, which may fall in a
+    // neighbouring task's range. Because there are no writes, the partition needs no overlap; this
+    // differs from the M = 1 parallel path above, which splits the data array (and overlaps by one
+    // element) for the difference-based transform.
+    std::vector<std::future<std::uint64_t>> future_manager {};
+    future_manager.reserve(num_threads);
+
+    const std::size_t chunk = static_cast<std::size_t>(std::ceil(static_cast<double>(n) / num_threads));
+
+    std::size_t lo = 0;
+    while (lo < n)
+    {
+        const std::size_t hi = (std::min)(lo + chunk, n);
+        future_manager.emplace_back(std::async(std::launch::async | std::launch::deferred,
+            [&rank_vector, lo, hi, M]() -> std::uint64_t
+            {
+                return chatterjee_mnn_min_sum(rank_vector, lo, hi, M);
+            }));
+        lo = hi;
+    }
+
+    std::uint64_t sum = 0;
+    for (std::size_t i {}; i < future_manager.size(); ++i)
+    {
+        sum += future_manager[i].get();
+    }
+
+    return chatterjee_mnn_result<ReturnType>(sum, n, M);
+}
+
 } // Namespace detail
 
 template <typename ExecutionPolicy, typename Container, typename Real = typename Container::value_type,
@@ -152,6 +305,23 @@ inline ReturnType chatterjee_correlation(ExecutionPolicy&& exec, const Container
     }
 }
 
+template <typename ExecutionPolicy, typename Container, typename Real = typename Container::value_type,
+          typename ReturnType = std::conditional_t<std::is_integral_v<Real>, double, Real>>
+inline ReturnType chatterjee_correlation_mnn(ExecutionPolicy&& exec, const Container& u, const Container& v, std::size_t M)
+{
+    if constexpr (std::is_same_v<std::remove_reference_t<decltype(exec)>, decltype(std::execution::seq)>)
+    {
+        return detail::chatterjee_correlation_mnn_seq_impl<ReturnType>(std::cbegin(u), std::cend(u),
+                                                                       std::cbegin(v), std::cend(v), M);
+    }
+    else
+    {
+        return detail::chatterjee_correlation_mnn_par_impl<ReturnType>(std::forward<ExecutionPolicy>(exec),
+                                                                       std::cbegin(u), std::cend(u),
+                                                                       std::cbegin(v), std::cend(v), M);
+    }
+}
+
 } // Namespace boost::math::statistics
 
 #endif