From df05b9d06090312b432489790d8e7d12c47b7e22 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Mon, 15 Dec 2025 12:26:08 +0100 Subject: [PATCH 1/5] add floyd-rivest and parallelization to gmedian --- src/data.table.h | 3 ++ src/gsumm.c | 69 ++++++++++++++++++++++++++++++--------------- src/quickselect.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 121 insertions(+), 22 deletions(-) diff --git a/src/data.table.h b/src/data.table.h index ebfa1232c6..48f1d346dd 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -229,6 +229,9 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, double dquickselect(double *x, int n); double iquickselect(int *x, int n); double i64quickselect(int64_t *x, int n); +double dfloyd_rivest(double *x, int n); +double ifloyd_rivest(int *x, int n); +double i64floyd_rivest(int64_t *x, int n); // fread.c double wallclock(void); diff --git a/src/gsumm.c b/src/gsumm.c index 5970f59194..3cf8d68a34 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -880,35 +880,60 @@ SEXP gmedian(SEXP x, SEXP narmArg) { const bool nosubset = irowslen==-1; switch(TYPEOF(x)) { case REALSXP: { - double *subd = REAL(PROTECT(allocVector(REALSXP, maxgrpn))); // allocate once upfront and reuse int64_t *xi64 = (int64_t *)REAL(x); double *xd = REAL(x); - for (int i=0; i 100 ? i64floyd_rivest((int64_t *)thread_subd, thisgrpsize) : i64quickselect((int64_t *)thread_subd, thisgrpsize)) : + (thisgrpsize > 100 ? dfloyd_rivest(thread_subd, thisgrpsize) : dquickselect(thread_subd, thisgrpsize)); + } } - thisgrpsize -= nacount; // all-NA is returned as NA_REAL via n==0 case inside *quickselect - ansd[i] = (nacount && !narm) ? NA_REAL : (isInt64 ? i64quickselect((void *)subd, thisgrpsize) : dquickselect(subd, thisgrpsize)); + free(thread_subd); }} break; case LGLSXP: case INTSXP: { - int *subi = INTEGER(PROTECT(allocVector(INTSXP, maxgrpn))); int *xi = INTEGER(x); - for (int i=0; i 100 ? ifloyd_rivest(thread_subi, validsize) : iquickselect(thread_subi, validsize); + } } - ansd[i] = (nacount && !narm) ? NA_REAL : iquickselect(subi, thisgrpsize-nacount); + free(thread_subi); }} break; default: @@ -916,7 +941,7 @@ SEXP gmedian(SEXP x, SEXP narmArg) { } if (!isInt64) copyMostAttrib(x, ans); // else the integer64 class needs to be dropped since double is always returned by gmedian - UNPROTECT(2); // ans, subd|subi + UNPROTECT(1); // ans return ans; } diff --git a/src/quickselect.c b/src/quickselect.c index 5277c0d65a..4ca909b140 100644 --- a/src/quickselect.c +++ b/src/quickselect.c @@ -71,3 +71,74 @@ double i64quickselect(int64_t *x, int n) int64_t a, b; BODY(i64swap); } + +// Floyd-Rivest selection algorithm + +#undef FLOYD_RIVEST_BODY +#define FLOYD_RIVEST_BODY(SWAP, TYPE) \ + if (n == 0) return NA_REAL; \ + unsigned long med = n / 2 - (n % 2 == 0); \ + unsigned long l = 0, ir = n - 1; \ + while (ir > l) { \ + if (ir - l > 600) { \ + unsigned long nn = ir - l + 1; \ + unsigned long i = med - l + 1; \ + double z = log((double)nn); \ + double s = 0.5 * exp(2.0*z/3.0); \ + double sd = 0.5 * sqrt(z*s*(nn-s)/nn); \ + if (i < nn/2) sd = -sd; \ + unsigned long newL = (unsigned long)(MAX((long)l, (long)(med - i*s/nn + sd))); \ + unsigned long newR = (unsigned long)(MIN((long)ir, (long)(med + (nn-i)*s/nn + sd))); \ + l = newL; \ + ir = newR; \ + } \ + TYPE pivot = x[med]; \ + unsigned long i = l, j = ir; \ + SWAP(x + l, x + med); \ + if (x[ir] > pivot) { \ + SWAP(x + ir, x + l); \ + pivot = x[l]; \ + } \ + while (i < j) { \ + SWAP(x + i, x + j); \ + i++; j--; \ + while (x[i] < pivot) i++; \ + while (x[j] > pivot) j--; \ + } \ + if (x[l] == pivot) { \ + SWAP(x + l, x + j); \ + } else { \ + j++; \ + SWAP(x + j, x + ir); \ + } \ + if (j <= med) l = j + 1; \ + if (med <= j) ir = j - 1; \ + } \ + a = x[med]; \ + if (n % 2 == 1) { \ + return (double)a; \ + } else { \ + b = x[med + 1]; \ + for (unsigned long i = med + 2; i < n; i++) { \ + if (x[i] < b) b = x[i]; \ + } \ + return ((double)a + (double)b) / 2.0; \ + } + +double dfloyd_rivest(double *x, int n) +{ + double a, b; + FLOYD_RIVEST_BODY(dswap, double); +} + +double ifloyd_rivest(int *x, int n) +{ + int a, b; + FLOYD_RIVEST_BODY(iswap, int); +} + +double i64floyd_rivest(int64_t *x, int n) +{ + int64_t a, b; + FLOYD_RIVEST_BODY(i64swap, int64_t); +} From 36742ea61274eeb31e967279a2229cee2762b578 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 16 Dec 2025 21:14:13 +0100 Subject: [PATCH 2/5] Update src/quickselect.c Co-authored-by: Michael Chirico --- src/quickselect.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/quickselect.c b/src/quickselect.c index 4ca909b140..67dfe34e3e 100644 --- a/src/quickselect.c +++ b/src/quickselect.c @@ -73,6 +73,7 @@ double i64quickselect(int64_t *x, int n) } // Floyd-Rivest selection algorithm +// https://en.wikipedia.org/wiki/Floyd%E2%80%93Rivest_algorithm #undef FLOYD_RIVEST_BODY #define FLOYD_RIVEST_BODY(SWAP, TYPE) \ From 6d94f8cd3a2efdea056d6798952743cc7955888b Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 17 Jan 2026 17:14:17 +0100 Subject: [PATCH 3/5] add missed recursive call --- inst/tests/tests.Rraw | 10 +++++ src/quickselect.c | 93 ++++++++++++++++++++++++------------------- 2 files changed, 62 insertions(+), 41 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 10fd2fc7f4..9dabf87e66 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21484,3 +21484,13 @@ dt = data.table(a=1:4, b=1:2) test(2362.51, optimize=0:2, dt[, c(list()), b, verbose=TRUE], data.table(b=integer(0L)), output="GForce FALSE") test(2362.52, optimize=0:2, dt[, c(lapply(.SD, sum), list()), b, verbose=TRUE], output=out) test(2362.53, optimize=0:2, dt[, list(lapply(.SD, sum), list()), b, verbose=TRUE], output="GForce FALSE") + +set.seed(1) +dt = data.table(g=sample(1:2, 1e4, TRUE), i=sample(1e4), d=rnorm(1e4)) +test(2363.1, optimize=0:2, dt[, median(i), by=g][, .(g, V1=as.numeric(V1))]) +test(2363.2, optimize=0:2, dt[, median(d), by=g][, .(g, V1=as.numeric(V1))]) +if (test_bit64) { + dt[, dd := as.integer64(sample(i))] + # test omitted to clarify intended bit64 behavior + # test(2363.3, dt[, median(dd), by=g]) +} diff --git a/src/quickselect.c b/src/quickselect.c index 67dfe34e3e..40a242a8e6 100644 --- a/src/quickselect.c +++ b/src/quickselect.c @@ -75,52 +75,63 @@ double i64quickselect(int64_t *x, int n) // Floyd-Rivest selection algorithm // https://en.wikipedia.org/wiki/Floyd%E2%80%93Rivest_algorithm +#undef DEFINE_FLOYD_RIVEST_SELECT_FN +#define DEFINE_FLOYD_RIVEST_SELECT_FN(SELECT_FN, TYPE, SWAP) \ + static void SELECT_FN(TYPE *x, unsigned long left, unsigned long right, unsigned long k) { \ + while (right > left) { \ + if (right - left > 600) { \ + unsigned long n = right - left + 1; \ + unsigned long i = k - left + 1; \ + double z = log((double)n); \ + double s = 0.5 * exp(2.0 * z / 3.0); \ + double sd = 0.5 * sqrt(z * s * (n - s) / n); \ + long delta = (long)i - (long)(n / 2); \ + if (delta < 0) sd = -sd; \ + else if (delta == 0) sd = 0.0; \ + double new_left = (double)k - (double)i * s / n + sd; \ + double new_right = (double)k + (double)(n - i) * s / n + sd; \ + unsigned long newL = (unsigned long)MAX((long)left, (long)new_left); \ + unsigned long newR = (unsigned long)MIN((long)right, (long)new_right); \ + SELECT_FN(x, newL, newR, k); \ + } \ + TYPE pivot = x[k]; \ + unsigned long i = left, j = right; \ + SWAP(x + left, x + k); \ + if (x[right] > pivot) { \ + SWAP(x + right, x + left); \ + } \ + while (i < j) { \ + SWAP(x + i, x + j); \ + i++; j--; \ + while (x[i] < pivot) i++; \ + while (x[j] > pivot) j--; \ + } \ + if (x[left] == pivot) { \ + SWAP(x + left, x + j); \ + } else { \ + j++; \ + SWAP(x + j, x + right); \ + } \ + if (j <= k) left = j + 1; \ + if (k <= j) right = j - 1; \ + } \ + } + +DEFINE_FLOYD_RIVEST_SELECT_FN(dfloyd_rivest_select, double, dswap) +DEFINE_FLOYD_RIVEST_SELECT_FN(ifloyd_rivest_select, int, iswap) +DEFINE_FLOYD_RIVEST_SELECT_FN(i64floyd_rivest_select, int64_t, i64swap) + #undef FLOYD_RIVEST_BODY -#define FLOYD_RIVEST_BODY(SWAP, TYPE) \ +#define FLOYD_RIVEST_BODY(TYPE, SELECTFN) \ if (n == 0) return NA_REAL; \ unsigned long med = n / 2 - (n % 2 == 0); \ - unsigned long l = 0, ir = n - 1; \ - while (ir > l) { \ - if (ir - l > 600) { \ - unsigned long nn = ir - l + 1; \ - unsigned long i = med - l + 1; \ - double z = log((double)nn); \ - double s = 0.5 * exp(2.0*z/3.0); \ - double sd = 0.5 * sqrt(z*s*(nn-s)/nn); \ - if (i < nn/2) sd = -sd; \ - unsigned long newL = (unsigned long)(MAX((long)l, (long)(med - i*s/nn + sd))); \ - unsigned long newR = (unsigned long)(MIN((long)ir, (long)(med + (nn-i)*s/nn + sd))); \ - l = newL; \ - ir = newR; \ - } \ - TYPE pivot = x[med]; \ - unsigned long i = l, j = ir; \ - SWAP(x + l, x + med); \ - if (x[ir] > pivot) { \ - SWAP(x + ir, x + l); \ - pivot = x[l]; \ - } \ - while (i < j) { \ - SWAP(x + i, x + j); \ - i++; j--; \ - while (x[i] < pivot) i++; \ - while (x[j] > pivot) j--; \ - } \ - if (x[l] == pivot) { \ - SWAP(x + l, x + j); \ - } else { \ - j++; \ - SWAP(x + j, x + ir); \ - } \ - if (j <= med) l = j + 1; \ - if (med <= j) ir = j - 1; \ - } \ + SELECTFN(x, 0, n - 1, med); \ a = x[med]; \ if (n % 2 == 1) { \ return (double)a; \ } else { \ b = x[med + 1]; \ - for (unsigned long i = med + 2; i < n; i++) { \ + for (unsigned long i = med + 2; i < (unsigned long)n; i++) { \ if (x[i] < b) b = x[i]; \ } \ return ((double)a + (double)b) / 2.0; \ @@ -129,17 +140,17 @@ double i64quickselect(int64_t *x, int n) double dfloyd_rivest(double *x, int n) { double a, b; - FLOYD_RIVEST_BODY(dswap, double); + FLOYD_RIVEST_BODY(double, dfloyd_rivest_select); } double ifloyd_rivest(int *x, int n) { int a, b; - FLOYD_RIVEST_BODY(iswap, int); + FLOYD_RIVEST_BODY(int, ifloyd_rivest_select); } double i64floyd_rivest(int64_t *x, int n) { int64_t a, b; - FLOYD_RIVEST_BODY(i64swap, int64_t); + FLOYD_RIVEST_BODY(int64_t, i64floyd_rivest_select); } From 9c0b64db5af5660789a08852bcc9848aadda5113 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 17 Jan 2026 17:15:38 +0100 Subject: [PATCH 4/5] add NEWS --- NEWS.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/NEWS.md b/NEWS.md index f218b93fbc..47d8bc2444 100644 --- a/NEWS.md +++ b/NEWS.md @@ -26,6 +26,20 @@ - Type conversion support in GForce expressions (e.g., `sum(as.numeric(x))` will use GForce, saving the need to coerce `x` in a setup step) [#2934](https://github.com/Rdatatable/data.table/issues/2934) - Arithmetic operation support in GForce (e.g., `max(x) - min(x)` will use GForce on both `max(x)` and `min(x)`, saving the need to do the subtraction in a follow-up step) [#3815](https://github.com/Rdatatable/data.table/issues/3815) +4. `median()` is now faster when used in `by=` grouping operations. Thanks @ben-schwen for the suggestion and implementation. + ```r + set.seed(1) + DT = data.table(g = sample(1e4, 1e6, TRUE), v = rnorm(1e6)) + microbenchmark::microbenchmark( + "master" = `[.data.table`(DT, j=median(v), by=g), + "1.18.0" = data.table:::`[.data.table`(DT, j=median(v), by=g) + ) + # Unit: milliseconds + # expr min lq mean median uq max neval + # master 6.761768 8.392194 22.36680 9.221132 10.66201 1102.11168 100 + # 1.18.0 18.928767 22.003751 26.64251 24.273920 28.06920 88.66257 100 + ``` + ### BUG FIXES 1. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix. From 19f0b8d150a2c7f6e1a2a695dd0102cabb2a6d05 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 17 Jan 2026 17:18:33 +0100 Subject: [PATCH 5/5] update NEWS example --- NEWS.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 47d8bc2444..79a4afd6bd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -29,15 +29,15 @@ 4. `median()` is now faster when used in `by=` grouping operations. Thanks @ben-schwen for the suggestion and implementation. ```r set.seed(1) - DT = data.table(g = sample(1e4, 1e6, TRUE), v = rnorm(1e6)) + DT = data.table(g = sample(1e4, 1e7, TRUE), v = rnorm(1e7)) microbenchmark::microbenchmark( "master" = `[.data.table`(DT, j=median(v), by=g), "1.18.0" = data.table:::`[.data.table`(DT, j=median(v), by=g) ) # Unit: milliseconds - # expr min lq mean median uq max neval - # master 6.761768 8.392194 22.36680 9.221132 10.66201 1102.11168 100 - # 1.18.0 18.928767 22.003751 26.64251 24.273920 28.06920 88.66257 100 + # expr min lq mean median uq max neval + # master 85.11445 92.76479 101.6840 99.22233 105.7872 146.8693 100 + # 1.18.0 217.07793 232.08915 250.6945 244.30433 264.9017 321.9682 100 ``` ### BUG FIXES