diff --git a/NEWS.md b/NEWS.md index f218b93fb..79a4afd6b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -26,6 +26,20 @@ - Type conversion support in GForce expressions (e.g., `sum(as.numeric(x))` will use GForce, saving the need to coerce `x` in a setup step) [#2934](https://github.com/Rdatatable/data.table/issues/2934) - Arithmetic operation support in GForce (e.g., `max(x) - min(x)` will use GForce on both `max(x)` and `min(x)`, saving the need to do the subtraction in a follow-up step) [#3815](https://github.com/Rdatatable/data.table/issues/3815) +4. `median()` is now faster when used in `by=` grouping operations. Thanks @ben-schwen for the suggestion and implementation. + ```r + set.seed(1) + DT = data.table(g = sample(1e4, 1e7, TRUE), v = rnorm(1e7)) + microbenchmark::microbenchmark( + "master" = `[.data.table`(DT, j=median(v), by=g), + "1.18.0" = data.table:::`[.data.table`(DT, j=median(v), by=g) + ) + # Unit: milliseconds + # expr min lq mean median uq max neval + # master 85.11445 92.76479 101.6840 99.22233 105.7872 146.8693 100 + # 1.18.0 217.07793 232.08915 250.6945 244.30433 264.9017 321.9682 100 + ``` + ### BUG FIXES 1. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 10fd2fc7f..9dabf87e6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21484,3 +21484,13 @@ dt = data.table(a=1:4, b=1:2) test(2362.51, optimize=0:2, dt[, c(list()), b, verbose=TRUE], data.table(b=integer(0L)), output="GForce FALSE") test(2362.52, optimize=0:2, dt[, c(lapply(.SD, sum), list()), b, verbose=TRUE], output=out) test(2362.53, optimize=0:2, dt[, list(lapply(.SD, sum), list()), b, verbose=TRUE], output="GForce FALSE") + +set.seed(1) +dt = data.table(g=sample(1:2, 1e4, TRUE), i=sample(1e4), d=rnorm(1e4)) +test(2363.1, optimize=0:2, dt[, median(i), by=g][, .(g, V1=as.numeric(V1))]) +test(2363.2, optimize=0:2, dt[, median(d), by=g][, .(g, V1=as.numeric(V1))]) +if (test_bit64) { + dt[, dd := as.integer64(sample(i))] + # test omitted to clarify intended bit64 behavior + # test(2363.3, dt[, median(dd), by=g]) +} diff --git a/src/data.table.h b/src/data.table.h index d6c67c752..6f7e711b2 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -236,6 +236,9 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, double dquickselect(double *x, int n); double iquickselect(int *x, int n); double i64quickselect(int64_t *x, int n); +double dfloyd_rivest(double *x, int n); +double ifloyd_rivest(int *x, int n); +double i64floyd_rivest(int64_t *x, int n); // fread.c double wallclock(void); diff --git a/src/gsumm.c b/src/gsumm.c index 890d010fc..575924c46 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -880,35 +880,60 @@ SEXP gmedian(SEXP x, SEXP narmArg) { const bool nosubset = irowslen==-1; switch(TYPEOF(x)) { case REALSXP: { - double *subd = REAL(PROTECT(allocVector(REALSXP, maxgrpn))); // allocate once upfront and reuse int64_t *xi64 = (int64_t *)REAL(x); double *xd = REAL(x); - for (int i=0; i 100 ? i64floyd_rivest((int64_t *)thread_subd, thisgrpsize) : i64quickselect((int64_t *)thread_subd, thisgrpsize)) : + (thisgrpsize > 100 ? dfloyd_rivest(thread_subd, thisgrpsize) : dquickselect(thread_subd, thisgrpsize)); + } } - thisgrpsize -= nacount; // all-NA is returned as NA_REAL via n==0 case inside *quickselect - ansd[i] = (nacount && !narm) ? NA_REAL : (isInt64 ? i64quickselect((void *)subd, thisgrpsize) : dquickselect(subd, thisgrpsize)); + free(thread_subd); }} break; case LGLSXP: case INTSXP: { - int *subi = INTEGER(PROTECT(allocVector(INTSXP, maxgrpn))); int *xi = INTEGER(x); - for (int i=0; i 100 ? ifloyd_rivest(thread_subi, validsize) : iquickselect(thread_subi, validsize); + } } - ansd[i] = (nacount && !narm) ? NA_REAL : iquickselect(subi, thisgrpsize-nacount); + free(thread_subi); }} break; default: @@ -916,7 +941,7 @@ SEXP gmedian(SEXP x, SEXP narmArg) { } if (!isInt64) copyMostAttrib(x, ans); // else the integer64 class needs to be dropped since double is always returned by gmedian - UNPROTECT(2); // ans, subd|subi + UNPROTECT(1); // ans return ans; } diff --git a/src/quickselect.c b/src/quickselect.c index 5277c0d65..40a242a8e 100644 --- a/src/quickselect.c +++ b/src/quickselect.c @@ -71,3 +71,86 @@ double i64quickselect(int64_t *x, int n) int64_t a, b; BODY(i64swap); } + +// Floyd-Rivest selection algorithm +// https://en.wikipedia.org/wiki/Floyd%E2%80%93Rivest_algorithm + +#undef DEFINE_FLOYD_RIVEST_SELECT_FN +#define DEFINE_FLOYD_RIVEST_SELECT_FN(SELECT_FN, TYPE, SWAP) \ + static void SELECT_FN(TYPE *x, unsigned long left, unsigned long right, unsigned long k) { \ + while (right > left) { \ + if (right - left > 600) { \ + unsigned long n = right - left + 1; \ + unsigned long i = k - left + 1; \ + double z = log((double)n); \ + double s = 0.5 * exp(2.0 * z / 3.0); \ + double sd = 0.5 * sqrt(z * s * (n - s) / n); \ + long delta = (long)i - (long)(n / 2); \ + if (delta < 0) sd = -sd; \ + else if (delta == 0) sd = 0.0; \ + double new_left = (double)k - (double)i * s / n + sd; \ + double new_right = (double)k + (double)(n - i) * s / n + sd; \ + unsigned long newL = (unsigned long)MAX((long)left, (long)new_left); \ + unsigned long newR = (unsigned long)MIN((long)right, (long)new_right); \ + SELECT_FN(x, newL, newR, k); \ + } \ + TYPE pivot = x[k]; \ + unsigned long i = left, j = right; \ + SWAP(x + left, x + k); \ + if (x[right] > pivot) { \ + SWAP(x + right, x + left); \ + } \ + while (i < j) { \ + SWAP(x + i, x + j); \ + i++; j--; \ + while (x[i] < pivot) i++; \ + while (x[j] > pivot) j--; \ + } \ + if (x[left] == pivot) { \ + SWAP(x + left, x + j); \ + } else { \ + j++; \ + SWAP(x + j, x + right); \ + } \ + if (j <= k) left = j + 1; \ + if (k <= j) right = j - 1; \ + } \ + } + +DEFINE_FLOYD_RIVEST_SELECT_FN(dfloyd_rivest_select, double, dswap) +DEFINE_FLOYD_RIVEST_SELECT_FN(ifloyd_rivest_select, int, iswap) +DEFINE_FLOYD_RIVEST_SELECT_FN(i64floyd_rivest_select, int64_t, i64swap) + +#undef FLOYD_RIVEST_BODY +#define FLOYD_RIVEST_BODY(TYPE, SELECTFN) \ + if (n == 0) return NA_REAL; \ + unsigned long med = n / 2 - (n % 2 == 0); \ + SELECTFN(x, 0, n - 1, med); \ + a = x[med]; \ + if (n % 2 == 1) { \ + return (double)a; \ + } else { \ + b = x[med + 1]; \ + for (unsigned long i = med + 2; i < (unsigned long)n; i++) { \ + if (x[i] < b) b = x[i]; \ + } \ + return ((double)a + (double)b) / 2.0; \ + } + +double dfloyd_rivest(double *x, int n) +{ + double a, b; + FLOYD_RIVEST_BODY(double, dfloyd_rivest_select); +} + +double ifloyd_rivest(int *x, int n) +{ + int a, b; + FLOYD_RIVEST_BODY(int, ifloyd_rivest_select); +} + +double i64floyd_rivest(int64_t *x, int n) +{ + int64_t a, b; + FLOYD_RIVEST_BODY(int64_t, i64floyd_rivest_select); +}