From e244973729890eadb24a0d65f11befcfa1e29a05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Wed, 10 Jun 2026 11:12:53 +0200 Subject: [PATCH 1/4] Thread-safe C function to unzip without calling R API --- R/threaded.R | 1 + src/Makevars | 3 +- src/Makevars.win | 2 +- src/cmdunzip_lib.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++ src/zip.c | 4 +-- src/zip.h | 8 +++++ 6 files changed, 90 insertions(+), 4 deletions(-) create mode 100644 R/threaded.R create mode 100644 src/cmdunzip_lib.c diff --git a/R/threaded.R b/R/threaded.R new file mode 100644 index 0000000..1bc2aa0 --- /dev/null +++ b/R/threaded.R @@ -0,0 +1 @@ +threaded_unzip <- function(zipfiles, exdirs = ".") {} diff --git a/src/Makevars b/src/Makevars index 0697e9c..3d9f3b1 100644 --- a/src/Makevars +++ b/src/Makevars @@ -11,7 +11,8 @@ MBEDTLS_OBJ = $(MBEDTLS_SRC:.c=.o) PKG_CPPFLAGS = $(MBEDTLS_CPPFLAGS) PKG_CFLAGS = $(C_VISIBILITY) -OBJECTS = init.o miniz.o rzip.o zip.o crypto.o unixutils.o errors.o cleancall.o $(MBEDTLS_OBJ) +OBJECTS = init.o miniz.o rzip.o zip.o crypto.o unixutils.o errors.o \ + cleancall.o cmdunzip_lib.o $(MBEDTLS_OBJ) .PHONY: all clean diff --git a/src/Makevars.win b/src/Makevars.win index e45a23f..fa108fb 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -12,7 +12,7 @@ PKG_CPPFLAGS = $(MBEDTLS_CPPFLAGS) # -lbcrypt: BCryptGenRandom(), used for WinZip AES salts (see crypto.c). PKG_LIBS = -lbcrypt -OBJECTS = init.o miniz.o rzip.o zip.o crypto.o winutils.o errors.o cleancall.o $(MBEDTLS_OBJ) +OBJECTS = init.o miniz.o rzip.o zip.o crypto.o winutils.o errors.o cleancall.o cmdunzip_lib.o $(MBEDTLS_OBJ) .PHONY: all clean diff --git a/src/cmdunzip_lib.c b/src/cmdunzip_lib.c new file mode 100644 index 0000000..cdbfa43 --- /dev/null +++ b/src/cmdunzip_lib.c @@ -0,0 +1,76 @@ + +#include +#include +#include + +#include "zip.h" + +static ZIP_THREAD_LOCAL char tl_error_msg[1024]; + +static void dll_error_handler(const char *reason, const char *file, + int line, int zip_errno, int eno) { + (void) file; (void) line; (void) zip_errno; (void) eno; + snprintf(tl_error_msg, sizeof(tl_error_msg), "%s", reason); + /* return normally; ZIP_ERROR macro executes `return 1` next */ +} + +static int hex_nibble(unsigned int c) { + if (c >= '0' && c <= '9') return (int)(c - '0'); + if (c >= 'a' && c <= 'f') return (int)(c - 'a') + 10; + if (c >= 'A' && c <= 'F') return (int)(c - 'A') + 10; + return -1; +} + +static int decode_hex_password(const char *hex, unsigned char **out) { + size_t hlen = strlen(hex); + if (hlen % 2 != 0) return -1; + size_t outlen = hlen / 2; + *out = (unsigned char *) malloc(outlen + 1); + if (!*out) return -1; + for (size_t i = 0; i < outlen; i++) { + int hi = hex_nibble((unsigned int) hex[2 * i]); + int lo = hex_nibble((unsigned int) hex[2 * i + 1]); + if (hi < 0 || lo < 0) { free(*out); return -1; } + (*out)[i] = (unsigned char)((hi << 4) | lo); + } + return (int) outlen; +} + +/* + * Unzip zipfile into exdir. hex_password may be NULL or empty string. + * All paths are UTF-8. Returns 0 on success, non-zero on failure. + * On failure, error_buf is filled with a null-terminated message. + */ +#if defined(__GNUC__) || defined(__clang__) +__attribute__((visibility("default"))) +#endif +int do_cmdunzip(const char *zipfile, const char *exdir, + const char *hex_password, + char *error_buf, size_t error_buf_len) { + zip_set_error_handler(dll_error_handler); + tl_error_msg[0] = '\0'; + + unsigned char *password = NULL; + int password_len = 0; + if (hex_password && hex_password[0]) { + password_len = decode_hex_password(hex_password, &password); + if (password_len < 0) { + snprintf(error_buf, error_buf_len, "Invalid hex password"); + return 1; + } + } + + int ret = zip_unzip(zipfile, /* cfiles= */ NULL, /* num_files= */ 0, + /* coverwrite= */ 1, /* cjunkpaths= */ 0, + /* exdir= */ exdir, + /* decode_fn= */ NULL, /* decode_data= */ NULL, + /* entry_fn= */ NULL, /* entry_data= */ NULL, + password, (size_t) password_len); + free(password); + + if (ret != 0 && error_buf && error_buf_len > 0) { + snprintf(error_buf, error_buf_len, "%s", + tl_error_msg[0] ? tl_error_msg : "unknown error"); + } + return ret; +} diff --git a/src/zip.c b/src/zip.c index ea5aa1c..52075f7 100644 --- a/src/zip.c +++ b/src/zip.c @@ -72,7 +72,7 @@ char *zip_cp437_to_utf8(const char *src) { #define ZIP_ERROR_BUFFER_SIZE 1000 -static char zip_error_buffer[ZIP_ERROR_BUFFER_SIZE]; +static ZIP_THREAD_LOCAL char zip_error_buffer[ZIP_ERROR_BUFFER_SIZE]; static const char *zip_error_strings[] = { /* 0 R_ZIP_ESUCCESS */ "Success", @@ -105,7 +105,7 @@ static const char *zip_error_strings[] = { "`%s` in `%s` is encrypted but no password was provided" }; -static zip_error_handler_t *zip_error_handler = 0; +static ZIP_THREAD_LOCAL zip_error_handler_t *zip_error_handler = 0; void zip_set_error_handler(zip_error_handler_t *handler) { zip_error_handler = handler; diff --git a/src/zip.h b/src/zip.h index a93d62c..6868126 100644 --- a/src/zip.h +++ b/src/zip.h @@ -2,6 +2,14 @@ #ifndef R_ZIP_H #define R_ZIP_H +#if defined(_MSC_VER) +# define ZIP_THREAD_LOCAL __declspec(thread) +#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L +# define ZIP_THREAD_LOCAL _Thread_local +#else +# define ZIP_THREAD_LOCAL __thread +#endif + #include #include #include From bc8bde5a807d62b00b4190ef78b8ff2e4e3c84cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Wed, 10 Jun 2026 11:56:20 +0200 Subject: [PATCH 2/4] Concurrent unzipping of several zip files in threads Closes #147. --- R/threaded.R | 72 +++++++++++- README.Rmd | 4 + README.md | 5 + src/Makevars | 1 + src/cmdunzip_lib.c | 131 ++++++++++++++++++++++ src/init.c | 2 + src/install.libs.R | 2 +- tests/testthat/_snaps/threaded-unzip.md | 67 +++++++++++ tests/testthat/test-threaded-unzip.R | 142 ++++++++++++++++++++++++ 9 files changed, 424 insertions(+), 2 deletions(-) create mode 100644 tests/testthat/_snaps/threaded-unzip.md create mode 100644 tests/testthat/test-threaded-unzip.R diff --git a/R/threaded.R b/R/threaded.R index 1bc2aa0..cbfab5f 100644 --- a/R/threaded.R +++ b/R/threaded.R @@ -1 +1,71 @@ -threaded_unzip <- function(zipfiles, exdirs = ".") {} +threaded_unzip <- function( + zipfiles, + exdirs = ".", + num_threads = NULL, + passwords = NULL +) { + passwords <- passwords %||% "" + stopifnot( + length(exdirs) == 1L || length(exdirs) == length(zipfiles), + length(passwords) == 1L || length(passwords) == length(zipfiles) + ) + if (length(exdirs) == 1L) { + exdirs <- rep_len(exdirs, length(zipfiles)) + } + if (length(passwords) == 1L) { + passwords <- rep_len(passwords, length(zipfiles)) + } + num_threads <- num_threads %||% get_num_threads() + ret <- .Call( + c_R_threaded_unzip, + as.character(zipfiles), + as.character(exdirs), + as.integer(num_threads), + as.character(passwords) + ) + failed <- ret[[1]] != 0L + if (any(failed)) { + msgs <- paste0( + " ", + zipfiles[failed], + ": ", + ret[[2]][failed], + collapse = "\n" + ) + stop( + "Failed to unzip ", + sum(failed), + " file", + if (sum(failed) > 1) "s" else "", + ":\n", + msgs, + call. = FALSE + ) + } + invisible(NULL) +} + +get_num_threads <- function() { + opt <- getOption("zip_threads") + if (!is.null(opt)) { + if (!is.numeric(opt) || length(opt) != 1L || opt < 1L) { + stop( + "Invalid value for 'zip_threads' option, must be a positive integer." + ) + } + return(as.integer(opt)) + } + ev <- Sys.getenv("ZIP_THREADS", NA_character_) + if (!is.na(ev)) { + evval <- suppressWarnings(as.integer(ev)) + if (is.na(evval) || evval < 1L) { + stop( + "Invalid value for ZIP_THREADS environment variable, ", + "must be a positive integer." + ) + } + return(evval) + } + + 2L +} diff --git a/README.Rmd b/README.Rmd index 8bc225a..44bf2ce 100644 --- a/README.Rmd +++ b/README.Rmd @@ -130,6 +130,8 @@ variable to `TRUE`. Can be set to a string, a raw vector, or a function returning one. * `zip_progress`: If set to `TRUE`, progress bars are enabled. Takes precedence over the `ZIP_PROGRESS` environment variable. +* `zip_threads`: The number of threads to use for threaded operations. + Takes precedence over the `ZIP_THREADS` environment variable. #### Environment variables @@ -139,6 +141,8 @@ variable to `TRUE`. `cmdunzip.exe` is blocked by system policies on Windows. * `ZIP_PROGRESS`: If set to `TRUE`, progress bars are enabled. The `zip_progress` option takes precedence over this environment variable. +* `ZIP_THREADS`: The number of threads to use for threaded operations. The + `zip_threads` option takes precedence over this environment variable. ## License diff --git a/README.md b/README.md index b28e832..07932d2 100644 --- a/README.md +++ b/README.md @@ -148,6 +148,8 @@ environment variable to `TRUE`. returning one. - `zip_progress`: If set to `TRUE`, progress bars are enabled. Takes precedence over the `ZIP_PROGRESS` environment variable. +- `zip_threads`: The number of threads to use for threaded operations. + Takes precedence over the `ZIP_THREADS` environment variable. #### Environment variables @@ -157,6 +159,9 @@ environment variable to `TRUE`. when `cmdunzip.exe` is blocked by system policies on Windows. - `ZIP_PROGRESS`: If set to `TRUE`, progress bars are enabled. The `zip_progress` option takes precedence over this environment variable. +- `ZIP_THREADS`: The number of threads to use for threaded operations. + The `zip_threads` option takes precedence over this environment + variable. ## License diff --git a/src/Makevars b/src/Makevars index 3d9f3b1..bf278ec 100644 --- a/src/Makevars +++ b/src/Makevars @@ -10,6 +10,7 @@ MBEDTLS_OBJ = $(MBEDTLS_SRC:.c=.o) PKG_CPPFLAGS = $(MBEDTLS_CPPFLAGS) PKG_CFLAGS = $(C_VISIBILITY) +PKG_LIBS = -lpthread OBJECTS = init.o miniz.o rzip.o zip.o crypto.o unixutils.o errors.o \ cleancall.o cmdunzip_lib.o $(MBEDTLS_OBJ) diff --git a/src/cmdunzip_lib.c b/src/cmdunzip_lib.c index cdbfa43..a7015b8 100644 --- a/src/cmdunzip_lib.c +++ b/src/cmdunzip_lib.c @@ -3,6 +3,12 @@ #include #include +#include + +#ifndef _WIN32 +#include +#endif + #include "zip.h" static ZIP_THREAD_LOCAL char tl_error_msg[1024]; @@ -74,3 +80,128 @@ int do_cmdunzip(const char *zipfile, const char *exdir, } return ret; } + +/* ---- threaded batch unzip ------------------------------------------- */ + +typedef struct { + const char **zipfiles; + const char **exdirs; + const char **passwords; /* NULL when unused */ + int n; + int *results; + char **error_bufs; +#ifdef _WIN32 + CRITICAL_SECTION cs; +#else + pthread_mutex_t mutex; +#endif + int next_task; +} unzip_queue_t; + +#ifdef _WIN32 +static DWORD WINAPI unzip_worker(LPVOID arg) +#else +static void *unzip_worker(void *arg) +#endif +{ + unzip_queue_t *q = (unzip_queue_t *) arg; + for (;;) { +#ifdef _WIN32 + EnterCriticalSection(&q->cs); +#else + pthread_mutex_lock(&q->mutex); +#endif + int i = q->next_task++; +#ifdef _WIN32 + LeaveCriticalSection(&q->cs); +#else + pthread_mutex_unlock(&q->mutex); +#endif + if (i >= q->n) break; + const char *pw = (q->passwords && q->passwords[i] && q->passwords[i][0]) + ? q->passwords[i] : NULL; + q->results[i] = do_cmdunzip(q->zipfiles[i], q->exdirs[i], + pw, q->error_bufs[i], 1024); + } +#ifdef _WIN32 + return 0; +#else + return NULL; +#endif +} + +SEXP R_threaded_unzip(SEXP zipfiles, SEXP exdirs, SEXP num_threads, + SEXP passwords) { + int n = LENGTH(zipfiles); + int nthreads = asInteger(num_threads); + if (nthreads <= 0) nthreads = 1; + if (nthreads > n) nthreads = n; + + /* Use malloc, not R_alloc: worker threads access these buffers. */ + int *res = (int *) malloc(n * sizeof(int)); + char **errs = (char **) malloc(n * sizeof(char *)); + for (int i = 0; i < n; i++) { + errs[i] = (char *) malloc(1024); + errs[i][0] = '\0'; + res[i] = 0; + } + + unzip_queue_t q; + memset(&q, 0, sizeof(q)); + q.n = n; + q.zipfiles = (const char **) malloc(n * sizeof(char *)); + q.exdirs = (const char **) malloc(n * sizeof(char *)); + q.passwords = isNull(passwords) ? NULL + : (const char **) malloc(n * sizeof(char *)); + q.results = res; + q.error_bufs = errs; + + /* Capture CHAR pointers before threads start. R GC cannot run while we + are in C (workers never call the R API), so these remain stable. */ + for (int i = 0; i < n; i++) { + q.zipfiles[i] = CHAR(STRING_ELT(zipfiles, i)); + q.exdirs[i] = CHAR(STRING_ELT(exdirs, i)); + if (q.passwords) q.passwords[i] = CHAR(STRING_ELT(passwords, i)); + } + +#ifdef _WIN32 + InitializeCriticalSection(&q.cs); + HANDLE *threads = (HANDLE *) malloc(nthreads * sizeof(HANDLE)); + for (int t = 0; t < nthreads; t++) + threads[t] = CreateThread(NULL, 0, unzip_worker, &q, 0, NULL); + WaitForMultipleObjects((DWORD) nthreads, threads, TRUE, INFINITE); + for (int t = 0; t < nthreads; t++) CloseHandle(threads[t]); + free(threads); + DeleteCriticalSection(&q.cs); +#else + pthread_mutex_init(&q.mutex, NULL); + pthread_t *threads = (pthread_t *) malloc(nthreads * sizeof(pthread_t)); + for (int t = 0; t < nthreads; t++) + pthread_create(&threads[t], NULL, unzip_worker, &q); + for (int t = 0; t < nthreads; t++) + pthread_join(threads[t], NULL); + free(threads); + pthread_mutex_destroy(&q.mutex); +#endif + + free(q.zipfiles); + free(q.exdirs); + if (q.passwords) free(q.passwords); + + /* Build R result — back on the main thread, R API safe again. */ + SEXP r_res = PROTECT(allocVector(INTSXP, n)); + SEXP r_errs = PROTECT(allocVector(STRSXP, n)); + for (int i = 0; i < n; i++) { + INTEGER(r_res)[i] = res[i]; + SET_STRING_ELT(r_errs, i, mkChar(errs[i])); + free(errs[i]); + } + free(res); + free(errs); + + SEXP ret = PROTECT(allocVector(VECSXP, 2)); + SET_VECTOR_ELT(ret, 0, r_res); + SET_VECTOR_ELT(ret, 1, r_errs); + UNPROTECT(3); + return ret; +} diff --git a/src/init.c b/src/init.c index f537972..3a985cf 100644 --- a/src/init.c +++ b/src/init.c @@ -17,6 +17,7 @@ extern SEXP R_make_big_file(SEXP, SEXP); extern SEXP R_inflate(SEXP, SEXP, SEXP, SEXP); extern SEXP R_deflate(SEXP, SEXP, SEXP, SEXP); extern SEXP R_zip_cp437_to_utf8(SEXP); +extern SEXP R_threaded_unzip(SEXP, SEXP, SEXP, SEXP); extern SEXP R_crypto_pbkdf2_sha1(SEXP, SEXP, SEXP, SEXP); extern SEXP R_crypto_hmac_sha1(SEXP, SEXP); extern SEXP R_crypto_aes_ctr(SEXP, SEXP); @@ -31,6 +32,7 @@ static const R_CallMethodDef CallEntries[] = { { "R_inflate", (DL_FUNC) &R_inflate, 4 }, { "R_deflate", (DL_FUNC) &R_deflate, 4 }, { "R_zip_cp437_to_utf8", (DL_FUNC) &R_zip_cp437_to_utf8, 1 }, + { "R_threaded_unzip", (DL_FUNC) &R_threaded_unzip, 4 }, { "R_crypto_pbkdf2_sha1", (DL_FUNC) &R_crypto_pbkdf2_sha1, 4 }, { "R_crypto_hmac_sha1", (DL_FUNC) &R_crypto_hmac_sha1, 2 }, { "R_crypto_aes_ctr", (DL_FUNC) &R_crypto_aes_ctr, 2 }, diff --git a/src/install.libs.R b/src/install.libs.R index feb9373..4c5941b 100644 --- a/src/install.libs.R +++ b/src/install.libs.R @@ -9,7 +9,7 @@ dir.create(dest, recursive = TRUE, showWarnings = FALSE) file.copy(progs, dest, overwrite = TRUE) files <- Sys.glob(paste0("*", SHLIB_EXT)) -dest <- file.path(R_PACKAGE_DIR, paste0('libs', R_ARCH)) +dest <- file.path(R_PACKAGE_DIR, paste0("libs", R_ARCH)) dir.create(dest, recursive = TRUE, showWarnings = FALSE) file.copy(files, dest, overwrite = TRUE) if (file.exists("symbols.rds")) { diff --git a/tests/testthat/_snaps/threaded-unzip.md b/tests/testthat/_snaps/threaded-unzip.md new file mode 100644 index 0000000..1439ebc --- /dev/null +++ b/tests/testthat/_snaps/threaded-unzip.md @@ -0,0 +1,67 @@ +# threaded_unzip errors on a missing zip + + Code + threaded_unzip("/nonexistent/path/file.zip", exdir) + Condition + Error: + ! Failed to unzip 1 file: + /nonexistent/path/file.zip: Cannot open zip file `/nonexistent/path/file.zip` for reading + +# threaded_unzip reports all failures when multiple zips fail + + Code + threaded_unzip(c(z$zip, "/bad1.zip", "/bad2.zip"), exdir) + Condition + Error: + ! Failed to unzip 2 files: + /bad1.zip: Cannot open zip file `/bad1.zip` for reading + /bad2.zip: Cannot open zip file `/bad2.zip` for reading + +# threaded_unzip stops when exdirs length is mismatched + + Code + threaded_unzip(c(z1$zip, z2$zip), c(d1, d1, d1)) + Condition + Error in `threaded_unzip()`: + ! length(exdirs) == 1L || length(exdirs) == length(zipfiles) is not TRUE + +# threaded_unzip stops when passwords length is mismatched + + Code + threaded_unzip(c(z1$zip, z2$zip), exdir, passwords = c("a", "b", "c")) + Condition + Error in `threaded_unzip()`: + ! length(passwords) == 1L || length(passwords) == length(zipfiles) is not TRUE + +# get_num_threads errors on invalid zip_threads option + + Code + get_num_threads() + Condition + Error in `get_num_threads()`: + ! Invalid value for 'zip_threads' option, must be a positive integer. + +--- + + Code + get_num_threads() + Condition + Error in `get_num_threads()`: + ! Invalid value for 'zip_threads' option, must be a positive integer. + +# get_num_threads errors on invalid ZIP_THREADS env var + + Code + get_num_threads() + Condition + Error in `get_num_threads()`: + ! Invalid value for ZIP_THREADS environment variable, must be a positive integer. + +--- + + Code + get_num_threads() + Condition + Error in `get_num_threads()`: + ! Invalid value for ZIP_THREADS environment variable, must be a positive integer. + diff --git a/tests/testthat/test-threaded-unzip.R b/tests/testthat/test-threaded-unzip.R new file mode 100644 index 0000000..ac1f444 --- /dev/null +++ b/tests/testthat/test-threaded-unzip.R @@ -0,0 +1,142 @@ +test_that("threaded_unzip extracts a single zip", { + z <- make_a_zip() + exdir <- test_temp_dir() + + threaded_unzip(z$zip, exdir) + + expect_true(file.exists(file.path(exdir, basename(z$ex), "file1"))) + expect_true(file.exists(file.path(exdir, basename(z$ex), "dir", "file2"))) + expect_equal(readLines(file.path(exdir, basename(z$ex), "file1")), "file1") +}) + +test_that("threaded_unzip extracts multiple zips into a single exdir", { + z1 <- make_a_zip() + z2 <- make_a_zip() + exdir <- test_temp_dir() + + threaded_unzip(c(z1$zip, z2$zip), exdir) + + expect_true(file.exists(file.path(exdir, basename(z1$ex), "file1"))) + expect_true(file.exists(file.path(exdir, basename(z2$ex), "file1"))) +}) + +test_that("threaded_unzip extracts multiple zips into separate exdirs", { + z1 <- make_a_zip() + z2 <- make_a_zip() + d1 <- test_temp_dir() + d2 <- test_temp_dir() + + threaded_unzip(c(z1$zip, z2$zip), c(d1, d2)) + + expect_true(file.exists(file.path(d1, basename(z1$ex), "file1"))) + expect_false(file.exists(file.path(d1, basename(z2$ex), "file1"))) + expect_true(file.exists(file.path(d2, basename(z2$ex), "file1"))) +}) + +test_that("threaded_unzip works with explicit num_threads = 1", { + z1 <- make_a_zip() + z2 <- make_a_zip() + exdir <- test_temp_dir() + + threaded_unzip(c(z1$zip, z2$zip), exdir, num_threads = 1L) + + expect_true(file.exists(file.path(exdir, basename(z1$ex), "file1"))) + expect_true(file.exists(file.path(exdir, basename(z2$ex), "file1"))) +}) + +test_that("threaded_unzip works with num_threads > number of files", { + z <- make_a_zip() + exdir <- test_temp_dir() + + threaded_unzip(z$zip, exdir, num_threads = 8L) + + expect_true(file.exists(file.path(exdir, basename(z$ex), "file1"))) +}) + +test_that("threaded_unzip errors on a missing zip", { + exdir <- test_temp_dir() + + expect_snapshot( + error = TRUE, + threaded_unzip("/nonexistent/path/file.zip", exdir) + ) +}) + +test_that("threaded_unzip reports all failures when multiple zips fail", { + z <- make_a_zip() + exdir <- test_temp_dir() + + expect_snapshot( + error = TRUE, + threaded_unzip(c(z$zip, "/bad1.zip", "/bad2.zip"), exdir), + transform = transform_tempdir + ) +}) + +test_that("threaded_unzip stops when exdirs length is mismatched", { + z1 <- make_a_zip() + z2 <- make_a_zip() + d1 <- test_temp_dir() + + expect_snapshot( + error = TRUE, + threaded_unzip(c(z1$zip, z2$zip), c(d1, d1, d1)) + ) +}) + +test_that("threaded_unzip stops when passwords length is mismatched", { + z1 <- make_a_zip() + z2 <- make_a_zip() + exdir <- test_temp_dir() + + expect_snapshot( + error = TRUE, + threaded_unzip(c(z1$zip, z2$zip), exdir, passwords = c("a", "b", "c")) + ) +}) + +test_that("get_num_threads uses zip_threads option", { + withr::local_options(zip_threads = 4L) + expect_equal(get_num_threads(), 4L) +}) + +test_that("get_num_threads uses ZIP_THREADS env var", { + withr::with_envvar(c(ZIP_THREADS = "3"), { + withr::local_options(zip_threads = NULL) + expect_equal(get_num_threads(), 3L) + }) +}) + +test_that("get_num_threads option takes precedence over env var", { + withr::with_envvar(c(ZIP_THREADS = "5"), { + withr::local_options(zip_threads = 7L) + expect_equal(get_num_threads(), 7L) + }) +}) + +test_that("get_num_threads defaults to 2", { + withr::with_envvar(c(ZIP_THREADS = NA), { + withr::local_options(zip_threads = NULL) + expect_equal(get_num_threads(), 2L) + }) +}) + +test_that("get_num_threads errors on invalid zip_threads option", { + withr::local_options(zip_threads = 0L) + expect_snapshot(error = TRUE, get_num_threads()) + + withr::local_options(zip_threads = "two") + expect_snapshot(error = TRUE, get_num_threads()) +}) + +test_that("get_num_threads errors on invalid ZIP_THREADS env var", { + withr::with_envvar(c(ZIP_THREADS = "0"), { + withr::local_options(zip_threads = NULL) + expect_snapshot(error = TRUE, get_num_threads()) + }) + + withr::with_envvar(c(ZIP_THREADS = "banana"), { + withr::local_options(zip_threads = NULL) + expect_snapshot(error = TRUE, get_num_threads()) + }) +}) From 604e729f64c60dea540530cdab2996370d5c7927 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Wed, 10 Jun 2026 12:21:37 +0200 Subject: [PATCH 3/4] Vectorize unzip() --- R/threaded.R | 8 +++- R/zip.R | 40 ++++++++++++++++++-- man/unzip.Rd | 4 +- tests/testthat/test-unzip.R | 73 +++++++++++++++++++++++++++++++++++++ 4 files changed, 120 insertions(+), 5 deletions(-) diff --git a/R/threaded.R b/R/threaded.R index cbfab5f..4b15abf 100644 --- a/R/threaded.R +++ b/R/threaded.R @@ -42,7 +42,13 @@ threaded_unzip <- function( call. = FALSE ) } - invisible(NULL) + results <- Map(function(zf, ed) { + lst <- zip_list(zf) + lst$path <- file.path(normalizePath(ed), lst$filename) + lst$encryption <- NULL + lst + }, zipfiles, exdirs) + invisible(do.call(rbind, results)) } get_num_threads <- function() { diff --git a/R/zip.R b/R/zip.R index 65a06ff..cff094d 100644 --- a/R/zip.R +++ b/R/zip.R @@ -443,7 +443,9 @@ encryption_types <- c( #' If the zip archive stores permissions and was created on Unix, #' the permissions will be restored. #' -#' @param zipfile Path to the zip file to uncompress. +#' @param zipfile Path to the zip file to uncompress, or a character vector of +#' paths. When multiple paths are given and all other arguments are at their +#' defaults, the files are unzipped concurrently in a thread pool. #' @param files Character vector of files to extract from the archive. #' Files within directories can be specified, but they must use a forward #' slash as path separator, as this is what zip files use internally. @@ -495,17 +497,49 @@ unzip <- function( encoding = NULL, password = NULL ) { - if (startsWith(zipfile, "http://") || startsWith(zipfile, "https://")) { + if ( + length(zipfile) == 1 && + (startsWith(zipfile, "http://") || startsWith(zipfile, "https://")) + ) { return(unzip_url(zipfile, files, overwrite, junkpaths, exdir, encoding)) } stopifnot( - is_string(zipfile), + is_character(zipfile), + length(zipfile) >= 1, is_character_or_null(files), is_flag(overwrite), is_flag(junkpaths), is_string(exdir) ) + if ( + length(zipfile) > 1 && + is.null(files) && + isTRUE(overwrite) && + !isTRUE(junkpaths) && + is.null(encoding) && + !any(startsWith(zipfile, "http://") | startsWith(zipfile, "https://")) + ) { + pw <- resolve_password(password) + passwords <- if (!is.null(pw)) rawToChar(pw) else NULL + return(threaded_unzip(zipfile, exdirs = exdir, passwords = passwords)) + } + + if (length(zipfile) > 1) { + results <- lapply(zipfile, function(zf) { + unzip( + zf, + files = files, + overwrite = overwrite, + junkpaths = junkpaths, + exdir = exdir, + encoding = encoding, + password = password + ) + }) + return(invisible(do.call(rbind, results))) + } + zipfile <- enc2c(normalizePath(zipfile)) if (!is.null(files)) { files <- enc2c(files) diff --git a/man/unzip.Rd b/man/unzip.Rd index 8d95b59..3961857 100644 --- a/man/unzip.Rd +++ b/man/unzip.Rd @@ -15,7 +15,9 @@ unzip( ) } \arguments{ -\item{zipfile}{Path to the zip file to uncompress.} +\item{zipfile}{Path to the zip file to uncompress, or a character vector of +paths. When multiple paths are given and all other arguments are at their +defaults, the files are unzipped concurrently in a thread pool.} \item{files}{Character vector of files to extract from the archive. Files within directories can be specified, but they must use a forward diff --git a/tests/testthat/test-unzip.R b/tests/testthat/test-unzip.R index 402cb6a..626e500 100644 --- a/tests/testthat/test-unzip.R +++ b/tests/testthat/test-unzip.R @@ -269,3 +269,76 @@ test_that("unzip() shows progress bar when zip.progress = TRUE", { expect_true(file.exists(file.path(exdir, basename(z$ex), "file1"))) expect_match(output, "Unzipping", all = FALSE) }) + +test_that("unzip() with vector zipfile extracts all archives (threaded path)", { + z1 <- make_a_zip() + z2 <- make_a_zip() + exdir <- test_temp_dir() + + zip::unzip(c(z1$zip, z2$zip), exdir = exdir) + + expect_true(file.exists(file.path(exdir, basename(z1$ex), "file1"))) + expect_true(file.exists(file.path(exdir, basename(z2$ex), "file1"))) + expect_equal(readLines(file.path(exdir, basename(z1$ex), "file1")), "file1") + expect_equal(readLines(file.path(exdir, basename(z2$ex), "file1")), "file1") +}) + +test_that("unzip() with vector zipfile returns combined data frame", { + z1 <- make_a_zip() + z2 <- make_a_zip() + exdir <- test_temp_dir() + + result <- zip::unzip(c(z1$zip, z2$zip), exdir = exdir) + + expect_s3_class(result, "data.frame") + expect_true(nrow(result) > 0) + expect_true(any(grepl(basename(z1$ex), result$path))) + expect_true(any(grepl(basename(z2$ex), result$path))) +}) + +test_that("unzip() with vector zipfile falls back to sequential when files= is set", { + # Build two zips with a common internal path so files= can filter both + f1 <- test_temp_file() + cat("zip1", file = f1) + f1b <- test_temp_file() + cat("extra1", file = f1b) + z1 <- test_temp_file(".zip", create = FALSE) + zip(z1, c(f1, f1b), keys = c("data.txt", "extra1.txt")) + + f2 <- test_temp_file() + cat("zip2", file = f2) + f2b <- test_temp_file() + cat("extra2", file = f2b) + z2 <- test_temp_file(".zip", create = FALSE) + zip(z2, c(f2, f2b), keys = c("data.txt", "extra2.txt")) + + exdir <- test_temp_dir() + zip::unzip(c(z1, z2), files = "data.txt", exdir = exdir) + + expect_true(file.exists(file.path(exdir, "data.txt"))) + expect_false(file.exists(file.path(exdir, "extra1.txt"))) + expect_false(file.exists(file.path(exdir, "extra2.txt"))) +}) + +test_that("unzip() with vector zipfile falls back to sequential when junkpaths = TRUE", { + z1 <- make_a_zip() + z2 <- make_a_zip() + exdir <- test_temp_dir() + + zip::unzip(c(z1$zip, z2$zip), junkpaths = TRUE, exdir = exdir) + + expect_true(file.exists(file.path(exdir, "file1"))) + expect_true(file.exists(file.path(exdir, "file2"))) + expect_false(file.exists(file.path(exdir, basename(z1$ex)))) +}) + +test_that("unzip() with vector zipfile falls back to sequential when encoding is set", { + z1 <- make_a_zip() + z2 <- make_a_zip() + exdir <- test_temp_dir() + + zip::unzip(c(z1$zip, z2$zip), encoding = "UTF-8", exdir = exdir) + + expect_true(file.exists(file.path(exdir, basename(z1$ex), "file1"))) + expect_true(file.exists(file.path(exdir, basename(z2$ex), "file1"))) +}) From e1d612d3cdd3b7505c7484769a6b5234356f2b36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Wed, 10 Jun 2026 12:22:54 +0200 Subject: [PATCH 4/4] air format --- R/threaded.R | 16 ++++++++++------ R/zip.R | 12 ++++++------ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/R/threaded.R b/R/threaded.R index 4b15abf..dc573bf 100644 --- a/R/threaded.R +++ b/R/threaded.R @@ -42,12 +42,16 @@ threaded_unzip <- function( call. = FALSE ) } - results <- Map(function(zf, ed) { - lst <- zip_list(zf) - lst$path <- file.path(normalizePath(ed), lst$filename) - lst$encryption <- NULL - lst - }, zipfiles, exdirs) + results <- Map( + function(zf, ed) { + lst <- zip_list(zf) + lst$path <- file.path(normalizePath(ed), lst$filename) + lst$encryption <- NULL + lst + }, + zipfiles, + exdirs + ) invisible(do.call(rbind, results)) } diff --git a/R/zip.R b/R/zip.R index cff094d..aa435fb 100644 --- a/R/zip.R +++ b/R/zip.R @@ -499,7 +499,7 @@ unzip <- function( ) { if ( length(zipfile) == 1 && - (startsWith(zipfile, "http://") || startsWith(zipfile, "https://")) + (startsWith(zipfile, "http://") || startsWith(zipfile, "https://")) ) { return(unzip_url(zipfile, files, overwrite, junkpaths, exdir, encoding)) } @@ -514,11 +514,11 @@ unzip <- function( if ( length(zipfile) > 1 && - is.null(files) && - isTRUE(overwrite) && - !isTRUE(junkpaths) && - is.null(encoding) && - !any(startsWith(zipfile, "http://") | startsWith(zipfile, "https://")) + is.null(files) && + isTRUE(overwrite) && + !isTRUE(junkpaths) && + is.null(encoding) && + !any(startsWith(zipfile, "http://") | startsWith(zipfile, "https://")) ) { pw <- resolve_password(password) passwords <- if (!is.null(pw)) rawToChar(pw) else NULL