diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 2300ad2..45d31ae 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -28,7 +28,7 @@ jobs:
 
     steps:
       - uses: actions/checkout@v3
-      - run: sudo apt-get install -y libparted-dev libudev-dev
+      - run: sudo apt-get update && sudo apt-get install -y libparted-dev libudev-dev
       - run: make all extra
 
       - if: matrix.volume == 'relative-path'
@@ -70,8 +70,10 @@ jobs:
       - name: Cleanup loop device
         if: always()
         run: |
-          sudo losetup -d ${{ steps.loop.outputs.dev }}
-          rm dummy.img
+          if [ -n "${{ steps.loop.outputs.dev }}" ]; then
+            sudo losetup -d ${{ steps.loop.outputs.dev }}
+          fi
+          rm -f dummy.img
 
   MacOS:
     strategy:
diff --git a/Makefile b/Makefile
index b62cd7f..f9a5301 100644
--- a/Makefile
+++ b/Makefile
@@ -54,8 +54,8 @@ f3write: libutils.o utils.o libflow.o f3write.o
 f3read: libutils.o utils.o libflow.o f3read.o
 	$(CC) -o $@ $^ $(LDFLAGS) -lm
 
-f3probe: libutils.o libdevs.o libprobe.o f3probe.o
-	$(CC) -o $@ $^ $(LDFLAGS) -ludev
+f3probe: libutils.o libflow.o libdevs.o libprobe.o f3probe.o
+	$(CC) -o $@ $^ $(LDFLAGS) -lm -ludev
 
 f3brew: libutils.o libflow.o libdevs.o f3brew.o
 	$(CC) -o $@ $^ $(LDFLAGS) -lm -ludev
diff --git a/f3brew.c b/f3brew.c
index b04180b..2ddc8da 100644
--- a/f3brew.c
+++ b/f3brew.c
@@ -1,3 +1,6 @@
+#define _POSIX_C_SOURCE 200112L
+#define _XOPEN_SOURCE 600
+
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -323,8 +326,8 @@ static void test_write_blocks(struct device *dev,
 		first_block, last_block);
 	fflush(stdout);
 
-	init_flow(&fw, block_size, total_size, max_write_rate, show_progress,
-		NULL);
+	init_flow(&fw, block_size, total_size, max_write_rate,
+		show_progress ? printf_flush_cb : dummy_cb, 0, NULL);
 
 	assert(!gettimeofday(&t1, NULL));
 	write_blocks(dev, &fw, first_block, last_block);
@@ -498,8 +501,8 @@ static void test_read_blocks(struct device *dev,
 	printf("Reading blocks from 0x%" PRIx64 " to 0x%" PRIx64 ":\n",
 		first_block, last_block);
 
-	init_flow(&fw, block_size, total_size, max_read_rate, show_progress,
-		NULL);
+	init_flow(&fw, block_size, total_size, max_read_rate,
+		show_progress ? printf_flush_cb : dummy_cb, 0, NULL);
 
 	assert(!gettimeofday(&t1, NULL));
 	read_blocks(dev, &fw, first_block, last_block, &stats);
diff --git a/f3fix.c b/f3fix.c
index 061c3a4..4b79293 100644
--- a/f3fix.c
+++ b/f3fix.c
@@ -1,3 +1,6 @@
+#define _POSIX_C_SOURCE 200112L
+#define _XOPEN_SOURCE 600
+
 #include <stdbool.h>
 #include <assert.h>
 #include <argp.h>
diff --git a/f3probe.c b/f3probe.c
index 4533313..5434901 100644
--- a/f3probe.c
+++ b/f3probe.c
@@ -1,14 +1,15 @@
-#define _POSIX_C_SOURCE 200809L
+#define _POSIX_C_SOURCE 200112L
+#define _XOPEN_SOURCE 600
 
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <string.h>
 #include <argp.h>
 #include <stdbool.h>
 #include <assert.h>
 #include <inttypes.h>
 #include <sys/time.h>
+#include <unistd.h>
 
 #include "version.h"
 #include "libprobe.h"
@@ -53,6 +54,8 @@ static struct argp_option options[] = {
 		"Time reads, writes, and resets",		0},
 	{"verbose",		'v',	NULL,		0,
 		"Show detailed progress",		0},
+	{"show-progress",	'p',	"NUM",		0,
+		"Show progress if NUM is not zero",			0},
 	{ 0 }
 };
 
@@ -69,6 +72,7 @@ struct args {
 	bool		min_mem;
 	bool		time_ops;
 	bool		verbose;
+	bool		show_progress;
 
 	/* Geometry. */
 	uint64_t	real_size_byte;
@@ -165,6 +169,10 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state)
 		args->verbose = true;
 		break;
 
+	case 'p':
+		args->show_progress = !!arg_to_ll_bytes(state, arg);
+		break;
+
 	case ARGP_KEY_INIT:
 		args->filename = NULL;
 		break;
@@ -196,12 +204,6 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state)
 
 static struct argp argp = {options, parse_opt, adoc, doc, NULL, NULL, NULL};
 
-static void dummy_probe_progress(const char *format, ...)
-{
-	/* Do nothing */
-	UNUSED(format);
-}
-
 struct unit_test_item {
 	uint64_t	real_size_byte;
 	uint64_t	fake_size_byte;
@@ -277,8 +279,8 @@ static int unit_test(const char *filename)
 		assert(dev);
 		max_probe_blocks = probe_device_max_blocks(dev);
 		assert(!probe_device(dev, &real_size_byte, &announced_size_byte,
-			&wrap, &cache_size_block, &block_order,
-			dummy_probe_progress));
+			&wrap, &cache_size_block, &block_order, dummy_cb,
+			false));
 		free_device(dev);
 		fake_type = dev_param_to_type(real_size_byte,
 			announced_size_byte, wrap, block_order);
@@ -329,18 +331,18 @@ static int unit_test(const char *filename)
 static inline void report_size(const char *prefix, uint64_t bytes,
 	int block_order)
 {
-	report_probed_size(printf_cb, prefix, bytes, block_order);
+	report_probed_size(0, printf_cb, prefix, bytes, block_order);
 }
 
 static inline void report_order(const char *prefix, int order)
 {
-	report_probed_order(printf_cb, prefix, order);
+	report_probed_order(0, printf_cb, prefix, order);
 }
 
 static inline void report_cache(const char *prefix, uint64_t cache_size_block,
 	int block_order)
 {
-	report_probed_cache(printf_cb, prefix, cache_size_block, block_order);
+	report_probed_cache(0, printf_cb, prefix, cache_size_block, block_order);
 }
 
 static void report_probe_time(const char *prefix, uint64_t usec)
@@ -358,15 +360,6 @@ static void report_ops(const char *op, uint64_t count, uint64_t time_us)
 	printf("%10s: %s / %" PRIu64 " = %s\n", op, str1, count, str2);
 }
 
-static void print_probe_progress(const char *format, ...)
-{
-	va_list args;
-	va_start(args, format);
-	vprintf(format, args);
-	va_end(args);
-	fflush(stdout);
-}
-
 static int test_device(struct args *args)
 {
 	struct timeval t1, t2;
@@ -421,7 +414,8 @@ static int test_device(struct args *args)
 	 */
 	assert(!probe_device(dev, &real_size_byte, &announced_size_byte,
 		&wrap, &cache_size_block, &block_order,
-		args->verbose ? print_probe_progress : dummy_probe_progress));
+		args->verbose ? printf_flush_cb : dummy_cb,
+		args->show_progress));
 	assert(!gettimeofday(&t2, NULL));
 
 	if (args->verbose) {
@@ -518,6 +512,8 @@ int main(int argc, char **argv)
 		.min_mem	= false,
 		.time_ops	= false,
 		.verbose	= false,
+		/* If stdout isn't a terminal, suppress progress. */
+		.show_progress	= isatty(STDOUT_FILENO),
 		.real_size_byte	= 1ULL << 31,
 		.fake_size_byte	= 1ULL << 34,
 		.wrap		= 31,
diff --git a/f3read.c b/f3read.c
index 3f3b28d..e60f801 100644
--- a/f3read.c
+++ b/f3read.c
@@ -327,7 +327,7 @@ static void iterate_files(const char *path, const long *files,
 	UNUSED(end_at);
 
 	init_flow(&fw, get_block_size(path), get_total_size(path, files),
-		max_read_rate, progress, NULL);
+		max_read_rate, progress ? printf_flush_cb : dummy_cb, 0, NULL);
 	printf("                  SECTORS "
 		"     ok/corrupted/changed/overwritten\n");
 
diff --git a/f3write.c b/f3write.c
index 9458342..db2a57c 100644
--- a/f3write.c
+++ b/f3write.c
@@ -306,7 +306,7 @@ static int fill_fs(const char *path, long start_at, long end_at,
 	}
 
 	init_flow(&fw, get_block_size(path), free_space, max_write_rate,
-		progress, flush_chunk);
+		progress ? printf_flush_cb : dummy_cb, 0, flush_chunk);
 	assert(!gettimeofday(&t1, NULL));
 	for (i = start_at; i <= end_at; i++)
 		if (create_and_fill_file(path, i, GIGABYTES,
diff --git a/libdevs.h b/libdevs.h
index e93b6a9..03d4126 100644
--- a/libdevs.h
+++ b/libdevs.h
@@ -56,14 +56,6 @@ const char *dev_get_filename(struct device *dev);
  *	Methods
  */
 
-/* One should use the following constant as the size of the buffer needed to
- * batch writes or reads.
- *
- * It must be a power of 2 greater than, or equal to 2^20.
- * The current value is 1MB.
- */
-#define BIG_BLOCK_SIZE_BYTE (1 << 20)
-
 int dev_read_blocks(struct device *dev, char *buf,
 	uint64_t first_pos, uint64_t last_pos);
 int dev_write_blocks(struct device *dev, const char *buf,
diff --git a/libflow.c b/libflow.c
index f76844e..661bc05 100644
--- a/libflow.c
+++ b/libflow.c
@@ -1,6 +1,8 @@
 #define _POSIX_C_SOURCE 200112L
 #define _XOPEN_SOURCE 600
 
+#include <stdbool.h>
+#include <stddef.h>
 #include <ctype.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -17,29 +19,29 @@
 #if (__APPLE__ && __MACH__) || defined(__OpenBSD__)
 
 #include <unistd.h>
-static void msleep(double wait_ms)
+static inline void ussleep(double wait_us)
 {
-	assert(!usleep(wait_ms * 1000));
+	assert(!usleep(wait_us));
 }
 
 #else	/* Everyone else */
 
 #include <time.h> /* For clock_gettime() and clock_nanosleep(). */
-static void msleep(double wait_ms)
+static void ussleep(double wait_us)
 {
 	struct timespec req;
 	int ret;
 
 	assert(!clock_gettime(CLOCK_MONOTONIC, &req));
 
-	/* Add @wait_ms to @req. */
-	if (wait_ms > 1000) {
-		time_t sec = wait_ms / 1000;
-		wait_ms -= sec * 1000;
-		assert(wait_ms > 0);
+	/* Add @wait_us to @req. */
+	if (wait_us > 1000000) {
+		time_t sec = wait_us / 1000000;
+		wait_us -= sec * 1000000;
+		assert(wait_us > 0);
 		req.tv_sec += sec;
 	}
-	req.tv_nsec += wait_ms * 1000000;
+	req.tv_nsec += wait_us * 1000;
 
 	/* Round @req up. */
 	if (req.tv_nsec >= 1000000000) {
@@ -56,7 +58,7 @@ static void msleep(double wait_ms)
 	assert(ret == 0);
 }
 
-#endif	/* msleep() */
+#endif	/* ussleep() */
 
 static inline void move_to_inc_at_start(struct flow *fw)
 {
@@ -65,69 +67,98 @@ static inline void move_to_inc_at_start(struct flow *fw)
 }
 
 void init_flow(struct flow *fw, int block_size, uint64_t total_size,
-	long max_process_rate, int progress,
+	long max_process_rate, progress_cb cb, unsigned int indent,
 	flow_func_flush_chunk_t func_flush_chunk)
 {
 	fw->total_size		= total_size;
 	fw->total_processed	= 0;
-	fw->progress		= progress;
+	fw->cb			= cb;
+	fw->indent		= indent;
 	fw->block_size		= block_size; /* Bytes		*/
 	fw->blocks_per_delay	= 1;	/* block_size B/s	*/
-	fw->delay_ms		= 1000;	/* 1s			*/
+	fw->delay_ns		= 1000000000ULL;	/* 1s	*/
 	fw->max_process_rate	= max_process_rate <= 0
 		? DBL_MAX : max_process_rate * 1024.;
 	fw->measured_blocks	= 0;
-	fw->measured_time_ms	= 0;
+	fw->measured_time_ns	= 0;
 	fw->erase		= 0;
 	fw->func_flush_chunk	= func_flush_chunk;
+	fw->has_rem_chunk_size	= false;
+	fw->rem_chunk_size	= 0;
+	fw->rem_chunk_speed	= 0;
 	fw->processed_blocks	= 0;
-	fw->acc_delay_us	= 0;
+	fw->acc_delay_ns	= 0;
 	assert(fw->block_size > 0);
 	assert(fw->block_size % SECTOR_SIZE == 0);
 
 	move_to_inc_at_start(fw);
 }
 
-static inline void repeat_ch(char ch, int count)
+uint64_t get_rem_chunk_size(const struct flow *fw)
 {
-	while (count > 0) {
-		printf("%c", ch);
-		count--;
-	}
+	const int64_t rem_blocks = fw->blocks_per_delay - fw->processed_blocks;
+	const uint64_t rem_size = rem_blocks * fw->block_size;
+	assert(rem_blocks > 0);
+	return fw->has_rem_chunk_size && rem_size >= fw->rem_chunk_size
+		? fw->rem_chunk_size
+		: rem_size;
 }
 
-static void erase(int count)
+static inline unsigned int repeat_ch(char *buf, char ch, int count)
 {
-	if (count <= 0)
-		return;
-	repeat_ch('\b',	count);
-	repeat_ch(' ',	count);
-	repeat_ch('\b',	count);
+	int i;
+
+	for (i = 0; i < count; i++)
+		buf[i] = ch;
+	return count;
 }
 
 void clear_progress(struct flow *fw)
 {
-	if (!fw->progress)
-		return;
-	erase(fw->erase);
+	char buf[512], *at_buf = buf;
+
+	if (fw->erase <= 0) {
+		if (fw->indent > 0) {
+			/* Remove indented empty line. */
+			fw->cb(fw->indent, "\b");
+		}
+		goto out;
+	}
+
+	assert((size_t)fw->erase * 3 + 1 <= sizeof(buf));
+	at_buf += repeat_ch(at_buf, '\b', fw->erase);
+	at_buf += repeat_ch(at_buf, ' ', fw->erase);
+	at_buf += repeat_ch(at_buf, '\b', fw->erase);
+	at_buf[0] = '\0';
+
+	fw->cb(fw->indent, buf);
+out:
 	fw->erase = 0;
-	fflush(stdout);
 }
 
-static int pr_time(double sec)
+#define CHECK_AND_MOVE do {			\
+		assert(c > 0);			\
+		len += c;			\
+		assert((size_t)c < rem_size);	\
+		rem_size -= c;			\
+		at_buf += c;			\
+	} while (0)
+
+static int pr_time(char *buf, const size_t size, double sec)
 {
-	int has_h, has_m;
-	int c, tot;
+	char *at_buf = buf;
+	size_t rem_size = size;
+	bool has_h, has_m;
+	int c, len = 0;
 
-	tot = printf(" -- ");
-	assert(tot > 0);
+	c = snprintf(at_buf, rem_size, " -- ");
+	CHECK_AND_MOVE;
 
 	has_h = sec >= 3600;
 	if (has_h) {
 		double h = floor(sec / 3600);
-		c = printf("%i:", (int)h);
-		assert(c > 0);
-		tot += c;
+		c = snprintf(at_buf, rem_size, "%i:", (int)h);
+		CHECK_AND_MOVE;
 		sec -= h * 3600;
 	}
 
@@ -135,60 +166,78 @@ static int pr_time(double sec)
 	if (has_m) {
 		double m = floor(sec / 60);
 		if (has_h)
-			c = printf("%02i:", (int)m);
+			c = snprintf(at_buf, rem_size, "%02i:", (int)m);
 		else
-			c = printf("%i:", (int)m);
-		assert(c > 0);
-		tot += c;
+			c = snprintf(at_buf, rem_size, "%i:", (int)m);
+		CHECK_AND_MOVE;
 		sec -= m * 60;
 	}
 
 	if (has_m)
-		c = printf("%02i", (int)round(sec));
+		c = snprintf(at_buf, rem_size, "%02i", (int)round(sec));
 	else
-		c = printf("%is", (int)round(sec));
-	assert(c > 0);
-	return tot + c;
+		c = snprintf(at_buf, rem_size, "%is", (int)round(sec));
+	CHECK_AND_MOVE;
+
+	return len;
 }
 
 static inline double get_avg_speed_given_time(const struct flow *fw,
-	uint64_t total_time_ms)
+	uint64_t total_time_ns)
 {
-	return (double)(fw->measured_blocks * fw->block_size * 1000) /
-		total_time_ms;
+	return ((double)(fw->measured_blocks * fw->block_size) * 1000000000.0)
+		/ total_time_ns;
 }
 
 /* Average writing speed in byte/s. */
 static inline double get_avg_speed(const struct flow *fw)
 {
-	return get_avg_speed_given_time(fw, fw->measured_time_ms);
+	return get_avg_speed_given_time(fw, fw->measured_time_ns);
+}
+
+static inline bool has_enough_measurements(const struct flow *fw)
+{
+	return fw->measured_time_ns > fw->delay_ns;
 }
 
 static void report_progress(struct flow *fw, double inst_speed)
 {
 	const char *unit = adjust_unit(&inst_speed);
 	double percent;
+	char buf[256];
+	int c, len = 0;
+
 	/* The following shouldn't be necessary, but sometimes
 	 * the initial free space isn't exactly reported
 	 * by the kernel; this issue has been seen on Macs.
 	 */
 	if (fw->total_size < fw->total_processed)
 		fw->total_size = fw->total_processed;
+
+	clear_progress(fw);
+
 	percent = (double)fw->total_processed * 100 / fw->total_size;
-	erase(fw->erase);
-	fw->erase = printf("%.2f%% -- %.2f %s/s",
+	c = snprintf(buf, sizeof(buf), "%.2f%% -- %.2f %s/s",
 		percent, inst_speed, unit);
-	assert(fw->erase > 0);
-	if (has_enough_measurements(fw))
-		fw->erase += pr_time(
+	assert(c > 0);
+	len += c;
+
+	if (has_enough_measurements(fw)) {
+		c = pr_time(buf + len, sizeof(buf) - len,
 			(fw->total_size - fw->total_processed) /
 			get_avg_speed(fw));
-	fflush(stdout);
+		assert(c > 0);
+		len += c;
+	}
+
+	assert((size_t)len + 1 <= sizeof(buf));
+	fw->erase = len;
+	fw->cb(fw->indent, "%s", buf);
 }
 
 static inline void __start_measurement(struct flow *fw)
 {
-	assert(!gettimeofday(&fw->t1, NULL));
+	assert(!clock_gettime(CLOCK_MONOTONIC, &fw->t1));
 }
 
 void start_measurement(struct flow *fw)
@@ -197,9 +246,9 @@ void start_measurement(struct flow *fw)
 	 * The report below is especially useful when a single measurement spans
 	 * multiple files; this happens when a drive is faster than 1GB/s.
 	 */
-	if (fw->progress)
-		report_progress(fw, fw->blocks_per_delay * fw->block_size *
-			1000.0 / fw->delay_ms);
+	report_progress(fw,
+		fw->blocks_per_delay * fw->block_size * 1000000000.0 /
+			fw->delay_ns);
 	__start_measurement(fw);
 }
 
@@ -253,17 +302,17 @@ static inline void move_to_dec(struct flow *fw)
 }
 
 static inline int is_rate_above(const struct flow *fw,
-	uint64_t delay, double inst_speed)
+	uint64_t delay_ns, double inst_speed)
 {
 	/* We use logical or here to enforce the lowest limit. */
-	return delay > fw->delay_ms || inst_speed > fw->max_process_rate;
+	return delay_ns > fw->delay_ns || inst_speed > fw->max_process_rate;
 }
 
 static inline int is_rate_below(const struct flow *fw,
-	uint64_t delay, double inst_speed)
+	uint64_t delay_ns, double inst_speed)
 {
 	/* We use logical and here to enforce both limits. */
-	return delay <= fw->delay_ms && inst_speed < fw->max_process_rate;
+	return delay_ns <= fw->delay_ns && inst_speed < fw->max_process_rate;
 }
 
 static inline int flush_chunk(const struct flow *fw, int fd)
@@ -273,12 +322,53 @@ static inline int flush_chunk(const struct flow *fw, int fd)
 	return 0;
 }
 
+static bool t1_gt_t2(const struct timespec *t1, const struct timespec *t2)
+{
+	if (t1->tv_sec > t2->tv_sec)
+		return true;
+	if (t1->tv_sec < t2->tv_sec)
+		return false;
+	return t1->tv_nsec > t2->tv_nsec;
+}
+
+static void update_rem_chunk_size(struct flow *fw, const struct timespec *t2)
+{
+	const uint64_t rem_chunk_size = fw->blocks_per_delay * fw->block_size;
+	const struct timespec *t1;
+	double inst_speed;
+
+	if (fw->rem_chunk_size == 0) {
+		/* This is the first time measure() is called. */
+		t1 = &fw->t1;
+	} else if (t1_gt_t2(&fw->t1, &fw->prv_t2)) {
+		/* end_measurement() has already been called, and
+		 * start_measurement() was called again.
+		 */
+		t1 = &fw->t1;
+	} else {
+		/* This is at least the second time measure() is called after
+		 * start_measurement() was called and before end_measurement()
+		 * was called.
+		 */
+		t1 = &fw->prv_t2;
+	}
+	inst_speed = (rem_chunk_size * 1000000000.0) /
+		diff_timespec_ns(t1, t2);
+	fw->prv_t2 = *t2;
+
+	if (fw->rem_chunk_size != 0 && inst_speed < fw->rem_chunk_speed)
+		return;
+
+	fw->rem_chunk_size = rem_chunk_size;
+	fw->rem_chunk_speed = inst_speed;
+}
+
 int measure(int fd, struct flow *fw, long processed)
 {
 	ldiv_t result = ldiv(processed, fw->block_size);
-	struct timeval t2;
-	uint64_t delay;
-	double bytes_k, inst_speed;
+	struct timespec t2;
+	uint64_t delay_ns;
+	double bytes_g, inst_speed;
 
 	assert(result.rem == 0);
 	fw->processed_blocks += result.quot;
@@ -291,62 +381,100 @@ int measure(int fd, struct flow *fw, long processed)
 	if (flush_chunk(fw, fd) < 0)
 		return -1; /* Caller can read errno(3). */
 
-	assert(!gettimeofday(&t2, NULL));
-	delay = (diff_timeval_us(&fw->t1, &t2) + fw->acc_delay_us) / 1000;
+	assert(!clock_gettime(CLOCK_MONOTONIC, &t2));
+	if (!fw->has_rem_chunk_size)
+		update_rem_chunk_size(fw, &t2);
+	delay_ns = diff_timespec_ns(&fw->t1, &t2) + fw->acc_delay_ns;
 
 	/* Instantaneous speed in bytes per second. */
-	bytes_k = fw->blocks_per_delay * fw->block_size * 1000.0;
-	inst_speed = bytes_k / delay;
-
-	if (delay < fw->delay_ms && inst_speed > fw->max_process_rate) {
-		/* Wait until inst_speed == fw->max_process_rate
-		 * (if possible).
+	bytes_g = fw->blocks_per_delay * fw->block_size * 1000000000.0;
+	inst_speed = bytes_g / delay_ns;
+
+	if (delay_ns < fw->delay_ns && inst_speed > fw->max_process_rate) {
+		/* delay_ns should be such that
+		 * inst_speed <= fw->max_process_rate.
+		 * To accomplish this, the code below adds a wait.
+		 *
+		 * inst_speed <= fw->max_process_rate [=>]
+		 * bytes_g / (delay_ns + wait_ns) <= fw->max_process_rate [=>]
+		 * bytes_g / fw->max_process_rate <= delay_ns + wait_ns [=>]
+		 * wait_ns >= bytes_g / fw->max_process_rate - delay_ns
+		 *
+		 * The step below minimizes rounding errors.
+		 *
+		 * wait_ns >= (bytes_g - delay_ns * fw->max_process_rate) /
+		 *	fw->max_process_rate
+		 *
+		 * Round wait_ns, so it operates as an integer when used in
+		 * nanoseconds.
+		 */
+		double wait_ns = round(
+			(bytes_g - delay_ns * fw->max_process_rate) /
+			fw->max_process_rate);
+
+		/* From the if-test,
+		 * 	inst_speed > fw->max_process_rate [=>]
+		 * 	bytes_g / delay_ns > fw->max_process_rate [=>]
+		 *	bytes_g > delay_ns * fw->max_process_rate
+		 *
+		 * For wait_ns to be negative,
+		 *	wait_ns < 0 [=>]
+		 *	(bytes_g - delay_ns * fw->max_process_rate) /
+		 *		fw->max_process_rate < 0 [=>]
+		 *	bytes_g < delay_ns * fw->max_process_rate
+		 *
+		 * Therefore, wait_ns cannot be negative.
 		 */
-		double wait_ms = round((bytes_k - delay * fw->max_process_rate)
-			/ fw->max_process_rate);
-
-		 if (wait_ms < 0) {
-			/* Wait what is possible. */
-			wait_ms = fw->delay_ms - delay;
-		} else if (delay + wait_ms < fw->delay_ms) {
-			/* wait_ms is not the largest possible value, so
-			 * force the flow algorithm to keep increasing it.
-			 * Otherwise, the delay to print progress may be
-			 * too small.
+		assert(wait_ns >= 0);
+
+		if (delay_ns + wait_ns < fw->delay_ns) {
+			/* In this case, There is a factor f > 1 that
+			 * satisfies the following equation:
+			 *
+			 * (delay_ns + wait_ns) * f = fw->delay_ns
+			 *
+			 * This means that both delay_ns and wait_ns should be
+			 * increased to make f = 1. To signal that to the flow
+			 * algorithm below, wait to fw->delay_ns.
 			 */
-			wait_ms++;
+			wait_ns = fw->delay_ns - delay_ns;
 		}
 
-		if (wait_ms > 0) {
+		if (wait_ns > 0) {
 			/* Slow down. */
-			msleep(wait_ms);
+			ussleep(wait_ns / 1000.);
 
 			/* Adjust measurements. */
-			delay += wait_ms;
-			inst_speed = bytes_k / delay;
+			delay_ns += wait_ns;
+			inst_speed = bytes_g / delay_ns;
 		}
 	}
 
 	/* Update mean. */
 	fw->measured_blocks += fw->processed_blocks;
-	fw->measured_time_ms += delay;
+	fw->measured_time_ns += delay_ns;
 
 	switch (fw->state) {
 	case FW_INC:
-		if (is_rate_above(fw, delay, inst_speed)) {
+		if (is_rate_above(fw, delay_ns, inst_speed)) {
+			if (!fw->has_rem_chunk_size) {
+				/* Recommend a chunk size to caller. */
+				assert(fw->rem_chunk_size != 0);
+				fw->has_rem_chunk_size = true;
+			}
 			move_to_search(fw,
 				fw->blocks_per_delay - fw->step / 2,
 				fw->blocks_per_delay);
-		} else if (is_rate_below(fw, delay, inst_speed)) {
+		} else if (is_rate_below(fw, delay_ns, inst_speed)) {
 			inc_step(fw);
 		} else
 			move_to_steady(fw);
 		break;
 
 	case FW_DEC:
-		if (is_rate_above(fw, delay, inst_speed)) {
+		if (is_rate_above(fw, delay_ns, inst_speed)) {
 			dec_step(fw);
-		} else if (is_rate_below(fw, delay, inst_speed)) {
+		} else if (is_rate_below(fw, delay_ns, inst_speed)) {
 			move_to_search(fw, fw->blocks_per_delay,
 				fw->blocks_per_delay + fw->step / 2);
 		} else
@@ -359,10 +487,10 @@ int measure(int fd, struct flow *fw, long processed)
 			break;
 		}
 
-		if (is_rate_above(fw, delay, inst_speed)) {
+		if (is_rate_above(fw, delay_ns, inst_speed)) {
 			fw->bpd2 = fw->blocks_per_delay;
 			fw->blocks_per_delay = (fw->bpd1 + fw->bpd2) / 2;
-		} else if (is_rate_below(fw, delay, inst_speed)) {
+		} else if (is_rate_below(fw, delay_ns, inst_speed)) {
 			fw->bpd1 = fw->blocks_per_delay;
 			fw->blocks_per_delay = (fw->bpd1 + fw->bpd2) / 2;
 		} else
@@ -370,7 +498,19 @@ int measure(int fd, struct flow *fw, long processed)
 		break;
 
 	case FW_STEADY: {
-		if (delay <= fw->delay_ms) {
+		if (!fw->has_rem_chunk_size) {
+			/* Recommend a chunk size to caller.
+			 * Execution reaches here when fw->max_process_rate is
+			 * throttling the flow.
+			 */
+			assert(fw->rem_chunk_size != 0);
+			fw->has_rem_chunk_size = true;
+			/* Since it's in steady state, go for another round
+			 * before making any change.
+			 */
+			break;
+		}
+		if (delay_ns <= fw->delay_ns) {
 			if (inst_speed < fw->max_process_rate) {
 				move_to_inc(fw);
 			} else if (inst_speed > fw->max_process_rate) {
@@ -386,19 +526,18 @@ int measure(int fd, struct flow *fw, long processed)
 		assert(0);
 	}
 
-	if (fw->progress)
-		report_progress(fw, inst_speed);
+	report_progress(fw, inst_speed);
 
 	/* Reset accumulators. */
 	fw->processed_blocks = 0;
-	fw->acc_delay_us = 0;
+	fw->acc_delay_ns = 0;
 	__start_measurement(fw);
 	return 0;
 }
 
 int end_measurement(int fd, struct flow *fw)
 {
-	struct timeval t2;
+	struct timespec t2;
 	int saved_errno;
 	int ret = 0;
 
@@ -412,14 +551,12 @@ int end_measurement(int fd, struct flow *fw)
 	}
 
 	/* Save time in between closing ongoing file and creating a new file. */
-	assert(!gettimeofday(&t2, NULL));
-	fw->acc_delay_us += diff_timeval_us(&fw->t1, &t2);
+	assert(!clock_gettime(CLOCK_MONOTONIC, &t2));
+	fw->acc_delay_ns += diff_timespec_ns(&fw->t1, &t2);
 
 out:
 	/* Erase progress information. */
-	erase(fw->erase);
-	fw->erase = 0;
-	fflush(stdout);
+	clear_progress(fw);
 
 	if (ret < 0) {
 		/* Propagate errno(3) to caller. */
@@ -453,7 +590,8 @@ void print_measured_speed(const struct flow *fw, const struct timeval *t1,
 		int64_t total_time_ms = delay_ms(t1, t2);
 		if (total_time_ms > 0) {
 			pr_avg_speed(speed_type,
-				get_avg_speed_given_time(fw, total_time_ms));
+				get_avg_speed_given_time(fw, total_time_ms *
+					1000000ULL));
 		} else {
 			assert(strlen(speed_type) > 0);
 			printf("%c%s speed not available\n",
diff --git a/libflow.h b/libflow.h
index 3ee7b8d..8a51394 100644
--- a/libflow.h
+++ b/libflow.h
@@ -5,7 +5,9 @@
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
-#include <sys/time.h>
+#include <time.h>
+
+#include "libutils.h"
 
 struct flow;
 
@@ -16,12 +18,14 @@ struct flow {
 	uint64_t	total_size;
 	/* Total number of bytes already processed. */
 	uint64_t	total_processed;
-	/* If true, show progress. */
-	int		progress;
+	/* Callback to show progress. */
+	progress_cb	cb;
+	/* Indentation level for callback. */
+	unsigned int	indent;
 	/* Block size in bytes. */
 	int		block_size;
-	/* Delay intended between measurements in milliseconds. */
-	unsigned int	delay_ms;
+	/* Delay intended between measurements in nanoseconds. */
+	uint64_t	delay_ns;
 	/* Increment to apply to @blocks_per_delay. */
 	int64_t		step;
 	/* Blocks to process before measurement. */
@@ -31,7 +35,7 @@ struct flow {
 	/* Number of measured blocks. */
 	uint64_t	measured_blocks;
 	/* Measured time. */
-	uint64_t	measured_time_ms;
+	uint64_t	measured_time_ns;
 	/* State. */
 	enum {FW_INC, FW_DEC, FW_SEARCH, FW_STEADY} state;
 	/* Number of characters to erase before printing out progress. */
@@ -46,42 +50,52 @@ struct flow {
 	 * Initialized while measuring
 	 */
 
+	/* Has a recommended chunk size? */
+	bool		has_rem_chunk_size;
+	/* Recommended chunk size. */
+	uint64_t	rem_chunk_size;
+	/* Speed of the recommended chunk size in bytes per second. */
+	double		rem_chunk_speed;
+	/* Only used while has_rem_chunk_size is false. */
+	struct timespec	prv_t2;
+
 	/* Number of blocks processed since last measurement. */
 	int64_t		processed_blocks;
 	/*
 	 * Accumulated delay before @processed_blocks reaches @blocks_per_delay
-	 * in microseconds.
+	 * in nanoseconds.
 	 */
-	uint64_t	acc_delay_us;
+	uint64_t	acc_delay_ns;
 	/* Range of blocks_per_delay while in FW_SEARCH state. */
 	int64_t		bpd1, bpd2;
 	/* Time measurements. */
-	struct timeval	t1;
+	struct timespec	t1;
 };
 
 /* If @max_process_rate <= 0, the maximum processing rate is infinity.
  * The unit of @max_process_rate is KB per second.
  */
 void init_flow(struct flow *fw, int block_size, uint64_t total_size,
-	long max_process_rate, int progress,
+	long max_process_rate, progress_cb cb, unsigned int indent,
 	flow_func_flush_chunk_t func_flush_chunk);
 
-void start_measurement(struct flow *fw);
-int measure(int fd, struct flow *fw, long processed);
-void clear_progress(struct flow *fw);
-int end_measurement(int fd, struct flow *fw);
-
-static inline int has_enough_measurements(const struct flow *fw)
+static inline void inc_total_size(struct flow *fw, uint64_t size)
 {
-	return fw->measured_time_ms > fw->delay_ms;
+	fw->total_size = fw->total_processed + size;
 }
 
-static inline uint64_t get_rem_chunk_size(const struct flow *fw)
+static inline void fw_set_indent(struct flow *fw, unsigned int indent)
 {
-	assert(fw->blocks_per_delay > fw->processed_blocks);
-	return (fw->blocks_per_delay - fw->processed_blocks) * fw->block_size;
+	fw->indent = indent;
 }
 
+uint64_t get_rem_chunk_size(const struct flow *fw);
+
+void start_measurement(struct flow *fw);
+int measure(int fd, struct flow *fw, long processed);
+void clear_progress(struct flow *fw);
+int end_measurement(int fd, struct flow *fw);
+
 void print_measured_speed(const struct flow *fw, const struct timeval *t1,
 	const struct timeval *t2, const char *speed_type);
 
diff --git a/libprobe.c b/libprobe.c
index 91f25bc..cb52adc 100644
--- a/libprobe.c
+++ b/libprobe.c
@@ -1,4 +1,6 @@
-#include <stdarg.h>
+#define _POSIX_C_SOURCE 200112L
+#define _XOPEN_SOURCE 600
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <stdbool.h>
@@ -8,103 +10,274 @@
 #include <inttypes.h>
 
 #include "libutils.h"
+#include "libflow.h"
 #include "libprobe.h"
 
-static int _write_blocks(struct device *dev, char *buf,
-	uint64_t first_pos, uint64_t last_pos, probe_progress_cb cb)
+static int _write_blocks(struct device *dev, const char *buf,
+	uint64_t first_pos, uint64_t last_pos, struct flow *fw,
+	progress_cb cb, unsigned int indent)
 {
 	if (dev_write_blocks(dev, buf, first_pos, last_pos) &&
-		dev_write_blocks(dev, buf, first_pos, last_pos)) {
-		cb("I/O ERROR: Write error at blocks [%" PRIu64 ", %" PRIu64 "]!\n",
+			dev_write_blocks(dev, buf, first_pos, last_pos)) {
+		clear_progress(fw);
+		cb(indent, "I/O ERROR: Write error at blocks [%" PRIu64 ", %" PRIu64 "]!\n",
 			first_pos, last_pos);
 		return true;
 	}
 	return false;
 }
 
-static int write_blocks(struct device *dev,
-	uint64_t first_pos, uint64_t last_pos, uint64_t salt,
-	probe_progress_cb cb)
+/* Some fake drives have a "tiny" (e.g. 8KB) cache for random accesses and
+ * a "large" (e.g. 4MB) cache for sequential accesses.  So, for these
+ * fake drives, a random read may return a bad block, while a sequential
+ * read that includes that block returns it as a good block.
+ * This situation has been verified with the donated drive from
+ * issue #50 (https://github.com/AltraMayor/f3/issues/50).
+ *
+ * The example cache sizes come from the following
+ * discussion among Linux kernel developers:
+ * https://linux-arm-kernel.infradead.narkive.com/h3crV0D3/mmc-quirks-relating-to-performance-lifetime
+ *
+ * To circunvent this problem, the probe must only issue random reads.
+ */
+struct rdwr_info {
+	uint64_t cache_pos;
+	uint64_t cache_size_block;
+	uint64_t salt;
+
+	struct dynamic_buffer seqw_dbuf;
+	struct flow seqw_fw;
+	struct flow randw_fw;
+
+	struct flow randr_fw;
+};
+
+static int write_random_blocks(struct device *dev, const uint64_t pos[],
+	uint32_t n_pos, struct rdwr_info *rwi, progress_cb cb,
+	unsigned int indent)
 {
 	const int block_order = dev_get_block_order(dev);
 	const int block_size = dev_get_block_size(dev);
 	/* Aligning these pointers is necessary to directly read and write
-	 * the block device.
-	 * For the file device, this is superfluous.
+	 * the block device. For the file device, this is superfluous.
 	 */
-	char stack[align_head(block_order) + BIG_BLOCK_SIZE_BYTE];
+	char stack[align_head(block_order) + block_size];
 	char *buffer = align_mem(stack, block_order);
-	char *stamp_blk = buffer;
-	char *flush_blk = buffer + BIG_BLOCK_SIZE_BYTE;
-	uint64_t offset = first_pos << block_order;
-	uint64_t pos, write_pos = first_pos;
-
-	for (pos = first_pos; pos <= last_pos; pos++) {
-		fill_buffer_with_block(stamp_blk, block_order, offset, salt);
-		stamp_blk += block_size;
-		offset += block_size;
-
-		if (stamp_blk == flush_blk || pos == last_pos) {
-			if (_write_blocks(dev, buffer, write_pos, pos, cb))
-				return true;
-			stamp_blk = buffer;
-			write_pos = pos + 1;
-		}
+	uint32_t i;
+
+	if (n_pos == 0)
+		return false;
+
+	inc_total_size(&rwi->randw_fw, n_pos << block_order);
+	fw_set_indent(&rwi->randw_fw, indent);
+
+	start_measurement(&rwi->randw_fw);
+	for (i = 0; i < n_pos; i++) {
+		fill_buffer_with_block(buffer, block_order,
+			pos[i] << block_order, rwi->salt);
+		if (_write_blocks(dev, buffer, pos[i], pos[i], &rwi->randw_fw,
+				cb, indent))
+			return true;
+		measure(0, &rwi->randw_fw, block_size);
 	}
+	end_measurement(0, &rwi->randw_fw);
+	return false;
+}
+
+static int write_blocks(struct device *dev,
+	uint64_t first_block, uint64_t last_block,
+	struct rdwr_info *rwi, progress_cb cb, unsigned int indent)
+{
+	const int block_order = dev_get_block_order(dev);
+	const int block_size = dev_get_block_size(dev);
+	uint64_t offset = first_block << block_order;
+	uint64_t first_pos = first_block;
+
+	if (first_block > last_block)
+		return false;
 
+	inc_total_size(&rwi->seqw_fw,
+		(last_block - first_block + 1) << block_order);
+	fw_set_indent(&rwi->seqw_fw, indent);
+
+	start_measurement(&rwi->seqw_fw);
+	while (first_pos <= last_block) {
+		const uint64_t chunk_bytes = get_rem_chunk_size(&rwi->seqw_fw);
+		const uint64_t needed_size =
+			align_head(block_order) + chunk_bytes;
+		const uint64_t max_blocks_to_write =
+			last_block - first_pos + 1;
+		uint64_t blocks_to_write;
+		int shift;
+		char *buffer, *stamp_blk;
+		size_t buf_len;
+		uint64_t pos, next_pos;
+
+		buffer = align_mem2(dbuf_get_buf(&rwi->seqw_dbuf, needed_size),
+			block_order, &shift);
+		buf_len = dbuf_get_len(&rwi->seqw_dbuf);
+
+		blocks_to_write = buf_len >= needed_size
+			? chunk_bytes >> block_order
+			: (buf_len - shift) >> block_order;
+		if (blocks_to_write > max_blocks_to_write)
+			blocks_to_write = max_blocks_to_write;
+
+		next_pos = first_pos + blocks_to_write - 1;
+
+		stamp_blk = buffer;
+		for (pos = first_pos; pos <= next_pos; pos++) {
+			fill_buffer_with_block(stamp_blk, block_order, offset,
+				rwi->salt);
+			stamp_blk += block_size;
+			offset += block_size;
+		}
+
+		if (_write_blocks(dev, buffer, first_pos, next_pos,
+				&rwi->seqw_fw, cb, indent))
+			return true;
+
+		/* Since parameter func_flush_chunk of init_flow() is NULL,
+		 * the parameter fd of measure() is ignored.
+		 */
+		measure(0, &rwi->seqw_fw, blocks_to_write << block_order);
+		first_pos = next_pos + 1;
+	}
+	end_measurement(0, &rwi->seqw_fw);
 	return false;
 }
 
-static inline int high_level_reset(struct device *dev, uint64_t start_pos,
-	uint64_t cache_size_block, uint64_t salt, probe_progress_cb cb)
+static int overwhelm_cache(struct device *dev,
+	struct rdwr_info *rwi, progress_cb cb, unsigned int indent)
 {
-	return write_blocks(dev, start_pos, start_pos + cache_size_block - 1,
-		salt, cb);
+	if (rwi->cache_size_block == 0)
+		return false;
+	cb(indent, "Overwhelming cache\n");
+	return write_blocks(dev, rwi->cache_pos,
+		rwi->cache_pos + rwi->cache_size_block - 1, rwi, cb, indent);
 }
 
-/* Some fake drives have a "tiny" (e.g. 8KB) cache for random accesses and
- * a "large" (e.g. 4MB) cache for sequential accesses.  So, for these
- * fake drives, a random read may return a bad block, while a sequential
- * read that includes that block returns it as a good block.
- * This situation has been verified with the donated drive from
- * issue #50 (https://github.com/AltraMayor/f3/issues/50).
- *
- * The example cache sizes come from the following
- * discussion among Linux kernel developers:
- * https://linux-arm-kernel.infradead.narkive.com/h3crV0D3/mmc-quirks-relating-to-performance-lifetime
- *
- * To circunvent this problem, the probe must only issue random reads.
- */
-static int read_blocks(struct device *dev, char *buf, uint64_t pos,
-	probe_progress_cb cb)
+static int read_block(struct device *dev, char *buf, uint64_t pos,
+	struct flow *fw, progress_cb cb, unsigned int indent)
 {
 	if (dev_read_blocks(dev, buf, pos, pos) &&
 		dev_read_blocks(dev, buf, pos, pos)) {
-		cb("I/O ERROR: Read error at block %" PRIu64 "!\n", pos);
+		clear_progress(fw);
+		cb(indent, "I/O ERROR: Read error at block %" PRIu64 "!\n",
+			pos);
 		return true;
 	}
 	return false;
 }
 
-static int is_block_good(struct device *dev, uint64_t pos, int *pis_good,
-	uint64_t salt, probe_progress_cb cb)
+static uint64_t bs_to_set(enum block_state bs)
+{
+	switch (bs) {
+	case bs_unknown:
+	case bs_good:
+	case bs_bad:
+	case bs_changed:
+	case bs_overwritten:
+		assert(bs < sizeof(uint64_t) * 8);
+		return 1ULL << bs;
+
+	default:
+		assert(0);
+	}
+}
+
+static uint64_t bss_to_set(const enum block_state bss[], uint32_t n_bs)
+{
+	uint64_t bs_set = 0;
+	uint32_t i;
+
+	for (i = 0; i < n_bs; i++)
+		bs_set |= bs_to_set(bss[i]);
+	return bs_set;
+}
+
+static inline bool in_bs_set(uint64_t bs_set, enum block_state bs)
+{
+	assert(bs < sizeof(bs_set) * 8);
+	return (bs_set >> bs) & 1;
+}
+
+struct def_x_block {
+	uint64_t pos;
+	uint64_t expected_offset;
+};
+
+static int find_first_x_block(struct device *dev,
+	const struct def_x_block x_blocks[], uint32_t n_blocks,
+	uint64_t bs_set, uint32_t *pfirst_x_block_idx,
+	enum block_state *pstate, struct rdwr_info *rwi,
+	progress_cb cb, unsigned int indent)
 {
-	const int block_size = dev_get_block_size(dev);
 	const int block_order = dev_get_block_order(dev);
+	const int block_size = dev_get_block_size(dev);
 	char stack[align_head(block_order) + block_size];
 	char *probe_blk = align_mem(stack, block_order);
-	uint64_t found_offset;
+	uint32_t i;
+
+	if (n_blocks == 0)
+		goto not_found;
+
+	inc_total_size(&rwi->randr_fw, n_blocks << block_order);
+	fw_set_indent(&rwi->randr_fw, indent);
+
+	start_measurement(&rwi->randr_fw);
+	for (i = 0; i < n_blocks; i++) {
+		uint64_t found_offset;
+		enum block_state bs;
+
+		if (read_block(dev, probe_blk, x_blocks[i].pos, &rwi->randr_fw,
+				cb, indent))
+			return true;
+		bs = validate_buffer_with_block(probe_blk, block_order,
+			x_blocks[i].expected_offset, &found_offset, rwi->salt);
+		measure(0, &rwi->randr_fw, block_size);
+
+		if (in_bs_set(bs_set, bs)) {
+			/* Found the first x_block. */
+			*pfirst_x_block_idx = i;
+			*pstate = bs;
+			end_measurement(0, &rwi->randr_fw);
+			return false;
+		}
+	}
+	end_measurement(0, &rwi->randr_fw);
+
+not_found:
+	*pfirst_x_block_idx = n_blocks;
+	return false;
+}
+
+static int find_first_bad_block(struct device *dev, const uint64_t pos[],
+	uint32_t n_pos, bool *pany_bad, uint64_t *pbad_pos,
+	struct rdwr_info *rwi, progress_cb cb, unsigned int indent)
+{
+	const int block_order = dev_get_block_order(dev);
+	/* All but bs_good. */
+	const enum block_state bss[] = {bs_unknown, bs_bad, bs_changed,
+		bs_overwritten};
+	struct def_x_block x_blocks[n_pos];
 	enum block_state bs;
+	uint32_t i;
 
-	if (read_blocks(dev, probe_blk, pos, cb))
-		return true;
+	for (i = 0; i < n_pos; i++) {
+		x_blocks[i].pos = pos[i];
+		x_blocks[i].expected_offset = pos[i] << block_order;
+	}
 
-	bs = validate_buffer_with_block(probe_blk, block_order,
-		(pos << block_order), &found_offset, salt);
-	*pis_good = bs == bs_good;
-	if (!*pis_good) {
-		cb("INFO: Block %" PRIu64 " is %s!\n",
-			pos, block_state_to_str(bs));
+	if (find_first_x_block(dev, x_blocks, n_pos,
+			bss_to_set(bss, DIM(bss)),
+			&i, &bs, rwi, cb, indent))
+		return true;
+	*pany_bad = i < n_pos;
+	if (*pany_bad) {
+		*pbad_pos = x_blocks[i].pos;
+		cb(indent, "INFO: Block %" PRIu64 " is %s!\n",
+			*pbad_pos, block_state_to_str(bs));
 	}
 	return false;
 }
@@ -131,51 +304,106 @@ static uint64_t uint64_rand_range(uint64_t a, uint64_t b)
 	return a + (r % (b - a + 1));
 }
 
-#define N_BLOCK_SAMPLES	64
+/* Since the list size is small, at most SAMPLING_MAX blocks,
+ * the O(n_samples^2) complexity is not a problem.
+ */
+static void fill_with_unique_samples(uint64_t *samples, uint32_t n_samples,
+	uint64_t first_pos, uint64_t last_pos)
+{
+	uint32_t i, j;
+
+	assert(n_samples < last_pos - first_pos + 1);
+	for (i = 0; i < n_samples; ) {
+		uint64_t r = uint64_rand_range(first_pos, last_pos);
+		bool unique = true;
+		for (j = 0; j < i; j++) {
+			if (samples[j] == r) {
+				unique = false;
+				break;
+			}
+		}
+		if (unique) {
+			samples[i] = r;
+			i++;
+		}
+	}
+}
 
+static int uint64_cmp(const void *pa, const void *pb)
+{
+	const uint64_t *pia = pa;
+	const uint64_t *pib = pb;
+	return *pia - *pib;
+}
+
+/* Fill @samples with @n_samples unique random positions in the range
+ * [@first_pos, @last_pos]. If @sorted is true, sort the entries of
+ * @samples. If @is_linear is true, the entries of @samples are linear
+ * (i.e. @first_pos, @first_pos + 1, ...).
+ */
+static void fill_samples(uint64_t *samples, uint32_t *pn_samples,
+	uint64_t first_pos, uint64_t last_pos, bool sorted, bool *pis_linear)
+{
+	const uint64_t gap = last_pos - first_pos + 1;
+	*pis_linear = gap <= *pn_samples;
+	if (*pis_linear) {
+		uint32_t i;
+		*pn_samples = gap;
+		for (i = 0; i < gap; i++)
+			samples[i] = first_pos + i;
+
+		/* Treat single blocks as random reads instead of
+		 * sequential ones.
+		 */
+		*pis_linear = gap > 1;
+	} else {
+		fill_with_unique_samples(samples, *pn_samples, first_pos,
+			last_pos);
+		if (sorted) {
+			qsort(samples, *pn_samples, sizeof(uint64_t),
+				uint64_cmp);
+		}
+	}
+}
+
+/* Let g be the number of good blocks between
+ *	@first_pos and @last_pos including them.
+ * Let b be the number of bad and overwritten blocks between
+ *	@first_pos and @last_pos including them.
+ *
+ * The probability Pr_g of sampling a good block at random between
+ *	@first_pos and @last_pos is Pr_g = g / (g + b), and
+ *	the probability Pr_1b that among k block samples at least
+ *	one block is bad is Pr_1b = 1 - Pr_g^k.
+ *
+ * Assuming Pr_g <= 95% and k = 64, Pr_1b >= 96.2%.
+ *	That is, with high probability (i.e. Pr_1b),
+ *	one can find at least a bad block with k samples
+ *	when most blocks are good (Pr_g).
+ */
 static int probabilistic_test(struct device *dev,
 	uint64_t first_pos, uint64_t last_pos, int *pfound_a_bad_block,
-	uint64_t salt, probe_progress_cb cb)
+	struct rdwr_info *rwi, progress_cb cb, unsigned int indent)
 {
-	uint64_t gap;
-	int i, n, is_linear;
+	uint32_t n_samples = 64;
+	uint64_t samples[n_samples];
+	bool is_linear, any_bad;
+	uint64_t bad_pos;
 
 	if (first_pos > last_pos)
 		goto not_found;
 
-	/* Let g be the number of good blocks between
-	 *   @first_pos and @last_pos including them.
-	 * Let b be the number of bad and overwritten blocks between
-	 *   @first_pos and @last_pos including them.
-	 *
-	 * The probability Pr_g of sampling a good block at random between
-	 *	@first_pos and @last_pos is Pr_g = g / (g + b), and
-	 *	the probability Pr_1b that among k block samples at least
-	 *	one block is bad is Pr_1b = 1 - Pr_g^k.
-	 *
-	 * Assuming Pr_g <= 95% and k = 64, Pr_1b >= 96.2%.
-	 *	That is, with high probability (i.e. Pr_1b),
-	 *	one can find at least a bad block with k samples
-	 *	when most blocks are good (Pr_g).
-	 */
-
-	/* Test @samples. */
-	gap = last_pos - first_pos + 1;
-	is_linear = gap <= N_BLOCK_SAMPLES;
-	n = is_linear ? gap : N_BLOCK_SAMPLES;
-	for (i = 0; i < n; i++) {
-		uint64_t sample_pos = is_linear
-			? first_pos + i
-			: uint64_rand_range(first_pos, last_pos);
-		int is_good;
-
-		if (is_block_good(dev, sample_pos, &is_good, salt, cb))
-			return true;
-		if (!is_good) {
-			/* Found a bad block. */
-			*pfound_a_bad_block = true;
-			return false;
-		}
+	fill_samples(samples, &n_samples, first_pos, last_pos, false,
+		&is_linear);
+	cb(indent, "Sampling %" PRIu32 " blocks from blocks [%" PRIu64 ", %" PRIu64 "]\n",
+		n_samples, first_pos, last_pos);
+	if (find_first_bad_block(dev, samples, n_samples, &any_bad, &bad_pos,
+			rwi, cb, indent))
+		return true;
+	if (any_bad) {
+		/* Found a bad block. */
+		*pfound_a_bad_block = true;
+		return false;
 	}
 
 not_found:
@@ -183,93 +411,62 @@ static int probabilistic_test(struct device *dev,
 	return false;
 }
 
-static int uint64_cmp(const void *pa, const void *pb)
-{
-	const uint64_t *pia = pa;
-	const uint64_t *pib = pb;
-	return *pia - *pib;
-}
-
+/* Find a bad block in the range (left_pos, right_pos) using up to
+ * n_samples random samples.
+ *
+ * If a bad block is found, set *pright_pos to the position of the
+ * leftmost bad block.
+ *
+ * The code relies on the same analytical result derived
+ * in probabilistic_test().
+ */
 static int find_a_bad_block(struct device *dev, uint32_t n_samples,
 	uint64_t left_pos, uint64_t *pright_pos, int *found_a_bad_block,
-	uint64_t reset_pos, uint64_t cache_size_block, uint64_t salt,
-	probe_progress_cb cb)
+	struct rdwr_info *rwi, progress_cb cb, unsigned int indent)
 {
-	/* We need to list all sampled blocks because
-	 * we need a sorted array; read the code to find the why.
-	 * If the sorted array were not needed, one could save the seed
-	 * of the random sequence and repeat the sequence to read the blocks
-	 * after writing them.
-	 */
 	uint64_t samples[n_samples];
-	uint64_t gap, prv_sample;
-	uint32_t i;
-
-	cb("\tSampling %" PRIu32 " blocks from blocks (%" PRIu64 ", %" PRIu64 ")\n",
-		n_samples, left_pos, *pright_pos);
+	bool is_linear, any_bad;
+	uint64_t bad_pos;
 
 	if (n_samples == 0 || *pright_pos <= left_pos + 1) {
 		/* Nothing to sample. */
 		goto not_found;
 	}
 
-	/* The code below relies on the same analytical result derived
-	 * in probabilistic_test().
+	/* Sort entries of samples to minimize reads.
+	 * As soon as one finds a bad block, one can ignore the remaining
+	 * samples because the found bad block is the leftmost bad block.
 	 */
+	fill_samples(samples, &n_samples, left_pos + 1, *pright_pos - 1, true,
+		&is_linear);
+	cb(indent, "## Sampling %" PRIu32 " blocks from blocks (%" PRIu64 ", %" PRIu64 ")\n",
+		n_samples, left_pos, *pright_pos);
 
-	/* Fill up @samples. */
-	gap = *pright_pos - left_pos - 1;
-	if (gap <= n_samples) {
-		n_samples = gap;
-		for (i = 0; i < n_samples; i++)
-			samples[i] = left_pos + 1 + i;
+	cb(indent + 1, "Writing random blocks\n");
 
-		/* Write @samples. */
-		if (write_blocks(dev, left_pos + 1, *pright_pos - 1, salt, cb))
+	if (is_linear) {
+		if (write_blocks(dev, left_pos + 1, *pright_pos - 1, rwi,
+				cb, indent + 1))
 			return true;
 	} else {
-		for (i = 0; i < n_samples; i++)
-			samples[i] = uint64_rand_range(left_pos + 1,
-				*pright_pos - 1);
-
-		/* Sort entries of @samples to minimize reads.
-		 * As soon as one finds a bad block, one can stop and ignore
-		 * the remaining blocks because the found bad block is
-		 * the leftmost bad block.
-		 */
-		qsort(samples, n_samples, sizeof(uint64_t), uint64_cmp);
-
-		/* Write @samples. */
-		prv_sample = left_pos;
-		for (i = 0; i < n_samples; i++) {
-			if (samples[i] == prv_sample)
-				continue;
-			prv_sample = samples[i];
-			if (write_blocks(dev, prv_sample, prv_sample, salt, cb))
-				return true;
-		}
+		if (write_random_blocks(dev, samples, n_samples, rwi,
+				cb, indent + 1))
+			return true;
 	}
 
-	if (high_level_reset(dev, reset_pos, cache_size_block, salt, cb))
+	if (overwhelm_cache(dev, rwi, cb, indent + 1))
 		return true;
 
-	/* Test @samples. */
-	prv_sample = left_pos;
-	for (i = 0; i < n_samples; i++) {
-		int is_good;
-
-		if (samples[i] == prv_sample)
-			continue;
-
-		prv_sample = samples[i];
-		if (is_block_good(dev, prv_sample, &is_good, salt, cb))
-			return true;
-		if (!is_good) {
-			/* Found the leftmost bad block. */
-			*pright_pos = prv_sample;
-			*found_a_bad_block = true;
-			return false;
-		}
+	/* Test samples. */
+	cb(indent + 1, "Reading written blocks\n");
+	if (find_first_bad_block(dev, samples, n_samples, &any_bad, &bad_pos,
+			rwi, cb, indent + 1))
+		return true;
+	if (any_bad) {
+		/* Found the leftmost bad block. */
+		*pright_pos = bad_pos;
+		*found_a_bad_block = true;
+		return false;
 	}
 
 not_found:
@@ -291,18 +488,18 @@ static int find_a_bad_block(struct device *dev, uint32_t n_samples,
  */
 static int sampling_probe(struct device *dev,
 	uint64_t left_pos, uint64_t *pright_pos,
-	uint64_t reset_pos, uint64_t cache_size_block, uint64_t salt,
-	probe_progress_cb cb)
+	struct rdwr_info *rwi, progress_cb cb, unsigned int indent)
 {
 	uint32_t n_samples = SAMPLING_MIN;
 	int found_a_bad_block;
 	bool phase1 = true;
 
 	assert(SAMPLING_MAX >= SAMPLING_MIN);
+	cb(indent, "# Sampling\n");
+
 	while (*pright_pos > left_pos + n_samples + 1) {
 		if (find_a_bad_block(dev, n_samples, left_pos, pright_pos,
-				&found_a_bad_block, reset_pos,
-				cache_size_block, salt, cb))
+				&found_a_bad_block, rwi, cb, indent + 1))
 			return true;
 		if (found_a_bad_block)
 			continue;
@@ -320,18 +517,17 @@ static int sampling_probe(struct device *dev,
 		left_pos = (*pright_pos + left_pos) / 2;
 	}
 	if (find_a_bad_block(dev, n_samples, left_pos, pright_pos,
-			&found_a_bad_block, reset_pos,
-			cache_size_block, salt, cb))
+			&found_a_bad_block, rwi, cb, indent + 1))
 		return true;
 	return false;
 }
 
-static void report_cache_size_test(probe_progress_cb cb,
+static void report_cache_size_test(unsigned int indent, progress_cb cb,
 	const struct device *dev, uint64_t first_pos, uint64_t last_pos)
 {
 	double f_size = (last_pos - first_pos + 1) * dev_get_block_size(dev);
 	const char *unit = adjust_unit(&f_size);
-	cb("\tTesting cache size: %.2f %s; Blocks [%" PRIu64 ", %" PRIu64 "]\n",
+	cb(indent, "## Testing cache size: %.2f %s; Blocks [%" PRIu64 ", %" PRIu64 "]\n",
 		f_size, unit, first_pos, last_pos);
 }
 
@@ -339,8 +535,8 @@ static void report_cache_size_test(probe_progress_cb cb,
 #define MAX_CACHE_SIZE_BYTE	(1ULL << 30)
 
 static int find_cache_size(struct device *dev, const uint64_t left_pos,
-	uint64_t *pright_pos, uint64_t *pcache_size_block, const uint64_t salt,
-	probe_progress_cb cb)
+	uint64_t *pright_pos, struct rdwr_info *rwi, progress_cb cb,
+	unsigned int indent)
 {
 	const int block_order = dev_get_block_order(dev);
 	const uint64_t end_pos = *pright_pos - 1;
@@ -348,7 +544,7 @@ static int find_cache_size(struct device *dev, const uint64_t left_pos,
 	uint64_t final_write_target = MAX_CACHE_SIZE_BYTE >> block_order;
 	uint64_t first_pos = *pright_pos;
 
-	cb("# Find cache size\n");
+	cb(indent, "# Find cache size\n");
 
 	assert(write_target > 0);
 	assert(write_target < final_write_target);
@@ -373,19 +569,22 @@ static int find_cache_size(struct device *dev, const uint64_t left_pos,
 			break;
 		}
 
+		report_cache_size_test(indent + 1, cb, dev, first_pos, end_pos);
+
 		/* Write @write_target blocks before
 		 * the previously written blocks.
 		 */
-		report_cache_size_test(cb, dev, first_pos, end_pos);
-		if (write_blocks(dev, first_pos, last_pos, salt, cb))
+		cb(indent + 2, "Writing blocks [%" PRIu64 ", %" PRIu64 "]\n",
+			first_pos, last_pos);
+		if (write_blocks(dev, first_pos, last_pos, rwi, cb, indent + 2))
 			goto bad;
 
 		if (probabilistic_test(dev, first_pos, end_pos,
-			&found_a_bad_block, salt, cb))
+				&found_a_bad_block, rwi, cb, indent + 2))
 			goto bad;
 		if (found_a_bad_block) {
 			*pright_pos = first_pos;
-			*pcache_size_block = write_target == 1
+			rwi->cache_size_block = write_target == 1
 				? 0 /* There is no cache. */
 				: end_pos - first_pos + 1;
 			return false;
@@ -397,24 +596,48 @@ static int find_cache_size(struct device *dev, const uint64_t left_pos,
 
 	/* Good drive. */
 	*pright_pos = end_pos + 1;
-	*pcache_size_block = 0;
+	rwi->cache_size_block = 0;
 	return false;
 
 bad:
 	/* *pright_pos does not change. */
-	*pcache_size_block = 0;
+	rwi->cache_size_block = 0;
 	return true;
 }
 
 static int find_wrap(struct device *dev,
 	uint64_t left_pos, uint64_t *pright_pos,
-	uint64_t reset_pos, uint64_t cache_size_block, uint64_t salt,
-	probe_progress_cb cb)
+	struct rdwr_info *rwi, progress_cb cb, unsigned int indent)
 {
-	uint64_t offset, high_bit, pos = left_pos + 1;
-	int is_good, block_order;
+	const uint64_t good_block = left_pos + 1;
+	/* The smallest integer m such that 2^m > good_block. */
+	const uint32_t m = ceiling_log2(good_block + 1);
+	/* Let k be the *smallest* integer such that
+	 *	2^(m+k) + good_block >= *pright_pos
+	 *
+	 * Since this function has to test the blocks
+	 * 2^m + good_block, 2^(m+1) + good_block, ..., 2^(m+k-1) + good_block,
+	 * k corresponds to the number of samples to test.
+	 *
+	 * 2^(m+k) + good_block >= *pright_pos [=>]
+	 * 2^(m+k) >= *pright_pos - good_block [=>]
+	 * m + k >= log2(*pright_pos - good_block) [=>]
+	 * k >= log2(*pright_pos - good_block) - m [=>]
+	 * k = ceiling_log2(*pright_pos - good_block) - m
+	 */
+	const uint32_t aux = *pright_pos > good_block
+		? ceiling_log2(*pright_pos - good_block)
+		: 0;
+	const uint32_t n_samples = aux > m ? aux - m : 0;
+	struct def_x_block x_blocks[n_samples];
+	bool any_bad;
+	uint64_t bad_pos;
+	int block_order;
+	uint64_t expected_offset, high_bit;
+	uint32_t i;
+	enum block_state bs;
 
-	cb("# Find module\n");
+	cb(indent, "# Find module\n");
 
 	/*
 	 *	Basis
@@ -424,45 +647,54 @@ static int find_wrap(struct device *dev,
 	 * of the drive.
 	 */
 
-	if (pos >= *pright_pos)
+	if (good_block >= *pright_pos)
 		return false;
 
-	if (write_blocks(dev, pos, pos, salt, cb) ||
-			high_level_reset(dev, reset_pos, cache_size_block,
-				salt, cb) ||
-			is_block_good(dev, pos, &is_good, salt, cb) ||
-			!is_good)
+	cb(indent + 1, "Writing reference block %" PRIu64 "\n", good_block);
+	if (write_random_blocks(dev, &good_block, 1, rwi, cb, indent + 1) ||
+			overwhelm_cache(dev, rwi, cb, indent + 1))
+		return true;
+
+	cb(indent + 1, "Reading reference block\n");
+	if (find_first_bad_block(dev, &good_block, 1, &any_bad, &bad_pos,
+			rwi, cb, indent + 1) || any_bad)
 		return true;
 
 	/*
 	 *	Inductive step
 	 */
 
-	block_order = dev_get_block_order(dev);
-	offset = pos << block_order;
-	high_bit = clp2(pos);
-	if (high_bit <= pos)
-		high_bit <<= 1;
-	pos += high_bit;
-
-	while (pos < *pright_pos) {
-		char stack[align_head(block_order) + (1 << block_order)];
-		char *probe_blk = align_mem(stack, block_order);
-		uint64_t found_offset;
+	cb(indent + 1, "Probing module (reading %" PRIu32 " blocks)\n",
+		n_samples);
 
-		if (read_blocks(dev, probe_blk, pos, cb))
-			return true;
+	block_order = dev_get_block_order(dev);
+	expected_offset = good_block << block_order;
 
-		if (validate_buffer_with_block(probe_blk, block_order,
-				offset, &found_offset, salt) == bs_good) {
-			*pright_pos = high_bit;
-			return false;
-		}
+	/* high_bit starts as the smallest power of 2 greater than
+	 * good_block.
+	 */
+	high_bit = 1ULL << m; /* 2^m */
+	assert(high_bit > good_block);
 
+	/* Fill x_blocks in. */
+	for (i = 0; i < n_samples; i++) {
+		uint64_t pos = high_bit + good_block;
+		assert(pos < *pright_pos);
+		x_blocks[i].pos = pos;
+		x_blocks[i].expected_offset = expected_offset;
 		high_bit <<= 1;
-		pos = high_bit + left_pos + 1;
 	}
+	assert(high_bit + good_block >= *pright_pos);
 
+	if (find_first_x_block(dev, x_blocks, n_samples, bs_to_set(bs_good),
+			&i, &bs, rwi, cb, indent + 1))
+		return true;
+	if (i < n_samples) {
+		assert(bs == bs_good);
+		*pright_pos = x_blocks[i].pos - good_block; /* = high_bit */
+		cb(indent + 1, "INFO: Block %" PRIu64 " overwrites block %" PRIu64 "\n",
+			x_blocks[i].pos, good_block);
+	}
 	return false;
 }
 
@@ -482,52 +714,54 @@ uint64_t probe_device_max_blocks(const struct device *dev)
 		n * SAMPLING_MIN;		/* Upper bound for phase 2. */
 }
 
-void printf_cb(const char *format, ...)
-{
-	va_list args;
-	va_start(args, format);
-	vprintf(format, args);
-	va_end(args);
-}
-
-void report_probed_size(probe_progress_cb cb, const char *prefix,
-	uint64_t bytes, int block_order)
+void report_probed_size(unsigned int indent, progress_cb cb,
+	const char *prefix, uint64_t bytes, int block_order)
 {
 	double f = bytes;
 	const char *unit = adjust_unit(&f);
-	cb("%s %.2f %s (%" PRIu64 " blocks)\n", prefix, f, unit,
-		bytes >> block_order);
+	cb(indent, "%s %.2f %s (%" PRIu64 " blocks)\n",
+		prefix, f, unit, bytes >> block_order);
 }
 
-void report_probed_order(probe_progress_cb cb, const char *prefix, int order)
+void report_probed_order(unsigned int indent, progress_cb cb,
+	const char *prefix, int order)
 {
 	double f = (1ULL << order);
 	const char *unit = adjust_unit(&f);
-	cb("%s %.2f %s (2^%i Bytes)\n", prefix, f, unit, order);
+	cb(indent, "%s %.2f %s (2^%i Bytes)\n", prefix, f, unit, order);
 }
 
-void report_probed_cache(probe_progress_cb cb, const char *prefix,
-	uint64_t cache_size_block, int block_order)
-
+void report_probed_cache(unsigned int indent, progress_cb cb,
+	const char *prefix, uint64_t cache_size_block, int block_order)
 {
 	double f = (cache_size_block << block_order);
 	const char *unit = adjust_unit(&f);
-	cb("%s %.2f %s (%" PRIu64 " blocks)\n",
+	cb(indent, "%s %.2f %s (%" PRIu64 " blocks)\n",
 		prefix, f, unit, cache_size_block);
 }
 
 int probe_device(struct device *dev, uint64_t *preal_size_byte,
 	uint64_t *pannounced_size_byte, int *pwrap, uint64_t *pcache_size_block,
-	int *pblock_order, probe_progress_cb cb)
+	int *pblock_order, progress_cb cb, int show_progress)
 {
 	const uint64_t dev_size_byte = dev_get_size_byte(dev);
 	const int block_order = dev_get_block_order(dev);
-	uint64_t salt, cache_size_block;
-	uint64_t left_pos, right_pos, mid_drive_pos, reset_pos;
+	const int block_size = dev_get_block_size(dev);
+	const progress_cb fw_cb = show_progress ? cb : dummy_cb;
+	uint64_t left_pos, right_pos, mid_drive_pos;
+	struct rdwr_info rwi;
 	int wrap;
 
 	assert(block_order <= 20);
 
+	dbuf_init(&rwi.seqw_dbuf);
+	/* We initialize total_size to 0 because write_blocks() updates it
+	 * before writing.
+	 */
+	init_flow(&rwi.seqw_fw, block_size, 0, 0, fw_cb, 0, NULL);
+	init_flow(&rwi.randw_fw, block_size, 0, 0, fw_cb, 0, NULL);
+	init_flow(&rwi.randr_fw, block_size, 0, 0, fw_cb, 0, NULL);
+
 	/* @left_pos must point to a good block.
 	 * We just point to the last block of the first 1MB of the card
 	 * because this region is reserved for partition tables.
@@ -546,7 +780,7 @@ int probe_device(struct device *dev, uint64_t *preal_size_byte,
 	 * @left_pos points to a good block, and @right_pos to a bad block.
 	 */
 	if (left_pos >= right_pos) {
-		cache_size_block = 0;
+		rwi.cache_size_block = 0;
 		goto bad;
 	}
 
@@ -561,30 +795,26 @@ int probe_device(struct device *dev, uint64_t *preal_size_byte,
 	/* This call is needed due to rand(). */
 	srand(time(NULL));
 
-	salt = uint64_rand();
+	rwi.salt = uint64_rand();
 
-	cb("# Device geometry\n");
-	report_probed_size(cb, "=> Announced size:", dev_size_byte,
+	cb(0, "# Device geometry\n");
+	report_probed_size(0, cb, "=> Announced size:", dev_size_byte,
 		block_order);
-	report_probed_order(cb, "=> Physical block size:", block_order);
+	report_probed_order(0, cb, "=> Physical block size:", block_order);
 
-	if (find_cache_size(dev, mid_drive_pos - 1, &right_pos,
-		&cache_size_block, salt, cb))
+	if (find_cache_size(dev, mid_drive_pos - 1, &right_pos, &rwi, cb, 0))
 		goto bad;
 	assert(mid_drive_pos <= right_pos);
-	reset_pos = right_pos;
-	report_probed_cache(cb, "=> Approximate cache size:",
-		cache_size_block, block_order);
+	rwi.cache_pos = right_pos;
+	report_probed_cache(0, cb, "=> Approximate cache size:",
+		rwi.cache_size_block, block_order);
 
-	if (find_wrap(dev, left_pos, &right_pos,
-		reset_pos, cache_size_block, salt, cb))
+	if (find_wrap(dev, left_pos, &right_pos, &rwi, cb, 0))
 		goto bad;
 	wrap = ceiling_log2(right_pos << block_order);
-	report_probed_order(cb, "=> Module:", wrap);
+	report_probed_order(0, cb, "=> Module:", wrap);
 
-	cb("# Sampling\n");
-	if (sampling_probe(dev, left_pos, &right_pos, reset_pos,
-			cache_size_block, salt, cb))
+	if (sampling_probe(dev, left_pos, &right_pos, &rwi, cb, 0))
 		goto bad;
 
 	if (right_pos == left_pos + 1) {
@@ -601,9 +831,11 @@ int probe_device(struct device *dev, uint64_t *preal_size_byte,
 	*pwrap = ceiling_log2(dev_size_byte);
 
 out:
-	report_probed_size(cb, "=> Usable size:", *preal_size_byte, block_order);
+	dbuf_free(&rwi.seqw_dbuf);
+	report_probed_size(0, cb, "=> Usable size:",
+		*preal_size_byte, block_order);
 	*pannounced_size_byte = dev_size_byte;
-	*pcache_size_block = cache_size_block;
+	*pcache_size_block = rwi.cache_size_block;
 	*pblock_order = block_order;
 	return false;
 }
diff --git a/libprobe.h b/libprobe.h
index 590058d..2fbf6a7 100644
--- a/libprobe.h
+++ b/libprobe.h
@@ -3,25 +3,23 @@
 
 #include <stdint.h>
 
+#include "libutils.h"
 #include "libdevs.h"
 
 uint64_t probe_device_max_blocks(const struct device *dev);
 
-typedef void (*probe_progress_cb)(const char *format, ...);
+void report_probed_size(unsigned int indent, progress_cb cb,
+	const char *prefix, uint64_t bytes, int block_order);
 
-void printf_cb(const char *format, ...);
+void report_probed_order(unsigned int indent, progress_cb cb,
+	const char *prefix, int order);
 
-void report_probed_size(probe_progress_cb cb, const char *prefix,
-	uint64_t bytes, int block_order);
-
-void report_probed_order(probe_progress_cb cb, const char *prefix, int order);
-
-void report_probed_cache(probe_progress_cb cb, const char *prefix,
-	uint64_t cache_size_block, int block_order);
+void report_probed_cache(unsigned int indent, progress_cb cb,
+	const char *prefix, uint64_t cache_size_block, int block_order);
 
 int probe_device(struct device *dev, uint64_t *preal_size_byte,
 	uint64_t *pannounced_size_byte, int *pwrap,
 	uint64_t *pcache_size_block, int *pblock_order,
-	probe_progress_cb cb);
+	progress_cb cb, int show_progress);
 
 #endif	/* HEADER_LIBPROBE_H */
diff --git a/libutils.c b/libutils.c
index 05cb57d..2a00425 100644
--- a/libutils.c
+++ b/libutils.c
@@ -1,8 +1,12 @@
+#define _POSIX_C_SOURCE 200112L
+#define _XOPEN_SOURCE 600
+
 #include <stdio.h>	/* For fprintf().	*/
 #include <stdlib.h>	/* For strtoll().	*/
 #include <stdbool.h>
 #include <assert.h>
 #include <inttypes.h>
+#include <stdarg.h>
 
 #include "libutils.h"
 #include "version.h"
@@ -319,3 +323,50 @@ void print_stats(const struct block_stats *stats, int block_size,
 	print_stat("\tSlightly changed:", stats->changed, block_size, unit_name);
 	print_stat("\t     Overwritten:", stats->overwritten, block_size, unit_name);
 }
+
+static void print_indent(unsigned int indent, const char *indent_str)
+{
+	unsigned int i;
+	for (i = 0; i < indent; i++)
+		printf("%s", indent_str);
+}
+
+static void vprintf_cb(unsigned int indent, const char *format, va_list args)
+{
+	const char *indent_str = "        ";
+	const char  *erase_str = "\b\b\b\b\b\b\b\b";
+
+	assert(format != NULL);
+	if (format[0] != '\b') {
+		print_indent(indent, indent_str);
+		vprintf(format, args);
+		return;
+	}
+
+	vprintf(format, args);
+	print_indent(indent, erase_str);
+}
+
+void printf_cb(unsigned int indent, const char *format, ...)
+{
+	va_list args;
+	va_start(args, format);
+	vprintf_cb(indent, format, args);
+	va_end(args);
+}
+
+void printf_flush_cb(unsigned int indent, const char *format, ...)
+{
+	va_list args;
+	va_start(args, format);
+	vprintf_cb(indent, format, args);
+	va_end(args);
+	fflush(stdout);
+}
+
+void dummy_cb(unsigned int indent, const char *format, ...)
+{
+	/* Do nothing */
+	UNUSED(indent);
+	UNUSED(format);
+}
diff --git a/libutils.h b/libutils.h
index ab550fc..9cd145a 100644
--- a/libutils.h
+++ b/libutils.h
@@ -3,12 +3,20 @@
 
 #include <stdint.h>
 #include <argp.h>	/* For struct argp_state.	*/
+#include <time.h>	/* For struct timespec.		*/
 #include <sys/time.h>	/* For struct timeval.		*/
 
 #define SECTOR_SIZE (512)
 #define SECTOR_ORDER (9)
 
 #define UNUSED(x)	((void)x)
+#define DIM(x)		(sizeof(x) / sizeof((x)[0]))
+
+typedef void (*progress_cb)(unsigned int indent, const char *format, ...);
+
+void printf_cb(unsigned int indent, const char *format, ...);
+void printf_flush_cb(unsigned int indent, const char *format, ...);
+void dummy_cb(unsigned int indent, const char *format, ...);
 
 int ilog2(uint64_t x);
 
@@ -92,6 +100,13 @@ static inline uint64_t diff_timeval_us(const struct timeval *t1,
 		t2->tv_usec - t1->tv_usec;
 }
 
+static inline uint64_t diff_timespec_ns(const struct timespec *t1,
+	const struct timespec *t2)
+{
+	return (t2->tv_sec - t1->tv_sec) * 1000000000ULL +
+		t2->tv_nsec - t1->tv_nsec;
+}
+
 void print_stats(const struct block_stats *stats, int block_size,
 	const char *unit_name);