diff --git a/README.md b/README.md
index 549321ec..e167f82e 100755
--- a/README.md
+++ b/README.md
@@ -246,6 +246,7 @@ VMAware also has support for a variety of languages, if C++ isn't the language y
+
What about using this for malware?
@@ -256,6 +257,26 @@ VMAware also has support for a variety of languages, if C++ isn't the language y
+
+
+Is a kernel-mode component planned to be developed?
+
+
+> No. A kernel-component would require serious auditing and a digitally signed driver. It would also be a dead end for VM bypassing ( so it's not fun >:( )
+>
+> In summary, we can still detect your ass while being completely user-mode.
+
+
+
+
+
+Is it thread-safe?
+
+
+> No. Don't call our library with multiple threads simultaneously, we don't take more than 1s to run.
+
+
+
I have linker errors when compiling
diff --git a/src/vmaware.hpp b/src/vmaware.hpp
index c5cff6c1..9d0d822b 100644
--- a/src/vmaware.hpp
+++ b/src/vmaware.hpp
@@ -982,14 +982,14 @@ struct VM {
return "Unknown";
}
- alignas(16) char buffer[49]{};
- u32* regs = reinterpret_cast(buffer);
+ u32 regs[12] = { 0 };
- // unrolled calls to fill buffer directly
cpu::cpuid(regs[0], regs[1], regs[2], regs[3], cpu::leaf::brand1);
cpu::cpuid(regs[4], regs[5], regs[6], regs[7], cpu::leaf::brand2);
cpu::cpuid(regs[8], regs[9], regs[10], regs[11], cpu::leaf::brand3);
+ static char buffer[49];
+ memcpy(buffer, regs, sizeof(regs));
buffer[48] = '\0';
// do NOT touch trailing spaces for the AMD_THREAD_MISMATCH technique
@@ -1010,31 +1010,21 @@ struct VM {
[[nodiscard]] static std::string cpu_manufacturer(const u32 leaf_id) {
- alignas(16) char buffer[13]{};
- u32* regs = reinterpret_cast(buffer);
-
- u32 eax = 0;
- u32 ebx = 0;
- u32 ecx = 0;
- u32 edx = 0;
-
+ u32 eax = 0, ebx = 0, ecx = 0, edx = 0;
cpu::cpuid(eax, ebx, ecx, edx, leaf_id);
- if (ebx == 0 && ecx == 0 && edx == 0) {
- return "";
- }
+ if (ebx == 0 && ecx == 0 && edx == 0) return "";
+ u32 regs[3] = { 0 };
if (leaf_id >= 0x40000000) {
- regs[0] = ebx;
- regs[1] = ecx;
- regs[2] = edx;
+ regs[0] = ebx; regs[1] = ecx; regs[2] = edx;
}
else {
- regs[0] = ebx;
- regs[1] = edx;
- regs[2] = ecx;
+ regs[0] = ebx; regs[1] = edx; regs[2] = ecx;
}
+ char buffer[13];
+ memcpy(buffer, regs, sizeof(regs));
buffer[12] = '\0';
return { buffer };
}
@@ -3155,38 +3145,6 @@ struct VM {
dest[i] = '\0';
}
- static void str_cat(char* dest, const char* src, size_t max_len) {
- size_t i = 0;
- while (dest[i] != '\0') {
- i++;
- }
-
- size_t j = 0;
- while (src[j] != '\0' && i < max_len - 1) {
- dest[i++] = src[j++];
- }
- dest[i] = '\0';
- }
-
- static bool str_eq(const char* a, const char* b) {
- if (a == b) {
- return true;
- }
-
- if (!a || !b) {
- return false;
- }
-
- while (*a && *b) {
- if (*a != *b) {
- return false;
- }
-
- a++; b++;
- }
- return *a == *b;
- }
-
// memoization
struct memo {
struct data_t {
@@ -3416,24 +3374,29 @@ struct VM {
#if (WINDOWS)
// timer helper functionalities
struct timer {
- #define VMAWARE_STR2(x) #x
- #define VMAWARE_STR(x) VMAWARE_STR2(x)
+ #if (x86_64)
+ using timer_tick_t = u64;
+ #else
+ using timer_tick_t = u32;
+ #endif
- // prevent false sharing when triggering hypervisor exits with the intentional data race condition
#if (MSVC)
#pragma warning(push)
#pragma warning(disable: 4324)
#endif
+ // align to prevent false sharing when triggering hypervisor exits with the intentional data race condition
struct alignas(64) cache_state {
- alignas(64) volatile u64 counter { 0 };
- alignas(64) std::atomic start_test{ false };
- alignas(64) std::atomic test_done{ false };
+ alignas(64) volatile timer_tick_t counter { 0 };
+ alignas(64) std::atomic start_test { false };
+ alignas(64) std::atomic test_done { false };
};
#if (MSVC)
#pragma warning(pop)
#endif
- static u32 get_ct_seed() {
+ #define VMAWARE_STR2(x) #x
+ #define VMAWARE_STR(x) VMAWARE_STR2(x)
+ [[nodiscard]] static u32 get_ct_seed() {
constexpr char s[] = __DATE__ " " __TIME__ " " __FILE__ " " VMAWARE_STR(__LINE__);
u32 h = 2166136261u;
for (char c : s) {
@@ -3444,8 +3407,28 @@ struct VM {
return h;
}
- // middle available logical CPU
- static DWORD_PTR get_trigger_mask() {
+ /*
+ Golden Rules (must happen ALWAYS; if they don't happen the check should be aborted):
+ 1. The check needs AT LEAST two different cores, so if one single core is detected, returns
+ 2. The counter thread should always be in the middle available logical CPU when there's more than 2 cores, and in the core 2 (1-indexed) when there's 2 cores
+
+ Silver Rules (in order of priority):
+ 1. The trigger and the counter thread must not be in the same physical core (avoid SMT siblings) WHENEVER POSSIBLE
+ 2. The trigger and the counter thread should be within the same NUMA node or AMD CCD to minimize baseline latency WHENEVER POSSIBLE
+ 3. The counter and trigger thread should not be in the first or last logical CPU WHENEVER POSSIBLE
+ 4. If after reaching here, there are multiple valid candidates (core indexes), then randomize it to avoid hypervisors from predicting where the trigger thread is
+
+ Example: Imagine we have a CPU with 2 cores and 4 threads (with SMT enabled), then:
+ 1. We process golden rules first:
+ - CPU has more than 1 core -> OK
+ - Counter thread is pinned to core 2
+
+ 2. We process the silver rules, in order:
+ - Counter and trigger thread must not be in the same physical core -> Trigger thread is pinned to core 4
+ - Trigger and the counter thread should be within the same NUMA node -> FAILED
+ - Counter and trigger thread should not be in the first or last logical CPU -> FAILED, trigger thread had to be put in core 4 due to a silver rule with more priority
+ */
+ [[nodiscard]] static DWORD_PTR getmask(u32 ct_seed, bool trigger) {
const HANDLE current_process = reinterpret_cast(-1LL);
DWORD_PTR proc_mask = 0, sys_mask = 0;
@@ -3461,183 +3444,192 @@ struct VM {
}
}
- if (n < 2) return 0ull; // single-core abort
-
- // (1-indexed)
- // 2 cores -> trigger 1
- // 3 cores -> trigger 1
- // 4 cores -> trigger 1
- // 5 cores -> trigger 4
- // 6 cores -> trigger 5
- // >6 cores -> middle available logical CPU
- size_t trigger_pos0 = 0; // 0-based ordinal
+ if (n < 2) {
+ return 0ull;
+ }
- if (n == 2 || n == 3 || n == 4) {
- trigger_pos0 = 0;
+ // first null buffer then use size
+ DWORD len = 0;
+ SetLastError(ERROR_SUCCESS);
+ GetLogicalProcessorInformationEx(RelationAll, nullptr, &len);
+ if (GetLastError() != ERROR_INSUFFICIENT_BUFFER || !len) {
+ return 0ull;
}
- else if (n == 5) {
- trigger_pos0 = 3;
+
+ std::vector topo(len);
+ if (!GetLogicalProcessorInformationEx(
+ RelationAll,
+ reinterpret_cast(topo.data()),
+ &len)) {
+ return 0ull;
}
- else if (n == 6) {
- trigger_pos0 = 4;
+
+ constexpr DWORD INVALID_CPU = 0xFFFFFFFFu;
+
+ DWORD logical_to_core[64];
+ DWORD logical_to_numa[64];
+ std::fill_n(logical_to_core, 64, INVALID_CPU);
+ std::fill_n(logical_to_numa, 64, INVALID_CPU);
+
+ DWORD core_count = 0;
+
+ size_t offset = 0;
+ while (offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX) <= len) {
+ auto* ptr = reinterpret_cast(topo.data() + offset);
+
+ switch (ptr->Relationship) {
+ case RelationProcessorCore: {
+ const DWORD core_id = core_count++;
+
+ for (DWORD g = 0; g < ptr->Processor.GroupCount; ++g) {
+ const KAFFINITY mask = ptr->Processor.GroupMask[g].Mask;
+ for (DWORD bit = 0; bit < 64; ++bit) {
+ if (mask & (1ull << bit)) {
+ logical_to_core[bit] = core_id;
+ }
+ }
+ }
+ break;
+ }
+
+ case RelationNumaNode: {
+ const DWORD node_id = ptr->NumaNode.NodeNumber;
+ const KAFFINITY mask = ptr->NumaNode.GroupMask.Mask;
+ for (DWORD bit = 0; bit < 64; ++bit) {
+ if (mask & (1ull << bit)) {
+ logical_to_numa[bit] = node_id;
+ }
+ }
+ break;
+ }
+
+ default:
+ break;
+ }
+
+ if (!ptr->Size) {
+ return 0ull;
+ }
+ offset += ptr->Size;
}
- else {
- trigger_pos0 = n / 2;
+
+ // abort if only one physical core exists in the allowed affinity set
+ {
+ bool seen_core[64]{};
+ DWORD physical_cores = 0;
+
+ for (DWORD i = 0; i < n; ++i) {
+ const DWORD log = idxs[i];
+ const DWORD core = logical_to_core[log];
+ if (core == INVALID_CPU) {
+ return 0ull;
+ }
+ if (!seen_core[core]) {
+ seen_core[core] = true;
+ ++physical_cores;
+ }
+ }
+
+ if (physical_cores < 2) {
+ return 0ull;
+ }
}
- if (trigger_pos0 >= n) return 0ull;
- return 1ull << idxs[trigger_pos0];
- }
+ // counter: middle available logical CPU when >2, otherwise second available logical CPU
+ const DWORD counter_pos0 = (n == 2) ? 1u : (n / 2u);
+ if (counter_pos0 >= n) {
+ return 0ull;
+ }
- // random logical CPU, but exclude the trigger_thread, first, second and last available logical CPUs, avoiding SMT siblings
- static DWORD_PTR get_counter_mask(u32 ct_seed, DWORD_PTR trigger_mask) {
- const HANDLE current_process = reinterpret_cast(-1LL);
+ const DWORD counter_logical = idxs[counter_pos0];
+ const DWORD counter_core = logical_to_core[counter_logical];
+ const DWORD counter_numa = logical_to_numa[counter_logical];
- DWORD_PTR proc_mask = 0, sys_mask = 0;
- if (!GetProcessAffinityMask(current_process, &proc_mask, &sys_mask) || !proc_mask) {
+ if (counter_core == INVALID_CPU || counter_numa == INVALID_CPU) {
return 0ull;
}
- DWORD idxs[64]{};
- DWORD n = 0;
- for (DWORD i = 0; i < 64; ++i) {
- if (proc_mask & (1ull << i)) {
- idxs[n++] = i;
- }
+ if (!trigger) {
+ return 1ull << counter_logical;
}
- if (n < 2) return 0ull; // single-core abort
+ auto is_edge = [&](DWORD logical) -> bool {
+ return logical == idxs[0] || logical == idxs[n - 1];
+ };
- // get topology to identify SMT siblings
- DWORD len = 0;
- GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &len);
+ auto same_numa = [&](DWORD logical) -> bool {
+ return logical_to_numa[logical] != INVALID_CPU && logical_to_numa[logical] == counter_numa;
+ };
- // stack buffer fallback mechanism
- BYTE stack_buf[1024]{};
- std::vector heap_buf;
- PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = nullptr;
+ auto build_candidates = [&](bool require_same_numa, bool avoid_edges) {
+ std::vector out;
+ out.reserve(n);
- if (len <= sizeof(stack_buf)) {
- info = reinterpret_cast(stack_buf);
- }
- else {
- heap_buf.resize(len);
- info = reinterpret_cast(heap_buf.data());
- }
+ for (DWORD i = 0; i < n; ++i) {
+ const DWORD logical = idxs[i];
+ if (logical == counter_logical) {
+ continue;
+ }
- if (!GetLogicalProcessorInformationEx(RelationProcessorCore, info, &len)) {
- return 0ull; // no valid topology data, fail closed
- }
+ // never same physical core when possible
+ if (logical_to_core[logical] == counter_core) {
+ continue;
+ }
- // logical processor index to its physical core ID
- DWORD logical_to_core[64] = { 0 };
- DWORD_PTR core_mask[64] = { 0 };
- size_t offset = 0;
- DWORD core_idx = 0;
- while (offset < len) {
- auto ptr = reinterpret_cast(reinterpret_cast(info) + offset);
- for (DWORD i = 0; i < ptr->Processor.GroupCount; ++i) {
- KAFFINITY mask = ptr->Processor.GroupMask[i].Mask;
- for (int b = 0; b < 64; ++b) {
- if (mask & (1ull << b)) {
- logical_to_core[b] = core_idx;
- core_mask[core_idx] |= (1ull << b);
- }
+ if (avoid_edges && is_edge(logical)) {
+ continue;
}
+
+ if (require_same_numa && !same_numa(logical)) {
+ continue;
+ }
+
+ out.push_back(logical);
}
- offset += ptr->Size;
- core_idx++;
- }
- auto pick_by_ordinal = [&](size_t ord0) -> DWORD_PTR {
- if (ord0 >= n) return 0ull;
- return 1ull << idxs[ord0];
+ return out;
};
- DWORD_PTR choices = 0ull;
-
- // exact placement rules:
- // 2 cores -> counter 2
- // 3 cores -> counter 3
- // 4 cores -> counter 3
- // 5 cores -> counter 2
- // 6 cores -> counter 3
- if (n == 2) {
- choices = pick_by_ordinal(1);
- }
- else if (n == 3 || n == 4) {
- choices = pick_by_ordinal(2);
- }
- else if (n == 5) {
- choices = pick_by_ordinal(1);
- }
- else if (n == 6) {
- choices = pick_by_ordinal(2);
+ // priority order
+ // 1) different physical core + same NUMA + not first/last
+ // 2) different physical core + same NUMA
+ // 3) different physical core + not first/last
+ // 4) different physical core
+ std::vector candidates = build_candidates(true, true);
+ if (candidates.empty()) candidates = build_candidates(true, false);
+ if (candidates.empty()) candidates = build_candidates(false, true);
+ if (candidates.empty()) candidates = build_candidates(false, false);
+
+ if (candidates.empty()) {
+ return 0ull;
}
- else {
- // > 6 cores:
- // trigger in the middle available logical CPU
- size_t trigger_pos0 = n / 2;
- if (trigger_pos0 >= n) return 0ull;
-
- DWORD trigger_logical = idxs[trigger_pos0];
- DWORD trigger_core_id = logical_to_core[trigger_logical];
-
- choices = proc_mask;
-
- // random so that the hypervisor doesn't know where the counter thread is
- // this will affect latency if cache lines from trigger_thread and counter_thread are separated enough due to cores being too distant
- // however, we do a ratio based detection, so this wont affect the detection accuracy because the cache latency affects both samples
- choices &= ~trigger_mask;
-
- if (trigger_pos0 >= 1)
- choices &= ~(1ull << idxs[trigger_pos0 - 1]); // adjacent left
- if (trigger_pos0 + 1 < n)
- choices &= ~(1ull << idxs[trigger_pos0 + 1]); // adjacent right
-
- choices &= ~(1ull << idxs[0]); // first core
- choices &= ~(1ull << idxs[n - 1]); // last core
- choices &= ~core_mask[trigger_core_id]; // avoid SMT siblings of the trigger core
-
- // if exclusions leave nothing, fail closed
- if (!choices) return 0ull;
-
- DWORD pick[64]{}, m = 0;
- for (DWORD i = 0; i < 64; ++i) {
- if (choices & (1ull << i))
- pick[m++] = i;
- }
- if (!m) return 0ull;
-
- // random so that the hypervisor doesn't know where the counter thread is
- // this will affect latency if cache lines from trigger_thread and counter_thread are separated enough due to cores being too distant
- // however, we do a ratio based detection, so this wont affect the detection accuracy because the cache latency affects both samples
- u64 seed = 0;
- seed ^= static_cast(ct_seed);
- seed ^= static_cast(reinterpret_cast(¤t_process));
- seed ^= static_cast(reinterpret_cast(&proc_mask)) << 1;
- seed ^= static_cast(reinterpret_cast(&sys_mask)) << 2;
- seed ^= seed >> 33;
- seed *= 0xff51afd7ed558ccdULL;
- seed ^= seed >> 33;
- seed *= 0xc4ceb9fe1a85ec53ULL;
- seed ^= seed >> 33;
- std::seed_seq seq{
- static_cast(seed), static_cast(seed >> 32), static_cast(seed ^ 0x9e3779b9u), ct_seed
- };
- // std::random_device{}() uses RDRAND/RDSEED which can be intercepted by hypervisors
- // we use our own compile-time seed that cannot be taken by examining PE/Linux binary properties and would need static/dynamic analysis
- // this changes per build and per process session due to hardware ASLR
- std::mt19937 gen(seq);
- return 1ull << pick[std::uniform_int_distribution(0, m - 1)(gen)];
- }
+ u64 seed = 0;
+ seed ^= static_cast(ct_seed);
+ seed ^= static_cast(reinterpret_cast(&proc_mask));
+ seed ^= static_cast(reinterpret_cast(&sys_mask)) << 1;
+ seed ^= static_cast(counter_logical) << 2;
+ seed ^= static_cast(counter_core) << 3;
+ seed ^= seed >> 33;
+ seed *= 0xff51afd7ed558ccdULL;
+ seed ^= seed >> 33;
+ seed *= 0xc4ceb9fe1a85ec53ULL;
+ seed ^= seed >> 33;
- return choices;
+ std::seed_seq seq{
+ static_cast(seed),
+ static_cast(seed >> 32),
+ static_cast(seed ^ 0x9e3779b9u),
+ ct_seed
+ };
+
+ std::mt19937 gen(seq);
+ const DWORD logical = candidates[std::uniform_int_distribution(0, candidates.size() - 1)(gen)];
+ return 1ull << logical;
}
// we dont use cpu::cpuid on purpose
- static VMAWARE_FORCE_INLINE void trigger_vmexit() {
+ static VMAWARE_FORCE_INLINE void vmexit() {
#if (GCC || CLANG)
u32 a = 0, c = 0, d = 0;
#if (x86_64)
@@ -3665,157 +3657,36 @@ struct VM {
#endif
}
- static u64 calculate_latency(const std::vector& samples_in) {
+ [[nodiscard]] static timer_tick_t calculate_latency(const std::vector& samples_in) {
if (samples_in.empty()) return 0;
const size_t N = samples_in.size();
if (N == 1) return samples_in[0];
- // local sorted copy
- std::vector s = samples_in;
- std::sort(s.begin(), s.end()); // ascending
-
- // tiny-sample short-circuits
- if (N <= 4) return s.front();
-
- // median (and works for sorted input)
- auto median_of_sorted = [](const std::vector& v, size_t lo, size_t hi) -> u64 {
- // this is the median of v[lo..hi-1], requires 0 <= lo < hi
- const size_t len = hi - lo;
- if (len == 0) return 0;
- const size_t mid = lo + (len / 2);
- if (len & 1) return v[mid];
- return (v[mid - 1] + v[mid]) / 2;
- };
+ // create a local copy to sort
+ std::vector s = samples_in;
+ std::sort(s.begin(), s.end());
- // the robust center: median M and MAD -> approximate sigma
- const u64 M = median_of_sorted(s, 0, s.size());
+ // discard the lower 25% and upper 25%, leaving the middle 50%
+ const size_t low_idx = N / 4;
+ const size_t high_idx = (3 * N) / 4;
- // select the median deviation in linear time instead of sorting all deviations
- std::vector absdev;
- absdev.resize(N);
- for (size_t i = 0; i < N; ++i) {
- const u64 d = (s[i] > M) ? (s[i] - M) : (M - s[i]);
- absdev[i] = d;
+ double sum = 0;
+ size_t count = 0;
+ for (size_t i = low_idx; i < high_idx; ++i) {
+ sum += s[i];
+ count++;
}
- const size_t mad_mid = N / 2;
- std::nth_element(absdev.begin(), absdev.begin() + mad_mid, absdev.end());
-
- u64 MAD = 0;
- if (N & 1) {
- MAD = absdev[mad_mid];
- }
- else {
- const u64 upper = absdev[mad_mid];
- const u64 lower = *std::max_element(absdev.begin(), absdev.begin() + mad_mid);
- MAD = (lower + upper) / 2;
- }
-
- // convert MAD to an approximate standard-deviation-like measure
- constexpr long double kmad_to_sigma = 1.4826L; // consistent for normal approx
- const long double sigma = (MAD == 0) ? 1.0L : (static_cast(MAD) * kmad_to_sigma);
-
- // find the densest small-valued cluster by sliding a fixed-count window
- // this locates the most concentrated group of samples (likely it would be the true VMEXIT cluster)
- // const size_t frac_win = (N * 8 + 99) / 100; // ceil(N * 0.08)
- // const size_t win = std::min(N, std::max(MIN_WIN, frac_win));
- const size_t MIN_WIN = 10;
- // manual min/max calculation for win size
- const size_t calc_frac = static_cast(std::ceil(static_cast(N) * 0.08));
- const size_t inner_max = (MIN_WIN > calc_frac) ? MIN_WIN : calc_frac;
- const size_t win = (N < inner_max) ? N : inner_max;
-
- size_t best_i = 0;
- u64 best_span = (s.back() - s.front()) + 1; // large initial
- for (size_t i = 0; i + win <= N; ++i) {
- const u64 span = s[i + win - 1] - s[i];
- if (span < best_span) {
- best_span = span;
- best_i = i;
- }
- }
-
- // expand the initial window greedily while staying "tight"
- // allow expansion while adding samples does not more than multiply the span by EXPAND_FACTOR
- constexpr long double EXPAND_FACTOR = 1.5L;
- size_t cluster_lo = best_i;
- size_t cluster_hi = best_i + win; // exclusive
- // expand left
- while (cluster_lo > 0) {
- const u64 new_span = s[cluster_hi - 1] - s[cluster_lo - 1];
- if (static_cast(new_span) <= EXPAND_FACTOR * static_cast(best_span) ||
- (s[cluster_hi - 1] <= (s[cluster_lo - 1] + static_cast(std::ceil(3.0L * sigma))))) {
- --cluster_lo;
- // manual min calculation
- best_span = (best_span < new_span) ? best_span : new_span;
- }
- else break;
- }
- // expand right
- while (cluster_hi < N) {
- const u64 new_span = s[cluster_hi] - s[cluster_lo];
- if (static_cast(new_span) <= EXPAND_FACTOR * static_cast(best_span) ||
- (s[cluster_hi] <= (s[cluster_lo] + static_cast(std::ceil(3.0L * sigma))))) {
- ++cluster_hi;
- best_span = (best_span < new_span) ? best_span : new_span;
- }
- else break;
- }
-
- const size_t cluster_size = (cluster_hi > cluster_lo) ? (cluster_hi - cluster_lo) : 0;
-
- // cluster must be reasonably dense and cover a non-negligible portion of samples, so this is pure sanity checks
- const double fraction_in_cluster = static_cast(cluster_size) / static_cast(N);
-
- // min/max calculation for MIN_CLUSTER
- const int val_n_50 = static_cast(N / 50);
- const size_t val_max = static_cast((5 > val_n_50) ? 5 : val_n_50);
- const size_t MIN_CLUSTER = (val_max < N) ? val_max : N; // at least 2% or 5 elements
-
- if (cluster_size < MIN_CLUSTER || fraction_in_cluster < 0.02) {
- // low-percentile (10th) trimmed median
- // Manual max calculation for fallback_count
- const size_t floor_val = static_cast(std::floor(static_cast(N) * 0.10));
- const size_t fallback_count = (1 > floor_val) ? 1 : floor_val;
-
- // median of lowest fallback_count elements (if fallback_count==1 that's smallest)
- if (fallback_count == 1) return s.front();
- const size_t mid = fallback_count / 2;
- if (fallback_count & 1) return s[mid];
- return (s[mid - 1] + s[mid]) / 2;
- }
-
- // now we try to get a robust estimate inside the cluster, trimmed mean (10% trim) centered on cluster
- const size_t trim_count = static_cast(std::floor(static_cast(cluster_size) * 0.10));
- const size_t lo = cluster_lo + trim_count;
- const size_t hi = cluster_hi - trim_count; // exclusive
- if (hi <= lo) {
- // degenerate -> median of cluster
- return median_of_sorted(s, cluster_lo, cluster_hi);
- }
-
- // sum with long double to avoid overflow and better rounding
- long double sum = 0.0L;
- for (size_t i = lo; i < hi; ++i) sum += static_cast(s[i]);
- const long double avg = sum / static_cast(hi - lo);
- u64 result = static_cast(std::llround(avg));
-
- // final sanity adjustments:
- // if the computed result is suspiciously far from the global median (e.g., > +6*sigma)
- // clamp toward the median to avoid choosing a high noisy cluster by mistake
- const long double diff_from_med = static_cast(result) - static_cast(M);
- if (diff_from_med > 0 && diff_from_med > (6.0L * sigma)) {
- // clamp to median + 4*sigma (conservative)
- result = static_cast(std::llround(static_cast(M) + 4.0L * sigma));
- }
-
- // also, if result is zero (shouldn't be) or extremely small, return a smallest observed sample
- if (result == 0) result = s.front();
+ // fallback to the median if the dataset is too small
+ if (count == 0) return s[N / 2];
- return result;
+ // compute the average of the middle 50% and round to the nearest integer
+ return static_cast((sum / count) + 0.5);
}
- static VMAWARE_FORCE_INLINE void burn_random_cycles(u64 ct_seed, u64 v_post, u64 r_post) {
+ static VMAWARE_FORCE_INLINE void burn_random_cycles(u32 ct_seed, timer_tick_t v_post, timer_tick_t r_post) {
+ // the internal pseudo-random number generator (PRNG) variables like u64 seed and volatile u64 x can be kept as u64
+ // because they are simple register-only PRNG arithmetic and benefit from the extra 64-bit entropy space even on 32-bit platforms
u64 seed = ct_seed;
seed ^= static_cast(reinterpret_cast(&seed));
seed ^= static_cast(reinterpret_cast(&v_post)) << 1;
@@ -3835,9 +3706,7 @@ struct VM {
x ^= x >> 17;
}
- #if (CLANG || GCC)
- __asm__ volatile("" :: "r"(x) : "memory");
- #endif
+ std::atomic_signal_fence(std::memory_order_acq_rel);
}
};
#endif
@@ -5943,13 +5812,13 @@ struct VM {
*/
[[nodiscard]] static bool timer() {
#if (x86 && WINDOWS)
+ using timer = struct timer;
+
if (util::is_running_under_translator()) {
debug("TIMER: Running inside a binary translation layer");
return false;
}
- using timer = struct timer;
-
// calculation of minimum threshold
bool is_intel = cpu::is_intel();
double threshold = 2.5;
@@ -5962,39 +5831,32 @@ struct VM {
}
}
- // Shared state and results
+ // shared state and results
timer::cache_state state;
bool hypervisor_detected = false;
const u32 ct_seed = timer::get_ct_seed();
- const DWORD_PTR trigger_affinity = timer::get_trigger_mask();
- if (!trigger_affinity) {
- return false;
- }
+ const DWORD_PTR trigger_affinity = timer::getmask(ct_seed, true);
+ const DWORD_PTR counter_affinity = timer::getmask(ct_seed, false);
- const DWORD_PTR target_affinity = timer::get_counter_mask(ct_seed, trigger_affinity);
- if (!target_affinity) {
+ if (!trigger_affinity || !counter_affinity) {
return false;
}
- // our software clock, it will count how many cycles a vmexit takes
+ // our software clock
auto counter_thread = [&]() {
const HANDLE current_thread = reinterpret_cast(-2LL);
- SetThreadAffinityMask(current_thread, target_affinity);
+ SetThreadAffinityMask(current_thread, counter_affinity);
SetThreadPriority(current_thread, THREAD_PRIORITY_HIGHEST); // decrease chance of being rescheduled
SetThreadPriorityBoost(current_thread, TRUE); // disable dynamic boosts
while (!state.start_test.load(std::memory_order_acquire)) {}
while (!state.test_done.load(std::memory_order_relaxed)) {
- const u64 current = state.counter; // to silence warnings about incrementing volatile stuff
+ const timer::timer_tick_t current = state.counter; // to silence warnings about incrementing volatile stuff
state.counter = current + 1; // better than calling incq in inline assembly, standard increment forces the correct cache behavior we want
-
- #if (GCC || CLANG)
- // prevents aggressive loop unrolling/batching of volatile stores
- __asm__ volatile("" ::: "memory");
- #endif
+ std::atomic_signal_fence(std::memory_order_seq_cst);
}
};
@@ -6020,7 +5882,7 @@ struct VM {
const DWORD old_process_priority = GetPriorityClass(current_process);
SetPriorityClass(current_process, ABOVE_NORMAL_PRIORITY_CLASS); // ABOVE_NORMAL_PRIORITY_CLASS + THREAD_PRIORITY_HIGHEST = 12 base priority
SetThreadPriority(current_thread, THREAD_PRIORITY_HIGHEST);
- SetThreadPriorityBoost(current_thread, TRUE); // disable dynamic boosts
+ SetThreadPriorityBoost(current_thread, TRUE); // disable dynamic thread priority adjustments by Windows, not turbo boosts by the hardware itself
// important so that hypervisor can't predict how many samples we will collect
// stack-only / ASLR-derived component (no APIs, no rdtsc)
@@ -6052,144 +5914,149 @@ struct VM {
std::mt19937 gen(seq);
std::uniform_int_distribution batch_dist(30000, 70000);
const size_t BATCH_SIZE = batch_dist(gen);
- size_t valid = 0; // end of setup phase
SleepEx(0, FALSE); // try to get fresh quantum before starting warm-up phase, give time to kernel to setup priorities
- std::vector vm_samples(BATCH_SIZE), ref_samples(BATCH_SIZE); // pre page-fault MMU, wwe wont warm-up cpuid samples for the P-states intentionally
- VirtualLock(vm_samples.data(), BATCH_SIZE * sizeof(u64)); // lock the memory for the samples to prevent page faults if permissions are enough
- VirtualLock(ref_samples.data(), BATCH_SIZE * sizeof(u64));
+ std::vector vm_samples(BATCH_SIZE), ref_samples(BATCH_SIZE); // pre page-fault MMU, wwe wont warm-up cpuid samples for the P-states intentionally
+ VirtualLock(vm_samples.data(), BATCH_SIZE * sizeof(timer::timer_tick_t)); // lock the memory for the samples to prevent page faults if permissions are enough
+ VirtualLock(ref_samples.data(), BATCH_SIZE * sizeof(timer::timer_tick_t));
#define LFENCE_8 _mm_lfence(); _mm_lfence(); _mm_lfence(); _mm_lfence(); _mm_lfence(); _mm_lfence(); _mm_lfence(); _mm_lfence();
state.start_test.store(true, std::memory_order_release); // _mm_pause can be vm-exited conditionally, spam hit L3
- // warm-up to settle caches and scheduler, P-states are already not enforced with the SetThreadPriorityBoost call of before
- for (int i = 0; i < 1000; ++i) {
- // serialize is a good candidate as it's the closest architectural match to CPUID's pipeline-stall behavior AND can't be intercepted in VCMB/VMCS
- // in AMD the serialize intrinsic triggers a illegal instruction exception, so the closest AMD native substitute that is not one of the standard direct instruction exit controls is LFENCE
- // when AMD has configured it to be dispatch-serializing via MSR C001_1029[1]=1 (or when LFenceAlwaysSerializing is set)
- if (is_intel) _serialize();
- else LFENCE_8
- timer::trigger_vmexit();
- }
+ // cache and cpu scheduler warm-up won't affect anything in the measurement loop, so ramp up frequency/P-states to a high non-AVX Turbo/P-state without vmexits
+ u64 val = static_cast(seed) ^ 0x5a5a5a5a5a5a5a5aULL;
+
+ for (u32 i = 0; i < 12'000'000; ++i) {
+ val = (val ^ i) * 6364136223846793005ULL + 1442695040888963407ULL;
+ }
+
+ volatile u64 compiler_sink = val;
+ VMAWARE_UNUSED(compiler_sink);
+
+ // independent multi-trial state initialization
+ timer::timer_tick_t best_cpuid_l = (std::numeric_limits::max)();
+ timer::timer_tick_t best_ref_l = (std::numeric_limits::max)();
+ constexpr int TRIALS = 3;
+
+ for (int trial = 0; trial < TRIALS; ++trial) {
+ size_t valid = 0; // end of setup phase
+
+ // inside the timing windows, there must be zero memory output (no stack arrays can be written to), zero conditional branches and zero stack spilling (no register push/pops)
+ if (is_intel) {
+ while (valid < BATCH_SIZE) {
+ // cpuid and serialize/lfence interpolated so that any turbo boost, thermal throttling, speculation (for the loop overhead itself, not for the serializing instructions), etc affects samples equally
+ timer::timer_tick_t r_pre, r_post, v_pre, v_post, sync;
+
+ // this is done as a counter to both legitimate and malicious hypervisors interrupts that may pause the counter thread while we measure
+ sync = state.counter;
+ while (state.counter == sync); // infer if counter got enough quantum momentum (so its currently scheduled)
+
+ // SERIALIZE/LFENCE check is before CPUID on purpose, so that possible pauses when cpuid is executed do not affect SERIALIZE/LFENCE too. The hv needs to wait for cpuid to pause the thread
+ // the amount of instructions (8 in case of LFENCE) are enough for the Cross-Core/Cross-CCD MESI RFO cache bounce in the data race so that the counter thread sees an increment
+ sync = state.counter;
+ while (state.counter == sync); // fastest busy-waiting strategy, PAUSE affects cache, calling APIs like SwitchToThread() would be even worse
+ r_pre = state.counter;
+ std::atomic_signal_fence(std::memory_order_seq_cst); // ensure compiler-level ordering
+ _serialize();
+ std::atomic_signal_fence(std::memory_order_seq_cst);
+ r_post = state.counter;
+
+ sync = state.counter;
+ while (state.counter == sync); // sync to our counter tick again
+ sync = state.counter;
+ while (state.counter == sync); // and again
+
+ v_pre = state.counter;
+ std::atomic_signal_fence(std::memory_order_seq_cst); // _ReadWriteBarrier() aka dont emit runtime fences
+
+ // the only way a legitimate interrupt can make the check false flag is if most of the samples were contaminated just in the cpuid samples but not in the serialize/lfence samples
+ // still possible tho, but it's as accurate we can get on user-mode without relying on any other hardware clock or cross-referencing with the counter thread mid-execution
+ // this is why the score of this technique is not enough to determine a VM
+ timer::vmexit();
+
+ std::atomic_signal_fence(std::memory_order_seq_cst);
+ v_post = state.counter;
+
+ // we dont filter by cycles spent here (for example by querying thread cycle time) because the point of this function is to not use TSC or any other clock
+ if (v_post > v_pre && r_post > r_pre) {
+ vm_samples[valid] = v_post - v_pre;
+ ref_samples[valid] = r_post - r_pre;
+ valid++;
+ }
- // inside the timing windows, there must be zero memory output (no stack arrays can be written to), zero conditional branches and zero stack spilling (no register push/pops)
- if (is_intel) {
- while (valid < BATCH_SIZE) {
- // cpuid and serialize/lfence interpolated so that any turbo boost, thermal throttling, speculation (for the loop overhead itself, not for the serializing instructions), etc affects samples equally
- u64 r_pre, r_post, v_pre, v_post, sync;
-
- // this is done as a counter to both legitimate and malicious hypervisors interrupts that may pause the counter thread while we measure
- sync = state.counter;
- while (state.counter == sync); // infer if counter got enough quantum momentum (so its currently scheduled)
-
- // SERIALIZE/LFENCE check is before CPUID on purpose, so that possible pauses when cpuid is executed do not affect SERIALIZE/LFENCE too. The hv needs to wait for cpuid to pause the thread
- // the amount of instructions (8 in case of LFENCE) are enough for the Cross-Core/Cross-CCD MESI RFO cache bounce in the data race so that the counter thread sees an increment
- sync = state.counter;
- while (state.counter == sync); // fastest busy-waiting strategy, PAUSE affects cache, calling APIs like SwitchToThread() would be even worse
- r_pre = state.counter;
- std::atomic_signal_fence(std::memory_order_seq_cst); // ensure compiler-level ordering
- _serialize();
- std::atomic_signal_fence(std::memory_order_seq_cst);
- r_post = state.counter;
-
- sync = state.counter;
- while (state.counter == sync); // sync to our counter tick again
- sync = state.counter;
- while (state.counter == sync); // and again
-
- v_pre = state.counter;
- std::atomic_signal_fence(std::memory_order_seq_cst); // _ReadWriteBarrier() aka dont emit runtime fences
-
- // the only way a legitimate interrupt can make the check false flag is if most of the samples were contaminated just in the cpuid samples but not in the serialize/lfence samples
- // still possible tho, but it's as accurate we can get on user-mode without relying on any other hardware clock or cross-referencing with the counter thread mid-execution
- // this is why the score of this technique is not enough to determine a VM
- timer::trigger_vmexit();
-
- std::atomic_signal_fence(std::memory_order_seq_cst);
- v_post = state.counter;
-
- // we dont filter by cycles spent here (for example by querying thread cycle time) because the point of this function is to not use TSC or any other clock
- if (v_post > v_pre && r_post > r_pre) {
- vm_samples[valid] = v_post - v_pre;
- ref_samples[valid] = r_post - r_pre;
- valid++;
+ // burn cycles executing a random number of instructions in each loop iteration, so that the hypervisor doesn't know when to pause the counter thread
+ timer::burn_random_cycles(ct_seed, v_post, r_post);
}
-
- // burn cycles executing a random number of instructions in each loop iteration, so that the hypervisor doesn't know when to pause the counter thread
- timer::burn_random_cycles(ct_seed, v_post, r_post);
-
- #if (CLANG || GCC)
- __asm__ volatile("" :: "r"(x) : "memory");
- #endif
}
- }
- else {
- while (valid < BATCH_SIZE) {
- // cpuid and serialize/lfence interpolated so that any turbo boost, thermal throttling, speculation (for the loop overhead itself, not for the serializing instructions), etc affects samples equally
- u64 r_pre, r_post, v_pre, v_post, sync;
-
- // this is done as a counter to both legitimate and malicious hypervisors interrupts that may pause the counter thread while we measure
- sync = state.counter;
- while (state.counter == sync); // infer if counter got enough quantum momentum (so its currently scheduled)
-
- // SERIALIZE/LFENCE check is before CPUID on purpose, so that possible pauses when cpuid is executed do not affect SERIALIZE/LFENCE too. The hv needs to wait for cpuid to pause the thread
- // the amount of instructions (8 in case of LFENCE) are enough for the Cross-Core/Cross-CCD MESI RFO cache bounce in the data race so that the counter thread sees an increment
- sync = state.counter;
- while (state.counter == sync);
- r_pre = state.counter;
- std::atomic_signal_fence(std::memory_order_seq_cst);
- LFENCE_8
- std::atomic_signal_fence(std::memory_order_seq_cst);
- r_post = state.counter;
-
- sync = state.counter;
- while (state.counter == sync); // sync to our counter tick again
- sync = state.counter;
- while (state.counter == sync); // and again
-
- v_pre = state.counter;
- std::atomic_signal_fence(std::memory_order_seq_cst); // _ReadWriteBarrier() aka dont emit runtime fences
-
- // the only way a legitimate interrupt can make the check false flag is if most of the samples were contaminated just in the cpuid samples but not in the serialize/lfence samples
- // still possible tho, but it's as accurate we can get on user-mode without relying on any other hardware clock or cross-referencing with the counter thread mid-execution
- // this is why the score of this technique is not enough to determine a VM
- timer::trigger_vmexit();
-
- std::atomic_signal_fence(std::memory_order_seq_cst);
- v_post = state.counter;
-
- // we dont filter by cycles spent here (for example by querying thread cycle time) because the point of this function is to not use TSC or any other clock
- if (v_post > v_pre && r_post > r_pre) {
- vm_samples[valid] = v_post - v_pre;
- ref_samples[valid] = r_post - r_pre;
- valid++;
+ else {
+ while (valid < BATCH_SIZE) {
+ // cpuid and serialize/lfence interpolated so that any turbo boost, thermal throttling, speculation (for the loop overhead itself, not for the serializing instructions), etc affects samples equally
+ timer::timer_tick_t r_pre, r_post, v_pre, v_post, sync;
+
+ // this is done as a counter to both legitimate and malicious hypervisors interrupts that may pause the counter thread while we measure
+ sync = state.counter;
+ while (state.counter == sync); // infer if counter got enough quantum momentum (so its currently scheduled)
+
+ // SERIALIZE/LFENCE check is before CPUID on purpose, so that possible pauses when cpuid is executed do not affect SERIALIZE/LFENCE too. The hv needs to wait for cpuid to pause the thread
+ // the amount of instructions (8 in case of LFENCE) are enough for the Cross-Core/Cross-CCD MESI RFO cache bounce in the data race so that the counter thread sees an increment
+ sync = state.counter;
+ while (state.counter == sync);
+ r_pre = state.counter;
+ std::atomic_signal_fence(std::memory_order_seq_cst);
+ LFENCE_8
+ std::atomic_signal_fence(std::memory_order_seq_cst);
+ r_post = state.counter;
+
+ sync = state.counter;
+ while (state.counter == sync); // sync to our counter tick again
+ sync = state.counter;
+ while (state.counter == sync); // and again
+
+ v_pre = state.counter;
+ std::atomic_signal_fence(std::memory_order_seq_cst); // _ReadWriteBarrier() aka dont emit runtime fences
+
+ // the only way a legitimate interrupt can make the check false flag is if most of the samples were contaminated just in the cpuid samples but not in the serialize/lfence samples
+ // still possible tho, but it's as accurate we can get on user-mode without relying on any other hardware clock or cross-referencing with the counter thread mid-execution
+ // this is why the score of this technique is not enough to determine a VM
+ timer::vmexit();
+
+ std::atomic_signal_fence(std::memory_order_seq_cst);
+ v_post = state.counter;
+
+ // we dont filter by cycles spent here (for example by querying thread cycle time) because the point of this function is to not use TSC or any other clock
+ if (v_post > v_pre && r_post > r_pre) {
+ vm_samples[valid] = v_post - v_pre;
+ ref_samples[valid] = r_post - r_pre;
+ valid++;
+ }
+
+ // burn cycles executing a random number of instructions in each loop iteration, so that the hypervisor doesn't know when to pause the counter thread
+ timer::burn_random_cycles(ct_seed, v_post, r_post);
}
+ }
- // burn cycles executing a random number of instructions in each loop iteration, so that the hypervisor doesn't know when to pause the counter thread
- timer::burn_random_cycles(ct_seed, v_post, r_post);
+ const timer::timer_tick_t cpuid_l = timer::calculate_latency(vm_samples); // check for lowest dense cluster with no interrupt spikes, filter noise we can't detect (SMIs, NMIs, etc)
+ const timer::timer_tick_t ref_l = timer::calculate_latency(ref_samples);
- #if (CLANG || GCC)
- __asm__ volatile("" :: "r"(x) : "memory");
- #endif
- }
+ // record the cleanest/lowest latency observed across the independent trials
+ if (cpuid_l < best_cpuid_l) best_cpuid_l = cpuid_l;
+ if (ref_l < best_ref_l) best_ref_l = ref_l;
}
state.test_done.store(true, std::memory_order_release);
- const u64 cpuid_l = timer::calculate_latency(vm_samples); // check for lowest dense cluster with no interrupt spikes, filter noise we can't detect (SMIs, NMIs, etc)
- const u64 ref_l = timer::calculate_latency(ref_samples);
- const double latency_ratio = ref_l ? (double)cpuid_l / (double)ref_l : 0;
+ const double latency_ratio = best_ref_l ? (double)best_cpuid_l / (double)best_ref_l : 0;
// VMM = Time spent in hypervisor; nVMM = Time spent in baremetal
- debug("TIMER: VMM -> ", cpuid_l, " | nVMM -> ", ref_l, " | Ratio -> ", latency_ratio); // those are NOT cycles
+ debug("TIMER: VMM -> ", best_cpuid_l, " | nVMM -> ", best_ref_l, " | Ratio -> ", latency_ratio); // these ARE NOT cycles
if (latency_ratio >= threshold) hypervisor_detected = true;
// Detect IPI-based counter pausing bypasses
// For the median itself to exceed baremetal limits (which rarely pass 1000), an interrupt must be occurring on almost EVERY single loop iteration
- // This is the footprint of a hypervisor continuously spamming cross-core IPIs to try and pause the counter thread (or the trigger_thread to make SERIALIZE/LFENCE take a lot of time)
- if (cpuid_l > 1000 || ref_l > 1000 || cpuid_l == 1 || ref_l == 1) {
- debug("TIMER: Detected artificial interrupt delivery to VMAware's threads");
+ // This is the footprint of a hypervisor continuously spamming cross-core IPIs to try and pause our threads
+ if (best_cpuid_l > 1000 || best_ref_l > 1000 || best_cpuid_l == 1 || best_ref_l == 1) {
+ debug("TIMER: Detected artificial interrupt delivery to timing threads");
hypervisor_detected = true;
}
@@ -6198,8 +6065,8 @@ struct VM {
SetThreadPriority(current_thread, old_thread_priority);
SetPriorityClass(current_process, old_process_priority);
SetThreadAffinityMask(current_thread, old_affinity);
- VirtualUnlock(vm_samples.data(), BATCH_SIZE * sizeof(u64));
- VirtualUnlock(ref_samples.data(), BATCH_SIZE * sizeof(u64));
+ VirtualUnlock(vm_samples.data(), BATCH_SIZE * sizeof(timer::timer_tick_t));
+ VirtualUnlock(ref_samples.data(), BATCH_SIZE * sizeof(timer::timer_tick_t));
};
std::thread t1(counter_thread);
@@ -6391,8 +6258,8 @@ struct VM {
const struct ifreq* end = it + (ifc.ifc_len / sizeof(struct ifreq));
for (; it != end; ++it) {
- std::size_t const name_len = std::min(sizeof(ifr.ifr_name) - 1, strlen(it->ifr_name));
- std::memcpy(ifr.ifr_name, it->ifr_name, name_len);
+ std::size_t const name_len = std::min(sizeof(ifr.ifr_name) - 1, strnlen(it->ifr_name, sizeof(it->ifr_name)));
+ memcpy(ifr.ifr_name, it->ifr_name, name_len);
*(ifr.ifr_name + name_len) = '\0';
if (ioctl(sockGuard.get(), SIOCGIFFLAGS, &ifr) != 0) {
@@ -6408,7 +6275,7 @@ struct VM {
}
if (success) {
- std::memcpy(mac, ifr.ifr_hwaddr.sa_data, 6);
+ memcpy(mac, ifr.ifr_hwaddr.sa_data, 6);
}
else {
debug("MAC: ", "not successful");
@@ -8436,16 +8303,21 @@ struct VM {
void* functions[1] = { nullptr };
util::get_function_address(ntdll, function_names, functions, 1);
- using NtQuerySysInfo_t = NTSTATUS(__stdcall*)(SYSTEM_INFORMATION_CLASS, PVOID, ULONG, PULONG);
- NtQuerySysInfo_t nt_query = reinterpret_cast(functions[0]);
+ using nt_query_sysinfo_t = NTSTATUS(__stdcall*)(SYSTEM_INFORMATION_CLASS, PVOID, ULONG, PULONG);
+ nt_query_sysinfo_t nt_query = reinterpret_cast(functions[0]);
if (!nt_query)
return false;
+ // parse header to locate the bitmap
+ struct boot_logo_info { ULONG flags, bitmap_offset; };
+
// determine required buffer size
const SYSTEM_INFORMATION_CLASS sys_boot_info = static_cast(140);
ULONG needed = 0;
NTSTATUS st = nt_query(sys_boot_info, nullptr, 0, &needed);
- if (st != static_cast(0xC0000023) && st != static_cast(0x80000005) && st != static_cast(0xC0000004))
+ if (st != static_cast(0xC0000023) &&
+ st != static_cast(0x80000005) &&
+ st != static_cast(0xC0000004))
return false;
std::vector buffer(needed);
@@ -8455,10 +8327,13 @@ struct VM {
if (!NT_SUCCESS(st))
return false;
- // parse header to locate the bitmap
- struct boot_logo_info { ULONG flags, bitmap_offset; };
- const auto* info = reinterpret_cast(buffer.data());
- if (info->bitmap_offset >= needed) return false;
+ if (needed < sizeof(boot_logo_info))
+ return false;
+
+ const auto* info = reinterpret_cast(buffer.data());
+ if (info->bitmap_offset >= needed)
+ return false;
+
const u8* bmp = buffer.data() + info->bitmap_offset;
const size_t size = static_cast(needed) - info->bitmap_offset;
#else
@@ -8571,27 +8446,14 @@ struct VM {
// helper to detect QEMU instances based on default hard drive serial patterns
// QEMU drives often start with "QM000" followed by digits
- auto is_qemu_serial = [](const char* str) noexcept -> bool {
- if (!str) {
- return false;
- }
- for (int i = 0; i < 6; ++i) {
- if (str[i] == '\0') {
- return false;
- }
- }
-
- if ((str[0] & 0xDF) != 'Q') {
+ auto is_qemu_serial = [](const char* str, size_t len) noexcept -> bool {
+ if (!str || len < 6) {
return false;
}
- if ((str[1] & 0xDF) != 'M') {
- return false;
- }
+ if ((str[0] & 0xDF) != 'Q') return false;
+ if ((str[1] & 0xDF) != 'M') return false;
- // we check byte-by-byte to be safe regarding alignment,
- // though a 32-bit integer check (0x30303030) could be used if alignment is guaranteed
- // we also essentially check for null termination safety here because '\0' != '0'
return str[2] == '0' && str[3] == '0' && str[4] == '0' && str[5] == '0';
};
@@ -8792,7 +8654,7 @@ struct VM {
debug("DISK_SERIAL: ", serial);
// Check the retrieved serial number against known VM artifacts
- if (is_qemu_serial(serial) || is_vbox_serial(serial, serialLen)) {
+ if (is_qemu_serial(serial, serialLen) || is_vbox_serial(serial, serialLen)) {
if (allocated_buffer) {
PVOID free_base = reinterpret_cast(allocated_buffer);
SIZE_T free_size = 0;
@@ -8859,7 +8721,7 @@ struct VM {
}
debug("DISK_SERIAL: ", (const char*)serial);
- if (is_qemu_serial(serial) || is_vbox_serial(serial, rsize)) {
+ if (is_qemu_serial(serial, static_cast(rsize)) || is_vbox_serial(serial, static_cast(rsize))) {
result = true;
}
}
@@ -8913,7 +8775,7 @@ struct VM {
*/
[[nodiscard]] static bool hwmodel() {
- //hw.model strings are short (like for example MacBookPro16,1), 128 bytes is plenty
+ // hw.model strings are short (like for example MacBookPro16,1), 128 bytes is plenty
char buffer[128] = { 0 };
size_t size = sizeof(buffer);
@@ -8924,6 +8786,8 @@ struct VM {
return false;
}
+ buffer[127] = '\0';
+
// sysctlbyname returns the raw value (usually without a trailing newline),
// so no trimming is required
debug("HWMODEL: ", "output = ", buffer);
@@ -12411,165 +12275,110 @@ struct VM {
return false;
}
- // Surface Pro models typically do not have PIT, some devices might have it but not expose it due to firmware bugs (i.e. Lenovo 83AG)
- {
- const char* manufacturer = nullptr;
- const char* model = nullptr;
- if (util::get_manufacturer_model(&manufacturer, &model)) {
- auto ci_contains = [](const char* hay, const char* needle) noexcept -> bool {
- if (!hay || !needle || !*hay || !*needle) return false;
-
- const unsigned char* h =
- reinterpret_cast(hay);
- const unsigned char* n =
- reinterpret_cast(needle);
-
- for (; *h; ++h) {
- size_t i = 0;
- for (;; ++i) {
- unsigned char hc = h[i];
- unsigned char nc = n[i];
-
- if (!nc) return true;
- if (!hc) break;
-
- if (hc >= 'A' && hc <= 'Z') hc += 32;
- if (nc >= 'A' && nc <= 'Z') nc += 32;
-
- if (hc != nc) break;
- }
- }
- return false;
- };
-
- const bool model_has_surface = ci_contains(model, "surface");
- const bool model_has_pro = ci_contains(model, "pro");
- const bool man_is_microsoft = ci_contains(manufacturer, "microsoft");
+ // Surface Pro models typically do not have PIT, some devices might have it but not expose it due to firmware bugs (i.e. Lenovo 83AG)
+ const char* manufacturer = nullptr;
+ const char* model = nullptr;
- if (model_has_surface && (model_has_pro || man_is_microsoft)) {
- return false;
- }
- }
- }
+ if (util::get_manufacturer_model(&manufacturer, &model)) {
+ auto ci_contains = [](const char* hay, const char* needle) noexcept -> bool {
+ if (!hay || !needle || !*hay || !*needle) return false;
- // The RTC (ACPI/CMOS RTC) timer can't be always detected via SetupAPI, it needs AML decode of the DSDT firmware table
- // The HPET (PNP0103) timer presence check was removed, more info at: https://github.com/kernelwernel/VMAware/pull/616
- // Here, we check for the PIT/AT timer (PC-class System Timer)
- constexpr wchar_t pattern[] = L"pnp0100";
- constexpr size_t patLen = (sizeof(pattern) / sizeof(wchar_t)) - 1;
-
- auto wcsstr_ci_ascii = [&](const wchar_t* hay) noexcept -> const wchar_t* {
- if (!hay) return nullptr;
+ for (const char* h = hay; *h; ++h) {
+ const char* a = h;
+ const char* b = needle;
- for (; *hay; ++hay) {
- wchar_t h = *hay;
- if (h >= L'A' && h <= L'Z') h += 32;
+ while (*a && *b) {
+ unsigned char ca = static_cast(*a);
+ unsigned char cb = static_cast(*b);
- if (h != pattern[0]) continue;
+ if (ca >= 'A' && ca <= 'Z') ca += 32;
+ if (cb >= 'A' && cb <= 'Z') cb += 32;
- size_t i = 1;
- for (; i < patLen; ++i) {
- wchar_t next_h = hay[i];
+ if (ca != cb) break;
+ ++a;
+ ++b;
+ }
- if (next_h == L'\0') return nullptr;
+ if (!*b)
+ return true;
+ }
- if (next_h >= L'A' && next_h <= L'Z') next_h += 32;
+ return false;
+ };
- if (next_h != pattern[i]) break;
- }
+ const bool is_surface_pro = ci_contains(model, "surface pro");
+ const bool is_microsoft = ci_contains(manufacturer, "microsoft");
- if (i == patLen) return hay;
+ if (is_surface_pro && is_microsoft) {
+ return false;
}
- return nullptr;
- };
-
- const HDEVINFO devs =
- SetupDiGetClassDevsW(nullptr, nullptr, nullptr,
- DIGCF_PRESENT | DIGCF_ALLCLASSES);
+ }
+
+ // The RTC (ACPI/CMOS RTC) timer can't be always detected via SetupAPI, it needs AML decode of the DSDT firmware table
+ // The HPET (PNP0103) timer presence check was removed, more info at: https://github.com/kernelwernel/VMAware/pull/616
+ // Here, we check for the PIT/AT timer (PC-class System Timer)
+ const HDEVINFO devs = SetupDiGetClassDevsW(
+ nullptr, nullptr, nullptr, DIGCF_PRESENT | DIGCF_ALLCLASSES);
if (devs == INVALID_HANDLE_VALUE)
return false;
SP_DEVINFO_DATA dev_info{};
- dev_info.cbSize = sizeof(SP_DEVINFO_DATA);
-
- DWORD alloc_size = 4096 + 4;
- BYTE* buffer = static_cast(malloc(alloc_size));
-
- if (!buffer) {
- SetupDiDestroyDeviceInfoList(devs);
- return false;
- }
+ dev_info.cbSize = sizeof(dev_info);
+ BYTE* buffer = nullptr;
+ DWORD buffer_size = 0;
bool found = false;
- for (DWORD idx = 0; SetupDiEnumDeviceInfo(devs, idx, &dev_info); ++idx) {
- DWORD property_type = 0;
- DWORD required = 0;
+ for (DWORD i = 0; SetupDiEnumDeviceInfo(devs, i, &dev_info); ++i) {
+ DWORD type = 0;
+ DWORD needed = 0;
- if (!SetupDiGetDeviceRegistryPropertyW(
- devs,
- &dev_info,
- SPDRP_HARDWAREID,
- &property_type,
- buffer,
- alloc_size > 4 ? alloc_size - 4 : 0,
- &required))
+ if (SetupDiGetDeviceRegistryPropertyW(
+ devs, &dev_info, SPDRP_HARDWAREID,
+ &type, nullptr, 0, &needed))
{
- const DWORD err = GetLastError();
-
- if (err == ERROR_INSUFFICIENT_BUFFER) {
- const DWORD needed_size = required + 4;
-
- if (needed_size > alloc_size) {
- BYTE* new_buffer =
- static_cast(realloc(buffer, needed_size));
+ continue;
+ }
- if (!new_buffer) {
- found = false;
- break;
- }
+ if (GetLastError() != ERROR_INSUFFICIENT_BUFFER || needed == 0)
+ continue;
- buffer = new_buffer;
- alloc_size = needed_size;
- }
+ if (needed > buffer_size) {
+ BYTE* new_buffer = static_cast(
+ realloc(buffer, needed + sizeof(wchar_t)));
- if (!SetupDiGetDeviceRegistryPropertyW(
- devs,
- &dev_info,
- SPDRP_HARDWAREID,
- &property_type,
- buffer,
- alloc_size > 4 ? alloc_size - 4 : 0,
- &required))
- {
- continue;
- }
- }
- else {
- continue;
+ if (!new_buffer) {
+ free(buffer);
+ SetupDiDestroyDeviceInfoList(devs);
+ return false;
}
+
+ buffer = new_buffer;
+ buffer_size = needed + sizeof(wchar_t);
}
- if (property_type != REG_MULTI_SZ)
+ if (!SetupDiGetDeviceRegistryPropertyW(
+ devs, &dev_info, SPDRP_HARDWAREID,
+ &type, buffer, buffer_size, &needed))
+ {
continue;
-
- if (required + 4 <= alloc_size) {
- buffer[required + 0] = 0;
- buffer[required + 1] = 0;
- buffer[required + 2] = 0;
- buffer[required + 3] = 0;
}
- wchar_t* cur = reinterpret_cast(buffer);
+ if (type != REG_MULTI_SZ)
+ continue;
+
+ reinterpret_cast(buffer)[needed / sizeof(wchar_t)] = L'\0';
- while (*cur) {
- if (wcsstr_ci_ascii(cur)) {
+ for (const wchar_t* s = reinterpret_cast(buffer); *s;
+ s += wcslen(s) + 1)
+ {
+ if (_wcsicmp(s, L"ACPI\\PNP0100") == 0 ||
+ _wcsicmp(s, L"PNP0100") == 0)
+ {
found = true;
break;
}
-
- cur += wcslen(cur) + 1;
}
if (found)
@@ -12578,7 +12387,6 @@ struct VM {
free(buffer);
SetupDiDestroyDeviceInfoList(devs);
-
return !found;
#endif
}
@@ -13487,7 +13295,7 @@ struct VM {
}
else {
debug("SVM_EXCEPTIONS: Detected SVM hypervisor hiding CPU capabilities");
- core::add(brand_enum::NULL_BRAND, 150);
+ return core::add(brand_enum::NULL_BRAND, 150);
}
return true;
@@ -14773,8 +14581,8 @@ std::array VM::core::technique_table = [
// START OF TECHNIQUE TABLE
#if (WINDOWS)
{VM::TRAP, {100, VM::trap}},
- {VM::KVM_INTERCEPTION, {100, VM::kvm_interception}},
- {VM::SVM_EXCEPTIONS, {100, VM::svm_exceptions}},
+ {VM::KVM_INTERCEPTION, {150, VM::kvm_interception}},
+ {VM::SVM_EXCEPTIONS, {150, VM::svm_exceptions}},
{VM::INTERRUPT_SHADOW, {100, VM::interrupt_shadow}},
{VM::EIP_OVERFLOW, {100, VM::eip_overflow}},
{VM::HYPERVISOR_HOOK, {100, VM::hypervisor_hook}},