diff --git a/README.md b/README.md index 549321ec..e167f82e 100755 --- a/README.md +++ b/README.md @@ -246,6 +246,7 @@ VMAware also has support for a variety of languages, if C++ isn't the language y +
What about using this for malware?
@@ -256,6 +257,26 @@ VMAware also has support for a variety of languages, if C++ isn't the language y
+ +
+Is a kernel-mode component planned to be developed? +
+ +> No. A kernel-component would require serious auditing and a digitally signed driver. It would also be a dead end for VM bypassing ( so it's not fun >:( ) +> +> In summary, we can still detect your ass while being completely user-mode. + +
+ + +
+Is it thread-safe? +
+ +> No. Don't call our library with multiple threads simultaneously, we don't take more than 1s to run. + +
+
I have linker errors when compiling
diff --git a/src/vmaware.hpp b/src/vmaware.hpp index c5cff6c1..9d0d822b 100644 --- a/src/vmaware.hpp +++ b/src/vmaware.hpp @@ -982,14 +982,14 @@ struct VM { return "Unknown"; } - alignas(16) char buffer[49]{}; - u32* regs = reinterpret_cast(buffer); + u32 regs[12] = { 0 }; - // unrolled calls to fill buffer directly cpu::cpuid(regs[0], regs[1], regs[2], regs[3], cpu::leaf::brand1); cpu::cpuid(regs[4], regs[5], regs[6], regs[7], cpu::leaf::brand2); cpu::cpuid(regs[8], regs[9], regs[10], regs[11], cpu::leaf::brand3); + static char buffer[49]; + memcpy(buffer, regs, sizeof(regs)); buffer[48] = '\0'; // do NOT touch trailing spaces for the AMD_THREAD_MISMATCH technique @@ -1010,31 +1010,21 @@ struct VM { [[nodiscard]] static std::string cpu_manufacturer(const u32 leaf_id) { - alignas(16) char buffer[13]{}; - u32* regs = reinterpret_cast(buffer); - - u32 eax = 0; - u32 ebx = 0; - u32 ecx = 0; - u32 edx = 0; - + u32 eax = 0, ebx = 0, ecx = 0, edx = 0; cpu::cpuid(eax, ebx, ecx, edx, leaf_id); - if (ebx == 0 && ecx == 0 && edx == 0) { - return ""; - } + if (ebx == 0 && ecx == 0 && edx == 0) return ""; + u32 regs[3] = { 0 }; if (leaf_id >= 0x40000000) { - regs[0] = ebx; - regs[1] = ecx; - regs[2] = edx; + regs[0] = ebx; regs[1] = ecx; regs[2] = edx; } else { - regs[0] = ebx; - regs[1] = edx; - regs[2] = ecx; + regs[0] = ebx; regs[1] = edx; regs[2] = ecx; } + char buffer[13]; + memcpy(buffer, regs, sizeof(regs)); buffer[12] = '\0'; return { buffer }; } @@ -3155,38 +3145,6 @@ struct VM { dest[i] = '\0'; } - static void str_cat(char* dest, const char* src, size_t max_len) { - size_t i = 0; - while (dest[i] != '\0') { - i++; - } - - size_t j = 0; - while (src[j] != '\0' && i < max_len - 1) { - dest[i++] = src[j++]; - } - dest[i] = '\0'; - } - - static bool str_eq(const char* a, const char* b) { - if (a == b) { - return true; - } - - if (!a || !b) { - return false; - } - - while (*a && *b) { - if (*a != *b) { - return false; - } - - a++; b++; - } - return *a == *b; - } - // memoization struct memo { struct data_t { @@ -3416,24 +3374,29 @@ struct VM { #if (WINDOWS) // timer helper functionalities struct timer { - #define VMAWARE_STR2(x) #x - #define VMAWARE_STR(x) VMAWARE_STR2(x) + #if (x86_64) + using timer_tick_t = u64; + #else + using timer_tick_t = u32; + #endif - // prevent false sharing when triggering hypervisor exits with the intentional data race condition #if (MSVC) #pragma warning(push) #pragma warning(disable: 4324) #endif + // align to prevent false sharing when triggering hypervisor exits with the intentional data race condition struct alignas(64) cache_state { - alignas(64) volatile u64 counter { 0 }; - alignas(64) std::atomic start_test{ false }; - alignas(64) std::atomic test_done{ false }; + alignas(64) volatile timer_tick_t counter { 0 }; + alignas(64) std::atomic start_test { false }; + alignas(64) std::atomic test_done { false }; }; #if (MSVC) #pragma warning(pop) #endif - static u32 get_ct_seed() { + #define VMAWARE_STR2(x) #x + #define VMAWARE_STR(x) VMAWARE_STR2(x) + [[nodiscard]] static u32 get_ct_seed() { constexpr char s[] = __DATE__ " " __TIME__ " " __FILE__ " " VMAWARE_STR(__LINE__); u32 h = 2166136261u; for (char c : s) { @@ -3444,8 +3407,28 @@ struct VM { return h; } - // middle available logical CPU - static DWORD_PTR get_trigger_mask() { + /* + Golden Rules (must happen ALWAYS; if they don't happen the check should be aborted): + 1. The check needs AT LEAST two different cores, so if one single core is detected, returns + 2. The counter thread should always be in the middle available logical CPU when there's more than 2 cores, and in the core 2 (1-indexed) when there's 2 cores + + Silver Rules (in order of priority): + 1. The trigger and the counter thread must not be in the same physical core (avoid SMT siblings) WHENEVER POSSIBLE + 2. The trigger and the counter thread should be within the same NUMA node or AMD CCD to minimize baseline latency WHENEVER POSSIBLE + 3. The counter and trigger thread should not be in the first or last logical CPU WHENEVER POSSIBLE + 4. If after reaching here, there are multiple valid candidates (core indexes), then randomize it to avoid hypervisors from predicting where the trigger thread is + + Example: Imagine we have a CPU with 2 cores and 4 threads (with SMT enabled), then: + 1. We process golden rules first: + - CPU has more than 1 core -> OK + - Counter thread is pinned to core 2 + + 2. We process the silver rules, in order: + - Counter and trigger thread must not be in the same physical core -> Trigger thread is pinned to core 4 + - Trigger and the counter thread should be within the same NUMA node -> FAILED + - Counter and trigger thread should not be in the first or last logical CPU -> FAILED, trigger thread had to be put in core 4 due to a silver rule with more priority + */ + [[nodiscard]] static DWORD_PTR getmask(u32 ct_seed, bool trigger) { const HANDLE current_process = reinterpret_cast(-1LL); DWORD_PTR proc_mask = 0, sys_mask = 0; @@ -3461,183 +3444,192 @@ struct VM { } } - if (n < 2) return 0ull; // single-core abort - - // (1-indexed) - // 2 cores -> trigger 1 - // 3 cores -> trigger 1 - // 4 cores -> trigger 1 - // 5 cores -> trigger 4 - // 6 cores -> trigger 5 - // >6 cores -> middle available logical CPU - size_t trigger_pos0 = 0; // 0-based ordinal + if (n < 2) { + return 0ull; + } - if (n == 2 || n == 3 || n == 4) { - trigger_pos0 = 0; + // first null buffer then use size + DWORD len = 0; + SetLastError(ERROR_SUCCESS); + GetLogicalProcessorInformationEx(RelationAll, nullptr, &len); + if (GetLastError() != ERROR_INSUFFICIENT_BUFFER || !len) { + return 0ull; } - else if (n == 5) { - trigger_pos0 = 3; + + std::vector topo(len); + if (!GetLogicalProcessorInformationEx( + RelationAll, + reinterpret_cast(topo.data()), + &len)) { + return 0ull; } - else if (n == 6) { - trigger_pos0 = 4; + + constexpr DWORD INVALID_CPU = 0xFFFFFFFFu; + + DWORD logical_to_core[64]; + DWORD logical_to_numa[64]; + std::fill_n(logical_to_core, 64, INVALID_CPU); + std::fill_n(logical_to_numa, 64, INVALID_CPU); + + DWORD core_count = 0; + + size_t offset = 0; + while (offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX) <= len) { + auto* ptr = reinterpret_cast(topo.data() + offset); + + switch (ptr->Relationship) { + case RelationProcessorCore: { + const DWORD core_id = core_count++; + + for (DWORD g = 0; g < ptr->Processor.GroupCount; ++g) { + const KAFFINITY mask = ptr->Processor.GroupMask[g].Mask; + for (DWORD bit = 0; bit < 64; ++bit) { + if (mask & (1ull << bit)) { + logical_to_core[bit] = core_id; + } + } + } + break; + } + + case RelationNumaNode: { + const DWORD node_id = ptr->NumaNode.NodeNumber; + const KAFFINITY mask = ptr->NumaNode.GroupMask.Mask; + for (DWORD bit = 0; bit < 64; ++bit) { + if (mask & (1ull << bit)) { + logical_to_numa[bit] = node_id; + } + } + break; + } + + default: + break; + } + + if (!ptr->Size) { + return 0ull; + } + offset += ptr->Size; } - else { - trigger_pos0 = n / 2; + + // abort if only one physical core exists in the allowed affinity set + { + bool seen_core[64]{}; + DWORD physical_cores = 0; + + for (DWORD i = 0; i < n; ++i) { + const DWORD log = idxs[i]; + const DWORD core = logical_to_core[log]; + if (core == INVALID_CPU) { + return 0ull; + } + if (!seen_core[core]) { + seen_core[core] = true; + ++physical_cores; + } + } + + if (physical_cores < 2) { + return 0ull; + } } - if (trigger_pos0 >= n) return 0ull; - return 1ull << idxs[trigger_pos0]; - } + // counter: middle available logical CPU when >2, otherwise second available logical CPU + const DWORD counter_pos0 = (n == 2) ? 1u : (n / 2u); + if (counter_pos0 >= n) { + return 0ull; + } - // random logical CPU, but exclude the trigger_thread, first, second and last available logical CPUs, avoiding SMT siblings - static DWORD_PTR get_counter_mask(u32 ct_seed, DWORD_PTR trigger_mask) { - const HANDLE current_process = reinterpret_cast(-1LL); + const DWORD counter_logical = idxs[counter_pos0]; + const DWORD counter_core = logical_to_core[counter_logical]; + const DWORD counter_numa = logical_to_numa[counter_logical]; - DWORD_PTR proc_mask = 0, sys_mask = 0; - if (!GetProcessAffinityMask(current_process, &proc_mask, &sys_mask) || !proc_mask) { + if (counter_core == INVALID_CPU || counter_numa == INVALID_CPU) { return 0ull; } - DWORD idxs[64]{}; - DWORD n = 0; - for (DWORD i = 0; i < 64; ++i) { - if (proc_mask & (1ull << i)) { - idxs[n++] = i; - } + if (!trigger) { + return 1ull << counter_logical; } - if (n < 2) return 0ull; // single-core abort + auto is_edge = [&](DWORD logical) -> bool { + return logical == idxs[0] || logical == idxs[n - 1]; + }; - // get topology to identify SMT siblings - DWORD len = 0; - GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &len); + auto same_numa = [&](DWORD logical) -> bool { + return logical_to_numa[logical] != INVALID_CPU && logical_to_numa[logical] == counter_numa; + }; - // stack buffer fallback mechanism - BYTE stack_buf[1024]{}; - std::vector heap_buf; - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = nullptr; + auto build_candidates = [&](bool require_same_numa, bool avoid_edges) { + std::vector out; + out.reserve(n); - if (len <= sizeof(stack_buf)) { - info = reinterpret_cast(stack_buf); - } - else { - heap_buf.resize(len); - info = reinterpret_cast(heap_buf.data()); - } + for (DWORD i = 0; i < n; ++i) { + const DWORD logical = idxs[i]; + if (logical == counter_logical) { + continue; + } - if (!GetLogicalProcessorInformationEx(RelationProcessorCore, info, &len)) { - return 0ull; // no valid topology data, fail closed - } + // never same physical core when possible + if (logical_to_core[logical] == counter_core) { + continue; + } - // logical processor index to its physical core ID - DWORD logical_to_core[64] = { 0 }; - DWORD_PTR core_mask[64] = { 0 }; - size_t offset = 0; - DWORD core_idx = 0; - while (offset < len) { - auto ptr = reinterpret_cast(reinterpret_cast(info) + offset); - for (DWORD i = 0; i < ptr->Processor.GroupCount; ++i) { - KAFFINITY mask = ptr->Processor.GroupMask[i].Mask; - for (int b = 0; b < 64; ++b) { - if (mask & (1ull << b)) { - logical_to_core[b] = core_idx; - core_mask[core_idx] |= (1ull << b); - } + if (avoid_edges && is_edge(logical)) { + continue; } + + if (require_same_numa && !same_numa(logical)) { + continue; + } + + out.push_back(logical); } - offset += ptr->Size; - core_idx++; - } - auto pick_by_ordinal = [&](size_t ord0) -> DWORD_PTR { - if (ord0 >= n) return 0ull; - return 1ull << idxs[ord0]; + return out; }; - DWORD_PTR choices = 0ull; - - // exact placement rules: - // 2 cores -> counter 2 - // 3 cores -> counter 3 - // 4 cores -> counter 3 - // 5 cores -> counter 2 - // 6 cores -> counter 3 - if (n == 2) { - choices = pick_by_ordinal(1); - } - else if (n == 3 || n == 4) { - choices = pick_by_ordinal(2); - } - else if (n == 5) { - choices = pick_by_ordinal(1); - } - else if (n == 6) { - choices = pick_by_ordinal(2); + // priority order + // 1) different physical core + same NUMA + not first/last + // 2) different physical core + same NUMA + // 3) different physical core + not first/last + // 4) different physical core + std::vector candidates = build_candidates(true, true); + if (candidates.empty()) candidates = build_candidates(true, false); + if (candidates.empty()) candidates = build_candidates(false, true); + if (candidates.empty()) candidates = build_candidates(false, false); + + if (candidates.empty()) { + return 0ull; } - else { - // > 6 cores: - // trigger in the middle available logical CPU - size_t trigger_pos0 = n / 2; - if (trigger_pos0 >= n) return 0ull; - - DWORD trigger_logical = idxs[trigger_pos0]; - DWORD trigger_core_id = logical_to_core[trigger_logical]; - - choices = proc_mask; - - // random so that the hypervisor doesn't know where the counter thread is - // this will affect latency if cache lines from trigger_thread and counter_thread are separated enough due to cores being too distant - // however, we do a ratio based detection, so this wont affect the detection accuracy because the cache latency affects both samples - choices &= ~trigger_mask; - - if (trigger_pos0 >= 1) - choices &= ~(1ull << idxs[trigger_pos0 - 1]); // adjacent left - if (trigger_pos0 + 1 < n) - choices &= ~(1ull << idxs[trigger_pos0 + 1]); // adjacent right - - choices &= ~(1ull << idxs[0]); // first core - choices &= ~(1ull << idxs[n - 1]); // last core - choices &= ~core_mask[trigger_core_id]; // avoid SMT siblings of the trigger core - - // if exclusions leave nothing, fail closed - if (!choices) return 0ull; - - DWORD pick[64]{}, m = 0; - for (DWORD i = 0; i < 64; ++i) { - if (choices & (1ull << i)) - pick[m++] = i; - } - if (!m) return 0ull; - - // random so that the hypervisor doesn't know where the counter thread is - // this will affect latency if cache lines from trigger_thread and counter_thread are separated enough due to cores being too distant - // however, we do a ratio based detection, so this wont affect the detection accuracy because the cache latency affects both samples - u64 seed = 0; - seed ^= static_cast(ct_seed); - seed ^= static_cast(reinterpret_cast(¤t_process)); - seed ^= static_cast(reinterpret_cast(&proc_mask)) << 1; - seed ^= static_cast(reinterpret_cast(&sys_mask)) << 2; - seed ^= seed >> 33; - seed *= 0xff51afd7ed558ccdULL; - seed ^= seed >> 33; - seed *= 0xc4ceb9fe1a85ec53ULL; - seed ^= seed >> 33; - std::seed_seq seq{ - static_cast(seed), static_cast(seed >> 32), static_cast(seed ^ 0x9e3779b9u), ct_seed - }; - // std::random_device{}() uses RDRAND/RDSEED which can be intercepted by hypervisors - // we use our own compile-time seed that cannot be taken by examining PE/Linux binary properties and would need static/dynamic analysis - // this changes per build and per process session due to hardware ASLR - std::mt19937 gen(seq); - return 1ull << pick[std::uniform_int_distribution(0, m - 1)(gen)]; - } + u64 seed = 0; + seed ^= static_cast(ct_seed); + seed ^= static_cast(reinterpret_cast(&proc_mask)); + seed ^= static_cast(reinterpret_cast(&sys_mask)) << 1; + seed ^= static_cast(counter_logical) << 2; + seed ^= static_cast(counter_core) << 3; + seed ^= seed >> 33; + seed *= 0xff51afd7ed558ccdULL; + seed ^= seed >> 33; + seed *= 0xc4ceb9fe1a85ec53ULL; + seed ^= seed >> 33; - return choices; + std::seed_seq seq{ + static_cast(seed), + static_cast(seed >> 32), + static_cast(seed ^ 0x9e3779b9u), + ct_seed + }; + + std::mt19937 gen(seq); + const DWORD logical = candidates[std::uniform_int_distribution(0, candidates.size() - 1)(gen)]; + return 1ull << logical; } // we dont use cpu::cpuid on purpose - static VMAWARE_FORCE_INLINE void trigger_vmexit() { + static VMAWARE_FORCE_INLINE void vmexit() { #if (GCC || CLANG) u32 a = 0, c = 0, d = 0; #if (x86_64) @@ -3665,157 +3657,36 @@ struct VM { #endif } - static u64 calculate_latency(const std::vector& samples_in) { + [[nodiscard]] static timer_tick_t calculate_latency(const std::vector& samples_in) { if (samples_in.empty()) return 0; const size_t N = samples_in.size(); if (N == 1) return samples_in[0]; - // local sorted copy - std::vector s = samples_in; - std::sort(s.begin(), s.end()); // ascending - - // tiny-sample short-circuits - if (N <= 4) return s.front(); - - // median (and works for sorted input) - auto median_of_sorted = [](const std::vector& v, size_t lo, size_t hi) -> u64 { - // this is the median of v[lo..hi-1], requires 0 <= lo < hi - const size_t len = hi - lo; - if (len == 0) return 0; - const size_t mid = lo + (len / 2); - if (len & 1) return v[mid]; - return (v[mid - 1] + v[mid]) / 2; - }; + // create a local copy to sort + std::vector s = samples_in; + std::sort(s.begin(), s.end()); - // the robust center: median M and MAD -> approximate sigma - const u64 M = median_of_sorted(s, 0, s.size()); + // discard the lower 25% and upper 25%, leaving the middle 50% + const size_t low_idx = N / 4; + const size_t high_idx = (3 * N) / 4; - // select the median deviation in linear time instead of sorting all deviations - std::vector absdev; - absdev.resize(N); - for (size_t i = 0; i < N; ++i) { - const u64 d = (s[i] > M) ? (s[i] - M) : (M - s[i]); - absdev[i] = d; + double sum = 0; + size_t count = 0; + for (size_t i = low_idx; i < high_idx; ++i) { + sum += s[i]; + count++; } - const size_t mad_mid = N / 2; - std::nth_element(absdev.begin(), absdev.begin() + mad_mid, absdev.end()); - - u64 MAD = 0; - if (N & 1) { - MAD = absdev[mad_mid]; - } - else { - const u64 upper = absdev[mad_mid]; - const u64 lower = *std::max_element(absdev.begin(), absdev.begin() + mad_mid); - MAD = (lower + upper) / 2; - } - - // convert MAD to an approximate standard-deviation-like measure - constexpr long double kmad_to_sigma = 1.4826L; // consistent for normal approx - const long double sigma = (MAD == 0) ? 1.0L : (static_cast(MAD) * kmad_to_sigma); - - // find the densest small-valued cluster by sliding a fixed-count window - // this locates the most concentrated group of samples (likely it would be the true VMEXIT cluster) - // const size_t frac_win = (N * 8 + 99) / 100; // ceil(N * 0.08) - // const size_t win = std::min(N, std::max(MIN_WIN, frac_win)); - const size_t MIN_WIN = 10; - // manual min/max calculation for win size - const size_t calc_frac = static_cast(std::ceil(static_cast(N) * 0.08)); - const size_t inner_max = (MIN_WIN > calc_frac) ? MIN_WIN : calc_frac; - const size_t win = (N < inner_max) ? N : inner_max; - - size_t best_i = 0; - u64 best_span = (s.back() - s.front()) + 1; // large initial - for (size_t i = 0; i + win <= N; ++i) { - const u64 span = s[i + win - 1] - s[i]; - if (span < best_span) { - best_span = span; - best_i = i; - } - } - - // expand the initial window greedily while staying "tight" - // allow expansion while adding samples does not more than multiply the span by EXPAND_FACTOR - constexpr long double EXPAND_FACTOR = 1.5L; - size_t cluster_lo = best_i; - size_t cluster_hi = best_i + win; // exclusive - // expand left - while (cluster_lo > 0) { - const u64 new_span = s[cluster_hi - 1] - s[cluster_lo - 1]; - if (static_cast(new_span) <= EXPAND_FACTOR * static_cast(best_span) || - (s[cluster_hi - 1] <= (s[cluster_lo - 1] + static_cast(std::ceil(3.0L * sigma))))) { - --cluster_lo; - // manual min calculation - best_span = (best_span < new_span) ? best_span : new_span; - } - else break; - } - // expand right - while (cluster_hi < N) { - const u64 new_span = s[cluster_hi] - s[cluster_lo]; - if (static_cast(new_span) <= EXPAND_FACTOR * static_cast(best_span) || - (s[cluster_hi] <= (s[cluster_lo] + static_cast(std::ceil(3.0L * sigma))))) { - ++cluster_hi; - best_span = (best_span < new_span) ? best_span : new_span; - } - else break; - } - - const size_t cluster_size = (cluster_hi > cluster_lo) ? (cluster_hi - cluster_lo) : 0; - - // cluster must be reasonably dense and cover a non-negligible portion of samples, so this is pure sanity checks - const double fraction_in_cluster = static_cast(cluster_size) / static_cast(N); - - // min/max calculation for MIN_CLUSTER - const int val_n_50 = static_cast(N / 50); - const size_t val_max = static_cast((5 > val_n_50) ? 5 : val_n_50); - const size_t MIN_CLUSTER = (val_max < N) ? val_max : N; // at least 2% or 5 elements - - if (cluster_size < MIN_CLUSTER || fraction_in_cluster < 0.02) { - // low-percentile (10th) trimmed median - // Manual max calculation for fallback_count - const size_t floor_val = static_cast(std::floor(static_cast(N) * 0.10)); - const size_t fallback_count = (1 > floor_val) ? 1 : floor_val; - - // median of lowest fallback_count elements (if fallback_count==1 that's smallest) - if (fallback_count == 1) return s.front(); - const size_t mid = fallback_count / 2; - if (fallback_count & 1) return s[mid]; - return (s[mid - 1] + s[mid]) / 2; - } - - // now we try to get a robust estimate inside the cluster, trimmed mean (10% trim) centered on cluster - const size_t trim_count = static_cast(std::floor(static_cast(cluster_size) * 0.10)); - const size_t lo = cluster_lo + trim_count; - const size_t hi = cluster_hi - trim_count; // exclusive - if (hi <= lo) { - // degenerate -> median of cluster - return median_of_sorted(s, cluster_lo, cluster_hi); - } - - // sum with long double to avoid overflow and better rounding - long double sum = 0.0L; - for (size_t i = lo; i < hi; ++i) sum += static_cast(s[i]); - const long double avg = sum / static_cast(hi - lo); - u64 result = static_cast(std::llround(avg)); - - // final sanity adjustments: - // if the computed result is suspiciously far from the global median (e.g., > +6*sigma) - // clamp toward the median to avoid choosing a high noisy cluster by mistake - const long double diff_from_med = static_cast(result) - static_cast(M); - if (diff_from_med > 0 && diff_from_med > (6.0L * sigma)) { - // clamp to median + 4*sigma (conservative) - result = static_cast(std::llround(static_cast(M) + 4.0L * sigma)); - } - - // also, if result is zero (shouldn't be) or extremely small, return a smallest observed sample - if (result == 0) result = s.front(); + // fallback to the median if the dataset is too small + if (count == 0) return s[N / 2]; - return result; + // compute the average of the middle 50% and round to the nearest integer + return static_cast((sum / count) + 0.5); } - static VMAWARE_FORCE_INLINE void burn_random_cycles(u64 ct_seed, u64 v_post, u64 r_post) { + static VMAWARE_FORCE_INLINE void burn_random_cycles(u32 ct_seed, timer_tick_t v_post, timer_tick_t r_post) { + // the internal pseudo-random number generator (PRNG) variables like u64 seed and volatile u64 x can be kept as u64 + // because they are simple register-only PRNG arithmetic and benefit from the extra 64-bit entropy space even on 32-bit platforms u64 seed = ct_seed; seed ^= static_cast(reinterpret_cast(&seed)); seed ^= static_cast(reinterpret_cast(&v_post)) << 1; @@ -3835,9 +3706,7 @@ struct VM { x ^= x >> 17; } - #if (CLANG || GCC) - __asm__ volatile("" :: "r"(x) : "memory"); - #endif + std::atomic_signal_fence(std::memory_order_acq_rel); } }; #endif @@ -5943,13 +5812,13 @@ struct VM { */ [[nodiscard]] static bool timer() { #if (x86 && WINDOWS) + using timer = struct timer; + if (util::is_running_under_translator()) { debug("TIMER: Running inside a binary translation layer"); return false; } - using timer = struct timer; - // calculation of minimum threshold bool is_intel = cpu::is_intel(); double threshold = 2.5; @@ -5962,39 +5831,32 @@ struct VM { } } - // Shared state and results + // shared state and results timer::cache_state state; bool hypervisor_detected = false; const u32 ct_seed = timer::get_ct_seed(); - const DWORD_PTR trigger_affinity = timer::get_trigger_mask(); - if (!trigger_affinity) { - return false; - } + const DWORD_PTR trigger_affinity = timer::getmask(ct_seed, true); + const DWORD_PTR counter_affinity = timer::getmask(ct_seed, false); - const DWORD_PTR target_affinity = timer::get_counter_mask(ct_seed, trigger_affinity); - if (!target_affinity) { + if (!trigger_affinity || !counter_affinity) { return false; } - // our software clock, it will count how many cycles a vmexit takes + // our software clock auto counter_thread = [&]() { const HANDLE current_thread = reinterpret_cast(-2LL); - SetThreadAffinityMask(current_thread, target_affinity); + SetThreadAffinityMask(current_thread, counter_affinity); SetThreadPriority(current_thread, THREAD_PRIORITY_HIGHEST); // decrease chance of being rescheduled SetThreadPriorityBoost(current_thread, TRUE); // disable dynamic boosts while (!state.start_test.load(std::memory_order_acquire)) {} while (!state.test_done.load(std::memory_order_relaxed)) { - const u64 current = state.counter; // to silence warnings about incrementing volatile stuff + const timer::timer_tick_t current = state.counter; // to silence warnings about incrementing volatile stuff state.counter = current + 1; // better than calling incq in inline assembly, standard increment forces the correct cache behavior we want - - #if (GCC || CLANG) - // prevents aggressive loop unrolling/batching of volatile stores - __asm__ volatile("" ::: "memory"); - #endif + std::atomic_signal_fence(std::memory_order_seq_cst); } }; @@ -6020,7 +5882,7 @@ struct VM { const DWORD old_process_priority = GetPriorityClass(current_process); SetPriorityClass(current_process, ABOVE_NORMAL_PRIORITY_CLASS); // ABOVE_NORMAL_PRIORITY_CLASS + THREAD_PRIORITY_HIGHEST = 12 base priority SetThreadPriority(current_thread, THREAD_PRIORITY_HIGHEST); - SetThreadPriorityBoost(current_thread, TRUE); // disable dynamic boosts + SetThreadPriorityBoost(current_thread, TRUE); // disable dynamic thread priority adjustments by Windows, not turbo boosts by the hardware itself // important so that hypervisor can't predict how many samples we will collect // stack-only / ASLR-derived component (no APIs, no rdtsc) @@ -6052,144 +5914,149 @@ struct VM { std::mt19937 gen(seq); std::uniform_int_distribution batch_dist(30000, 70000); const size_t BATCH_SIZE = batch_dist(gen); - size_t valid = 0; // end of setup phase SleepEx(0, FALSE); // try to get fresh quantum before starting warm-up phase, give time to kernel to setup priorities - std::vector vm_samples(BATCH_SIZE), ref_samples(BATCH_SIZE); // pre page-fault MMU, wwe wont warm-up cpuid samples for the P-states intentionally - VirtualLock(vm_samples.data(), BATCH_SIZE * sizeof(u64)); // lock the memory for the samples to prevent page faults if permissions are enough - VirtualLock(ref_samples.data(), BATCH_SIZE * sizeof(u64)); + std::vector vm_samples(BATCH_SIZE), ref_samples(BATCH_SIZE); // pre page-fault MMU, wwe wont warm-up cpuid samples for the P-states intentionally + VirtualLock(vm_samples.data(), BATCH_SIZE * sizeof(timer::timer_tick_t)); // lock the memory for the samples to prevent page faults if permissions are enough + VirtualLock(ref_samples.data(), BATCH_SIZE * sizeof(timer::timer_tick_t)); #define LFENCE_8 _mm_lfence(); _mm_lfence(); _mm_lfence(); _mm_lfence(); _mm_lfence(); _mm_lfence(); _mm_lfence(); _mm_lfence(); state.start_test.store(true, std::memory_order_release); // _mm_pause can be vm-exited conditionally, spam hit L3 - // warm-up to settle caches and scheduler, P-states are already not enforced with the SetThreadPriorityBoost call of before - for (int i = 0; i < 1000; ++i) { - // serialize is a good candidate as it's the closest architectural match to CPUID's pipeline-stall behavior AND can't be intercepted in VCMB/VMCS - // in AMD the serialize intrinsic triggers a illegal instruction exception, so the closest AMD native substitute that is not one of the standard direct instruction exit controls is LFENCE - // when AMD has configured it to be dispatch-serializing via MSR C001_1029[1]=1 (or when LFenceAlwaysSerializing is set) - if (is_intel) _serialize(); - else LFENCE_8 - timer::trigger_vmexit(); - } + // cache and cpu scheduler warm-up won't affect anything in the measurement loop, so ramp up frequency/P-states to a high non-AVX Turbo/P-state without vmexits + u64 val = static_cast(seed) ^ 0x5a5a5a5a5a5a5a5aULL; + + for (u32 i = 0; i < 12'000'000; ++i) { + val = (val ^ i) * 6364136223846793005ULL + 1442695040888963407ULL; + } + + volatile u64 compiler_sink = val; + VMAWARE_UNUSED(compiler_sink); + + // independent multi-trial state initialization + timer::timer_tick_t best_cpuid_l = (std::numeric_limits::max)(); + timer::timer_tick_t best_ref_l = (std::numeric_limits::max)(); + constexpr int TRIALS = 3; + + for (int trial = 0; trial < TRIALS; ++trial) { + size_t valid = 0; // end of setup phase + + // inside the timing windows, there must be zero memory output (no stack arrays can be written to), zero conditional branches and zero stack spilling (no register push/pops) + if (is_intel) { + while (valid < BATCH_SIZE) { + // cpuid and serialize/lfence interpolated so that any turbo boost, thermal throttling, speculation (for the loop overhead itself, not for the serializing instructions), etc affects samples equally + timer::timer_tick_t r_pre, r_post, v_pre, v_post, sync; + + // this is done as a counter to both legitimate and malicious hypervisors interrupts that may pause the counter thread while we measure + sync = state.counter; + while (state.counter == sync); // infer if counter got enough quantum momentum (so its currently scheduled) + + // SERIALIZE/LFENCE check is before CPUID on purpose, so that possible pauses when cpuid is executed do not affect SERIALIZE/LFENCE too. The hv needs to wait for cpuid to pause the thread + // the amount of instructions (8 in case of LFENCE) are enough for the Cross-Core/Cross-CCD MESI RFO cache bounce in the data race so that the counter thread sees an increment + sync = state.counter; + while (state.counter == sync); // fastest busy-waiting strategy, PAUSE affects cache, calling APIs like SwitchToThread() would be even worse + r_pre = state.counter; + std::atomic_signal_fence(std::memory_order_seq_cst); // ensure compiler-level ordering + _serialize(); + std::atomic_signal_fence(std::memory_order_seq_cst); + r_post = state.counter; + + sync = state.counter; + while (state.counter == sync); // sync to our counter tick again + sync = state.counter; + while (state.counter == sync); // and again + + v_pre = state.counter; + std::atomic_signal_fence(std::memory_order_seq_cst); // _ReadWriteBarrier() aka dont emit runtime fences + + // the only way a legitimate interrupt can make the check false flag is if most of the samples were contaminated just in the cpuid samples but not in the serialize/lfence samples + // still possible tho, but it's as accurate we can get on user-mode without relying on any other hardware clock or cross-referencing with the counter thread mid-execution + // this is why the score of this technique is not enough to determine a VM + timer::vmexit(); + + std::atomic_signal_fence(std::memory_order_seq_cst); + v_post = state.counter; + + // we dont filter by cycles spent here (for example by querying thread cycle time) because the point of this function is to not use TSC or any other clock + if (v_post > v_pre && r_post > r_pre) { + vm_samples[valid] = v_post - v_pre; + ref_samples[valid] = r_post - r_pre; + valid++; + } - // inside the timing windows, there must be zero memory output (no stack arrays can be written to), zero conditional branches and zero stack spilling (no register push/pops) - if (is_intel) { - while (valid < BATCH_SIZE) { - // cpuid and serialize/lfence interpolated so that any turbo boost, thermal throttling, speculation (for the loop overhead itself, not for the serializing instructions), etc affects samples equally - u64 r_pre, r_post, v_pre, v_post, sync; - - // this is done as a counter to both legitimate and malicious hypervisors interrupts that may pause the counter thread while we measure - sync = state.counter; - while (state.counter == sync); // infer if counter got enough quantum momentum (so its currently scheduled) - - // SERIALIZE/LFENCE check is before CPUID on purpose, so that possible pauses when cpuid is executed do not affect SERIALIZE/LFENCE too. The hv needs to wait for cpuid to pause the thread - // the amount of instructions (8 in case of LFENCE) are enough for the Cross-Core/Cross-CCD MESI RFO cache bounce in the data race so that the counter thread sees an increment - sync = state.counter; - while (state.counter == sync); // fastest busy-waiting strategy, PAUSE affects cache, calling APIs like SwitchToThread() would be even worse - r_pre = state.counter; - std::atomic_signal_fence(std::memory_order_seq_cst); // ensure compiler-level ordering - _serialize(); - std::atomic_signal_fence(std::memory_order_seq_cst); - r_post = state.counter; - - sync = state.counter; - while (state.counter == sync); // sync to our counter tick again - sync = state.counter; - while (state.counter == sync); // and again - - v_pre = state.counter; - std::atomic_signal_fence(std::memory_order_seq_cst); // _ReadWriteBarrier() aka dont emit runtime fences - - // the only way a legitimate interrupt can make the check false flag is if most of the samples were contaminated just in the cpuid samples but not in the serialize/lfence samples - // still possible tho, but it's as accurate we can get on user-mode without relying on any other hardware clock or cross-referencing with the counter thread mid-execution - // this is why the score of this technique is not enough to determine a VM - timer::trigger_vmexit(); - - std::atomic_signal_fence(std::memory_order_seq_cst); - v_post = state.counter; - - // we dont filter by cycles spent here (for example by querying thread cycle time) because the point of this function is to not use TSC or any other clock - if (v_post > v_pre && r_post > r_pre) { - vm_samples[valid] = v_post - v_pre; - ref_samples[valid] = r_post - r_pre; - valid++; + // burn cycles executing a random number of instructions in each loop iteration, so that the hypervisor doesn't know when to pause the counter thread + timer::burn_random_cycles(ct_seed, v_post, r_post); } - - // burn cycles executing a random number of instructions in each loop iteration, so that the hypervisor doesn't know when to pause the counter thread - timer::burn_random_cycles(ct_seed, v_post, r_post); - - #if (CLANG || GCC) - __asm__ volatile("" :: "r"(x) : "memory"); - #endif } - } - else { - while (valid < BATCH_SIZE) { - // cpuid and serialize/lfence interpolated so that any turbo boost, thermal throttling, speculation (for the loop overhead itself, not for the serializing instructions), etc affects samples equally - u64 r_pre, r_post, v_pre, v_post, sync; - - // this is done as a counter to both legitimate and malicious hypervisors interrupts that may pause the counter thread while we measure - sync = state.counter; - while (state.counter == sync); // infer if counter got enough quantum momentum (so its currently scheduled) - - // SERIALIZE/LFENCE check is before CPUID on purpose, so that possible pauses when cpuid is executed do not affect SERIALIZE/LFENCE too. The hv needs to wait for cpuid to pause the thread - // the amount of instructions (8 in case of LFENCE) are enough for the Cross-Core/Cross-CCD MESI RFO cache bounce in the data race so that the counter thread sees an increment - sync = state.counter; - while (state.counter == sync); - r_pre = state.counter; - std::atomic_signal_fence(std::memory_order_seq_cst); - LFENCE_8 - std::atomic_signal_fence(std::memory_order_seq_cst); - r_post = state.counter; - - sync = state.counter; - while (state.counter == sync); // sync to our counter tick again - sync = state.counter; - while (state.counter == sync); // and again - - v_pre = state.counter; - std::atomic_signal_fence(std::memory_order_seq_cst); // _ReadWriteBarrier() aka dont emit runtime fences - - // the only way a legitimate interrupt can make the check false flag is if most of the samples were contaminated just in the cpuid samples but not in the serialize/lfence samples - // still possible tho, but it's as accurate we can get on user-mode without relying on any other hardware clock or cross-referencing with the counter thread mid-execution - // this is why the score of this technique is not enough to determine a VM - timer::trigger_vmexit(); - - std::atomic_signal_fence(std::memory_order_seq_cst); - v_post = state.counter; - - // we dont filter by cycles spent here (for example by querying thread cycle time) because the point of this function is to not use TSC or any other clock - if (v_post > v_pre && r_post > r_pre) { - vm_samples[valid] = v_post - v_pre; - ref_samples[valid] = r_post - r_pre; - valid++; + else { + while (valid < BATCH_SIZE) { + // cpuid and serialize/lfence interpolated so that any turbo boost, thermal throttling, speculation (for the loop overhead itself, not for the serializing instructions), etc affects samples equally + timer::timer_tick_t r_pre, r_post, v_pre, v_post, sync; + + // this is done as a counter to both legitimate and malicious hypervisors interrupts that may pause the counter thread while we measure + sync = state.counter; + while (state.counter == sync); // infer if counter got enough quantum momentum (so its currently scheduled) + + // SERIALIZE/LFENCE check is before CPUID on purpose, so that possible pauses when cpuid is executed do not affect SERIALIZE/LFENCE too. The hv needs to wait for cpuid to pause the thread + // the amount of instructions (8 in case of LFENCE) are enough for the Cross-Core/Cross-CCD MESI RFO cache bounce in the data race so that the counter thread sees an increment + sync = state.counter; + while (state.counter == sync); + r_pre = state.counter; + std::atomic_signal_fence(std::memory_order_seq_cst); + LFENCE_8 + std::atomic_signal_fence(std::memory_order_seq_cst); + r_post = state.counter; + + sync = state.counter; + while (state.counter == sync); // sync to our counter tick again + sync = state.counter; + while (state.counter == sync); // and again + + v_pre = state.counter; + std::atomic_signal_fence(std::memory_order_seq_cst); // _ReadWriteBarrier() aka dont emit runtime fences + + // the only way a legitimate interrupt can make the check false flag is if most of the samples were contaminated just in the cpuid samples but not in the serialize/lfence samples + // still possible tho, but it's as accurate we can get on user-mode without relying on any other hardware clock or cross-referencing with the counter thread mid-execution + // this is why the score of this technique is not enough to determine a VM + timer::vmexit(); + + std::atomic_signal_fence(std::memory_order_seq_cst); + v_post = state.counter; + + // we dont filter by cycles spent here (for example by querying thread cycle time) because the point of this function is to not use TSC or any other clock + if (v_post > v_pre && r_post > r_pre) { + vm_samples[valid] = v_post - v_pre; + ref_samples[valid] = r_post - r_pre; + valid++; + } + + // burn cycles executing a random number of instructions in each loop iteration, so that the hypervisor doesn't know when to pause the counter thread + timer::burn_random_cycles(ct_seed, v_post, r_post); } + } - // burn cycles executing a random number of instructions in each loop iteration, so that the hypervisor doesn't know when to pause the counter thread - timer::burn_random_cycles(ct_seed, v_post, r_post); + const timer::timer_tick_t cpuid_l = timer::calculate_latency(vm_samples); // check for lowest dense cluster with no interrupt spikes, filter noise we can't detect (SMIs, NMIs, etc) + const timer::timer_tick_t ref_l = timer::calculate_latency(ref_samples); - #if (CLANG || GCC) - __asm__ volatile("" :: "r"(x) : "memory"); - #endif - } + // record the cleanest/lowest latency observed across the independent trials + if (cpuid_l < best_cpuid_l) best_cpuid_l = cpuid_l; + if (ref_l < best_ref_l) best_ref_l = ref_l; } state.test_done.store(true, std::memory_order_release); - const u64 cpuid_l = timer::calculate_latency(vm_samples); // check for lowest dense cluster with no interrupt spikes, filter noise we can't detect (SMIs, NMIs, etc) - const u64 ref_l = timer::calculate_latency(ref_samples); - const double latency_ratio = ref_l ? (double)cpuid_l / (double)ref_l : 0; + const double latency_ratio = best_ref_l ? (double)best_cpuid_l / (double)best_ref_l : 0; // VMM = Time spent in hypervisor; nVMM = Time spent in baremetal - debug("TIMER: VMM -> ", cpuid_l, " | nVMM -> ", ref_l, " | Ratio -> ", latency_ratio); // those are NOT cycles + debug("TIMER: VMM -> ", best_cpuid_l, " | nVMM -> ", best_ref_l, " | Ratio -> ", latency_ratio); // these ARE NOT cycles if (latency_ratio >= threshold) hypervisor_detected = true; // Detect IPI-based counter pausing bypasses // For the median itself to exceed baremetal limits (which rarely pass 1000), an interrupt must be occurring on almost EVERY single loop iteration - // This is the footprint of a hypervisor continuously spamming cross-core IPIs to try and pause the counter thread (or the trigger_thread to make SERIALIZE/LFENCE take a lot of time) - if (cpuid_l > 1000 || ref_l > 1000 || cpuid_l == 1 || ref_l == 1) { - debug("TIMER: Detected artificial interrupt delivery to VMAware's threads"); + // This is the footprint of a hypervisor continuously spamming cross-core IPIs to try and pause our threads + if (best_cpuid_l > 1000 || best_ref_l > 1000 || best_cpuid_l == 1 || best_ref_l == 1) { + debug("TIMER: Detected artificial interrupt delivery to timing threads"); hypervisor_detected = true; } @@ -6198,8 +6065,8 @@ struct VM { SetThreadPriority(current_thread, old_thread_priority); SetPriorityClass(current_process, old_process_priority); SetThreadAffinityMask(current_thread, old_affinity); - VirtualUnlock(vm_samples.data(), BATCH_SIZE * sizeof(u64)); - VirtualUnlock(ref_samples.data(), BATCH_SIZE * sizeof(u64)); + VirtualUnlock(vm_samples.data(), BATCH_SIZE * sizeof(timer::timer_tick_t)); + VirtualUnlock(ref_samples.data(), BATCH_SIZE * sizeof(timer::timer_tick_t)); }; std::thread t1(counter_thread); @@ -6391,8 +6258,8 @@ struct VM { const struct ifreq* end = it + (ifc.ifc_len / sizeof(struct ifreq)); for (; it != end; ++it) { - std::size_t const name_len = std::min(sizeof(ifr.ifr_name) - 1, strlen(it->ifr_name)); - std::memcpy(ifr.ifr_name, it->ifr_name, name_len); + std::size_t const name_len = std::min(sizeof(ifr.ifr_name) - 1, strnlen(it->ifr_name, sizeof(it->ifr_name))); + memcpy(ifr.ifr_name, it->ifr_name, name_len); *(ifr.ifr_name + name_len) = '\0'; if (ioctl(sockGuard.get(), SIOCGIFFLAGS, &ifr) != 0) { @@ -6408,7 +6275,7 @@ struct VM { } if (success) { - std::memcpy(mac, ifr.ifr_hwaddr.sa_data, 6); + memcpy(mac, ifr.ifr_hwaddr.sa_data, 6); } else { debug("MAC: ", "not successful"); @@ -8436,16 +8303,21 @@ struct VM { void* functions[1] = { nullptr }; util::get_function_address(ntdll, function_names, functions, 1); - using NtQuerySysInfo_t = NTSTATUS(__stdcall*)(SYSTEM_INFORMATION_CLASS, PVOID, ULONG, PULONG); - NtQuerySysInfo_t nt_query = reinterpret_cast(functions[0]); + using nt_query_sysinfo_t = NTSTATUS(__stdcall*)(SYSTEM_INFORMATION_CLASS, PVOID, ULONG, PULONG); + nt_query_sysinfo_t nt_query = reinterpret_cast(functions[0]); if (!nt_query) return false; + // parse header to locate the bitmap + struct boot_logo_info { ULONG flags, bitmap_offset; }; + // determine required buffer size const SYSTEM_INFORMATION_CLASS sys_boot_info = static_cast(140); ULONG needed = 0; NTSTATUS st = nt_query(sys_boot_info, nullptr, 0, &needed); - if (st != static_cast(0xC0000023) && st != static_cast(0x80000005) && st != static_cast(0xC0000004)) + if (st != static_cast(0xC0000023) && + st != static_cast(0x80000005) && + st != static_cast(0xC0000004)) return false; std::vector buffer(needed); @@ -8455,10 +8327,13 @@ struct VM { if (!NT_SUCCESS(st)) return false; - // parse header to locate the bitmap - struct boot_logo_info { ULONG flags, bitmap_offset; }; - const auto* info = reinterpret_cast(buffer.data()); - if (info->bitmap_offset >= needed) return false; + if (needed < sizeof(boot_logo_info)) + return false; + + const auto* info = reinterpret_cast(buffer.data()); + if (info->bitmap_offset >= needed) + return false; + const u8* bmp = buffer.data() + info->bitmap_offset; const size_t size = static_cast(needed) - info->bitmap_offset; #else @@ -8571,27 +8446,14 @@ struct VM { // helper to detect QEMU instances based on default hard drive serial patterns // QEMU drives often start with "QM000" followed by digits - auto is_qemu_serial = [](const char* str) noexcept -> bool { - if (!str) { - return false; - } - for (int i = 0; i < 6; ++i) { - if (str[i] == '\0') { - return false; - } - } - - if ((str[0] & 0xDF) != 'Q') { + auto is_qemu_serial = [](const char* str, size_t len) noexcept -> bool { + if (!str || len < 6) { return false; } - if ((str[1] & 0xDF) != 'M') { - return false; - } + if ((str[0] & 0xDF) != 'Q') return false; + if ((str[1] & 0xDF) != 'M') return false; - // we check byte-by-byte to be safe regarding alignment, - // though a 32-bit integer check (0x30303030) could be used if alignment is guaranteed - // we also essentially check for null termination safety here because '\0' != '0' return str[2] == '0' && str[3] == '0' && str[4] == '0' && str[5] == '0'; }; @@ -8792,7 +8654,7 @@ struct VM { debug("DISK_SERIAL: ", serial); // Check the retrieved serial number against known VM artifacts - if (is_qemu_serial(serial) || is_vbox_serial(serial, serialLen)) { + if (is_qemu_serial(serial, serialLen) || is_vbox_serial(serial, serialLen)) { if (allocated_buffer) { PVOID free_base = reinterpret_cast(allocated_buffer); SIZE_T free_size = 0; @@ -8859,7 +8721,7 @@ struct VM { } debug("DISK_SERIAL: ", (const char*)serial); - if (is_qemu_serial(serial) || is_vbox_serial(serial, rsize)) { + if (is_qemu_serial(serial, static_cast(rsize)) || is_vbox_serial(serial, static_cast(rsize))) { result = true; } } @@ -8913,7 +8775,7 @@ struct VM { */ [[nodiscard]] static bool hwmodel() { - //hw.model strings are short (like for example MacBookPro16,1), 128 bytes is plenty + // hw.model strings are short (like for example MacBookPro16,1), 128 bytes is plenty char buffer[128] = { 0 }; size_t size = sizeof(buffer); @@ -8924,6 +8786,8 @@ struct VM { return false; } + buffer[127] = '\0'; + // sysctlbyname returns the raw value (usually without a trailing newline), // so no trimming is required debug("HWMODEL: ", "output = ", buffer); @@ -12411,165 +12275,110 @@ struct VM { return false; } - // Surface Pro models typically do not have PIT, some devices might have it but not expose it due to firmware bugs (i.e. Lenovo 83AG) - { - const char* manufacturer = nullptr; - const char* model = nullptr; - if (util::get_manufacturer_model(&manufacturer, &model)) { - auto ci_contains = [](const char* hay, const char* needle) noexcept -> bool { - if (!hay || !needle || !*hay || !*needle) return false; - - const unsigned char* h = - reinterpret_cast(hay); - const unsigned char* n = - reinterpret_cast(needle); - - for (; *h; ++h) { - size_t i = 0; - for (;; ++i) { - unsigned char hc = h[i]; - unsigned char nc = n[i]; - - if (!nc) return true; - if (!hc) break; - - if (hc >= 'A' && hc <= 'Z') hc += 32; - if (nc >= 'A' && nc <= 'Z') nc += 32; - - if (hc != nc) break; - } - } - return false; - }; - - const bool model_has_surface = ci_contains(model, "surface"); - const bool model_has_pro = ci_contains(model, "pro"); - const bool man_is_microsoft = ci_contains(manufacturer, "microsoft"); + // Surface Pro models typically do not have PIT, some devices might have it but not expose it due to firmware bugs (i.e. Lenovo 83AG) + const char* manufacturer = nullptr; + const char* model = nullptr; - if (model_has_surface && (model_has_pro || man_is_microsoft)) { - return false; - } - } - } + if (util::get_manufacturer_model(&manufacturer, &model)) { + auto ci_contains = [](const char* hay, const char* needle) noexcept -> bool { + if (!hay || !needle || !*hay || !*needle) return false; - // The RTC (ACPI/CMOS RTC) timer can't be always detected via SetupAPI, it needs AML decode of the DSDT firmware table - // The HPET (PNP0103) timer presence check was removed, more info at: https://github.com/kernelwernel/VMAware/pull/616 - // Here, we check for the PIT/AT timer (PC-class System Timer) - constexpr wchar_t pattern[] = L"pnp0100"; - constexpr size_t patLen = (sizeof(pattern) / sizeof(wchar_t)) - 1; - - auto wcsstr_ci_ascii = [&](const wchar_t* hay) noexcept -> const wchar_t* { - if (!hay) return nullptr; + for (const char* h = hay; *h; ++h) { + const char* a = h; + const char* b = needle; - for (; *hay; ++hay) { - wchar_t h = *hay; - if (h >= L'A' && h <= L'Z') h += 32; + while (*a && *b) { + unsigned char ca = static_cast(*a); + unsigned char cb = static_cast(*b); - if (h != pattern[0]) continue; + if (ca >= 'A' && ca <= 'Z') ca += 32; + if (cb >= 'A' && cb <= 'Z') cb += 32; - size_t i = 1; - for (; i < patLen; ++i) { - wchar_t next_h = hay[i]; + if (ca != cb) break; + ++a; + ++b; + } - if (next_h == L'\0') return nullptr; + if (!*b) + return true; + } - if (next_h >= L'A' && next_h <= L'Z') next_h += 32; + return false; + }; - if (next_h != pattern[i]) break; - } + const bool is_surface_pro = ci_contains(model, "surface pro"); + const bool is_microsoft = ci_contains(manufacturer, "microsoft"); - if (i == patLen) return hay; + if (is_surface_pro && is_microsoft) { + return false; } - return nullptr; - }; - - const HDEVINFO devs = - SetupDiGetClassDevsW(nullptr, nullptr, nullptr, - DIGCF_PRESENT | DIGCF_ALLCLASSES); + } + + // The RTC (ACPI/CMOS RTC) timer can't be always detected via SetupAPI, it needs AML decode of the DSDT firmware table + // The HPET (PNP0103) timer presence check was removed, more info at: https://github.com/kernelwernel/VMAware/pull/616 + // Here, we check for the PIT/AT timer (PC-class System Timer) + const HDEVINFO devs = SetupDiGetClassDevsW( + nullptr, nullptr, nullptr, DIGCF_PRESENT | DIGCF_ALLCLASSES); if (devs == INVALID_HANDLE_VALUE) return false; SP_DEVINFO_DATA dev_info{}; - dev_info.cbSize = sizeof(SP_DEVINFO_DATA); - - DWORD alloc_size = 4096 + 4; - BYTE* buffer = static_cast(malloc(alloc_size)); - - if (!buffer) { - SetupDiDestroyDeviceInfoList(devs); - return false; - } + dev_info.cbSize = sizeof(dev_info); + BYTE* buffer = nullptr; + DWORD buffer_size = 0; bool found = false; - for (DWORD idx = 0; SetupDiEnumDeviceInfo(devs, idx, &dev_info); ++idx) { - DWORD property_type = 0; - DWORD required = 0; + for (DWORD i = 0; SetupDiEnumDeviceInfo(devs, i, &dev_info); ++i) { + DWORD type = 0; + DWORD needed = 0; - if (!SetupDiGetDeviceRegistryPropertyW( - devs, - &dev_info, - SPDRP_HARDWAREID, - &property_type, - buffer, - alloc_size > 4 ? alloc_size - 4 : 0, - &required)) + if (SetupDiGetDeviceRegistryPropertyW( + devs, &dev_info, SPDRP_HARDWAREID, + &type, nullptr, 0, &needed)) { - const DWORD err = GetLastError(); - - if (err == ERROR_INSUFFICIENT_BUFFER) { - const DWORD needed_size = required + 4; - - if (needed_size > alloc_size) { - BYTE* new_buffer = - static_cast(realloc(buffer, needed_size)); + continue; + } - if (!new_buffer) { - found = false; - break; - } + if (GetLastError() != ERROR_INSUFFICIENT_BUFFER || needed == 0) + continue; - buffer = new_buffer; - alloc_size = needed_size; - } + if (needed > buffer_size) { + BYTE* new_buffer = static_cast( + realloc(buffer, needed + sizeof(wchar_t))); - if (!SetupDiGetDeviceRegistryPropertyW( - devs, - &dev_info, - SPDRP_HARDWAREID, - &property_type, - buffer, - alloc_size > 4 ? alloc_size - 4 : 0, - &required)) - { - continue; - } - } - else { - continue; + if (!new_buffer) { + free(buffer); + SetupDiDestroyDeviceInfoList(devs); + return false; } + + buffer = new_buffer; + buffer_size = needed + sizeof(wchar_t); } - if (property_type != REG_MULTI_SZ) + if (!SetupDiGetDeviceRegistryPropertyW( + devs, &dev_info, SPDRP_HARDWAREID, + &type, buffer, buffer_size, &needed)) + { continue; - - if (required + 4 <= alloc_size) { - buffer[required + 0] = 0; - buffer[required + 1] = 0; - buffer[required + 2] = 0; - buffer[required + 3] = 0; } - wchar_t* cur = reinterpret_cast(buffer); + if (type != REG_MULTI_SZ) + continue; + + reinterpret_cast(buffer)[needed / sizeof(wchar_t)] = L'\0'; - while (*cur) { - if (wcsstr_ci_ascii(cur)) { + for (const wchar_t* s = reinterpret_cast(buffer); *s; + s += wcslen(s) + 1) + { + if (_wcsicmp(s, L"ACPI\\PNP0100") == 0 || + _wcsicmp(s, L"PNP0100") == 0) + { found = true; break; } - - cur += wcslen(cur) + 1; } if (found) @@ -12578,7 +12387,6 @@ struct VM { free(buffer); SetupDiDestroyDeviceInfoList(devs); - return !found; #endif } @@ -13487,7 +13295,7 @@ struct VM { } else { debug("SVM_EXCEPTIONS: Detected SVM hypervisor hiding CPU capabilities"); - core::add(brand_enum::NULL_BRAND, 150); + return core::add(brand_enum::NULL_BRAND, 150); } return true; @@ -14773,8 +14581,8 @@ std::array VM::core::technique_table = [ // START OF TECHNIQUE TABLE #if (WINDOWS) {VM::TRAP, {100, VM::trap}}, - {VM::KVM_INTERCEPTION, {100, VM::kvm_interception}}, - {VM::SVM_EXCEPTIONS, {100, VM::svm_exceptions}}, + {VM::KVM_INTERCEPTION, {150, VM::kvm_interception}}, + {VM::SVM_EXCEPTIONS, {150, VM::svm_exceptions}}, {VM::INTERRUPT_SHADOW, {100, VM::interrupt_shadow}}, {VM::EIP_OVERFLOW, {100, VM::eip_overflow}}, {VM::HYPERVISOR_HOOK, {100, VM::hypervisor_hook}},