cpp-presentation/lcg.cpp at main · sitrin/cpp-presentation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
// compile:
// g++ -std=c++17 -O3 -Wall -pedantic -o lcg lcg.cpp -march=native
//
// Note: -march=native turns out to be AVX2 on EC2 m5 instance
// Benchmarks:
//    100'000'000:   1.04 sec (normal),  0.36 sec (vectorized)
//  1'000'000'000:  10.4  sec (normal),  3.6  sec (vectorized)
// 10'000'000'000: 104    sec (normal), 36    sec (vectorized)

#include <array>
#include <chrono>
#include <immintrin.h>
#include <iostream>
#include <iomanip>

// output only the high 16 bits to avoid "eliminate the
// short period problem in the low-order bits" (Wikipedia)
inline uint16_t high_transform(uint32_t n) {
    return (n >> 16) & 0xffff;
}

// this is the essential operation of an LCG
// no modulus is neccessary because overflow acts
// as the implied modulus for VSC++
// (nice explanation at https://stackoverflow.com/a/14788283)
inline void update_state(uint32_t & state, uint32_t a, uint32_t c) {
    state = (a * state) + c ;
}

// virtually the same as update status for non-vectorized args (as above)
inline void update_state(__m256i & state_v, __m256i a_v, __m256i c_v) {
        state_v = _mm256_mullo_epi32(state_v, a_v);
        state_v = _mm256_add_epi32(state_v, c_v);
}

// print out a fancy hex dump of an array (or any container)
template<typename T>
void print_state(const T& state) {
    for(auto s : state) {
         std::cout << high_transform(s) << " ";
    }
    std::cout << "]";
}

int main() {

    // VS C++ LCG params (Wikipedia)
    const uint32_t A = 0x343FD;
    const uint32_t C = 0x269EC3;

    const size_t NUM_UPDATES = 1'000'000'00;

    constexpr size_t SEED_SIZE = 8; // number of 32-bit seed values to seed with
    const std::array<uint32_t, SEED_SIZE> SEED {{ 'C','O','L','U','M','B','I','A' }};

    // initialize state to SEED
    auto state = SEED;

    auto start = std::chrono::system_clock::now();
    for(size_t i=0; i<SEED_SIZE; ++i) {
        for(size_t j = 0; j < NUM_UPDATES; ++j) {
            update_state(state[i], A, C);
        }
    }
    std::chrono::duration<double> diff = std::chrono::system_clock::now() - start;

    // output non-vectorized results
    std::cout << std::fixed << std::setprecision(9) << std::left;
    std::cout << "Perform " << (int)NUM_UPDATES << " loops for each (of 8) bytes of seed     (" << diff.count() << " sec): [ ";
    print_state(state);
    std::cout << "\n";

    // Now do vectorized
    const auto SEED_v = _mm256_stream_load_si256((__m256i const *)(SEED.data()));
    const auto    A_v = _mm256_stream_load_si256((__m256i const *)(std::array<uint32_t, SEED_SIZE>{{A,A,A,A,A,A,A,A}}.data()));
    const auto    C_v = _mm256_stream_load_si256((__m256i const *)(std::array<uint32_t, SEED_SIZE>{{C,C,C,C,C,C,C,C}}.data()));

    // as above for non-vectorized, initialize state to SEED
    auto state_v = SEED_v;

    start = std::chrono::system_clock::now();
    for(size_t i = 0; i < NUM_UPDATES; ++i) {
        update_state(state_v, A_v, C_v);
    }
    diff = std::chrono::system_clock::now() - start;

    std::array<uint32_t,SEED_SIZE> state_v_as_array;
    _mm256_storeu_si256((__m256i*) state_v_as_array.data(), state_v);

    // output result of vectorized approach
    std::cout << "Perform " << (int)NUM_UPDATES << " loops for all bytes of seed in parallel (" << diff.count() << " sec): [ ";
    print_state(state_v_as_array);
    std::cout << "\n";
}