From 89f12ecc0a3e884ef63894303db6966987265bbc Mon Sep 17 00:00:00 2001 From: scchan Date: Tue, 16 Aug 2016 03:21:31 +0000 Subject: [PATCH 1/3] allocate array and launch kernel on the selected device --- HCCStream.cpp | 21 +++++++++++++-------- HCCStream.h | 3 +++ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/HCCStream.cpp b/HCCStream.cpp index ff8de0d0..8c58d5f5 100644 --- a/HCCStream.cpp +++ b/HCCStream.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include "HCCStream.h" @@ -49,9 +50,10 @@ void listDevices(void) template HCCStream::HCCStream(const unsigned int ARRAY_SIZE, const int device_index): array_size(ARRAY_SIZE), - d_a(ARRAY_SIZE), - d_b(ARRAY_SIZE), - d_c(ARRAY_SIZE) + accelerator(hc::accelerator::get_all()[device_index]), + d_a(ARRAY_SIZE,accelerator.get_default_view()), + d_b(ARRAY_SIZE,accelerator.get_default_view()), + d_c(ARRAY_SIZE,accelerator.get_default_view()) { // The array size must be divisible by TBSIZE for kernel launches @@ -67,7 +69,6 @@ HCCStream::HCCStream(const unsigned int ARRAY_SIZE, const int device_index): auto current = accs[device_index]; std::cout << "Using HCC device " << getDeviceName(current) << std::endl; - // Check buffers fit on the device // TODO: unclear how to do that!! @@ -102,7 +103,8 @@ void HCCStream::copy() { try{ // launch a GPU kernel to compute the saxpy in parallel - hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) + hc::completion_future future_kernel = hc::parallel_for_each(accelerator.get_default_view() + , hc::extent<1>(array_size) , [&](hc::index<1> i) __attribute((hc)) { d_c[i] = d_a[i]; }); @@ -120,7 +122,8 @@ void HCCStream::mul() const T scalar = 0.3; try{ // launch a GPU kernel to compute the saxpy in parallel - hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) + hc::completion_future future_kernel = hc::parallel_for_each(accelerator.get_default_view() + , hc::extent<1>(array_size) , [&](hc::index<1> i) __attribute((hc)) { d_b[i] = scalar*d_c[i]; }); @@ -137,7 +140,8 @@ void HCCStream::add() { try{ // launch a GPU kernel to compute the saxpy in parallel - hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) + hc::completion_future future_kernel = hc::parallel_for_each(accelerator.get_default_view() + , hc::extent<1>(array_size) , [&](hc::index<1> i) __attribute((hc)) { d_c[i] = d_a[i]+d_b[i]; }); @@ -155,7 +159,8 @@ void HCCStream::triad() const T scalar = 0.3; try{ // launch a GPU kernel to compute the saxpy in parallel - hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) + hc::completion_future future_kernel = hc::parallel_for_each(accelerator.get_default_view() + , hc::extent<1>(array_size) , [&](hc::index<1> i) __attribute((hc)) { d_a[i] = d_b[i] + scalar*d_c[i]; }); diff --git a/HCCStream.h b/HCCStream.h index 818d2438..e8a056f4 100644 --- a/HCCStream.h +++ b/HCCStream.h @@ -23,6 +23,9 @@ class HCCStream : public Stream // Size of arrays unsigned int array_size; + // Selected device + hc::accelerator accelerator; + // Device side pointers to arrays hc::array d_a; hc::array d_b; From 2c73ad1257e3c4ce18c4236f35bae2aa8ef27eda Mon Sep 17 00:00:00 2001 From: scchan Date: Mon, 29 Aug 2016 22:41:56 +0000 Subject: [PATCH 2/3] changing references hc::array to local references in order to avoid the capturing the this pointer into the kernel, experimenting with placement new into coarse grained memory --- HCCStream.cpp | 90 +++++++++++++++++++++++++++++++++++++++++++++++++-- main.cpp | 23 +++++++++++-- 2 files changed, 108 insertions(+), 5 deletions(-) diff --git a/HCCStream.cpp b/HCCStream.cpp index 8c58d5f5..efd22e5e 100644 --- a/HCCStream.cpp +++ b/HCCStream.cpp @@ -46,6 +46,8 @@ void listDevices(void) } +#define SCC_VERIFY (1) + template HCCStream::HCCStream(const unsigned int ARRAY_SIZE, const int device_index): @@ -86,6 +88,45 @@ void HCCStream::write_arrays(const std::vector& a, const std::vector& b hc::copy(a.cbegin(),a.cend(),d_a); hc::copy(b.cbegin(),b.cend(),d_b); hc::copy(c.cbegin(),c.cend(),d_c); + + + +#if (SCC_VERIFY!=0) +{ + hc::array_view av_d_a(d_a); + int errors = 0; + int avi = 0; + for (auto i = a.begin(); i != a.end(); i++,avi++) { + if (av_d_a[avi] != *i) + errors++; + } + printf("%d errors in d_a\n",errors); +} + +{ + hc::array_view av_d_b(d_b); + int errors = 0; + int avi = 0; + for (auto i = b.begin(); i != b.end(); i++,avi++) { + if (av_d_b[avi] != *i) + errors++; + } + printf("%d errors in d_b\n",errors); +} + +{ + hc::array_view av_d_c(d_c); + int errors = 0; + int avi = 0; + for (auto i = c.begin(); i != c.end(); i++,avi++) { + if (av_d_c[avi] != *i) + errors++; + } + printf("%d errors in d_c\n",errors); +} +#endif + + } template @@ -97,18 +138,44 @@ void HCCStream::read_arrays(std::vector& a, std::vector& b, std::vector hc::copy(d_c,c.begin()); } - template void HCCStream::copy() { + + std::cout << "In " << __FUNCTION__ << std::endl; + +#if 1 + hc::array &d_a = this->d_a; + hc::array &d_c = this->d_c; +#endif + try{ // launch a GPU kernel to compute the saxpy in parallel hc::completion_future future_kernel = hc::parallel_for_each(accelerator.get_default_view() , hc::extent<1>(array_size) , [&](hc::index<1> i) __attribute((hc)) { - d_c[i] = d_a[i]; + + d_c[i] = d_a[i]; }); future_kernel.wait(); + + +#if (SCC_VERIFY!=0) +{ + hc::array_view av_d_c(d_c); + hc::array_view av_d_a(d_a); + int errors = 0; + for (int i = 0; i < array_size; i++) { + if (av_d_c[i]!=av_d_a[i]) { + errors++; + } + } + printf("%s %d errors\n",__FUNCTION__,errors); +} +#endif + + + } catch(std::exception& e){ std::cout << e.what() << std::endl; @@ -119,6 +186,11 @@ void HCCStream::copy() template void HCCStream::mul() { + std::cout << "In " << __FUNCTION__ << std::endl; + + hc::array &d_b = this->d_b; + hc::array &d_c = this->d_c; + const T scalar = 0.3; try{ // launch a GPU kernel to compute the saxpy in parallel @@ -138,6 +210,13 @@ void HCCStream::mul() template void HCCStream::add() { + + std::cout << "In " << __FUNCTION__ << std::endl; + + hc::array &d_a = this->d_a; + hc::array &d_b = this->d_b; + hc::array &d_c = this->d_c; + try{ // launch a GPU kernel to compute the saxpy in parallel hc::completion_future future_kernel = hc::parallel_for_each(accelerator.get_default_view() @@ -156,6 +235,13 @@ void HCCStream::add() template void HCCStream::triad() { + + std::cout << "In " << __FUNCTION__ << std::endl; + + hc::array &d_a = this->d_a; + hc::array &d_b = this->d_b; + hc::array &d_c = this->d_c; + const T scalar = 0.3; try{ // launch a GPU kernel to compute the saxpy in parallel diff --git a/main.cpp b/main.cpp index b4f5abff..3bd0326e 100644 --- a/main.cpp +++ b/main.cpp @@ -39,7 +39,14 @@ #endif // Default size of 2^25 -unsigned int ARRAY_SIZE = 33554432; +//unsigned int ARRAY_SIZE = 33554432; + +//unsigned int ARRAY_SIZE = 1024 * 1024; + +unsigned int ARRAY_SIZE = 1024; + + + unsigned int num_times = 100; unsigned int deviceIndex = 0; bool use_float = false; @@ -131,8 +138,13 @@ void run() stream = new OMP45Stream(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); #elif defined(HCC) + + HCCStream* gpu_buffer = (HCCStream*) hc::allocate_coarsed_grain_system_memory(sizeof(HCCStream)); + stream = new(gpu_buffer) HCCStream(ARRAY_SIZE, deviceIndex); + + // Use the "reference" OpenMP 3 implementation - stream = new HCCStream(ARRAY_SIZE, deviceIndex); + //stream = new HCCStream(ARRAY_SIZE, deviceIndex); #endif @@ -171,6 +183,7 @@ void run() t2 = std::chrono::high_resolution_clock::now(); timings[3].push_back(std::chrono::duration_cast >(t2 - t1).count()); + } // Check solutions @@ -214,8 +227,12 @@ void run() } +#ifdef HCC + stream->~Stream(); + hc::free_system_memory((void*)gpu_buffer); +#else delete stream; - +#endif } template From f16f7bde5bdd681527a602b4ccb935b4cb286063 Mon Sep 17 00:00:00 2001 From: scchan Date: Fri, 2 Sep 2016 02:36:32 -0400 Subject: [PATCH 3/3] remove debug code and messages --- HCCStream.cpp | 74 --------------------------------------------------- main.cpp | 24 +++-------------- 2 files changed, 3 insertions(+), 95 deletions(-) diff --git a/HCCStream.cpp b/HCCStream.cpp index efd22e5e..57d4f39e 100644 --- a/HCCStream.cpp +++ b/HCCStream.cpp @@ -45,10 +45,6 @@ void listDevices(void) } } - -#define SCC_VERIFY (1) - - template HCCStream::HCCStream(const unsigned int ARRAY_SIZE, const int device_index): array_size(ARRAY_SIZE), @@ -88,45 +84,6 @@ void HCCStream::write_arrays(const std::vector& a, const std::vector& b hc::copy(a.cbegin(),a.cend(),d_a); hc::copy(b.cbegin(),b.cend(),d_b); hc::copy(c.cbegin(),c.cend(),d_c); - - - -#if (SCC_VERIFY!=0) -{ - hc::array_view av_d_a(d_a); - int errors = 0; - int avi = 0; - for (auto i = a.begin(); i != a.end(); i++,avi++) { - if (av_d_a[avi] != *i) - errors++; - } - printf("%d errors in d_a\n",errors); -} - -{ - hc::array_view av_d_b(d_b); - int errors = 0; - int avi = 0; - for (auto i = b.begin(); i != b.end(); i++,avi++) { - if (av_d_b[avi] != *i) - errors++; - } - printf("%d errors in d_b\n",errors); -} - -{ - hc::array_view av_d_c(d_c); - int errors = 0; - int avi = 0; - for (auto i = c.begin(); i != c.end(); i++,avi++) { - if (av_d_c[avi] != *i) - errors++; - } - printf("%d errors in d_c\n",errors); -} -#endif - - } template @@ -141,13 +98,8 @@ void HCCStream::read_arrays(std::vector& a, std::vector& b, std::vector template void HCCStream::copy() { - - std::cout << "In " << __FUNCTION__ << std::endl; - -#if 1 hc::array &d_a = this->d_a; hc::array &d_c = this->d_c; -#endif try{ // launch a GPU kernel to compute the saxpy in parallel @@ -158,24 +110,6 @@ void HCCStream::copy() d_c[i] = d_a[i]; }); future_kernel.wait(); - - -#if (SCC_VERIFY!=0) -{ - hc::array_view av_d_c(d_c); - hc::array_view av_d_a(d_a); - int errors = 0; - for (int i = 0; i < array_size; i++) { - if (av_d_c[i]!=av_d_a[i]) { - errors++; - } - } - printf("%s %d errors\n",__FUNCTION__,errors); -} -#endif - - - } catch(std::exception& e){ std::cout << e.what() << std::endl; @@ -186,8 +120,6 @@ void HCCStream::copy() template void HCCStream::mul() { - std::cout << "In " << __FUNCTION__ << std::endl; - hc::array &d_b = this->d_b; hc::array &d_c = this->d_c; @@ -210,9 +142,6 @@ void HCCStream::mul() template void HCCStream::add() { - - std::cout << "In " << __FUNCTION__ << std::endl; - hc::array &d_a = this->d_a; hc::array &d_b = this->d_b; hc::array &d_c = this->d_c; @@ -235,9 +164,6 @@ void HCCStream::add() template void HCCStream::triad() { - - std::cout << "In " << __FUNCTION__ << std::endl; - hc::array &d_a = this->d_a; hc::array &d_b = this->d_b; hc::array &d_c = this->d_c; diff --git a/main.cpp b/main.cpp index 3bd0326e..7aad2154 100644 --- a/main.cpp +++ b/main.cpp @@ -39,13 +39,7 @@ #endif // Default size of 2^25 -//unsigned int ARRAY_SIZE = 33554432; - -//unsigned int ARRAY_SIZE = 1024 * 1024; - -unsigned int ARRAY_SIZE = 1024; - - +unsigned int ARRAY_SIZE = 33554432; unsigned int num_times = 100; unsigned int deviceIndex = 0; @@ -138,14 +132,9 @@ void run() stream = new OMP45Stream(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); #elif defined(HCC) - - HCCStream* gpu_buffer = (HCCStream*) hc::allocate_coarsed_grain_system_memory(sizeof(HCCStream)); - stream = new(gpu_buffer) HCCStream(ARRAY_SIZE, deviceIndex); - - // Use the "reference" OpenMP 3 implementation - //stream = new HCCStream(ARRAY_SIZE, deviceIndex); - + stream = new HCCStream(ARRAY_SIZE, deviceIndex); + #endif stream->write_arrays(a, b, c); @@ -183,7 +172,6 @@ void run() t2 = std::chrono::high_resolution_clock::now(); timings[3].push_back(std::chrono::duration_cast >(t2 - t1).count()); - } // Check solutions @@ -226,13 +214,7 @@ void run() << std::endl; } - -#ifdef HCC - stream->~Stream(); - hc::free_system_memory((void*)gpu_buffer); -#else delete stream; -#endif } template