From 89f12ecc0a3e884ef63894303db6966987265bbc Mon Sep 17 00:00:00 2001
From: scchan <siuchi.chan@amd.com>
Date: Tue, 16 Aug 2016 03:21:31 +0000
Subject: [PATCH 1/3] allocate array and launch kernel on the selected device

---
 HCCStream.cpp | 21 +++++++++++++--------
 HCCStream.h   |  3 +++
 2 files changed, 16 insertions(+), 8 deletions(-)
diff --git a/HCCStream.cpp b/HCCStream.cpp
index ff8de0d0..8c58d5f5 100644
--- a/HCCStream.cpp
+++ b/HCCStream.cpp
@@ -8,6 +8,7 @@
 #include <codecvt>
 #include <vector>
 #include <locale>
+#include <cstdio>
 
 #include "HCCStream.h"
 
@@ -49,9 +50,10 @@ void listDevices(void)
 template <class T>
 HCCStream<T>::HCCStream(const unsigned int ARRAY_SIZE, const int device_index):
   array_size(ARRAY_SIZE),
-  d_a(ARRAY_SIZE),
-  d_b(ARRAY_SIZE),
-  d_c(ARRAY_SIZE)
+  accelerator(hc::accelerator::get_all()[device_index]),
+  d_a(ARRAY_SIZE,accelerator.get_default_view()),
+  d_b(ARRAY_SIZE,accelerator.get_default_view()),
+  d_c(ARRAY_SIZE,accelerator.get_default_view())
 {
 
   // The array size must be divisible by TBSIZE for kernel launches
@@ -67,7 +69,6 @@ HCCStream<T>::HCCStream(const unsigned int ARRAY_SIZE, const int device_index):
   auto current = accs[device_index];
   
   std::cout << "Using HCC device " << getDeviceName(current) << std::endl;
-  
 
   // Check buffers fit on the device
   // TODO: unclear how to do that!!
@@ -102,7 +103,8 @@ void HCCStream<T>::copy()
 {
   try{
   // launch a GPU kernel to compute the saxpy in parallel 
-    hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size)
+    hc::completion_future future_kernel = hc::parallel_for_each(accelerator.get_default_view()
+                , hc::extent<1>(array_size)
 								, [&](hc::index<1> i) __attribute((hc)) {
 								  d_c[i] = d_a[i];
 								});
@@ -120,7 +122,8 @@ void HCCStream<T>::mul()
   const T scalar = 0.3;
   try{
   // launch a GPU kernel to compute the saxpy in parallel 
-    hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size)
+    hc::completion_future future_kernel = hc::parallel_for_each(accelerator.get_default_view()
+                , hc::extent<1>(array_size)
 								, [&](hc::index<1> i) __attribute((hc)) {
 								  d_b[i] = scalar*d_c[i];
 								});
@@ -137,7 +140,8 @@ void HCCStream<T>::add()
 {
   try{
     // launch a GPU kernel to compute the saxpy in parallel 
-    hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size)
+    hc::completion_future future_kernel = hc::parallel_for_each(accelerator.get_default_view()
+                , hc::extent<1>(array_size)
 								, [&](hc::index<1> i) __attribute((hc)) {
 								  d_c[i] = d_a[i]+d_b[i];
 								});
@@ -155,7 +159,8 @@ void HCCStream<T>::triad()
   const T scalar = 0.3;
   try{
     // launch a GPU kernel to compute the saxpy in parallel 
-    hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size)
+    hc::completion_future future_kernel = hc::parallel_for_each(accelerator.get_default_view()
+                , hc::extent<1>(array_size)
 								, [&](hc::index<1> i) __attribute((hc)) {
 								  d_a[i] = d_b[i] + scalar*d_c[i];
 								});
diff --git a/HCCStream.h b/HCCStream.h
index 818d2438..e8a056f4 100644
--- a/HCCStream.h
+++ b/HCCStream.h
@@ -23,6 +23,9 @@ class HCCStream : public Stream<T>
   // Size of arrays
   unsigned int array_size;
 
+  // Selected device
+  hc::accelerator accelerator;
+
   // Device side pointers to arrays
   hc::array<T,1> d_a;
   hc::array<T,1> d_b;

From 2c73ad1257e3c4ce18c4236f35bae2aa8ef27eda Mon Sep 17 00:00:00 2001
From: scchan <siuchi.chan@amd.com>
Date: Mon, 29 Aug 2016 22:41:56 +0000
Subject: [PATCH 2/3] changing references hc::array to local references in
 order to avoid the capturing the this pointer into the kernel, experimenting
 with placement new into coarse grained memory

---
 HCCStream.cpp | 90 +++++++++++++++++++++++++++++++++++++++++++++++++--
 main.cpp      | 23 +++++++++++--
 2 files changed, 108 insertions(+), 5 deletions(-)

diff --git a/HCCStream.cpp b/HCCStream.cpp
index 8c58d5f5..efd22e5e 100644
--- a/HCCStream.cpp
+++ b/HCCStream.cpp
@@ -46,6 +46,8 @@ void listDevices(void)
 }
 
 
+#define SCC_VERIFY (1)
+
 
 template <class T>
 HCCStream<T>::HCCStream(const unsigned int ARRAY_SIZE, const int device_index):
@@ -86,6 +88,45 @@ void HCCStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b
   hc::copy(a.cbegin(),a.cend(),d_a);
   hc::copy(b.cbegin(),b.cend(),d_b);
   hc::copy(c.cbegin(),c.cend(),d_c);
+
+
+
+#if (SCC_VERIFY!=0) 
+{
+   hc::array_view<T> av_d_a(d_a);
+   int errors = 0;
+   int avi = 0;
+   for (auto i = a.begin(); i != a.end(); i++,avi++) {
+     if (av_d_a[avi] != *i)
+       errors++;
+   }
+   printf("%d errors in d_a\n",errors);
+}
+
+{
+   hc::array_view<T> av_d_b(d_b);
+   int errors = 0;
+   int avi = 0;
+   for (auto i = b.begin(); i != b.end(); i++,avi++) {
+     if (av_d_b[avi] != *i)
+       errors++;
+   }
+   printf("%d errors in d_b\n",errors);
+}
+
+{
+   hc::array_view<T> av_d_c(d_c);
+   int errors = 0;
+   int avi = 0;
+   for (auto i = c.begin(); i != c.end(); i++,avi++) {
+     if (av_d_c[avi] != *i)
+       errors++;
+   }
+   printf("%d errors in d_c\n",errors);
+}
+#endif
+
+
 }
 
 template <class T>
@@ -97,18 +138,44 @@ void HCCStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector
   hc::copy(d_c,c.begin());
 }
 
-
 template <class T>
 void HCCStream<T>::copy()
 {
+
+  std::cout << "In " << __FUNCTION__ << std::endl;
+
+#if 1
+  hc::array<T> &d_a = this->d_a;
+  hc::array<T> &d_c = this->d_c;
+#endif
+
   try{
   // launch a GPU kernel to compute the saxpy in parallel 
     hc::completion_future future_kernel = hc::parallel_for_each(accelerator.get_default_view()
                 , hc::extent<1>(array_size)
 								, [&](hc::index<1> i) __attribute((hc)) {
-								  d_c[i] = d_a[i];
+
+								 d_c[i] = d_a[i];
 								});
     future_kernel.wait();
+
+
+#if (SCC_VERIFY!=0) 
+{
+   hc::array_view<T> av_d_c(d_c);
+   hc::array_view<T> av_d_a(d_a);
+   int errors = 0;
+   for (int i = 0; i < array_size; i++) {
+     if (av_d_c[i]!=av_d_a[i]) {
+       errors++;
+     }
+   }
+   printf("%s %d errors\n",__FUNCTION__,errors);
+}
+#endif
+
+
+
   }
   catch(std::exception& e){
     std::cout << e.what() << std::endl;
@@ -119,6 +186,11 @@ void HCCStream<T>::copy()
 template <class T>
 void HCCStream<T>::mul()
 {
+  std::cout << "In " << __FUNCTION__ << std::endl;
+
+  hc::array<T> &d_b = this->d_b;
+  hc::array<T> &d_c = this->d_c;
+
   const T scalar = 0.3;
   try{
   // launch a GPU kernel to compute the saxpy in parallel 
@@ -138,6 +210,13 @@ void HCCStream<T>::mul()
 template <class T>
 void HCCStream<T>::add()
 {
+
+  std::cout << "In " << __FUNCTION__ << std::endl;
+
+  hc::array<T> &d_a = this->d_a;
+  hc::array<T> &d_b = this->d_b;
+  hc::array<T> &d_c = this->d_c;
+
   try{
     // launch a GPU kernel to compute the saxpy in parallel 
     hc::completion_future future_kernel = hc::parallel_for_each(accelerator.get_default_view()
@@ -156,6 +235,13 @@ void HCCStream<T>::add()
 template <class T>
 void HCCStream<T>::triad()
 {
+
+  std::cout << "In " << __FUNCTION__ << std::endl;
+
+  hc::array<T> &d_a = this->d_a;
+  hc::array<T> &d_b = this->d_b;
+  hc::array<T> &d_c = this->d_c;
+
   const T scalar = 0.3;
   try{
     // launch a GPU kernel to compute the saxpy in parallel 
diff --git a/main.cpp b/main.cpp
index b4f5abff..3bd0326e 100644
--- a/main.cpp
+++ b/main.cpp
@@ -39,7 +39,14 @@
 #endif
 
 // Default size of 2^25
-unsigned int ARRAY_SIZE = 33554432;
+//unsigned int ARRAY_SIZE = 33554432;
+
+//unsigned int ARRAY_SIZE = 1024 * 1024;
+
+unsigned int ARRAY_SIZE = 1024;
+
+
+
 unsigned int num_times = 100;
 unsigned int deviceIndex = 0;
 bool use_float = false;
@@ -131,8 +138,13 @@ void run()
   stream = new OMP45Stream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
 
 #elif defined(HCC)
+
+  HCCStream<T>* gpu_buffer =  (HCCStream<T>*)  hc::allocate_coarsed_grain_system_memory(sizeof(HCCStream<T>));
+  stream = new(gpu_buffer) HCCStream<T>(ARRAY_SIZE, deviceIndex);
+  
+
   // Use the "reference" OpenMP 3 implementation
-  stream = new HCCStream<T>(ARRAY_SIZE, deviceIndex);
+  //stream = new HCCStream<T>(ARRAY_SIZE, deviceIndex);
   
 #endif
 
@@ -171,6 +183,7 @@ void run()
     t2 = std::chrono::high_resolution_clock::now();
     timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
 
+
   }
 
   // Check solutions
@@ -214,8 +227,12 @@ void run()
 
   }
 
+#ifdef HCC
+  stream->~Stream();
+  hc::free_system_memory((void*)gpu_buffer);
+#else
   delete stream;
-
+#endif
 }
 
 template <typename T>

From f16f7bde5bdd681527a602b4ccb935b4cb286063 Mon Sep 17 00:00:00 2001
From: scchan <siuchi.chan@amd.com>
Date: Fri, 2 Sep 2016 02:36:32 -0400
Subject: [PATCH 3/3] remove debug code and messages

---
 HCCStream.cpp | 74 ---------------------------------------------------
 main.cpp      | 24 +++--------------
 2 files changed, 3 insertions(+), 95 deletions(-)

diff --git a/HCCStream.cpp b/HCCStream.cpp
index efd22e5e..57d4f39e 100644
--- a/HCCStream.cpp
+++ b/HCCStream.cpp
@@ -45,10 +45,6 @@ void listDevices(void)
   }
 }
 
-
-#define SCC_VERIFY (1)
-
-
 template <class T>
 HCCStream<T>::HCCStream(const unsigned int ARRAY_SIZE, const int device_index):
   array_size(ARRAY_SIZE),
@@ -88,45 +84,6 @@ void HCCStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b
   hc::copy(a.cbegin(),a.cend(),d_a);
   hc::copy(b.cbegin(),b.cend(),d_b);
   hc::copy(c.cbegin(),c.cend(),d_c);
-
-
-
-#if (SCC_VERIFY!=0) 
-{
-   hc::array_view<T> av_d_a(d_a);
-   int errors = 0;
-   int avi = 0;
-   for (auto i = a.begin(); i != a.end(); i++,avi++) {
-     if (av_d_a[avi] != *i)
-       errors++;
-   }
-   printf("%d errors in d_a\n",errors);
-}
-
-{
-   hc::array_view<T> av_d_b(d_b);
-   int errors = 0;
-   int avi = 0;
-   for (auto i = b.begin(); i != b.end(); i++,avi++) {
-     if (av_d_b[avi] != *i)
-       errors++;
-   }
-   printf("%d errors in d_b\n",errors);
-}
-
-{
-   hc::array_view<T> av_d_c(d_c);
-   int errors = 0;
-   int avi = 0;
-   for (auto i = c.begin(); i != c.end(); i++,avi++) {
-     if (av_d_c[avi] != *i)
-       errors++;
-   }
-   printf("%d errors in d_c\n",errors);
-}
-#endif
-
-
 }
 
 template <class T>
@@ -141,13 +98,8 @@ void HCCStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector
 template <class T>
 void HCCStream<T>::copy()
 {
-
-  std::cout << "In " << __FUNCTION__ << std::endl;
-
-#if 1
   hc::array<T> &d_a = this->d_a;
   hc::array<T> &d_c = this->d_c;
-#endif
 
   try{
   // launch a GPU kernel to compute the saxpy in parallel 
@@ -158,24 +110,6 @@ void HCCStream<T>::copy()
 								 d_c[i] = d_a[i];
 								});
     future_kernel.wait();
-
-
-#if (SCC_VERIFY!=0) 
-{
-   hc::array_view<T> av_d_c(d_c);
-   hc::array_view<T> av_d_a(d_a);
-   int errors = 0;
-   for (int i = 0; i < array_size; i++) {
-     if (av_d_c[i]!=av_d_a[i]) {
-       errors++;
-     }
-   }
-   printf("%s %d errors\n",__FUNCTION__,errors);
-}
-#endif
-
-
-
   }
   catch(std::exception& e){
     std::cout << e.what() << std::endl;
@@ -186,8 +120,6 @@ void HCCStream<T>::copy()
 template <class T>
 void HCCStream<T>::mul()
 {
-  std::cout << "In " << __FUNCTION__ << std::endl;
-
   hc::array<T> &d_b = this->d_b;
   hc::array<T> &d_c = this->d_c;
 
@@ -210,9 +142,6 @@ void HCCStream<T>::mul()
 template <class T>
 void HCCStream<T>::add()
 {
-
-  std::cout << "In " << __FUNCTION__ << std::endl;
-
   hc::array<T> &d_a = this->d_a;
   hc::array<T> &d_b = this->d_b;
   hc::array<T> &d_c = this->d_c;
@@ -235,9 +164,6 @@ void HCCStream<T>::add()
 template <class T>
 void HCCStream<T>::triad()
 {
-
-  std::cout << "In " << __FUNCTION__ << std::endl;
-
   hc::array<T> &d_a = this->d_a;
   hc::array<T> &d_b = this->d_b;
   hc::array<T> &d_c = this->d_c;
diff --git a/main.cpp b/main.cpp
index 3bd0326e..7aad2154 100644
--- a/main.cpp
+++ b/main.cpp
@@ -39,13 +39,7 @@
 #endif
 
 // Default size of 2^25
-//unsigned int ARRAY_SIZE = 33554432;
-
-//unsigned int ARRAY_SIZE = 1024 * 1024;
-
-unsigned int ARRAY_SIZE = 1024;
-
-
+unsigned int ARRAY_SIZE = 33554432;
 
 unsigned int num_times = 100;
 unsigned int deviceIndex = 0;
@@ -138,14 +132,9 @@ void run()
   stream = new OMP45Stream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
 
 #elif defined(HCC)
-
-  HCCStream<T>* gpu_buffer =  (HCCStream<T>*)  hc::allocate_coarsed_grain_system_memory(sizeof(HCCStream<T>));
-  stream = new(gpu_buffer) HCCStream<T>(ARRAY_SIZE, deviceIndex);
-  
-
   // Use the "reference" OpenMP 3 implementation
-  //stream = new HCCStream<T>(ARRAY_SIZE, deviceIndex);
-  
+  stream = new HCCStream<T>(ARRAY_SIZE, deviceIndex);
+
 #endif
 
   stream->write_arrays(a, b, c);
@@ -183,7 +172,6 @@ void run()
     t2 = std::chrono::high_resolution_clock::now();
     timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
 
-
   }
 
   // Check solutions
@@ -226,13 +214,7 @@ void run()
       << std::endl;
 
   }
-
-#ifdef HCC
-  stream->~Stream();
-  hc::free_system_memory((void*)gpu_buffer);
-#else
   delete stream;
-#endif
 }
 
 template <typename T>