parallelcomp/squbitsim2.cu at main · abhinav110595/parallelcomp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#include <stdio.h>
#include <iostream>
#include <vector>
#include <fstream>
#include <cmath>

#include <cuda_runtime.h>
using namespace std;

__global__ void
squbitsim(const float *input_A, float *output, const float *input_B, int nq, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
   int set;
       if(i < numElements)
       {
	       int index1 = i | (int)powf(2, nq);    //////////__funnelshift_r(1, 0, nq);
       	int index2 = i & ~(int)powf(2, nq);
	set=((i & (int)powf(2, nq))> 0)?1:0;
	if(set)
		output[i] = input_B[2] * input_A[index2] + input_B[3] * input_A[i];
	else
		output[i] = input_B[0] * input_A[i] + input_B[1] * input_A[index1];
       }
}

/**
 * Host main routine
 */
int
main(void)
{
    // Error code to check return values for CUDA calls
    cudaError_t err = cudaSuccess;
	vector<float> value;
	int numElements, nq;
	float *input_B;
	//float *input_B= (float *)malloc(sizeof(float));
	cudaMallocManaged(&input_B,sizeof(float));
	fstream f_in("input.txt", ios::in);
	if (f_in.is_open()) {

	int k = 0;
	while( k < 4)
	{
		f_in >>input_B[k];
		k++;
	}

	for (float f; f_in >> f;numElements++) {
   	value.insert(value.end(), f);
    	}
	f_in.close();
	}
	else {
    		// handle error opening file
	}

	nq = (int)value.back();
	value.pop_back();
	numElements--;
	size_t size = numElements *sizeof(float);
	float *input_A;
	float *output;
    	cudaMallocManaged(&input_A,size);
   	cudaMallocManaged(&output,size);
	//float *input_A= (float *)malloc(size);
	//float *output= (float *)malloc(size);


	// Verify that allocations succeeded
    if (input_A == NULL || output == NULL || input_B == NULL )
    {
       // fprintf(stderr, "Failed to allocate host vectors!\n");
        exit(EXIT_FAILURE);
    }

    //Populate input array
    for(int i=0; i<numElements; ++i)
	    input_A[i]=value[i];

    // Allocate the device input vector in
  //  float *d_input_A = NULL;
  //  err = cudaMalloc((void **)&d_input_A, size);

  //  if (err != cudaSuccess)
  //  {
  //      fprintf(stderr, "Failed to allocate device d_input_A (error code %s)!\n", cudaGetErrorString(err));
  //      exit(EXIT_FAILURE);
  //  }

  //  // Allocate the device input vector output
  //  float *d_output = NULL;
  //  err = cudaMalloc((void **)&d_output, size);

  //  if (err != cudaSuccess)
  //  {
  //      fprintf(stderr, "Failed to allocate device d_output (error code %s)!\n", cudaGetErrorString(err));
  //      exit(EXIT_FAILURE);
  //  }

  //  // Allocate the device output vector C
  //  float *d_input_B = NULL;
  //  err = cudaMalloc((void **)&d_input_B, size) ;

  //  if (err != cudaSuccess)
  //  {
  //      fprintf(stderr, "Failed to allocate device d_input_B (error code %s)!\n", cudaGetErrorString(err));
  //      exit(EXIT_FAILURE);
  //  }

  //  // Copy the host input d_input_A and d_input_B in host memory to the device input vectors in
  //  // device memory
  //  //printf("Copy input data from the host memory to the CUDA device\n");
  //  err = cudaMemcpy(d_input_A, input_A, size, cudaMemcpyHostToDevice);

  //  if (err != cudaSuccess)
  //  {
  //      fprintf(stderr, "Failed to copy input array d_input_A from host to device (error code %s)!\n", cudaGetErrorString(err));
  //      exit(EXIT_FAILURE);
  //  }

  //  err = cudaMemcpy(d_input_B, input_B, size, cudaMemcpyHostToDevice);

  //  if (err != cudaSuccess)
  //  {
  //      fprintf(stderr, "Failed to copy qbit gate d_input_B from host to device (error code %s)!\n", cudaGetErrorString(err));
  //      exit(EXIT_FAILURE);
  //  }
 int threadsPerBlock = 256;
    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
    //printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
    squbitsim<<<blocksPerGrid, threadsPerBlock>>>(input_A, output, input_B, nq, numElements);
    cudaDeviceSynchronize();
    err = cudaGetLastError();

    //if (err != cudaSuccess)
    //{
    //    fprintf(stderr, "Failed to launch squbitsim kernel (error code %s)!\n", cudaGetErrorString(err));
    //    exit(EXIT_FAILURE);
    //}

    // Copy the device result vector in device memory to the host result vector
    // in host memory.
    //printf("Copy output data from the CUDA device to the host memory\n");
    //err = cudaMemcpy(output, d_output, size, cudaMemcpyDeviceToHost);

 //   if (err != cudaSuccess)
 //   {
 //       fprintf(stderr, "Failed to copy output array d_output from device to host (error code %s)!\n", cudaGetErrorString(err));
 //       exit(EXIT_FAILURE);
 //   }

    for (int i = 0; i < numElements; ++i)
    {
        printf(" %0.3f \n", output[i]);
    }

    //printf("Test PASSED\n");

    // Free device global memory
   // err = cudaFree(input_A);

   // if (err != cudaSuccess)
   // {
   //     fprintf(stderr, "Failed to free device d_input_A (error code %s)!\n", cudaGetErrorString(err));
   //     exit(EXIT_FAILURE);
   // }

   // err = cudaFree(output);

   // if (err != cudaSuccess)
   // {
       // fprintf(stderr, "Failed to free device d_output (error code %s)!\n", cudaGetErrorString(err));
    //    exit(EXIT_FAILURE);
    //}

    //err = cudaFree(input_B);

    //if (err != cudaSuccess)
   // {
   //    // fprintf(stderr, "Failed to free device d_u (error code %s)!\n", cudaGetErrorString(err));
    //    exit(EXIT_FAILURE);
    //}

    // Free host memory
    cudaFree(input_A);
    cudaFree(output);
    cudaFree(input_B);

    err = cudaDeviceReset();

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    printf("Done\n");
    return 0;
}