cnn_proj_cpu/hidden_layer.cpp at master · chitadi/cnn_proj_cpu · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#include <cstdint>
#include <vector>
#include <string>
#include <stdio.h>
#include <stdlib.h>
#include <limits>
#include <cmath>
#include <omp.h>
#include <openacc.h>
#include <chrono>
#include <iostream>
#include "consts.h"
#include "hidden_layer.h"

using namespace std;

// #define window 2 //this is to check how small the max pooling window should be this will result in a 2x2 matrix
// #define hidden_layers 2

// float kernel_list [dim][dim][index];

// depth is the 3rd dimension of the kernel - 3 for the first layer and 16 for the next
// dim is the number of dimensions of the kernel
// index is the number of kernels
int initialise_kernel(vector<vector<vector<vector<float>>>> &velocity_kernels,vector<vector<vector<vector<float>>>> &kernel_list, int depth, int dim, int index) {
    // kernel_list.resize(index, vector<vector<vector<float>>>(depth, vector<vector<float>>(dim, vector<float>(dim, 0.0f))));
    // for(int i = 0; i < index; i++) {
    //     for(int d = 0; d < depth; d++) {
    //         for(int j = 0; j < dim; j++) {
    //             for(int k = 0; k < dim; k++) {
    //                 kernel_list[i][d][j][k] = ((float)rand() / RAND_MAX) - 0.5f; // [-0.5, 0.5]
    //                 // kernel_list[i][d][j][k] = 0.2f * ((float)rand() / RAND_MAX) - 0.1f;
    //             }
    //         }
    //     }
    // }
    // return 0;

    // xavier initialisation

    // int fan_in = depth * dim * dim;  // in_channels * kH * kW
    // int fan_out = index * dim * dim; // out_channels * kH * kW
    // float limit = sqrt(6.0f / (fan_in + fan_out));

    // he initialisaton
    int fan_in = depth * dim * dim;
    float limit = sqrt(2.0f / fan_in);

    kernel_list.resize(index, vector<vector<vector<float>>>(depth, vector<vector<float>>(dim, vector<float>(dim, 0.0f))));
    velocity_kernels.resize(index, vector<vector<vector<float>>>(depth, vector<vector<float>>(dim, vector<float>(dim, 0.0f))));
    for(int i = 0; i < index; i++) {
        for(int d = 0; d < depth; d++) {
            for(int j = 0; j < dim; j++) {
                for(int k = 0; k < dim; k++) {
                    kernel_list[i][d][j][k] = ((float)rand() / RAND_MAX) * 2 * limit - limit;
                }
            }
        }
    }
    return 0;
}

// dim has to be the depth of the kernel
int initialise_bias(vector<float> &velocity_bias_list,vector<float> &bias_list, int index) {
    bias_list.resize(index, 0.0f);
    velocity_bias_list.resize(index, 0.0f);
    for(int i = 0; i < index; i++){
        bias_list[i] = 0.2f * ((float)rand() / RAND_MAX) - 0.1f;
    }
    return 0;
}
// start with a center c and rows go from c-1 to c+1 and columns go from c-1 to c+1 as well because dim is 3. so maybe go dim/2
// then multiple with the kernel, add all values and divide it by dim square. unsure now
// move c by one and keep repeating.
    int apply_kernel(Image &input_image,
        vector<vector<vector<vector<float>>>> &kernel_list,
        Image &output_map,
        vector<float> &bias_list) {

        int kernel_dim = kernel_list[0][0].size();
        int num_kernels = kernel_list.size();

        int input_channels = input_image.rgb.size();
        int input_height = input_image.rgb[0].size();
        int input_width = input_image.rgb[0][0].size();


        int output_height = input_height - kernel_dim + 1;
        int output_width = input_width - kernel_dim + 1;

    //     #pragma omp parallel
    // {
    //     #pragma omp single
    //     std::cout << "OpenMP thread count: " << omp_get_num_threads() << std::endl;
    // }

        // Resize output map
        output_map.rgb.resize(num_kernels, vector<vector<float>>(output_height, vector<float>(output_width, 0.0f)));
        output_map.pre_activation.resize(num_kernels, vector<vector<float>>(output_height, vector<float>(output_width, 0.0f)));

        // cout << "Inside apply_kernel: "
        //      << "input_channels=" << input_channels
        //      << ", num_kernels=" << num_kernels
        //      << ", output dims: " << output_height << "x" << output_width << endl;
        // #pragma omp parallel for collapse(3)
        #pragma omp parallel for collapse(2)
        // #pragma acc parallel loop collapse(2) present(input_image, kernel_list, bias_list, output_map)
        for (int i = 0; i < num_kernels; i++) {
            for (int row = 0; row < output_height; row++) {
                for (int col = 0; col < output_width; col++) {
                    float sum = 0.0f;
                    for (int c = 0; c < input_channels; c++) {
                        for (int a = 0; a < kernel_dim; a++) {
                            for (int b = 0; b < kernel_dim; b++) {
                                sum += input_image.rgb[c][row + a][col + b] * kernel_list[i][c][a][b];
                            }
                        }
                    }
                    float z = sum + bias_list[i]; // this is pre-activation
                    output_map.pre_activation[i][row][col] = z;
                    // if (i == 0 && row == 0 && col == 0)
                    //     cout << "[Debug] pre-activation = " << z << endl;
                    output_map.rgb[i][row][col] = max(0.0f, z); // ReLU

                }
            }
        }

        // dead relu checking
        // int total = 0, zero_count = 0;
        // for (const auto& channel : output_map.rgb) {
        //     for (const auto& row : channel) {
        //         for (float val : row) {
        //             total++;
        //             if (val == 0.0f) zero_count++;
        //         }
        //     }
        // }
        // float zero_ratio = 100.0f * zero_count / total;
        // cout << "ReLU Zero Count: " << zero_count << " / " << total
        //      << " (" << zero_ratio << "% zeros)" << endl;

        // Copy label
        for (int i = 0; i < 10; i++) {
            output_map.label[i] = input_image.label[i];
        }

        return 0;
    }

// declare pool properly when you call
// vector<vector<float>> pool;
// should be enough
int max_pool(Image &image_map, Image &pool){

    // int stride = 2;
    int row_pool = 0;
    int col_pool = 0;

    pool.rgb.resize(image_map.rgb.size(), vector<vector<float>>((image_map.rgb[0].size() + stride-1) / stride, vector<float>((image_map.rgb[0][0].size() + stride-1) / stride)));

    #pragma omp parallel for collapse(2)
    for(int k=0;k<image_map.rgb.size();k++){
        row_pool = 0;
        for(int row_image=0;row_image<image_map.rgb[0].size();row_image+=stride){
            col_pool = 0;
            for(int col_image=0;col_image<image_map.rgb[0][0].size();col_image+=stride){

                float max = -std::numeric_limits<float>::infinity();
                for(int i=row_image;i<min(row_image + window, (int)image_map.rgb[0].size());i++){
                for(int j=col_image;j<min(col_image+window, (int)image_map.rgb[0][0].size());j++){
                    if (image_map.rgb[k][i][j] > max){
                        max = image_map.rgb[k][i][j];
                    }
                }
            }
            pool.rgb[k][row_pool][col_pool] = max;
            col_pool++;
        }
        row_pool++;
    }
    }
    for (int i = 0; i < 10; i++) {
        pool.label[i] = image_map.label[i];
    }
    return 0;
}