-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy patharraycopy.c
More file actions
143 lines (119 loc) · 3.69 KB
/
arraycopy.c
File metadata and controls
143 lines (119 loc) · 3.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include "source.h"
#define ELEM_SIZE 8 // 8 bytes for uint64_t
#define NUM_ELEM_IN_BUFFER 1024*1024*2 // 2 M elements in buffer
#define BUFFER_SIZE NUM_ELEM_IN_BUFFER*ELEM_SIZE // 16 MiB in buffer
#if !defined(VSX)
void arraycopy(uint64_t *dst, uint64_t *src, size_t n)
{
size_t i;
size_t remainder;
// Bulk rd/wr size is 4, ie 4 x 8 bytes, or
// 4 x 64-bit elements rd/wr "at once".
remainder = n % 4;
i = n / 4;
#if defined(VERBOSE)
printf("Copying %ld 64-bit element(s), " \
"with %ld block iteration(s) " \
"and %ld byte(s) as remainder(s)\n", n, i, remainder);
#endif
asm (
" li 3, 7+8 \n\t"
" mtspr 3, 3 \n\t"
" cmpldi %2, 0 \n\t"
" beq 2f \n\t"
" mtctr %2 \n\t"
/********* Main Code ********/
"1: ld 3, 0(%1) \n\t"
" ld 4, 8(%1) \n\t"
" ld 5,16(%1) \n\t"
" ld 6,24(%1) \n\t"
" std 3, 0(%0) \n\t"
" std 4, 8(%0) \n\t"
" std 5,16(%0) \n\t"
" std 6,24(%0) \n\t"
/****************************/
" addi %1, %1, 32 \n\t"
" addi %0, %0, 32 \n\t"
" bdnz+ 1b \n\t"
"2: nop \n\t"
:
: "r"(dst), "r"(src), "r"(i)
: "memory", "r3", "r4", "r5", "r6"
);
for (int j = i*4; j < n; ++j)
dst[j] = src[j];
}
#else
void arraycopy(uint64_t *dst, uint64_t *src, size_t n)
{
size_t i = 0;
size_t remainder = 0;
// Bulk rd/wr size is 8, ie 8 x 8 bytes, or
// 8 x 64-bit elements rd/wr "at once".
remainder = n % 4;
i = n / 4;
#if defined(VERBOSE)
printf("VSX copying %ld 64-bit element(s), " \
"with %ld block iteration(s) " \
"and %ld byte(s) as remainder(s)\n", n, i, remainder);
#endif
asm (
" cmpldi %2, 0 \n\t"
" beq 2f \n\t"
" li 6, 16 \n\t"
" li 7, 32 \n\t"
" li 8, 48 \n\t"
" mtctr %2 \n\t"
"1: lxvd2x 6, 0, %1 \n\t"
" lxvd2x 7, %1, 6 \n\t"
" stxvd2x 6, 0, %0 \n\t"
" stxvd2x 7, %0, 6 \n\t"
" addi %1, %1, 32 \n\t"
" addi %0, %0, 32 \n\t"
" bdnz+ 1b \n\t"
"2: nop \n\t"
:
: "r"(dst), "r"(src), "r"(i)
: "memory", "3", "4", "5", "6", "7"
);
for (int j = i*4; j < n; ++j)
dst[j] = src[j];
}
#endif
int main(void)
{
printf("** Inline ASM for VSX test **\n\n");
//uint64_t* source => from source.h
uint64_t* destination = malloc(BUFFER_SIZE); // 16 MiB, or 2 M 64-bit elements.
printf("1. Exercising...\n");
// Waist some time here.
for (int p = 0; p < 2500; ++p) {
#if defined(MEMCPY) // Use libc memcpy().
memcpy(destination, source, BUFFER_SIZE);
#else // use our crafted VSX copy.
arraycopy(destination, source, NUM_ELEM_IN_BUFFER);
#endif
}
printf("1. Done.\n");
#if defined(CHECKCOPY)
printf("2. Verifying if copy is ok...\n");
for (int p = 0; p < 16*4+2; ++p) {
if (destination[p] != source[p]) {
printf(">> Mismatch @%d: %#lx != %#lx\n", p, \
destination[p], \
source[p]);
exit(1);
}
}
printf("2. Done.\n");
#endif
exit(0);
}