-
Notifications
You must be signed in to change notification settings - Fork 80
/
blake2b.cu
168 lines (156 loc) · 5.32 KB
/
blake2b.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
// Blake2-B CUDA Implementation
// tpruvot@github July 2016
// permission granted to use under MIT license
// modified for use in Zcash by John Tromp September 2016
/**
* uint2 direct ops by c++ operator definitions
*/
static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) {
return make_uint2(a.x ^ b.x, a.y ^ b.y);
}
// uint2 ROR/ROL methods
__device__ __forceinline__ uint2 ROR2(const uint2 a, const int offset) {
uint2 result;
#if __CUDA_ARCH__ > 300
if (offset < 32) {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
} else /* if (offset < 64) */ {
/* offset SHOULD BE < 64 ! */
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
}
#else
if (!offset)
result = a;
else if (offset < 32) {
result.y = ((a.y >> offset) | (a.x << (32 - offset)));
result.x = ((a.x >> offset) | (a.y << (32 - offset)));
} else if (offset == 32) {
result.y = a.x;
result.x = a.y;
} else {
result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
}
#endif
return result;
}
__device__ __forceinline__ uint2 SWAPUINT2(uint2 value) {
return make_uint2(value.y, value.x);
}
#ifdef __CUDA_ARCH__
__device__ __inline__ uint2 ROR24(const uint2 a) {
uint2 result;
result.x = __byte_perm(a.y, a.x, 0x2107);
result.y = __byte_perm(a.y, a.x, 0x6543);
return result;
}
__device__ __inline__ uint2 ROR16(const uint2 a) {
uint2 result;
result.x = __byte_perm(a.y, a.x, 0x1076);
result.y = __byte_perm(a.y, a.x, 0x5432);
return result;
}
#else
#define ROR24(u) ROR2(u,24)
#define ROR16(u) ROR2(u,16)
#endif
typedef uint64_t u64;
static __constant__ const int8_t blake2b_sigma[12][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 } ,
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
__device__ __forceinline__
static void G(const int r, const int i, u64 &a, u64 &b, u64 &c, u64 &d, u64 const m[16]) {
a = a + b + m[ blake2b_sigma[r][2*i] ];
((uint2*)&d)[0] = SWAPUINT2( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
c = c + d;
((uint2*)&b)[0] = ROR24( ((uint2*)&b)[0] ^ ((uint2*)&c)[0] );
a = a + b + m[ blake2b_sigma[r][2*i+1] ];
((uint2*)&d)[0] = ROR16( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
c = c + d;
((uint2*)&b)[0] = ROR2( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
}
#define ROUND(r) \
G(r, 0, v[0], v[4], v[ 8], v[12], m); \
G(r, 1, v[1], v[5], v[ 9], v[13], m); \
G(r, 2, v[2], v[6], v[10], v[14], m); \
G(r, 3, v[3], v[7], v[11], v[15], m); \
G(r, 4, v[0], v[5], v[10], v[15], m); \
G(r, 5, v[1], v[6], v[11], v[12], m); \
G(r, 6, v[2], v[7], v[ 8], v[13], m); \
G(r, 7, v[3], v[4], v[ 9], v[14], m);
__device__ void blake2b_gpu_hash(blake2b_state *state, u32 idx, uchar *hash, u32 outlen) {
const u32 leb = idx; // CUDA is little endian, so no need for htole32(idx)
memcpy(state->buf + state->buflen, &leb, sizeof(u32));
state->buflen += sizeof(u32);
state->counter += state->buflen;
memset(state->buf + state->buflen, 0, BLAKE2B_BLOCKBYTES - state->buflen);
u64 *d_data = (u64 *)state->buf;
u64 m[16];
m[0] = d_data[0];
m[1] = d_data[1];
m[2] = d_data[2];
m[3] = d_data[3];
m[4] = d_data[4];
m[5] = d_data[5];
m[6] = d_data[6];
m[7] = d_data[7];
m[8] = d_data[8];
m[9] = d_data[9];
m[10] = d_data[10];
m[11] = d_data[11];
m[12] = d_data[12];
m[13] = d_data[13];
m[14] = d_data[14];
m[15] = d_data[15];
u64 v[16];
v[0] = state->h[0];
v[1] = state->h[1];
v[2] = state->h[2];
v[3] = state->h[3];
v[4] = state->h[4];
v[5] = state->h[5];
v[6] = state->h[6];
v[7] = state->h[7];
v[8] = 0x6a09e667f3bcc908;
v[9] = 0xbb67ae8584caa73b;
v[10] = 0x3c6ef372fe94f82b;
v[11] = 0xa54ff53a5f1d36f1;
v[12] = 0x510e527fade682d1 ^ state->counter;
v[13] = 0x9b05688c2b3e6c1f;
v[14] = 0x1f83d9abfb41bd6b ^ 0xffffffffffffffff;
v[15] = 0x5be0cd19137e2179;
ROUND( 0 );
ROUND( 1 );
ROUND( 2 );
ROUND( 3 );
ROUND( 4 );
ROUND( 5 );
ROUND( 6 );
ROUND( 7 );
ROUND( 8 );
ROUND( 9 );
ROUND( 10 );
ROUND( 11 );
state->h[0] ^= v[0] ^ v[ 8];
state->h[1] ^= v[1] ^ v[ 9];
state->h[2] ^= v[2] ^ v[10];
state->h[3] ^= v[3] ^ v[11];
state->h[4] ^= v[4] ^ v[12];
state->h[5] ^= v[5] ^ v[13];
state->h[6] ^= v[6] ^ v[14];
state->h[7] ^= v[7] ^ v[15];
memcpy(hash, (uchar *)state->h, outlen);
}