Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions dev/fips202/armv81m/mve.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,18 @@

/* Part of backend API */
#define MLD_USE_FIPS202_X4_NATIVE
#define MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE
#define MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE
/* Guard for assembly file */
#define MLD_FIPS202_ARMV81M_NEED_X4

#if !defined(__ASSEMBLER__)
#include "../api.h"

/*
* Native x4 permutation
* State is kept in bit-interleaved format.
*/
#define mld_keccak_f1600_x4_native_impl \
MLD_NAMESPACE(keccak_f1600_x4_native_impl)
int mld_keccak_f1600_x4_native_impl(uint64_t *state);
Expand All @@ -27,6 +33,48 @@ static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state)
return mld_keccak_f1600_x4_native_impl(state);
}

/*
* Native x4 XOR bytes (with on-the-fly bit interleaving)
*/
#define mld_keccak_f1600_x4_state_xor_bytes \
MLD_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm)
void mld_keccak_f1600_x4_state_xor_bytes(void *state, const uint8_t *data0,
const uint8_t *data1,
const uint8_t *data2,
const uint8_t *data3, unsigned offset,
unsigned length);

MLD_MUST_CHECK_RETURN_VALUE
static MLD_INLINE int mld_keccakf1600_xor_bytes_x4_native(
uint64_t *state, const uint8_t *data0, const uint8_t *data1,
const uint8_t *data2, const uint8_t *data3, unsigned offset,
unsigned length)
{
mld_keccak_f1600_x4_state_xor_bytes(state, data0, data1, data2, data3, offset,
length);
return MLD_NATIVE_FUNC_SUCCESS;
}

/*
* Native x4 extract bytes (with on-the-fly bit de-interleaving)
*/
#define mld_keccak_f1600_x4_state_extract_bytes \
MLD_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm)
void mld_keccak_f1600_x4_state_extract_bytes(void *state, uint8_t *data0,
uint8_t *data1, uint8_t *data2,
uint8_t *data3, unsigned offset,
unsigned length);

MLD_MUST_CHECK_RETURN_VALUE
static MLD_INLINE int mld_keccakf1600_extract_bytes_x4_native(
uint64_t *state, uint8_t *data0, uint8_t *data1, uint8_t *data2,
uint8_t *data3, unsigned offset, unsigned length)
{
mld_keccak_f1600_x4_state_extract_bytes(state, data0, data1, data2, data3,
offset, length);
return MLD_NATIVE_FUNC_SUCCESS;
}

#endif /* !__ASSEMBLER__ */

#endif /* !MLD_DEV_FIPS202_ARMV81M_MVE_H */
16 changes: 16 additions & 0 deletions dev/fips202/armv81m/src/fips202_native_armv81m.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,20 @@ MLD_INTERNAL_DATA_DECLARATION const uint32_t
void mld_keccak_f1600_x4_mve_asm(uint64_t state[100], uint64_t tmpstate[100],
const uint32_t rc[48]);

#define mld_keccak_f1600_x4_state_xor_bytes_asm \
MLD_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm)
void mld_keccak_f1600_x4_state_xor_bytes_asm(void *state, const uint8_t *d0,
const uint8_t *d1,
const uint8_t *d2,
const uint8_t *d3, unsigned offset,
unsigned length);

#define mld_keccak_f1600_x4_state_extract_bytes_asm \
MLD_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm)
void mld_keccak_f1600_x4_state_extract_bytes_asm(void *state, uint8_t *data0,
uint8_t *data1, uint8_t *data2,
uint8_t *data3,
unsigned offset,
unsigned length);

#endif /* !MLD_DEV_FIPS202_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H */
105 changes: 5 additions & 100 deletions dev/fips202/armv81m/src/keccak_f1600_x4_mve.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,114 +12,19 @@

#include "fips202_native_armv81m.h"

/*
* TEMPORARY: Bit-interleaving using efficient shift-and-mask operations.
* TODO: Replace with optimized MVE assembly implementations
* (as a part of XORBytes and ExtractBytes)
*/

/* Extract even-indexed bits from 64-bit value into lower 32 bits */
static uint32_t bitinterleave_even(uint64_t x)
{
uint64_t t;
t = x & 0x5555555555555555ULL;
t = (t | (t >> 1)) & 0x3333333333333333ULL;
t = (t | (t >> 2)) & 0x0f0f0f0f0f0f0f0fULL;
t = (t | (t >> 4)) & 0x00ff00ff00ff00ffULL;
t = (t | (t >> 8)) & 0x0000ffff0000ffffULL;
t = (t | (t >> 16)) & 0x00000000ffffffffULL;
return (uint32_t)t;
}

/* Extract odd-indexed bits from 64-bit value into lower 32 bits */
static uint32_t bitinterleave_odd(uint64_t x)
{
return bitinterleave_even(x >> 1);
}

/* Spread 32-bit value across even bit positions of 64-bit result */
static uint64_t spread_even(uint32_t x)
{
uint64_t t = x;
t = (t | (t << 16)) & 0x0000ffff0000ffffULL;
t = (t | (t << 8)) & 0x00ff00ff00ff00ffULL;
t = (t | (t << 4)) & 0x0f0f0f0f0f0f0f0fULL;
t = (t | (t << 2)) & 0x3333333333333333ULL;
t = (t | (t << 1)) & 0x5555555555555555ULL;
return t;
}

/* Combine even and odd 32-bit halves into interleaved 64-bit value */
static uint64_t bitdeinterleave(uint32_t even, uint32_t odd)
{
return spread_even(even) | (spread_even(odd) << 1);
}

/*
* TEMPORARY: Naive C interleaving functions.
* These will be replaced with optimized MVE assembly implementations.
* Keccak-f1600 x4 permutation (on bit-interleaved state)
* State is expected to already be in bit-interleaved format.
*/
static void interleave_4fold(uint64_t *state_4x, const uint64_t *state0,
const uint64_t *state1, const uint64_t *state2,
const uint64_t *state3)
{
uint32_t *state_4xl = (uint32_t *)state_4x;
uint32_t *state_4xh = (uint32_t *)state_4x + 100;

for (size_t i = 0; i < 25; i++)
{
state_4xl[i * 4 + 0] = bitinterleave_even(state0[i]);
state_4xl[i * 4 + 1] = bitinterleave_even(state1[i]);
state_4xl[i * 4 + 2] = bitinterleave_even(state2[i]);
state_4xl[i * 4 + 3] = bitinterleave_even(state3[i]);

state_4xh[i * 4 + 0] = bitinterleave_odd(state0[i]);
state_4xh[i * 4 + 1] = bitinterleave_odd(state1[i]);
state_4xh[i * 4 + 2] = bitinterleave_odd(state2[i]);
state_4xh[i * 4 + 3] = bitinterleave_odd(state3[i]);
}
}

static void deinterleave_4fold(uint64_t *state_4x, uint64_t *state0,
uint64_t *state1, uint64_t *state2,
uint64_t *state3)
{
uint32_t *state_4xl = (uint32_t *)state_4x;
uint32_t *state_4xh = (uint32_t *)state_4x + 100;

for (size_t i = 0; i < 25; i++)
{
state0[i] = bitdeinterleave(state_4xl[i * 4 + 0], state_4xh[i * 4 + 0]);
state1[i] = bitdeinterleave(state_4xl[i * 4 + 1], state_4xh[i * 4 + 1]);
state2[i] = bitdeinterleave(state_4xl[i * 4 + 2], state_4xh[i * 4 + 2]);
state3[i] = bitdeinterleave(state_4xl[i * 4 + 3], state_4xh[i * 4 + 3]);
}
}

#define mld_keccak_f1600_x4_native_impl \
MLD_NAMESPACE(keccak_f1600_x4_native_impl)
int mld_keccak_f1600_x4_native_impl(uint64_t *state)
{
/*
* TEMPORARY: Bit-interleaving using efficient shift-and-mask operations.
* TODO: Replace with optimized MVE assembly implementations
* (as a part of XORBytes and ExtractBytes)
*/
MLD_ALIGN uint64_t state_4x[100];
MLD_ALIGN uint64_t state_4x_tmp[100];

/* Interleave the 4 states into bit-interleaved format */
interleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]);

/* Run the permutation */
mld_keccak_f1600_x4_mve_asm(state_4x, state_4x_tmp,
MLD_ALIGN uint64_t state_tmp[100];
mld_keccak_f1600_x4_mve_asm(state, state_tmp,
mld_keccakf1600_round_constants);

/* Deinterleave back to 4 separate states */
deinterleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]);

mld_zeroize(state_4x, sizeof(state_4x));
mld_zeroize(state_4x_tmp, sizeof(state_4x_tmp));
mld_zeroize(state_tmp, sizeof(state_tmp));
return MLD_NATIVE_FUNC_SUCCESS;
}

Expand Down
Loading
Loading