Brief: Binary ANDs two vector registers.

Call signature:
binary_and<Vec>(const typename Vec::register_type a, const typename Vec::register_type b) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Binary ORs two vector registers.

Call signature:
binary_or<Vec>(const typename Vec::register_type a, const typename Vec::register_type b) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Binary XORs two vector registers.

Call signature:
binary_xor<Vec>(const typename Vec::register_type a, const typename Vec::register_type b) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Arithmetic shift of data to the left by n bits.

Call signature:
shift_left<Vec>(const typename Vec::register_type data, const unsigned int shift) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	-	+	-	-	-	-	-	-
double	-	+	-	-	-	-	-	-

Brief: Shifts data to left by n bits (shifting in 0).

Call signature:
shift_left<Vec>(const typename Vec::register_type data, const typename Vec::register_type shift) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Arithmetic shift of data to the right by n bits.

Call signature:
shift_right<Vec>(const typename Vec::register_type data, const unsigned int shift) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Arithmetic shift of data to the right by n bits.

Call signature:
shift_right<Vec>(const typename Vec::register_type data, const typename Vec::register_type shift) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Arithmetic shift of data to the right by n bits.

Call signature:
shift_right<Vec>(const typename Vec::imask_type data, const unsigned int shift) -> typename Vec::imask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Population counter.

Call signature:
popcnt<Vec>(const typename Vec::register_type data) -> typename Vec::offset_base_register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: todo.

Call signature:
lzc<Vec>(const typename Vec::base_type data) -> typename Vec::offset_base_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Leading zeros counter.

Call signature:
lzc<Vec>(const typename Vec::register_type data) -> typename Vec::offset_base_register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	+	-
int8_t	+	-	-	+	+	+	+	-
uint16_t	+	-	-	+	+	+	+	-
int16_t	+	-	-	+	+	+	+	-
uint32_t	+	-	-	+	+	+	+	+
int32_t	+	-	-	+	+	+	+	+
uint64_t	+	-	-	+	+	+	+	-
int64_t	+	-	-	+	+	+	+	-
float	-	-	-	-	-	-	+	+
double	-	-	-	-	-	-	+	-

Brief: Leading zeros counter.

Call signature:
lzc_alt<Vec>(const typename Vec::register_type data) -> typename Vec::offset_base_register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	+	-
int8_t	-	-	-	-	-	-	+	-
uint16_t	-	-	-	-	-	-	+	-
int16_t	-	-	-	-	-	-	+	-
uint32_t	-	-	-	-	-	-	+	-
int32_t	-	-	-	-	-	-	+	-
uint64_t	-	-	-	-	-	-	+	-
int64_t	-	-	-	-	-	-	+	-
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Leading zeros counter.

Call signature:
lzc_alt1<Vec>(const typename Vec::register_type data) -> typename Vec::offset_base_register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	+	-
int8_t	-	-	-	-	-	-	+	-
uint16_t	-	-	-	-	-	-	+	-
int16_t	-	-	-	-	-	-	+	-
uint32_t	-	-	-	-	-	-	+	-
int32_t	-	-	-	-	-	-	+	-
uint64_t	-	-	-	-	-	-	+	-
int64_t	-	-	-	-	-	-	+	-
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Leading zeros counter.

Call signature:
lzc_alt2<Vec>(const typename Vec::register_type data) -> typename Vec::offset_base_register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	+	-
int8_t	-	-	-	-	-	-	+	-
uint16_t	-	-	-	-	-	-	+	-
int16_t	-	-	-	-	-	-	+	-
uint32_t	-	-	-	-	-	-	+	-
int32_t	-	-	-	-	-	-	+	-
uint64_t	-	-	-	-	-	-	+	-
int64_t	-	-	-	-	-	-	+	-
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Leading zeros counter.

Call signature:
lzc_alt3<Vec>(const typename Vec::register_type data) -> typename Vec::offset_base_register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	+	-
int8_t	-	-	-	-	-	-	+	-
uint16_t	-	-	-	-	-	-	+	-
int16_t	-	-	-	-	-	-	+	-
uint32_t	-	-	-	-	-	-	+	-
int32_t	-	-	-	-	-	-	+	-
uint64_t	-	-	-	-	-	-	+	-
int64_t	-	-	-	-	-	-	+	-
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: todo.

Call signature:
lzc<Vec>(const typename Vec::imask_type data) -> typename Vec::imask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: todo.

Call signature:
tzc<Vec>(const typename Vec::imask_type data) -> typename Vec::imask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Operates horizontal OR on vector register

Call signature:
hor<Vec>(const typename Vec::register_type vec) -> typename Vec::base_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	+	+
int8_t	+	-	-	+	+	+	+	+
uint16_t	+	-	-	+	+	+	+	+
int16_t	+	-	-	+	+	+	+	+
uint32_t	+	-	-	+	+	+	+	+
int32_t	+	-	-	+	+	+	+	+
uint64_t	+	-	-	+	+	+	+	+
int64_t	+	-	-	+	+	+	+	+
float	+	-	-	+	+	+	+	+
double	+	-	-	+	+	+	+	+

Brief: Bitwise invertion values in vector Register.

Call signature:
inv<Vec>(const typename Vec::register_type vec) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	+	+
int8_t	+	-	-	+	+	+	+	+
uint16_t	+	-	-	+	+	+	+	+
int16_t	+	-	-	+	+	+	+	+
uint32_t	+	-	-	+	+	+	+	+
int32_t	+	-	-	+	+	+	+	+
uint64_t	+	-	-	+	+	+	+	+
int64_t	+	-	-	+	+	+	+	+
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Adds two vector registers.

Call signature:
add<Vec>(const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	+	+	+	+	+	+
int8_t	+	+	+	+	+	+	+	+
uint16_t	+	+	+	+	+	+	+	+
int16_t	+	+	+	+	+	+	+	+
uint32_t	+	+	+	+	+	+	+	+
int32_t	+	+	+	+	+	+	+	+
uint64_t	+	+	+	+	+	+	+	+
int64_t	+	+	+	+	+	+	+	+
float	+	+	+	+	+	+	+	+
double	+	+	+	+	+	+	+	+

Brief: Subtracts two vector registers.

Call signature:
sub<Vec>(const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	+	+	+	+	+	+
int8_t	+	+	+	+	+	+	+	+
uint16_t	+	+	+	+	+	+	+	+
int16_t	+	+	+	+	+	+	+	+
uint32_t	+	+	+	+	+	+	+	+
int32_t	+	+	+	+	+	+	+	+
uint64_t	+	+	+	+	+	+	+	+
int64_t	+	+	+	+	+	+	+	+
float	+	+	+	+	+	+	+	+
double	+	+	+	+	+	+	+	+

Brief: Adds two vector registers, depending on a mask: result[*] = (m[*])? vec_a[*]+vec_b[*] : vec_a[*].

Call signature:
add<Vec>(const typename Vec::mask_type mask, const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Adds two vector registers, depending on a mask: result[*] = (m[*])? vec_a[*]+vec_b[*] : vec_a[*].

Call signature:
add<Vec>(const typename Vec::imask_type mask, const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Subtracts two vector registers, depending on a mask: result[*] = (m[*])? vec_a[*]-vec_b[*] : vec_a[*].

Call signature:
sub<Vec>(const typename Vec::mask_type mask, const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Subtracts two vector registers, depending on a mask: result[*] = (m[*])? vec_a[*]-vec_b[*] : vec_a[*].

Call signature:
sub<Vec>(const typename Vec::imask_type mask, const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Multiplies two vector registers.

Call signature:
mul<Vec>(const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	+	+	-	+	+	+	+	+
double	+	+	-	+	+	+	+	+

Brief: Multiplies a vector register with a constant.

Call signature:
mul<Vec>(const typename Vec::register_type vec_a, const typename Vec::base_type mul_var) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	+	+
int8_t	+	-	-	+	+	+	+	+
uint16_t	+	-	-	+	+	+	+	+
int16_t	+	-	-	+	+	+	+	+
uint32_t	+	-	-	+	+	+	+	+
int32_t	+	-	-	+	+	+	+	+
uint64_t	+	-	-	+	+	+	+	+
int64_t	+	-	-	+	+	+	+	+
float	+	-	-	+	+	+	+	+
double	+	-	-	+	+	+	+	+

Brief: Reduces the elements to a sum.

Call signature:
hadd<Vec>(const typename Vec::register_type value) -> typename Vec::base_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	+	+	-	+	+	+	+	+
double	+	+	-	+	+	+	+	+

Brief: compares the values of 2 vectors and returns a vector with the minimum of each corrisponding values

Call signature:
min<Vec>(const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	-	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	-	-	+	+	+	+	+
int64_t	+	-	-	+	+	+	+	+
float	+	+	-	+	+	+	+	+
double	+	+	-	+	+	+	+	+

Brief: Divides two vector registers.

Call signature:
div<Vec>(const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	+	+	-	+	+	+	+	+
double	+	+	-	+	+	+	+	+

Brief: Operates the modulo operation on one datavector modulo another data vector.

Call signature:
mod<Vec>(const typename Vec::register_type vec_data, const typename Vec::register_type vec_mod) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	+	+
int8_t	+	-	-	+	+	+	+	+
uint16_t	+	-	-	+	+	+	+	+
int16_t	+	-	-	+	+	+	+	+
uint32_t	+	-	-	+	+	+	+	+
int32_t	+	-	-	+	+	+	+	+
uint64_t	+	-	-	+	+	+	+	+
int64_t	+	-	-	+	+	+	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Operates the modulo operation on one datavector modulo one input value.

Call signature:
mod<Vec>(const typename Vec::register_type vec, const typename Vec::base_type val) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	+	+
int8_t	+	-	-	+	+	+	+	+
uint16_t	+	-	-	+	+	+	+	+
int16_t	+	-	-	+	+	+	+	+
uint32_t	+	-	-	+	+	+	+	+
int32_t	+	-	-	+	+	+	+	+
uint64_t	+	-	-	+	+	+	+	+
int64_t	+	-	-	+	+	+	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Operates the modulo operation on one datavector modulo one input value.

Call signature:
mod_safe<Vec>(const typename Vec::register_type vec, const typename Vec::base_type val) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Reduces the elements to the maximum value.

Call signature:
hmax<Vec>(const typename Vec::register_type data) -> typename Vec::base_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	+	+
int8_t	+	-	-	+	+	+	+	+
uint16_t	+	-	-	+	+	+	+	+
int16_t	+	-	-	+	+	+	+	+
uint32_t	+	-	-	+	+	+	+	+
int32_t	+	-	-	+	+	+	+	+
uint64_t	+	-	-	+	+	+	+	+
int64_t	+	-	-	+	+	+	+	+
float	+	-	-	+	+	+	+	+
double	+	-	-	+	+	+	+	+

Brief: Reduces the elements to the minimum value.

Call signature:
hmin<Vec>(const typename Vec::register_type data) -> typename Vec::base_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	+	+
int8_t	+	-	-	+	+	+	+	+
uint16_t	+	-	-	+	+	+	+	+
int16_t	+	-	-	+	+	+	+	+
uint32_t	+	-	-	+	+	+	+	+
int32_t	+	-	-	+	+	+	+	+
uint64_t	+	-	-	+	+	+	+	+
int64_t	+	-	-	+	+	+	+	+
float	+	-	-	+	+	+	+	+
double	+	-	-	+	+	+	+	+

Brief: compares the values of 2 vectors and returns a vector with the maximum of each corrisponding values

Call signature:
max<Vec>(const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Compares two vector registers for equality.

Call signature:
equal<Vec>(const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::mask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	+	+	-	+	+	+	+	+
double	+	+	-	+	+	+	+	+

Brief: Compares two vector registers for equality.

Call signature:
equal<Vec>(const typename Vec::mask_type mask, const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::mask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Compares two imasks for equality.

Call signature:
equal<Vec>(const typename Vec::imask_type mask_a, const typename Vec::imask_type mask_b) -> bool

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Compares two registers for equality and returns an integral mask.

Call signature:
equal_as_imask<Vec>(const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::imask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Compares two registers for equality and returns an integral mask.

Call signature:
equal_as_imask<Vec>(const typename Vec::imask_type mask, const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::imask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Compares two vector registers for inequality.

Call signature:
nequal<Vec>(const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::mask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	+	+	-	-	+	+	+	+
double	+	+	-	-	+	+	+	+

Brief: Compares two vector registers for inequality.

Call signature:
nequal<Vec>(const typename Vec::mask_type mask, const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::mask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Compares two imasks for non-equality.

Call signature:
nequal<Vec>(const typename Vec::imask_type mask_a, const typename Vec::imask_type mask_b) -> bool

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Compares two registers for not-equality and returns an integral mask.

Call signature:
nequal_as_imask<Vec>(const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::imask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Compares two registers for not-equality and returns an integral mask.

Call signature:
nequal_as_imask<Vec>(const typename Vec::imask_type mask, const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::imask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Tests whether left elements are smaller than the corresponding right ones.

Call signature:
less_than<Vec>(const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::mask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	+	+	-	+	+	+	+	+
double	+	+	-	+	+	+	+	+

Brief: Tests whether left elements are larger than or equal to the corresponding right ones.

Call signature:
greater_than<Vec>(const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::mask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	+	+	-	+	+	+	+	+
double	+	+	-	+	+	+	+	+

Brief: Tests whether left elements are smaller than or equal to the corresponding right ones.

Call signature:
less_than_or_equal<Vec>(const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::mask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	+	+	-	+	+	+	+	+
double	+	+	-	+	+	+	+	+

Brief: Tests whether left elements are larger than the corresponding right ones.

Call signature:
greater_than_or_equal<Vec>(const typename Vec::register_type vec_a, const typename Vec::register_type vec_b) -> typename Vec::mask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	+	+	-	+	+	+	+	+
double	+	+	-	+	+	+	+	+

Brief: Checks if the values of a vector are in a specific range (min[*] <= d[*] <= max[*]).

Call signature:
between_inclusive<Vec>(const typename Vec::register_type vec_data, const typename Vec::register_type vec_min, typename Vec::register_type vec_max) -> typename Vec::mask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	+	+	-	+	+	+	+	+
double	+	+	-	+	+	+	+	+

Brief: Checks if the vector register contains at least one value unequal zero.

Call signature:
unequal_zero<Vec>(const typename Vec::register_type vec) -> bool

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Counts number of matches of a chosen value within a vector register.

Call signature:
count_matches<Vec>(const typename Vec::register_type vec, const typename Vec::base_type val) -> typename Vec::base_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Unpacks a coherent AC_INT.

Call signature:
unpack_acint<Vec>(ac_int<Vec::vector_size_b(), false> data) -> ac_int<Vec::vector_size_b(), false>

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	-	-
int8_t	-	-	-	-	-	-	-	-
uint16_t	-	-	-	-	-	-	-	-
int16_t	-	-	-	-	-	-	-	-
uint32_t	-	-	-	-	-	-	+	+
int32_t	-	-	-	-	-	-	+	+
uint64_t	-	-	-	-	-	-	+	+
int64_t	-	-	-	-	-	-	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief:

Call signature:
unpack_merge_acint<Vec>(ac_int<Vec::vector_size_b(), false> source, ac_int<Vec::vector_size_b(), false> const data) -> ac_int<Vec::vector_size_b(), false>

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	-	-
int8_t	-	-	-	-	-	-	-	-
uint16_t	-	-	-	-	-	-	-	-
int16_t	-	-	-	-	-	-	-	-
uint32_t	-	-	-	-	-	-	+	+
int32_t	-	-	-	-	-	-	+	+
uint64_t	-	-	-	-	-	-	+	+
int64_t	-	-	-	-	-	-	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Loads consecutive data from memory into a coherent AC_INT.

Call signature:
load_acint<Vec>(const typename Vec::base_type* memory) -> ac_int<Vec::vector_size_b(), false>

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	-	-
int8_t	-	-	-	-	-	-	-	-
uint16_t	-	-	-	-	-	-	-	-
int16_t	-	-	-	-	-	-	-	-
uint32_t	-	-	-	-	-	-	+	+
int32_t	-	-	-	-	-	-	+	+
uint64_t	-	-	-	-	-	-	+	+
int64_t	-	-	-	-	-	-	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: todo.

Call signature:
append_acint<Vec>(ac_int<Vec::vector_size_b(), false> source, ac_int<Vec::vector_size_b(), false> const data) -> ac_int<Vec::vector_size_b(), false>

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	-	-
int8_t	-	-	-	-	-	-	-	-
uint16_t	-	-	-	-	-	-	-	-
int16_t	-	-	-	-	-	-	-	-
uint32_t	-	-	-	-	-	-	+	+
int32_t	-	-	-	-	-	-	+	+
uint64_t	-	-	-	-	-	-	+	+
int64_t	-	-	-	-	-	-	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Loads data from memory and packs n-bits from all elements within a vector together.

Call signature:
packed_load_acint<Vec>(const typename Vec::base_type* memory) -> ac_int<Vec::vector_size_b(), false>

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	-	-
int8_t	-	-	-	-	-	-	-	-
uint16_t	-	-	-	-	-	-	-	-
int16_t	-	-	-	-	-	-	-	-
uint32_t	-	-	-	-	-	-	+	+
int32_t	-	-	-	-	-	-	+	+
uint64_t	-	-	-	-	-	-	+	+
int64_t	-	-	-	-	-	-	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: stores N bit of data. N must be a multiple of sizeof(Vec::base_type).

Call signature:
packed_store_acint<Vec>(typename Vec::base_type* memory, ac_int<Vec::vector_size_b(), false> data) -> void

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	-	-
int8_t	-	-	-	-	-	-	-	-
uint16_t	-	-	-	-	-	-	-	-
int16_t	-	-	-	-	-	-	-	-
uint32_t	-	-	-	-	-	-	+	+
int32_t	-	-	-	-	-	-	+	+
uint64_t	-	-	-	-	-	-	+	+
int64_t	-	-	-	-	-	-	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief:

Call signature:
store_acint<Vec>(typename Vec::base_type* memory, ac_int<Vec::vector_size_b(), false> data) -> void

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	-	-
int8_t	-	-	-	-	-	-	-	-
uint16_t	-	-	-	-	-	-	-	-
int16_t	-	-	-	-	-	-	-	-
uint32_t	-	-	-	-	-	-	+	+
int32_t	-	-	-	-	-	-	+	+
uint64_t	-	-	-	-	-	-	+	+
int64_t	-	-	-	-	-	-	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief:

Call signature:
packed_shift_left_acint<Vec>(ac_int<Vec::vector_size_b(), false> data, int shift_value) -> ac_int<Vec::vector_size_b(), false>

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	-	-
int8_t	-	-	-	-	-	-	-	-
uint16_t	-	-	-	-	-	-	-	-
int16_t	-	-	-	-	-	-	-	-
uint32_t	-	-	-	-	-	-	+	+
int32_t	-	-	-	-	-	-	+	+
uint64_t	-	-	-	-	-	-	+	+
int64_t	-	-	-	-	-	-	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief:

Call signature:
slice_packed_shift_left_acint_by_N<Vec>(ac_int<Vec::vector_size_b(), false> data) -> ac_int<Vec::vector_size_b(), false>

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	-	-
int8_t	-	-	-	-	-	-	-	-
uint16_t	-	-	-	-	-	-	-	-
int16_t	-	-	-	-	-	-	-	-
uint32_t	-	-	-	-	-	-	+	+
int32_t	-	-	-	-	-	-	+	+
uint64_t	-	-	-	-	-	-	+	+
int64_t	-	-	-	-	-	-	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief:

Call signature:
packed_shift_right_acint<Vec>(ac_int<Vec::vector_size_b(), false> data, int shift_value) -> ac_int<Vec::vector_size_b(), false>

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	-	-
int8_t	-	-	-	-	-	-	-	-
uint16_t	-	-	-	-	-	-	-	-
int16_t	-	-	-	-	-	-	-	-
uint32_t	-	-	-	-	-	-	+	+
int32_t	-	-	-	-	-	-	+	+
uint64_t	-	-	-	-	-	-	+	+
int64_t	-	-	-	-	-	-	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief:

Call signature:
slice_packed_shift_right_acint_by_N<Vec>(ac_int<Vec::vector_size_b(), false> data) -> ac_int<Vec::vector_size_b(), false>

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	-	-
int8_t	-	-	-	-	-	-	-	-
uint16_t	-	-	-	-	-	-	-	-
int16_t	-	-	-	-	-	-	-	-
uint32_t	-	-	-	-	-	-	+	+
int32_t	-	-	-	-	-	-	+	+
uint64_t	-	-	-	-	-	-	+	+
int64_t	-	-	-	-	-	-	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief:

Call signature:
packed_or_acint<Vec>(ac_int<Vec::vector_size_b(), false> a, ac_int<Vec::vector_size_b(), false> b) -> ac_int<Vec::vector_size_b(), false>

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	-	-
int8_t	-	-	-	-	-	-	-	-
uint16_t	-	-	-	-	-	-	-	-
int16_t	-	-	-	-	-	-	-	-
uint32_t	-	-	-	-	-	-	+	+
int32_t	-	-	-	-	-	-	+	+
uint64_t	-	-	-	-	-	-	+	+
int64_t	-	-	-	-	-	-	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Packs elements from a vector together using a fixed bitwidth.

Call signature:
pack_bits_linear<Vec>(const typename Vec::register_type data, const unsigned bitwidth) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	+	+
int8_t	-	-	-	-	-	-	+	+
uint16_t	-	-	-	-	-	-	+	+
int16_t	-	-	-	-	-	-	+	+
uint32_t	-	-	-	-	-	-	+	+
int32_t	-	-	-	-	-	-	+	+
uint64_t	-	-	-	-	-	-	+	+
int64_t	-	-	-	-	-	-	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Packs elements from a vector together using a fixed bitwidth.

Call signature:
pack_bits_treelike<Vec>(const typename Vec::register_type data, const unsigned bitwidth) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	+	+
int8_t	-	-	-	-	-	-	+	+
uint16_t	-	-	-	-	-	-	+	+
int16_t	-	-	-	-	-	-	+	+
uint32_t	-	-	-	-	-	-	+	+
int32_t	-	-	-	-	-	-	+	+
uint64_t	-	-	-	-	-	-	+	+
int64_t	-	-	-	-	-	-	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief:

Call signature:
pack_bits_linear_merge<Vec>(const typename Vec::register_type src, const unsigned bit_offset, const typename Vec::register_type data, const unsigned bitwidth) -> std::tuple<typename Vec::register_type, int, typename Vec::register_type>

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	+	+
int8_t	-	-	-	-	-	-	+	+
uint16_t	-	-	-	-	-	-	+	+
int16_t	-	-	-	-	-	-	+	+
uint32_t	-	-	-	-	-	-	+	+
int32_t	-	-	-	-	-	-	+	+
uint64_t	-	-	-	-	-	-	+	+
int64_t	-	-	-	-	-	-	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief:

Call signature:
pack_bits_treelike_merge<Vec>(const typename Vec::register_type src, const unsigned bit_offset, const typename Vec::register_type data, const unsigned bitwidth) -> std::tuple<typename Vec::register_type, int, typename Vec::register_type>

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	+	+
int8_t	-	-	-	-	-	-	+	+
uint16_t	-	-	-	-	-	-	+	+
int16_t	-	-	-	-	-	-	+	+
uint32_t	-	-	-	-	-	-	+	+
int32_t	-	-	-	-	-	-	+	+
uint64_t	-	-	-	-	-	-	+	+
int64_t	-	-	-	-	-	-	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: todo.

Call signature:
reinterpret<Vec, ToType>(const typename Vec::register_type data) -> typename ToType::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	+	+
double	+	+	-	+	+	+	+	+

Brief: todo.

Call signature:
cast<Vec, ToType>(const typename Vec::register_type data) -> typename ToType::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	+	+	-	-	-
int8_t	-	-	-	+	+	-	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: todo.

Call signature:
split<Vec, ToType>(const typename Vec::register_type data) -> std::array<typename ToType::register_type, sizeof(typename ToType::base_type)/sizeof(typename Vec::base_type) * Vec::vector_element_count() / ToType::vector_element_count()>

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	-	-
int8_t	-	-	-	-	-	-	-	-
uint16_t	-	-	-	-	-	-	-	-
int16_t	-	-	-	-	-	-	-	-
uint32_t	-	-	-	-	-	+	-	-
int32_t	-	-	-	-	-	+	-	-
uint64_t	-	-	-	-	-	-	-	-
int64_t	-	-	-	-	-	-	-	-
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: todo.

Call signature:
merge<Vec, ToType>(std::array<typename Vec::register_type, sizeof(typename Vec::base_type)/sizeof(typename ToType::base_type) * ToType::vector_element_count() / Vec::vector_element_count()> data) -> typename ToType::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	-	-
int8_t	-	-	-	-	-	-	-	-
uint16_t	-	-	-	-	-	-	-	-
int16_t	-	-	-	-	-	-	-	-
uint32_t	-	-	-	-	+	-	-	-
int32_t	-	-	-	-	+	-	-	-
uint64_t	-	-	-	-	-	-	-	-
int64_t	-	-	-	-	-	-	-	-
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Converts a SIMD register to an array of SIMD registers with a larger base type.

Call signature:
convert_up<Vec, ToType>(const typename Vec::register_type data) -> std::array<typename ToType::register_type, Vec::vector_element_count() / ToType::vector_element_count()>

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	-	-	-
int8_t	+	+	-	+	+	-	-	-
uint16_t	+	+	-	+	+	-	-	-
int16_t	+	+	-	+	+	-	-	-
uint32_t	+	+	-	+	+	-	-	-
int32_t	+	+	-	+	+	-	-	-
uint64_t	-	-	-	-	+	-	-	-
int64_t	-	-	-	-	+	-	-	-
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: todo.

Call signature:
convert_down<Vec, ToType>(std::array<typename Vec::register_type, sizeof(typename Vec::base_type)/sizeof(typename ToType::base_type)> data) -> typename ToType::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	-	-	-	-
int8_t	-	-	-	-	-	-	-	-
uint16_t	-	+	-	-	-	-	-	-
int16_t	-	+	-	-	-	-	-	-
uint32_t	-	+	-	-	+	-	-	-
int32_t	-	+	-	-	+	-	-	-
uint64_t	-	+	-	-	+	-	-	-
int64_t	-	+	-	-	+	-	-	-
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Loads data from aligned memory into a vector register.

Call signature:
to_ostream<Vec>(std::ostream & out, typename Vec::register_type const data, modifier ostream_modifier) -> std::ostream &

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	+	+	+	-	-
int8_t	-	-	-	+	+	+	-	-
uint16_t	-	-	-	+	+	+	-	-
int16_t	-	-	-	+	+	+	-	-
uint32_t	-	-	-	+	+	+	-	-
int32_t	-	-	-	+	+	+	-	-
uint64_t	-	-	-	+	+	+	-	-
int64_t	-	+	-	+	+	+	-	-
float	-	-	-	+	+	+	-	-
double	-	-	-	+	+	+	-	-

Brief: Loads data from aligned memory into a vector register.

Call signature:
load<Vec>(const typename Vec::base_type* memory) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	+	+	-	+	+	+	+	+
double	+	+	-	+	+	+	+	+

Brief: Loads data from (un)aligned memory into a vector register.

Call signature:
loadu<Vec>(const typename Vec::base_type* memory) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	+	+	-	+	+	+	+	+
double	+	+	-	+	+	+	+	+

Brief: Stores data from a vector register to aligned memory.

Call signature:
store<Vec>(typename Vec::base_type* memory, const typename Vec::register_type data) -> void

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	+	+	-	+	+	+	+	+
double	+	+	-	+	+	+	+	+

Brief: Stores data from a vector register to (un)aligned memory.

Call signature:
storeu<Vec>(typename Vec::base_type* memory, const typename Vec::register_type data) -> void

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	+	+	-	+	+	+	+	+
double	+	+	-	+	+	+	+	+

Brief: Stores SIMD register to array.

Call signature:
to_array<Vec>(const typename Vec::register_type data) -> __attribute__((__aligned__(Vec::vector_alignment()))) std::array<typename Vec::base_type, Vec::vector_element_count()>

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Broadcasts a single value into all lanes of a vector register.

Call signature:
set1<Vec>(const typename Vec::base_type value) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	+	+	-	+	+	+	+	+
double	+	+	-	+	+	+	+	+

Brief: Set all lanes to zero.

Call signature:
set_zero<Vec>() -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	+	+	-	+	+	+	+	+
double	+	+	-	+	+	+	+	+

Brief: Transfers provided elements into a vector register. This implementation is currently (errorneous) in the reverse order.

Call signature:
set<Vec>(Ts args) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Creates a sequence [0..SIMD-Reg-Element-Count].

Call signature:
sequence<Vec>() -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Creates a sequence.

Call signature:
custom_sequence<Vec>(typename Vec::base_type start, typename Vec::base_type stepwidth) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Transfers data from arbitrary locations into a vector register.

Call signature:
gather<Vec, IndicesType>(const void* memory, const typename IndicesType::register_type index, std::integral_constant<int, N> scale) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Transfers data from a vector register to an arbitrary locations.

Call signature:
scatter<Vec>(const typename Vec::register_type data, void* memory, const typename Vec::offset_base_register_type index, std::integral_constant<int, N> scale) -> void

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	-	-	-
int8_t	+	+	-	+	+	-	-	-
uint16_t	+	+	-	+	+	-	-	-
int16_t	+	+	-	+	+	-	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Stores elements from data consecutively, if the corresponding bit in mask is set to 1.

Call signature:
compress_store<Vec>(const typename Vec::imask_type mask, typename Vec::base_type* memory, const typename Vec::register_type data) -> void

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Loads contiguos data from a specified memory location and puts the elements using write mask.

Call signature:
expand_load<Vec>(const typename Vec::imask_type mask, const typename Vec::register_type src, typename Vec::base_type* memory) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	+	+	-	-
int8_t	-	-	-	-	+	+	-	-
uint16_t	-	-	-	-	+	+	-	-
int16_t	-	-	-	-	+	+	-	-
uint32_t	-	-	-	-	+	+	-	-
int32_t	-	-	-	-	+	+	-	-
uint64_t	-	-	-	-	+	+	-	-
int64_t	-	-	-	-	+	+	-	-
float	-	-	-	-	+	+	-	-
double	-	-	-	-	+	+	-	-

Brief: todo.

Call signature:
load_convert_up<Vec, ToType>(typename Vec::base_type const * memory) -> typename ToType::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	+	-	-	-
int8_t	-	-	-	-	+	-	-	-
uint16_t	-	-	-	-	+	-	-	-
int16_t	-	-	-	-	+	-	-	-
uint32_t	-	-	-	-	+	-	-	-
int32_t	-	-	-	-	+	-	-	-
uint64_t	-	-	-	-	-	-	-	-
int64_t	-	-	-	-	-	-	-	-
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Extracts value on given index.

Call signature:
extract_value<Vec>(const typename Vec::register_type data) -> typename Vec::base_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Forms a mask type from an integral.

Call signature:
to_mask<Vec>(const typename Vec::imask_type mask) -> typename Vec::mask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Binary NOT of a vector mask type.

Call signature:
mask_binary_not<Vec>(const typename Vec::mask_type mask) -> typename Vec::mask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	+	+	-	-
int8_t	-	-	-	-	+	+	-	-
uint16_t	-	-	-	-	+	+	-	-
int16_t	-	-	-	-	+	+	-	-
uint32_t	-	-	-	-	+	+	-	-
int32_t	-	-	-	-	+	+	-	-
uint64_t	-	-	-	-	+	+	-	-
int64_t	-	-	-	-	+	+	-	-
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Binary NOT of a vector integral mask type.

Call signature:
mask_binary_not<Vec>(const typename Vec::imask_type mask) -> typename Vec::imask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Binary AND of two vector mask types.

Call signature:
mask_binary_and<Vec>(const typename Vec::mask_type first, const typename Vec::mask_type second) -> typename Vec::mask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Binary AND of two vector integral mask types.

Call signature:
mask_binary_and<Vec>(const typename Vec::imask_type first, const typename Vec::imask_type second) -> typename Vec::imask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Binary OR of two vector mask types.

Call signature:
mask_binary_or<Vec>(const typename Vec::mask_type first, const typename Vec::mask_type second) -> typename Vec::mask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	+	-	+	+	+	-	-
int8_t	-	+	-	+	+	+	-	-
uint16_t	-	+	-	+	+	+	-	-
int16_t	-	+	-	+	+	+	-	-
uint32_t	-	+	-	+	+	+	-	-
int32_t	-	+	-	+	+	+	-	-
uint64_t	-	+	-	+	+	+	-	-
int64_t	-	+	-	+	+	+	-	-
float	-	+	-	+	+	+	-	-
double	-	+	-	+	+	+	-	-

Brief: Binary OR of two vector integral mask types.

Call signature:
mask_binary_or<Vec>(const typename Vec::imask_type first, const typename Vec::imask_type second) -> typename Vec::imask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	+	-	+	+	-	-	-
int8_t	-	+	-	+	+	-	-	-
uint16_t	-	+	-	+	+	-	-	-
int16_t	-	+	-	+	+	-	-	-
uint32_t	-	+	-	+	+	-	-	-
int32_t	-	+	-	+	+	-	-	-
uint64_t	-	+	-	+	+	-	-	-
int64_t	-	+	-	+	+	-	-	-
float	-	+	-	+	+	-	-	-
double	-	+	-	+	+	-	-	-

Brief: Binary XOR of two vector mask types.

Call signature:
mask_binary_xor<Vec>(const typename Vec::mask_type first, const typename Vec::mask_type second) -> typename Vec::mask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	+	-	+	+	+	-	-
int8_t	-	+	-	+	+	+	-	-
uint16_t	-	+	-	+	+	+	-	-
int16_t	-	+	-	+	+	+	-	-
uint32_t	-	+	-	+	+	+	-	-
int32_t	-	+	-	+	+	+	-	-
uint64_t	-	+	-	+	+	+	-	-
int64_t	-	+	-	+	+	+	-	-
float	-	+	-	+	+	+	-	-
double	-	+	-	+	+	+	-	-

Brief: Binary XOR of two vector integral mask types.

Call signature:
mask_binary_xor<Vec>(const typename Vec::imask_type first, const typename Vec::imask_type second) -> typename Vec::imask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	+	-	+	+	-	-	-
int8_t	-	+	-	+	+	-	-	-
uint16_t	-	+	-	+	+	-	-	-
int16_t	-	+	-	+	+	-	-	-
uint32_t	-	+	-	+	+	-	-	-
int32_t	-	+	-	+	+	-	-	-
uint64_t	-	+	-	+	+	-	-	-
int64_t	-	+	-	+	+	-	-	-
float	-	+	-	+	+	-	-	-
double	-	+	-	+	+	-	-	-

Brief: todo.

Call signature:
mask_population_count<Vec>(const typename Vec::imask_type mask) -> unsigned int

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: todo.

Call signature:
integral_all_true<Vec>() -> typename Vec::imask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: todo.

Call signature:
integral_all_false<Vec>() -> typename Vec::imask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Tests whether a specific bit is set to 1.

Call signature:
test_mask<Vec>(typename Vec::imask_type mask, int position) -> bool

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Merges two masks. result[0:i-1] = mask_a[0:i-1]; result[i:N-1] = mask_b[0:N-1] where N is the number of effective bits in mask.

Call signature:
insert_mask<Vec>(typename Vec::imask_type mask_a, typename Vec::imask_type mask_b, int position) -> typename Vec::imask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Extracts one mask from another.

Call signature:
extract_mask<Vec>(typename Vec::imask_type mask, int position) -> typename Vec::imask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Forms an integral value from the most significant bits of every lane in a vector mask register.

Call signature:
to_integral<Vec>(const typename Vec::mask_type vec_mask) -> typename Vec::imask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Forms an vector register from an integral where all bits are set in a lane if the corresponding mask bit is set to 1.

Call signature:
to_vector<Vec>(const typename Vec::mask_type mask) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	+	+	+	+	+
int8_t	-	-	-	+	+	+	+	+
uint16_t	-	-	-	+	+	+	+	+
int16_t	-	-	-	+	+	+	+	+
uint32_t	-	-	-	+	+	+	+	+
int32_t	-	-	-	+	+	+	+	+
uint64_t	-	-	-	+	+	+	+	+
int64_t	-	+	-	+	+	+	+	+
float	-	-	-	+	+	+	+	+
double	-	-	-	+	+	+	+	+

Brief: Stores data from a vector register to (un)aligned memory.

Call signature:
storeu<Vec>(const typename Vec::mask_type mask, typename Vec::base_type* memory, const typename Vec::register_type data) -> void

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Stores data from a vector register to (un)aligned memory.

Call signature:
storeu<Vec>(const typename Vec::imask_type mask, typename Vec::base_type* memory, const typename Vec::register_type data) -> void

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: If mask[i] is 1, load memory[index[i] * scale], otherwise use source[i]

Call signature:
gather<Vec, IndicesType>(const typename Vec::mask_type mask, const typename Vec::register_type source, const void* memory, const typename IndicesType::offset_base_register_type index, std::integral_constant<int, N> scale) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	-	-	-	-
int8_t	+	+	-	+	-	-	-	-
uint16_t	+	+	-	+	-	-	-	-
int16_t	+	+	-	+	-	-	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Transfers data from a vector register to an arbitrary locations.

Call signature:
scatter<Vec>(const typename Vec::mask_type mask, const typename Vec::register_type data, void* memory, const typename Vec::offset_base_register_type index, std::integral_constant<int, N> scale) -> void

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	-	-	-
int8_t	+	+	-	+	+	-	-	-
uint16_t	+	+	-	+	+	-	-	-
int16_t	+	+	-	+	+	-	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Partially override a Vector with a single value.

Call signature:
masked_set1<Vec>(const typename Vec::register_type src, const typename Vec::imask_type mask, const typename Vec::base_type value) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Loads data from memory to a mask.

Call signature:
load_mask<Vec>(typename Vec::imask_type const* memory) -> typename Vec::mask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Loads data from memory to a mask.

Call signature:
load_imask<Vec>(typename Vec::imask_type const* memory) -> typename Vec::imask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Stores an integral mask to memory.

Call signature:
store_imask<Vec>(typename Vec::imask_type * memory, typename Vec::imask_type mask) -> void

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Allocates (unaligned) contiguous memory.

Call signature:
allocate<Vec>(std::size_t count_bytes) -> typename Vec::base_type*

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Allocates aligned contiguous memory.

Call signature:
allocate_aligned<Vec>(std::size_t count_bytes, std::size_t alignment) -> typename Vec::base_type*

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Deallocates (possibly aligned) contiguous memory.

Call signature:
deallocate<Vec>(typename Vec::base_type* ptr) -> void

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Copy memory.

Call signature:
memory_cp<Vec>(typename Vec::base_type* dst, typename Vec::base_type const* src, std::size_t count_bytes, int copy_kind) -> void

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Checks whether all elements are unique in a register.

Call signature:
conflict<Vec>(const typename Vec::register_type data) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	+	+
int8_t	+	+	-	+	+	+	+	+
uint16_t	+	+	-	+	+	+	+	+
int16_t	+	+	-	+	+	+	+	+
uint32_t	+	+	-	+	+	+	+	+
int32_t	+	+	-	+	+	+	+	+
uint64_t	+	+	-	+	+	+	+	+
int64_t	+	+	-	+	+	+	+	+
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Checks whether all elements are unique in a register and returns a mask indicating which elements don't have preceeding conflicts.

Call signature:
conflict_free<Vec>(const typename Vec::imask_type mask, const typename Vec::register_type data) -> typename Vec::imask_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Blends two registers using provided bitmask.

Call signature:
blend<Vec>(const typename Vec::mask_type control, const typename Vec::register_type left, const typename Vec::register_type right) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-

Brief: Blends or add two registers using provided bitmask

Call signature:
blend_add<Vec>(const typename Vec::mask_type control, const typename Vec::register_type left, const typename Vec::register_type right, const typename Vec::register_type adder) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	-	+	+	-	-
int8_t	-	-	-	-	+	+	-	-
uint16_t	-	-	-	-	+	+	-	-
int16_t	-	-	-	-	+	+	-	-
uint32_t	-	-	-	-	+	+	-	-
int32_t	-	-	-	-	+	+	-	-
uint64_t	-	-	-	-	+	+	-	-
int64_t	-	-	-	-	+	+	-	-
float	-	-	-	-	-	-	-	-
double	-	-	-	-	-	-	-	-

Brief: Returns a vector register with undefined data inside.

Call signature:
undefined<Vec>() -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	+	+	+	-	-
int8_t	-	-	-	+	+	+	-	-
uint16_t	-	-	-	+	+	+	-	-
int16_t	-	-	-	+	+	+	-	-
uint32_t	-	-	-	+	+	+	-	-
int32_t	-	-	-	+	+	+	-	-
uint64_t	-	-	-	+	+	+	-	-
int64_t	-	-	-	+	+	+	-	-
float	-	-	-	+	+	+	-	-
double	-	-	-	+	+	+	-	-

Brief: Copy elements from a vector, where the mask bit it set, otherwise write zero

Call signature:
maskz_mov<Vec>(const typename Vec::mask_type mask, const typename Vec::register_type src) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	-	-	-	+	+	-	-	-
int8_t	-	-	-	+	+	-	-	-
uint16_t	-	-	-	+	+	-	-	-
int16_t	-	-	-	+	+	-	-	-
uint32_t	-	-	-	+	+	-	-	-
int32_t	-	-	-	+	+	-	-	-
uint64_t	-	-	-	+	+	-	-	-
int64_t	-	-	-	+	+	-	-	-
float	-	-	-	+	+	-	-	-
double	-	-	-	+	+	-	-	-

Brief: Copy elements from a vector, where the mask bit it set, otherwise write zero

Call signature:
maskz_mov<Vec>(const typename Vec::imask_type mask, const typename Vec::register_type src) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Merge two vectors while picking the source of each element based on the corresponding mask bit

Call signature:
mask_mov<Vec>(const typename Vec::register_type src, const typename Vec::imask_type mask, const typename Vec::register_type data) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	-	-	+	+	+	-	-
int8_t	+	-	-	+	+	+	-	-
uint16_t	+	-	-	+	+	+	-	-
int16_t	+	-	-	+	+	+	-	-
uint32_t	+	-	-	+	+	+	-	-
int32_t	+	-	-	+	+	+	-	-
uint64_t	+	-	-	+	+	+	-	-
int64_t	+	-	-	+	+	+	-	-
float	+	-	-	+	+	+	-	-
double	+	-	-	+	+	+	-	-

Brief: Compress valid elements in a register (if the corresponding bit in mask is set to 1).

Call signature:
compress<Vec>(const typename Vec::imask_type mask, const typename Vec::register_type data) -> typename Vec::register_type

	scalar	neon	cuda	sse	avx2	avx512	oneAPIfpga	oneAPIfpgaRTL
uint8_t	+	+	-	+	+	+	-	-
int8_t	+	+	-	+	+	+	-	-
uint16_t	+	+	-	+	+	+	-	-
int16_t	+	+	-	+	+	+	-	-
uint32_t	+	+	-	+	+	+	-	-
int32_t	+	+	-	+	+	+	-	-
uint64_t	+	+	-	+	+	+	-	-
int64_t	+	+	-	+	+	+	-	-
float	+	+	-	+	+	+	-	-
double	+	+	-	+	+	+	-	-