|
void | load_lut_2x_float (int *lut1, int *lut2, v16int32 offset, v32bfloat16 &v1, v32bfloat16 &v2) |
|
void | load_lut_2x_float (int *lut1, int *lut2, v16uint32 offset, v32bfloat16 &v1, v32bfloat16 &v2) |
|
void | load_lut_2x_int16 (int *lut1, int *lut2, v16int32 offset, v32int16 &v1, v32int16 &v2) |
|
void | load_lut_2x_int16 (int *lut1, int *lut2, v16uint32 offset, v32int16 &v1, v32int16 &v2) |
|
void | load_lut_2x_int32 (int *lut1, int *lut2, v16int32 offset, v16int32 &v1, v16int32 &v2) |
|
void | load_lut_2x_int32 (int *lut1, int *lut2, v16uint32 offset, v16int32 &v1, v16int32 &v2) |
|
void | load_lut_2x_int8 (int *lut1, int *lut2, v16int32 offset, v64int8 &v1, v64int8 &v2) |
|
void | load_lut_2x_int8 (int *lut1, int *lut2, v16uint32 offset, v64int8 &v1, v64int8 &v2) |
|
void | load_lut_float (int *lut1, int *lut2, v16int32 offset, v32bfloat16 &v1) |
|
void | load_lut_float (int *lut1, int *lut2, v16uint32 offset, v32bfloat16 &v1) |
|
void | load_lut_int16 (int *lut1, int *lut2, v16int32 offset, v32int16 &v1) |
|
void | load_lut_int16 (int *lut1, int *lut2, v16uint32 offset, v32int16 &v1) |
|
void | load_lut_int32 (int *lut1, int *lut2, v16int32 offset, v16int32 &v1) |
|
void | load_lut_int32 (int *lut1, int *lut2, v16uint32 offset, v16int32 &v1) |
|
void | load_lut_int8 (int *lut1, int *lut2, v16int32 offset, v64int8 &v1) |
|
void | load_lut_int8 (int *lut1, int *lut2, v16uint32 offset, v64int8 &v1) |
|
Load 4x intrinsics perform four 64-bit values loads to a vector register from data memory. The four 64-bit values are loaded from the address pointed by four pointers stored in a W register. The four 64-bit values are concatenated and placed in the destination register. This is done using indirect addressing on each of the four 64-bit lanes.
This instruction occupies the VLIW slot B, but uses all memory interfaces and, thus, no other load operations can take place in parallel in VLIW slot A.
The pseudo-code of the operation is as follows:
// VLDB.<4xmode>.<ptr_offset> <wdst>, <wsrc>
// Extract pointers
if ptr_offset == lo:
ptr0 = wrsc[31..0]
ptr1 = wrsc[63..32]
ptr2 = wrsc[95..64]
ptr3 = wrsc[127..96]
else
ptr0 = wrsc[159..128]
ptr1 = wrsc[191..160]
ptr2 = wrsc[223..192]
ptr3 = wrsc[255..224]
// Load data
data0 = DM[ptr0 & 0xfffc0] // even bank (256 bits)
data1 = DM[ptr1 & 0xfffc0|0x20] // odd bank (256 bits)
data2 = DM[ptr2 & 0xfffc0] // even bank (256 bits)
data3 = DM[ptr3 & 0xfffc0|0x20] // odd bank (256 bits)
// take lsbs of pointers and shift by 2
switch 4x_mode:
case 4x16: ptr_mask = 0x3c
case 4x32: ptr_mask = 0x38
case 4x64: ptr_mask = 0x30
shift0 = (ptr0 & ptr_mask) >> 2
shift1 = (ptr1 & ptr_mask) >> 2
shift2 = (ptr2 & ptr_mask) >> 2
shift3 = (ptr3 & ptr_mask) >> 2
// shift data in increments of 16 bits
data0 = data0 >> (shift0*16)
data1 = data1 >> (shift1*16)
data2 = data2 >> (shift2*16)
data3 = data3 >> (shift3*16)
// Concatenate outputs
wdst = data3[63:0]::data2[63:0]::data1[63:0]::data0[63:0]
◆ load_lut_2x_float() [1/2]
- Parameters
-
lut1 | Pointer input 1 (even bank) |
lut2 | Pointer input 2 (odd bank) |
offset | Offset for the input pointers |
v1 | Output vector 1 |
v2 | Output vector 2 |
◆ load_lut_2x_float() [2/2]
- Parameters
-
lut1 | Pointer input 1 (even bank) |
lut2 | Pointer input 2 (odd bank) |
offset | Offset for the input pointers |
v1 | Output vector 1 |
v2 | Output vector 2 |
◆ load_lut_2x_int16() [1/2]
- Parameters
-
lut1 | Pointer input 1 (even bank) |
lut2 | Pointer input 2 (odd bank) |
offset | Offset for the input pointers |
v1 | Output vector 1 |
v2 | Output vector 2 |
◆ load_lut_2x_int16() [2/2]
- Parameters
-
lut1 | Pointer input 1 (even bank) |
lut2 | Pointer input 2 (odd bank) |
offset | Offset for the input pointers |
v1 | Output vector 1 |
v2 | Output vector 2 |
◆ load_lut_2x_int32() [1/2]
- Parameters
-
lut1 | Pointer input 1 (even bank) |
lut2 | Pointer input 2 (odd bank) |
offset | Offset for the input pointers |
v1 | Output vector 1 |
v2 | Output vector 2 |
◆ load_lut_2x_int32() [2/2]
- Parameters
-
lut1 | Pointer input 1 (even bank) |
lut2 | Pointer input 2 (odd bank) |
offset | Offset for the input pointers |
v1 | Output vector 1 |
v2 | Output vector 2 |
◆ load_lut_2x_int8() [1/2]
- Parameters
-
lut1 | Pointer input 1 (even bank) |
lut2 | Pointer input 2 (odd bank) |
offset | Offset for the input pointers |
v1 | Output vector 1 |
v2 | Output vector 2 |
◆ load_lut_2x_int8() [2/2]
- Parameters
-
lut1 | Pointer input 1 (even bank) |
lut2 | Pointer input 2 (odd bank) |
offset | Offset for the input pointers |
v1 | Output vector 1 |
v2 | Output vector 2 |
◆ load_lut_float() [1/2]
- Parameters
-
lut1 | Pointer input 1 (even bank) |
lut2 | Pointer input 2 (odd bank) |
offset | Offset for the input pointers |
v1 | Output vector 1 |
◆ load_lut_float() [2/2]
- Parameters
-
lut1 | Pointer input 1 (even bank) |
lut2 | Pointer input 2 (odd bank) |
offset | Offset for the input pointers |
v1 | Output vector 1 |
◆ load_lut_int16() [1/2]
- Parameters
-
lut1 | Pointer input 1 (even bank) |
lut2 | Pointer input 2 (odd bank) |
offset | Offset for the input pointers |
v1 | Output vector 1 |
◆ load_lut_int16() [2/2]
- Parameters
-
lut1 | Pointer input 1 (even bank) |
lut2 | Pointer input 2 (odd bank) |
offset | Offset for the input pointers |
v1 | Output vector 1 |
◆ load_lut_int32() [1/2]
- Parameters
-
lut1 | Pointer input 1 (even bank) |
lut2 | Pointer input 2 (odd bank) |
offset | Offset for the input pointers |
v1 | Output vector 1 |
◆ load_lut_int32() [2/2]
- Parameters
-
lut1 | Pointer input 1 (even bank) |
lut2 | Pointer input 2 (odd bank) |
offset | Offset for the input pointers |
v1 | Output vector 1 |
◆ load_lut_int8() [1/2]
void load_lut_int8 |
( |
int * | lut1, |
|
|
int * | lut2, |
|
|
v16int32 | offset, |
|
|
v64int8 & | v1 ) |
- Parameters
-
lut1 | Pointer input 1 (even bank) |
lut2 | Pointer input 2 (odd bank) |
offset | Offset for the input pointers |
v1 | Output vector 1 |
◆ load_lut_int8() [2/2]
- Parameters
-
lut1 | Pointer input 1 (even bank) |
lut2 | Pointer input 2 (odd bank) |
offset | Offset for the input pointers |
v1 | Output vector 1 |