AI Engine-ML v2 Intrinsics User Guide  v2025.1
Loading...
Searching...
No Matches
Load 4x Operations

Functions

void load_lut_2x_float (int *lut1, int *lut2, v16int32 offset, v32bfloat16 &v1, v32bfloat16 &v2)
 
void load_lut_2x_float (int *lut1, int *lut2, v16uint32 offset, v32bfloat16 &v1, v32bfloat16 &v2)
 
void load_lut_2x_int16 (int *lut1, int *lut2, v16int32 offset, v32int16 &v1, v32int16 &v2)
 
void load_lut_2x_int16 (int *lut1, int *lut2, v16uint32 offset, v32int16 &v1, v32int16 &v2)
 
void load_lut_2x_int32 (int *lut1, int *lut2, v16int32 offset, v16int32 &v1, v16int32 &v2)
 
void load_lut_2x_int32 (int *lut1, int *lut2, v16uint32 offset, v16int32 &v1, v16int32 &v2)
 
void load_lut_2x_int8 (int *lut1, int *lut2, v16int32 offset, v64int8 &v1, v64int8 &v2)
 
void load_lut_2x_int8 (int *lut1, int *lut2, v16uint32 offset, v64int8 &v1, v64int8 &v2)
 
void load_lut_float (int *lut1, int *lut2, v16int32 offset, v32bfloat16 &v1)
 
void load_lut_float (int *lut1, int *lut2, v16uint32 offset, v32bfloat16 &v1)
 
void load_lut_int16 (int *lut1, int *lut2, v16int32 offset, v32int16 &v1)
 
void load_lut_int16 (int *lut1, int *lut2, v16uint32 offset, v32int16 &v1)
 
void load_lut_int32 (int *lut1, int *lut2, v16int32 offset, v16int32 &v1)
 
void load_lut_int32 (int *lut1, int *lut2, v16uint32 offset, v16int32 &v1)
 
void load_lut_int8 (int *lut1, int *lut2, v16int32 offset, v64int8 &v1)
 
void load_lut_int8 (int *lut1, int *lut2, v16uint32 offset, v64int8 &v1)
 

Detailed Description

Load 4x intrinsics perform four 64-bit values loads to a vector register from data memory. The four 64-bit values are loaded from the address pointed by four pointers stored in a W register. The four 64-bit values are concatenated and placed in the destination register. This is done using indirect addressing on each of the four 64-bit lanes.

This instruction occupies the VLIW slot B, but uses all memory interfaces and, thus, no other load operations can take place in parallel in VLIW slot A.

The pseudo-code of the operation is as follows:

// VLDB.<4xmode>.<ptr_offset> <wdst>, <wsrc>
// Extract pointers
if ptr_offset == lo:
ptr0 = wrsc[31..0]
ptr1 = wrsc[63..32]
ptr2 = wrsc[95..64]
ptr3 = wrsc[127..96]
else
ptr0 = wrsc[159..128]
ptr1 = wrsc[191..160]
ptr2 = wrsc[223..192]
ptr3 = wrsc[255..224]
// Load data
data0 = DM[ptr0 & 0xfffc0] // even bank (256 bits)
data1 = DM[ptr1 & 0xfffc0|0x20] // odd bank (256 bits)
data2 = DM[ptr2 & 0xfffc0] // even bank (256 bits)
data3 = DM[ptr3 & 0xfffc0|0x20] // odd bank (256 bits)
// take lsbs of pointers and shift by 2
switch 4x_mode:
case 4x16: ptr_mask = 0x3c
case 4x32: ptr_mask = 0x38
case 4x64: ptr_mask = 0x30
shift0 = (ptr0 & ptr_mask) >> 2
shift1 = (ptr1 & ptr_mask) >> 2
shift2 = (ptr2 & ptr_mask) >> 2
shift3 = (ptr3 & ptr_mask) >> 2
// shift data in increments of 16 bits
data0 = data0 >> (shift0*16)
data1 = data1 >> (shift1*16)
data2 = data2 >> (shift2*16)
data3 = data3 >> (shift3*16)
// Concatenate outputs
wdst = data3[63:0]::data2[63:0]::data1[63:0]::data0[63:0]

Function Documentation

◆ load_lut_2x_float() [1/2]

void load_lut_2x_float ( int * lut1,
int * lut2,
v16int32 offset,
v32bfloat16 & v1,
v32bfloat16 & v2 )
Parameters
lut1Pointer input 1 (even bank)
lut2Pointer input 2 (odd bank)
offsetOffset for the input pointers
v1Output vector 1
v2Output vector 2

◆ load_lut_2x_float() [2/2]

void load_lut_2x_float ( int * lut1,
int * lut2,
v16uint32 offset,
v32bfloat16 & v1,
v32bfloat16 & v2 )
Parameters
lut1Pointer input 1 (even bank)
lut2Pointer input 2 (odd bank)
offsetOffset for the input pointers
v1Output vector 1
v2Output vector 2

◆ load_lut_2x_int16() [1/2]

void load_lut_2x_int16 ( int * lut1,
int * lut2,
v16int32 offset,
v32int16 & v1,
v32int16 & v2 )
Parameters
lut1Pointer input 1 (even bank)
lut2Pointer input 2 (odd bank)
offsetOffset for the input pointers
v1Output vector 1
v2Output vector 2

◆ load_lut_2x_int16() [2/2]

void load_lut_2x_int16 ( int * lut1,
int * lut2,
v16uint32 offset,
v32int16 & v1,
v32int16 & v2 )
Parameters
lut1Pointer input 1 (even bank)
lut2Pointer input 2 (odd bank)
offsetOffset for the input pointers
v1Output vector 1
v2Output vector 2

◆ load_lut_2x_int32() [1/2]

void load_lut_2x_int32 ( int * lut1,
int * lut2,
v16int32 offset,
v16int32 & v1,
v16int32 & v2 )
Parameters
lut1Pointer input 1 (even bank)
lut2Pointer input 2 (odd bank)
offsetOffset for the input pointers
v1Output vector 1
v2Output vector 2

◆ load_lut_2x_int32() [2/2]

void load_lut_2x_int32 ( int * lut1,
int * lut2,
v16uint32 offset,
v16int32 & v1,
v16int32 & v2 )
Parameters
lut1Pointer input 1 (even bank)
lut2Pointer input 2 (odd bank)
offsetOffset for the input pointers
v1Output vector 1
v2Output vector 2

◆ load_lut_2x_int8() [1/2]

void load_lut_2x_int8 ( int * lut1,
int * lut2,
v16int32 offset,
v64int8 & v1,
v64int8 & v2 )
Parameters
lut1Pointer input 1 (even bank)
lut2Pointer input 2 (odd bank)
offsetOffset for the input pointers
v1Output vector 1
v2Output vector 2

◆ load_lut_2x_int8() [2/2]

void load_lut_2x_int8 ( int * lut1,
int * lut2,
v16uint32 offset,
v64int8 & v1,
v64int8 & v2 )
Parameters
lut1Pointer input 1 (even bank)
lut2Pointer input 2 (odd bank)
offsetOffset for the input pointers
v1Output vector 1
v2Output vector 2

◆ load_lut_float() [1/2]

void load_lut_float ( int * lut1,
int * lut2,
v16int32 offset,
v32bfloat16 & v1 )
Parameters
lut1Pointer input 1 (even bank)
lut2Pointer input 2 (odd bank)
offsetOffset for the input pointers
v1Output vector 1

◆ load_lut_float() [2/2]

void load_lut_float ( int * lut1,
int * lut2,
v16uint32 offset,
v32bfloat16 & v1 )
Parameters
lut1Pointer input 1 (even bank)
lut2Pointer input 2 (odd bank)
offsetOffset for the input pointers
v1Output vector 1

◆ load_lut_int16() [1/2]

void load_lut_int16 ( int * lut1,
int * lut2,
v16int32 offset,
v32int16 & v1 )
Parameters
lut1Pointer input 1 (even bank)
lut2Pointer input 2 (odd bank)
offsetOffset for the input pointers
v1Output vector 1

◆ load_lut_int16() [2/2]

void load_lut_int16 ( int * lut1,
int * lut2,
v16uint32 offset,
v32int16 & v1 )
Parameters
lut1Pointer input 1 (even bank)
lut2Pointer input 2 (odd bank)
offsetOffset for the input pointers
v1Output vector 1

◆ load_lut_int32() [1/2]

void load_lut_int32 ( int * lut1,
int * lut2,
v16int32 offset,
v16int32 & v1 )
Parameters
lut1Pointer input 1 (even bank)
lut2Pointer input 2 (odd bank)
offsetOffset for the input pointers
v1Output vector 1

◆ load_lut_int32() [2/2]

void load_lut_int32 ( int * lut1,
int * lut2,
v16uint32 offset,
v16int32 & v1 )
Parameters
lut1Pointer input 1 (even bank)
lut2Pointer input 2 (odd bank)
offsetOffset for the input pointers
v1Output vector 1

◆ load_lut_int8() [1/2]

void load_lut_int8 ( int * lut1,
int * lut2,
v16int32 offset,
v64int8 & v1 )
Parameters
lut1Pointer input 1 (even bank)
lut2Pointer input 2 (odd bank)
offsetOffset for the input pointers
v1Output vector 1

◆ load_lut_int8() [2/2]

void load_lut_int8 ( int * lut1,
int * lut2,
v16uint32 offset,
v64int8 & v1 )
Parameters
lut1Pointer input 1 (even bank)
lut2Pointer input 2 (odd bank)
offsetOffset for the input pointers
v1Output vector 1