![]() |
AI Engine
(AIE) r2p15.2
|
Advanced Float Vector Operations. This page contains the fully configurable fpmac_conf and some convenient wrappers to it. The lane selection scheme is explained after each intrinsic definition.
Some of this floating point operations can generate exceptions, for more information you can go here.
Fully configurable multiply-accumulate functions | |
v8float | fpmac_conf (v8float acc, v32float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v8float | fpmac_conf (v8float acc, v16float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v8float | fpmac_conf (v8float acc, v32float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v8float | fpmac_conf (v8float acc, v16float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmac_conf (v4cfloat acc, v16cfloat xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmac_conf (v4cfloat acc, v8cfloat xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmac_conf (v4cfloat acc, v16cfloat xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmac_conf (v4cfloat acc, v8cfloat xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmac_conf (v4cfloat acc, v32float xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmac_conf (v4cfloat acc, v16float xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmac_conf (v4cfloat acc, v32float xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmac_conf (v4cfloat acc, v16float xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmac_conf (v4cfloat acc, v16cfloat xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmac_conf (v4cfloat acc, v8cfloat xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmac_conf (v4cfloat acc, v16cfloat xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmac_conf (v4cfloat acc, v8cfloat xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v8float | fpmul_conf (v32float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v8float | fpmul_conf (v16float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v8float | fpmul_conf (v32float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v8float | fpmul_conf (v16float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmul_conf (v16cfloat xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmul_conf (v8cfloat xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmul_conf (v16cfloat xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmul_conf (v8cfloat xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmul_conf (v32float xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmul_conf (v16float xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmul_conf (v32float xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmul_conf (v16float xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmul_conf (v16cfloat xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmul_conf (v8cfloat xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmul_conf (v16cfloat xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmul_conf (v8cfloat xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v8float | fpmac_conf (v8float acc, v32float xbuf, int xstart, unsigned int xoffs, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v8float | fpmac_conf (v8float acc, v16float xbuf, int xstart, unsigned int xoffs, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v8float | fpmac_conf (v8float acc, v32float xbuf, int xstart, unsigned int xoffs, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v8float | fpmac_conf (v8float acc, v16float xbuf, int xstart, unsigned int xoffs, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmac_conf (v4cfloat acc, v16cfloat xbuf, int xstart, unsigned int xoffs, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmac_conf (v4cfloat acc, v8cfloat xbuf, int xstart, unsigned int xoffs, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmac_conf (v4cfloat acc, v16cfloat xbuf, int xstart, unsigned int xoffs, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmac_conf (v4cfloat acc, v8cfloat xbuf, int xstart, unsigned int xoffs, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v8float | fpmul_conf (v32float xbuf, int xstart, unsigned int xoffs, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v8float | fpmul_conf (v16float xbuf, int xstart, unsigned int xoffs, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v8float | fpmul_conf (v32float xbuf, int xstart, unsigned int xoffs, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v8float | fpmul_conf (v16float xbuf, int xstart, unsigned int xoffs, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmul_conf (v16cfloat xbuf, int xstart, unsigned int xoffs, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmul_conf (v8cfloat xbuf, int xstart, unsigned int xoffs, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmul_conf (v16cfloat xbuf, int xstart, unsigned int xoffs, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
v4cfloat | fpmul_conf (v8cfloat xbuf, int xstart, unsigned int xoffs, int zstart, unsigned int zoffs, bool ones, bool abs, unsigned int addmode, unsigned int addmask, unsigned int cmpmode, unsigned int &cmp) |
Fully configurable real multiply-accumulate for single precision floating point vectors. | |
Multiply-accumulate functions | |
v8float | fpmac (v8float acc, v32float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiply-accumulate for single precision real floating point vectors. | |
v8float | fpmac (v8float acc, v16float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiply-accumulate for single precision real floating point vectors. | |
v8float | fpmac_abs (v8float acc, v32float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiply, take absolute value and accumulate for single precision real floating point vectors. | |
v8float | fpmac_abs (v8float acc, v16float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiply, take absolute value and accumulate for single precision real floating point vectors. | |
v4cfloat | fpmac (v4cfloat acc, v16cfloat xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiply-accumulate for complex times real single precision floating point vectors. | |
v4cfloat | fpmac (v4cfloat acc, v8cfloat xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiply-accumulate for complex times real single precision floating point vectors. | |
v4cfloat | fpmac (v4cfloat acc, v32float xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs) |
Multiply-accumulate for real times complex single precision floating point vectors. | |
v4cfloat | fpmac (v4cfloat acc, v16float xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs) |
Multiply-accumulate for real times complex single precision floating point vectors. | |
v4cfloat | fpmac (v4cfloat acc, v16cfloat xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs) |
Multiply-accumulate for complex single precision floating point vectors. | |
v4cfloat | fpmac (v4cfloat acc, v8cfloat xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs) |
Multiply-accumulate for complex single precision floating point vectors. | |
Multiply-subtract functions | |
v8float | fpmsc (v8float acc, v32float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiply-subtract for single precision real floating point vectors. | |
v8float | fpmsc (v8float acc, v16float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiply-subtract for single precision real floating point vectors. | |
v8float | fpmsc_abs (v8float acc, v32float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiply, take absolute value and subtract for single precision real floating point vectors. | |
v8float | fpmsc_abs (v8float acc, v16float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiply, take absolute value and subtract for single precision real floating point vectors. | |
v4cfloat | fpmsc (v4cfloat acc, v16cfloat xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiply-subtract for complex times real single precision floating point vectors. | |
v4cfloat | fpmsc (v4cfloat acc, v8cfloat xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiply-subtract for complex times real single precision floating point vectors. | |
v4cfloat | fpmsc (v4cfloat acc, v32float xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs) |
Multiply-subtract for real times complex single precision floating point vectors. | |
v4cfloat | fpmsc (v4cfloat acc, v16float xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs) |
Multiply-subtract for real times complex single precision floating point vectors. | |
v4cfloat | fpmsc (v4cfloat acc, v16cfloat xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs) |
Multiply-subtract for complex single precision floating point vectors. | |
v4cfloat | fpmsc (v4cfloat acc, v8cfloat xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs) |
Multiply-subtract for complex single precision floating point vectors. | |
Multiplication functions | |
v8float | fpmul (v32float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiplication for single precision real floating point vectors. | |
v8float | fpmul (v16float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiplication for single precision real floating point vectors. | |
v4cfloat | fpmul (v16cfloat xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiplication for complex times real single precision floating point vectors. | |
v4cfloat | fpmul (v8cfloat xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiplication for complex times real single precision floating point vectors. | |
v4cfloat | fpmul (v32float xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs) |
Multiplication for real times complex single precision floating point vectors. | |
v4cfloat | fpmul (v16float xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs) |
Multiplication for real times complex single precision floating point vectors. | |
v4cfloat | fpmul (v16cfloat xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs) |
Multiplication for complex single precision floating point vectors. | |
v4cfloat | fpmul (v8cfloat xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs) |
Multiplication for complex single precision floating point vectors. | |
v8float | fpabs_mul (v32float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiply and take absolute value for single precision real floating point vectors. | |
v8float | fpabs_mul (v16float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiply and take absolute value for single precision real floating point vectors. | |
v8float | fpneg_mul (v32float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiply-negate for single precision real floating point vectors. | |
v8float | fpneg_mul (v16float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiply-negate for single precision real floating point vectors. | |
v4cfloat | fpneg_mul (v16cfloat xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiply-negate for complex times real single precision floating point vectors. | |
v4cfloat | fpneg_mul (v8cfloat xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiply-negate for complex times real single precision floating point vectors. | |
v4cfloat | fpneg_mul (v32float xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs) |
Multiply-negate for real times complex single precision floating point vectors. | |
v4cfloat | fpneg_mul (v16float xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs) |
Multiply-negate for real times complex single precision floating point vectors. | |
v4cfloat | fpneg_mul (v16cfloat xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs) |
Multiply-negate for complex single precision floating point vectors. | |
v4cfloat | fpneg_mul (v8cfloat xbuf, int xstart, unsigned int xoffs, v4cfloat zbuf, int zstart, unsigned int zoffs) |
Multiply-negate for complex single precision floating point vectors. | |
v8float | fpneg_abs_mul (v32float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiply, take absolute value and negate for single precision real floating point vectors. | |
v8float | fpneg_abs_mul (v16float xbuf, int xstart, unsigned int xoffs, v8float zbuf, int zstart, unsigned int zoffs) |
Multiply, take absolute value and negate for single precision real floating point vectors. | |
v8float fpabs_mul | ( | v32float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply and take absolute value for single precision real floating point vectors.
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v8float fpabs_mul | ( | v16float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply and take absolute value for single precision real floating point vectors.
xbuf | First multiplication input buffer. Small buffer variant. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v8float fpmac | ( | v8float | acc, |
v32float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-accumulate for single precision real floating point vectors.
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 8 x 4 bits: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v8float fpmac | ( | v8float | acc, |
v16float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-accumulate for single precision real floating point vectors.
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. Small buffer variant. |
xstart | Starting offset for all lanes of X. |
xoffs | 8 x 4 bits: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v4cfloat fpmac | ( | v4cfloat | acc, |
v16cfloat | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-accumulate for complex times real single precision floating point vectors.
Where the product corresponds to (x.re + j(x.im)) * z
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v4cfloat fpmac | ( | v4cfloat | acc, |
v8cfloat | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-accumulate for complex times real single precision floating point vectors.
Where the product corresponds to (x.re + j(x.im)) * z
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. Small buffer variant. |
xstart | Starting offset for all lanes of X. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v4cfloat fpmac | ( | v4cfloat | acc, |
v32float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-accumulate for real times complex single precision floating point vectors.
Where the product corresponds to x * (z.re + j(z.im))
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
v4cfloat fpmac | ( | v4cfloat | acc, |
v16float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-accumulate for real times complex single precision floating point vectors.
Where the product corresponds to x * (z.re + j(z.im))
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. Small buffer variant. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
v4cfloat fpmac | ( | v4cfloat | acc, |
v16cfloat | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-accumulate for complex single precision floating point vectors.
Where the product corresponds to (x.re + j(x.im)) * (z.re + j(z.im))
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
v4cfloat fpmac | ( | v4cfloat | acc, |
v8cfloat | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-accumulate for complex single precision floating point vectors.
Where the product corresponds to (x.re + j(x.im)) * (z.re + j(z.im))
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. Small buffer variant. |
xstart | Starting offset for all lanes of X. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
v8float fpmac_abs | ( | v8float | acc, |
v32float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply, take absolute value and accumulate for single precision real floating point vectors.
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v8float fpmac_abs | ( | v8float | acc, |
v16float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply, take absolute value and accumulate for single precision real floating point vectors.
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. Small buffer variant. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v4cfloat fpmac_conf | ( | v4cfloat | acc, |
v8cfloat | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmac_conf | ( | v4cfloat | acc, |
v8cfloat | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v8float fpmac_conf | ( | v8float | acc, |
v32float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v8float fpmac_conf | ( | v8float | acc, |
v16float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v8float fpmac_conf | ( | v8float | acc, |
v32float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v8float fpmac_conf | ( | v8float | acc, |
v16float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v8float fpmac_conf | ( | v8float | acc, |
v16float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmac_conf | ( | v4cfloat | acc, |
v16cfloat | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v8float fpmac_conf | ( | v8float | acc, |
v16float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmac_conf | ( | v4cfloat | acc, |
v8cfloat | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmac_conf | ( | v4cfloat | acc, |
v32float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmac_conf | ( | v4cfloat | acc, |
v16cfloat | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmac_conf | ( | v4cfloat | acc, |
v16float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmac_conf | ( | v4cfloat | acc, |
v8cfloat | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmac_conf | ( | v4cfloat | acc, |
v32float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmac_conf | ( | v4cfloat | acc, |
v16cfloat | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v8float fpmac_conf | ( | v8float | acc, |
v32float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmac_conf | ( | v4cfloat | acc, |
v16float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v8float fpmac_conf | ( | v8float | acc, |
v32float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmac_conf | ( | v4cfloat | acc, |
v8cfloat | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmac_conf | ( | v4cfloat | acc, |
v16cfloat | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmac_conf | ( | v4cfloat | acc, |
v16cfloat | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmac_conf | ( | v4cfloat | acc, |
v8cfloat | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmac_conf | ( | v4cfloat | acc, |
v16cfloat | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v8float fpmsc | ( | v8float | acc, |
v32float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-subtract for single precision real floating point vectors.
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v8float fpmsc | ( | v8float | acc, |
v16float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-subtract for single precision real floating point vectors.
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. Small buffer variant. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v4cfloat fpmsc | ( | v4cfloat | acc, |
v16cfloat | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-subtract for complex times real single precision floating point vectors.
Where the product corresponds to (x.re + j(x.im)) * z
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v4cfloat fpmsc | ( | v4cfloat | acc, |
v8cfloat | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-subtract for complex times real single precision floating point vectors.
Where the product corresponds to (x.re + j(x.im)) * z
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. Small buffer variant. |
xstart | Starting offset for all lanes of X. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v4cfloat fpmsc | ( | v4cfloat | acc, |
v32float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-subtract for real times complex single precision floating point vectors.
Where the product corresponds to x * (z.re + j(z.im))
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
v4cfloat fpmsc | ( | v4cfloat | acc, |
v16float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-subtract for real times complex single precision floating point vectors.
Where the product corresponds to x * (z.re + j(z.im))
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. Small buffer variant. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
v4cfloat fpmsc | ( | v4cfloat | acc, |
v16cfloat | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-subtract for complex single precision floating point vectors.
Where the product corresponds to (x.re + j(x.im)) * (z.re + j(z.im))
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
v4cfloat fpmsc | ( | v4cfloat | acc, |
v8cfloat | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-subtract for complex single precision floating point vectors.
Where the product corresponds to (x.re + j(x.im)) * (z.re + j(z.im))
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. Small buffer variant. |
xstart | Starting offset for all lanes of X. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
v8float fpmsc_abs | ( | v8float | acc, |
v32float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply, take absolute value and subtract for single precision real floating point vectors.
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v8float fpmsc_abs | ( | v8float | acc, |
v16float | xbuf, | ||
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply, take absolute value and subtract for single precision real floating point vectors.
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. Small buffer variant. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v8float fpmul | ( | v32float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiplication for single precision real floating point vectors.
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v8float fpmul | ( | v16float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiplication for single precision real floating point vectors.
xbuf | First multiplication input buffer. Small buffer variant. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v4cfloat fpmul | ( | v16cfloat | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiplication for complex times real single precision floating point vectors.
Where the product corresponds to (x.re + j(x.im)) * z
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v4cfloat fpmul | ( | v8cfloat | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiplication for complex times real single precision floating point vectors.
Where the product corresponds to (x.re + j(x.im)) * z
xbuf | First multiplication input buffer. Small buffer variant. |
xstart | Starting offset for all lanes of X. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v4cfloat fpmul | ( | v32float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiplication for real times complex single precision floating point vectors.
Where the product corresponds to x * (z.re + j(z.im))
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
v4cfloat fpmul | ( | v16float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiplication for real times complex single precision floating point vectors.
Where the product corresponds to x * (z.re + j(z.im))
xbuf | First multiplication input buffer. Small buffer variant. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
v4cfloat fpmul | ( | v16cfloat | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiplication for complex single precision floating point vectors.
Where the product corresponds to (x.re + j(x.im)) * (z.re + j(z.im))
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
v4cfloat fpmul | ( | v8cfloat | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiplication for complex single precision floating point vectors.
Where the product corresponds to (x.re + j(x.im)) * (z.re + j(z.im))
xbuf | First multiplication input buffer. Small buffer variant. |
xstart | Starting offset for all lanes of X. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
v4cfloat fpmul_conf | ( | v16cfloat | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmul_conf | ( | v8cfloat | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v8float fpmul_conf | ( | v16float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v8float fpmul_conf | ( | v32float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmul_conf | ( | v32float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmul_conf | ( | v8cfloat | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v8float fpmul_conf | ( | v32float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v8float fpmul_conf | ( | v32float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmul_conf | ( | v32float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v8float fpmul_conf | ( | v16float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmul_conf | ( | v8cfloat | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmul_conf | ( | v8cfloat | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmul_conf | ( | v16cfloat | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v8float fpmul_conf | ( | v16float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmul_conf | ( | v16cfloat | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmul_conf | ( | v8cfloat | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v8float fpmul_conf | ( | v32float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v8float fpmul_conf | ( | v16float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmul_conf | ( | v8cfloat | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmul_conf | ( | v16cfloat | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmul_conf | ( | v16cfloat | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmul_conf | ( | v16cfloat | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmul_conf | ( | v16float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v4cfloat fpmul_conf | ( | v16float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs, | ||
bool | ones, | ||
bool | abs, | ||
unsigned int | addmode, | ||
unsigned int | addmask, | ||
unsigned int | cmpmode, | ||
unsigned int & | cmp | ||
) |
Fully configurable real multiply-accumulate for single precision floating point vectors.
~~~~~~~~~~~~~~~~~~~ if (addmode == fpadd_add ) neg = addmask ^ 0x00; if (addmode == fpadd_sub ) neg = addmask ^ 0xFF; if (addmode == fpadd_mixadd) neg = addmask ^ 0xAA; if (addmode == fpadd_mixsub) neg = addmask ^ 0x55;
The output can be considered to always have 8 values beause each part of the complex float is treated differently A v4cfloat will have the loop interating over real0 - complex0 - real1 - complex1 ...
This capability is introduced to allow flexibility and implement operations on conjugates.
osz = 8; for (i = 0 ; i < osz ; i++) m[i] = xbuf[xstart + xoffs[i]] * (ones ? 1.0 : (zbuf exists ? zbuf : xbuf)[zstart + zoffs[i]]) n[i] = (-1)^neg[i] * (abs ? |m[i]| : m[i]) o[i] = acc[i] + n[i] if cmpmode == fpcmp_nrm : cmp[i] = sgn(o[i]) ret[i] = o[i] elif cmpmode == fpcmp_lt : cmp[i] = sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] elif cmpmode == fpcmp_ge : cmp[i] = ~sgn(o[i]) ret[i] = cmp[i] ? -n[i] : acc[i] ~~~~~~~~~~~~~~~~~~~ <em>If cmp is not a parameter then it get's discarded.</em> Note that the return value of the cmp operation (less than/great equal than) is related to the acc value. To make a less-than / minimum operation a user could do : ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_ge ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = ~sgn(o[i]) = (o[i] > 0) = (acc[i] > m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = min(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~ Similarly a greater-or-equal / maximum operation would take the same inputs with the different mode (fpcmp_lt) ~~~~~~~~~~~~~~~~~~~ abs = 0 addmode = fpadd_sub addmask = 0 cmpmode = fpcmp_lt ~~~~~~~~~~~~~~~~~~~ Which would then result in (for a given lane i): ~~~~~~~~~~~~~~~~~~~ n[i] = -1 * m[i] o[i] = acc[i] - m[i] cmp[i] = sgn(o[i]) = (o[i] < 0) = (acc[i] < m[i]) ret[i] = cmp[i] ? -n[i] : acc[i] = max(acc[i],m[i]) ~~~~~~~~~~~~~~~~~~~
acc | Incoming accumulation vector. |
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
ones | If true all lanes from Z are replaced with 1.0. |
abs | If true the absolute value is taken before accumulation. |
addmode | Select one of fpadd_add, fpadd_sub, fpadd_mixadd or fpadd_mixsub. This must be a compile time constant. |
addmask | 8 x 1 LSB bits: Corresponding lane is negated if bit is set (depending on addmode). |
cmpmode | Use "fpcmp_lt" to select the minimum between accumulator and result of multiplication per lane, "fpcmp_ge" for the maximum and "fpcmp_nrm" for the usual sum. |
cmp | 8 x 1 LSB bits: When using fpcmp_ge or fpcmp_lt in "cmpmode", it sets a bit if accumulator was chosen (per lane). This parameter is optional. |
v8float fpneg_abs_mul | ( | v32float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply, take absolute value and negate for single precision real floating point vectors.
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v8float fpneg_abs_mul | ( | v16float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply, take absolute value and negate for single precision real floating point vectors.
xbuf | First multiplication input buffer. Small buffer variant. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v8float fpneg_mul | ( | v32float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-negate for single precision real floating point vectors.
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v8float fpneg_mul | ( | v16float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-negate for single precision real floating point vectors.
xbuf | First multiplication input buffer. Small buffer variant. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v4cfloat fpneg_mul | ( | v16cfloat | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-negate for complex times real single precision floating point vectors.
Where the product corresponds to (x.re + j(x.im)) * z
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v4cfloat fpneg_mul | ( | v8cfloat | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v8float | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-negate for complex times real single precision floating point vectors.
Where the product corresponds to (x.re + j(x.im)) * z
xbuf | First multiplication input buffer. Small buffer variant. |
xstart | Starting offset for all lanes of X. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. |
v4cfloat fpneg_mul | ( | v32float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-negate for real times complex single precision floating point vectors.
Where the product corresponds to x * (z.re + j(z.im))
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
v4cfloat fpneg_mul | ( | v16float | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-negate for real times complex single precision floating point vectors.
Where the product corresponds to x * (z.re + j(z.im))
xbuf | First multiplication input buffer. Small buffer variant. |
xstart | Starting offset for all lanes of X. |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
v4cfloat fpneg_mul | ( | v16cfloat | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-negate for complex single precision floating point vectors.
Where the product corresponds to (x.re + j(x.im)) * (z.re + j(z.im))
xbuf | First multiplication input buffer. |
xstart | Starting offset for all lanes of X. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
v4cfloat fpneg_mul | ( | v8cfloat | xbuf, |
int | xstart, | ||
unsigned int | xoffs, | ||
v4cfloat | zbuf, | ||
int | zstart, | ||
unsigned int | zoffs | ||
) |
Multiply-negate for complex single precision floating point vectors.
Where the product corresponds to (x.re + j(x.im)) * (z.re + j(z.im))
xbuf | First multiplication input buffer. Small buffer variant. |
xstart | Starting offset for all lanes of X. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). |
xoffs | 4 bits per lane: Additional lane-dependent offset for X. The offsets are referring to complex lanes. For optimized code should be compile time constant. |
zbuf | Second multiplication input buffer. |
zstart | Starting offset for all lanes of Z. The offsets are referring to complex lanes (lane 0 corresponds to the first real and complex values). This must be a compile time constant. |
zoffs | 4 bits per lane: Additional lane-dependent offset for Z. The offsets are referring to complex lanes. For optimized code should be compile time constant. |