



 +-----------------+------------------------------------------------------------- WLEN=64 (2048/32) ---------------------------------------------------------------+ 
 |  MARGIN         +-------------  512 bits -----------+-------------  512 bits -----------+-------------  512 bits -----------+-------------  512 bits -----------+
 +-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+
 |  256b/8samples  |  256b/8samples  |  256b/8samples  |  256b/8samples  |  256b/8samples  |  256b/8samples  |  256b/8samples  |  256b/8samples  |  256b/8samples  |          
 +-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+
       MARGIN
 +----d0 to d7 ----+----d8 to d15----+----d16 to d23---+----d24 to d31---+----d32 to d39---+----d40 to d47---+----d48 to d55---+----d56 to d63---+----d64 to d71---+       
                     Data starts from here
                   +----d0 to d7 ----+----d8 to d15----+----d16 to d23---+----d24 to d31---+----d32 to d39---+----d40 to d47---+----d48 to d55---+----d56 to d63---+
 +------------- Loop 1 --------------------------------+      
 |     Data 0      |      Data 1     |     Data 2      |
 |  data.insert(0) |  data.insert(1) |  data.insert(0) |
 +-----------------------------------+------------------------ Loop 2 ---------------------+    
 									 |     Data 2      |      Data 3     |     Data 4      |
									 |  data.insert(0) |  data.insert(1) |  data.insert(0) |
									 +-----------------------------------+------------------------ Loop 3 ---------------------+
									 									 |     Data 4      |      Data 5     |     Data 6      |
																		 |  data.insert(0) |  data.insert(1) |  data.insert(0) |
																		 +-----------------------------------+------------------------ Loop 4 ---------------------+
																		 	                                 |     Data 6      |      Data 7     |     Data 8      |
																		 	                                 |  data.insert(0) |  data.insert(1) |  data.insert(0) |
																		 	                                 +-----------------------------------+-----------------+
Loop = 64 samples/16 = 4 

In each loop 16 samples are pocessed. To process 16 samples, it requires 23 samples.

+-------------------------+---------------------------+----------------------------+----------------------------+
|1st mul call             |2nd mul call               |3rd mul call                |4th mul call                |
|(d0 to d10 - 4 samples)  |(d4 to d14 - 4 samples)    |(d8 to d18 - 4 samples)     |(d12 to d22 - 4 samples)    |
+-------------------------+---------------------------+----------------------------+----------------------------+
| o0  = c0*(d0) + c1*(d1) | o4  = c0*(d4) + c1*(d5)   | o8  = c0*(d8) + c1*(d9)    | o12  = c0*(d12) + c1*(d13) |
| o1  = c0*(d1) + c1*(d2) | o5  = c0*(d5) + c1*(d6)   | o9  = c0*(d9) + c1*(d10)   | o13  = c0*(d13) + c1*(d14) |
| o2  = c0*(d2) + c1*(d3) | o6  = c0*(d6) + c1*(d7)   | o10 = c0*(d10) + c1*(d11)  | o14  = c0*(d14) + c1*(d15) |
| o3  = c0*(d3) + c1*(d4) | o7  = c0*(d7) + c1*(d8)   | o11 = c0*(d12) + c1*(d12)  | o15  = c0*(d15) + c1*(d16) |
|                         |                           |                            |                            |
| o0 += c2*(d2) + c3*(d3) | o4 += c2*(d6) + c3*(d7)   | o8  += c2*(d10) + c3*(d11) | o12 += c2*(d14) + c3*(d15) |
| o1 += c2*(d3) + c3*(d4) | o5 += c2*(d7) + c3*(d8)   | o9  += c2*(d11) + c3*(d12) | o13 += c2*(d15) + c3*(d16) |
| o2 += c2*(d4) + c3*(d5) | o6 += c2*(d8) + c3*(d9)   | o10 += c2*(d12) + c3*(d13) | o14 += c2*(d16) + c3*(d17) |
| o3 += c2*(d5) + c3*(d6) | o7 += c2*(d9) + c3*(d10)  | o11 += c2*(d13) + c3*(d14) | o15 += c2*(d17) + c3*(d18) |
|                         |                           |                            |                            |
| o0 += c4*(d4) + c5*(d5) | o4 += c4*(d8) + c5*(d9)   | o8  += c4*(d12) + c5*(d13) | o12 += c4*(d16) + c5*(d17) |
| o1 += c4*(d5) + c5*(d6) | o5 += c4*(d9) + c5*(d10)  | o9  += c4*(d13) + c5*(d14) | o13 += c4*(d17) + c5*(d18) |
| o2 += c4*(d6) + c5*(d7) | o6 += c4*(d10) + c5*(d11) | o10 += c4*(d14) + c5*(d15) | o14 += c4*(d18) + c5*(d19) |
| o3 += c4*(d7) + c5*(d8) | o7 += c4*(d11) + c5*(d12) | o11 += c4*(d15) + c5*(d16) | o15 += c4*(d19) + c5*(d20) |
|                         |                           |                            |                            |
| o0 += c6*(d6) + c7*(d7) | o4 += c6*(d10) + c7*(d11) | o8  += c6*(d14) + c7*(d15) | o12 += c6*(d18) + c7*(d19) | <= 16 samples
| o1 += c6*(d7) + c7*(d8) | o5 += c6*(d11) + c7*(d12) | o9  += c6*(d15) + c7*(d16) | o13 += c6*(d19) + c7*(d20) | <=  OUTPUT
| o2 += c6*(d8) + c7*(d9) | o6 += c6*(d12) + c7*(d13) | o10 += c6*(d16) + c7*(d17) | o14 += c6*(d20) + c7*(d21) | <= 
| o3 += c6*(d9) + c7*(d10)| o7 += c6*(d13) + c7*(d14) | o11 += c6*(d17) + c7*(d18) | o15 += c6*(d21) + c7*(d22) | <= 
+-------------------------+---------------------------+----------------------------+----------------------------+
 
 
 For AI Engine API details:
 https://www.xilinx.com/html_docs/xilinx2021_2/aiengine_api/aie_api/doc/group__group__mul__special.html#structaie_1_1sliding__mul__ops

 template<unsigned Lanes, unsigned Points, int CoeffStep, int DataStepX, int DataStepY, ElemBaseType CoeffType, ElemBaseType DataType, AccumElemBaseType AccumTag = detail::default_accum_tag_t<CoeffType, DataType>>
 struct aie::sliding_mul_ops< Lanes, Points, CoeffStep, DataStepX, DataStepY, CoeffType, DataType, AccumTag >

 This type provides a parameterized multiplication that implements the following compute pattern:

 DSX = DataStepX
 DSY = DataStepY
 CS  = CoeffStep
 P   = Points
 L   = Lanes
 c_s = coeff_start
 d_s = data_start
 
 out[0]   = coeff[c_s] * data[d_s +            ] + coeff[c_s + CS] * data[d_s +               DSX] + ... + coeff[c_s + (P-1) * CS] * data[d_s +               (P-1) * DSX]
 out[1]   = coeff[c_s] * data[d_s +         DSY] + coeff[c_s + CS] * data[d_s +         DSY + DSX] + ... + coeff[c_s + (P-1) * CS] * data[d_s +         DSY + (P-1) * DSX]
 ...
 out[L-1] = coeff[c_s] * data[d_s + (L-1) * DSY] + coeff[c_s + CS] * data[d_s + (L-1) * DSY + DSX] + ... + coeff[c_s + (P-1) * CS] * data[d_s + (L-1) * DSY + (P-1) * DSX]


