#ifndef __WRAPPER_PERMUTE_CC__
#define __WRAPPER_PERMUTE_CC__

#include <adf.h>
#include <aie_api/aie.hpp>
#include "aie_api/utils.hpp"



void depthToSpace(uint16_t* input,
                  uint16_t* output,
                  int batch, int depth, int height, int width, int blockSize,
                  bool DCR = true) {//default is DCR(true), CRD(false) 
    // Validate dimensions
    // if (depth % (blockSize * blockSize) != 0) {
    //     throw std::invalid_argument("Depth must be divisible by blockSize^2.");
    // }

    // Compute the output dimensions
    
    int newHeight = height * blockSize;
    int newWidth = width * blockSize;
    int newDepth = depth / (blockSize * blockSize);

    // Perform DepthToSpace rearrangement
    for (int b = 0; b < batch; ++b) {
        for (int h = 0; h < newHeight; ++h) {
            for (int w = 0; w < newWidth; ++w) {
                for (int d = 0; d < newDepth; ++d) {
                    int offsetH = h % blockSize;
                    int offsetW = w % blockSize;
                    int inDepth = DCR ? (d * blockSize * blockSize + offsetH * blockSize + offsetW) //DCR mode
                                      : (d + newDepth * (offsetH * blockSize + offsetW));  //CRD mode
                    int inHeight = h / blockSize;
                    int inWidth = w / blockSize;

                    int inputIndex = b * (height * width * depth) +
                                     inHeight * (width * depth) +
                                     inWidth * depth +
                                     inDepth;

                    int outputIndex = b * (newHeight * newWidth * newDepth) +
                                      h * (newWidth * newDepth) +
                                      w * newDepth +
                                      d;

                    output[outputIndex] = input[inputIndex];
                }
            }
        }
    }
}

void run_int16_permute(KernelArgs& args)
{
    uint16_t const* args_params = static_cast<uint16_t const*>(args.params_data);
    // NOTE: SUBV_elemens must be a multiple of 128
    int batch = args_params[0];
    int depth = args_params[1];
    int height = args_params[2];
    int width = args_params[3];
    int blockSize = args_params[4];
    bool perm_mode = bool(args_params[5]);
    uint16_t* matin = static_cast<uint16_t*>(args.s2mm_ch0_data);
    uint16_t* matout = static_cast<uint16_t*>(args.mm2s_ch0_data);

    depthToSpace(matin, matout, batch, depth, height, width, blockSize, perm_mode);
    
    // int loop_count = subv_elements / 32;

    // v32uint16*   v_in  = (v32uint16*)(matin);
    // v32uint16*   v_out = (v32uint16*)(matout);
    // for (int i = 0; i < loop_count; ++i)
    // chess_loop_range(6,)
    // //chess_no_hw_loop
    // chess_prepare_for_pipelining
    // {
    //     *v_out++ = *v_in++;
    // }
}





#endif
