/*
 * Copyright (C) 2019-2022, Xilinx, Inc.
 * Copyright (C) 2022-2025, Advanced Micro Devices, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#ifndef _DSPLIB_FFT_IFFT_1D_GRAPH_HPP_
#define _DSPLIB_FFT_IFFT_1D_GRAPH_HPP_

#include <adf.h>
#include <vector>
#include <adf/arch/aie_arch_properties.hpp>
#include "graph_utils.hpp"
#include "fft_ifft_dit_1ch_graph.hpp"
#include "twiddle_rotator.hpp"

using namespace adf;
using namespace xf::dsp::aie::fft::dit_1ch;
using namespace xf::dsp::aie::fft::twidRot;

namespace xf {
namespace dsp {
namespace aie {
namespace fft {
namespace vss_1d {

template <unsigned int num>
static constexpr unsigned int isPowerOf2() {
    if (num == 2 || num == 4 || num == 8 || num == 16 || num == 32 || num == 64)
        return true;
    else
        return false;
}

template <typename TT_DATA,
          typename TT_TWIDDLE,
          unsigned int TP_WINDOW_SIZE_CALC,
          unsigned int TP_DIM,
          unsigned int TP_SSR,
          unsigned int TP_PT_SIZE_D1,
          unsigned int TP_PT_SIZE_D2,
          unsigned int TP_FFT_NIFFT,
          unsigned int TP_DYN_PT_SIZE,
          unsigned int TP_RND = 0,
          unsigned int TP_SAT = 1,
          unsigned int TP_PT_SIZE_D2_CEIL = TP_PT_SIZE_D2,
          unsigned int TP_ROT_FAN_SIZE = 8>
class create_par_kernels_vss_decomp {
   public:
    static void create(kernel (&m_fftTwRotKernels)[TP_SSR],
                       std::array<std::array<TT_TWIDDLE, TP_PT_SIZE_D2_CEIL / TP_SSR * TP_ROT_FAN_SIZE>, TP_SSR>& twRot,
                       std::array<std::array<TT_TWIDDLE, TP_PT_SIZE_D2_CEIL / TP_SSR * TP_PT_SIZE_D1 / TP_ROT_FAN_SIZE>,
                                  TP_SSR>& twMain) {
        std::array<TT_TWIDDLE, TP_PT_SIZE_D2_CEIL / TP_SSR * TP_ROT_FAN_SIZE> twRotKernel;
        std::array<TT_TWIDDLE, TP_PT_SIZE_D2_CEIL / TP_SSR * TP_PT_SIZE_D1 / TP_ROT_FAN_SIZE> twMainKernel;
        memcpy(&twRotKernel, &twRot[TP_DIM], TP_PT_SIZE_D2_CEIL / TP_SSR * TP_ROT_FAN_SIZE * sizeof(TT_TWIDDLE));
        memcpy(&twMainKernel, &twMain[TP_DIM],
               TP_PT_SIZE_D2_CEIL / TP_SSR * TP_PT_SIZE_D1 / TP_ROT_FAN_SIZE * sizeof(TT_TWIDDLE));
        m_fftTwRotKernels[TP_DIM] =
            kernel::create_object<twiddleRotator<TT_DATA, TT_TWIDDLE, TP_WINDOW_SIZE_CALC, TP_PT_SIZE_D1, TP_PT_SIZE_D2,
                                                 TP_SSR, TP_FFT_NIFFT, TP_DIM> >(twRotKernel, twMainKernel);
        runtime<ratio>(m_fftTwRotKernels[TP_DIM]) = 0.2;
        // Source files
        source(m_fftTwRotKernels[TP_DIM]) = "twiddle_rotator.cpp";
        headers(m_fftTwRotKernels[TP_DIM]) = {"twiddle_rotator.hpp"};
        if
            constexpr(TP_DIM != 0) {
                create_par_kernels_vss_decomp<TT_DATA, TT_TWIDDLE, TP_WINDOW_SIZE_CALC, TP_DIM - 1, TP_SSR,
                                              TP_PT_SIZE_D1, TP_PT_SIZE_D2, TP_FFT_NIFFT, TP_DYN_PT_SIZE, TP_RND,
                                              TP_SAT, TP_PT_SIZE_D2_CEIL, TP_ROT_FAN_SIZE>::create(m_fftTwRotKernels,
                                                                                                   twRot, twMain);
            }
    }
};

/**
  * @endcond
  */

//--------------------------------------------------------------------------------------------------
// vss_fft_ifft_1d template
//--------------------------------------------------------------------------------------------------
/**
 * @ingroup fft_graphs
 *
 * @brief vss_fft_ifft_1d is a single-channel, decomposed FFT that contains the AIE sub-part of the VSS FFT Mode 1
 *offering.
 *
 * These are the templates to configure the single-channel decimation-in-time class.
 * @tparam TT_DATA describes the type of individual data samples input to and
 *         output from the transform function. \n
 *         This is a typename and must be one of the following: \n
 *         cint32, cfloat, cint16.
 * @tparam TT_TWIDDLE describes the type of twiddle factors of the transform. \n
 *         It must be one of the following: cint16, cint32, cfloat
 *         and must also satisfy the following rules:
 *         - 32 bit types are only supported when TT_DATA is also a 32 bit type,
 *         - TT_TWIDDLE must be an integer type if TT_DATA is an integer type
 *         - TT_TWIDDLE must be cfloat type if TT_DATA is a float type.
 * @tparam TP_POINT_SIZE is an unsigned integer which describes the number of samples in
 *         the transform. \n This must be 2^N where N is an integer in the range
 *         8 to 16 inclusive for AIE devices and 10 to 16 inclusive for AIE-ML devices.
 * @tparam TP_FFT_NIFFT selects whether the transform to perform is an FFT (1) or IFFT (0).
 * @tparam TP_SHIFT selects the power of 2 to scale the result by prior to output.
 * @tparam TP_API is an unsigned integer to select window (0) or stream (1) interfaces.
 *         When stream I/O is selected, one sample is taken from, or output to, a stream and the next sample
 *         from or two the next stream. Two streams minimum are used. In this example, even samples are
 *         read from input stream[0] and odd samples from input stream[1].
 * @tparam TP_SSR is an unsigned integer to describe the number of parallel computational paths into which the
 *implementation will be split to improve the performance.
 *         Higher SSR relates to higher performance.
 * @tparam TP_RND describes the selection of rounding to be applied during the
 *         shift down stage of processing. \n
 *         Although, TP_RND accepts unsigned integer values descriptive macros are recommended where
 *         - rnd_floor      = Truncate LSB, always round down (towards negative infinity).
 *         - rnd_ceil       = Always round up (towards positive infinity).
 *         - rnd_sym_floor  = Truncate LSB, always round towards 0.
 *         - rnd_sym_ceil   = Always round up towards infinity.
 *         - rnd_pos_inf    = Round halfway towards positive infinity.
 *         - rnd_neg_inf    = Round halfway towards negative infinity.
 *         - rnd_sym_inf    = Round halfway towards infinity (away from zero).
 *         - rnd_sym_zero   = Round halfway towards zero (away from infinity).
 *         - rnd_conv_even  = Round halfway towards nearest even number.
 *         - rnd_conv_odd   = Round halfway towards nearest odd number. \n
 *         No rounding is performed on ceil or floor mode variants. \n
 *         Other modes round to the nearest integer. They differ only in how
 *         they round for values of 0.5. \n
 *         \n
 *         Note: Rounding modes ``rnd_sym_floor`` and ``rnd_sym_ceil`` are only supported on AIE-ML and AIE-MLv2 device.
 *\n
 * @tparam TP_SAT describes the selection of saturation to be applied during the shift down stage of processing. \n
 *         TP_SAT accepts unsigned integer values, where:
 *         - 0: none           = No saturation is performed and the value is truncated on the MSB side.
 *         - 1: saturate       = Default. Saturation rounds an n-bit signed value
 *         in the range [- ( 2^(n-1) ) : +2^(n-1) - 1 ].
 *         - 3: symmetric      = Controls symmetric saturation. Symmetric saturation rounds an n-bit signed value in the
 *range [- ( 2^(n-1) -1 ) : +2^(n-1) - 1 ]. \n
 * @tparam TP_TWIDDLE_MODE describes the magnitude of integer twiddles. It has no effect for cfloat. \n
 *         - 0: Max amplitude. Values at 2^15 (for TT_TWIDDLE=cint16) and 2^31 (TT_TWIDDLE=cint32) will saturate and so
 *introduce errors
 *         - 1: 0.5 amplitude. Twiddle values are 1/2 that of mode 0 so as to avoid twiddle saturation. However,
 *twiddles are one bit less precise versus mode 0.
 *
  **/
template <typename TT_DATA,
          typename TT_TWIDDLE,
          unsigned int TP_POINT_SIZE,
          unsigned int TP_FFT_NIFFT = 1,
          unsigned int TP_SHIFT = 0,
          unsigned int TP_API = 0,
          unsigned int TP_SSR = 0,
          unsigned int TP_RND = 4,
          unsigned int TP_SAT = 1,
          unsigned int TP_TWIDDLE_MODE = 0>
class vss_fft_ifft_1d_graph : public graph {
   public:
    // FFT twiddle rotation kernels that follow the first set of FFT operations.
    kernel m_fftTwRotKernels[TP_SSR];
    // This is a port that interfaces with a PL kernel internal to the VSS.
    port_array<input, TP_SSR> front_i;
    // This is a port that interfaces with a PL kernel internal to the VSS.
    port_array<input, TP_SSR> back_i;
    // This is a port that interfaces with a PL kernel internal to the VSS.
    port_array<output, TP_SSR> back_o;
    // This is a port that interfaces with a PL kernel internal to the VSS.
    port_array<output, TP_SSR> front_o;

#if __HAS_MEM_TILE__ == 1
    adf::shared_buffer<TT_DATA> memTileFrontIn[TP_SSR];
    adf::shared_buffer<TT_DATA> memTileFrontOut[TP_SSR];
    adf::shared_buffer<TT_DATA> memTileBackIn[TP_SSR];
    adf::shared_buffer<TT_DATA> memTileBackOut[TP_SSR];
#endif

   private:
    static constexpr unsigned int kIntDynPtSize = 0;
    static constexpr unsigned int kIntParPow = 0;
    static constexpr unsigned int kIntCascLen = 1;
    static constexpr unsigned int kIntUseWidg = 0;
    static constexpr unsigned int kHeaderBytes = kIntDynPtSize > 0 ? 32 : 0;
    static constexpr unsigned int kPtSizeD1 = fnPtSizeD1<TP_POINT_SIZE, modeAIEffts, TP_SSR>();
    static constexpr unsigned int kPtSizeD2 = TP_POINT_SIZE / kPtSizeD1;
    static constexpr unsigned int kPtSizeD2Ceil = fnCeil<kPtSizeD2, TP_SSR>();
    static constexpr unsigned int kFirstFFTShift = TP_SHIFT / 2;
    static constexpr unsigned int kSecondFFTShift = TP_SHIFT - TP_SHIFT / 2;
    static constexpr unsigned int kWindowSizeRaw1 = (kPtSizeD2Ceil * kPtSizeD1) / TP_SSR; // ;
    static constexpr unsigned int kWindowSizeCalc1 = kWindowSizeRaw1 * 2 * sizeof(TT_DATA) <= __DATA_MEM_BYTES__
                                                         ? kWindowSizeRaw1
                                                         : __DATA_MEM_BYTES__ / (2 * sizeof(TT_DATA));
    static constexpr unsigned int kWindowSizeRaw2 = (kPtSizeD2 * fnCeil<kPtSizeD1, TP_SSR>()) / TP_SSR; // ;
    static constexpr unsigned int kWindowSizeCalc2 = kWindowSizeRaw2 * 2 * sizeof(TT_DATA) <= __DATA_MEM_BYTES__
                                                         ? kWindowSizeRaw2
                                                         : __DATA_MEM_BYTES__ / (2 * sizeof(TT_DATA));
    static constexpr unsigned int kRotFanSize =
        TP_POINT_SIZE / TP_SSR * sizeof(TT_TWIDDLE) <= __DATA_MEM_BYTES__ ? 1 : fnNumLanes<TT_TWIDDLE, TT_TWIDDLE>();
    static constexpr int kInv = TP_FFT_NIFFT == 1 ? -1 : 1;
    static constexpr unsigned int kSamplesPerRead =
        128 / (sizeof(TT_DATA) * 8); // 128 is the width of data read by subsequent PL kernels
    static constexpr bool kUseBDTranspose =
        (sizeof(TT_DATA) <= __MAX_BD_DSIZE_TPOSE__) && (kWindowSizeRaw1 * 2 * sizeof(TT_DATA) <= __DATA_MEM_BYTES__)
            ? true
            : false;
    static constexpr bool kUseMemTileTranspose =
        __HAS_MEM_TILE__ == 1 && (kWindowSizeRaw2 * 2 * sizeof(TT_DATA) > __DATA_MEM_BYTES__) ? true : false;
    static constexpr bool kUseBDTiling = sizeof(TT_DATA) <= __MAX_BD_DSIZE_TILING__ ? true : false;
    static constexpr bool kUseMemTileTiling = __HAS_MEM_TILE__ == 1 ? true : false;

    // This static assert may trigger only for point sizes that are not perfect square numbers.
    // The window size for the front and back FFTs need to be equal and they also need to be a divisible by the front
    // and back point sizes.
    // For some values of SSR, choosing a common window size would mean that the twiddle rotator kernels would not start
    // and end their operation in a deterministic way
    // and would need runtime checks to determine the operation.
    // The front AIE kernels form the throughput bottleneck for the system, so rather than reducing the overall
    // throughput with run-time code,
    // the user can choose a different value of SSR for better efficiency.

    static_assert((kPtSizeD1 == kPtSizeD2) || ((kWindowSizeRaw1 * 2 * sizeof(TT_DATA) <= __DATA_MEM_BYTES__) &&
                                                   (kWindowSizeRaw2 * 2 * sizeof(TT_DATA) <= __DATA_MEM_BYTES__) ||
                                               isPowerOf2<TP_SSR>()),
                  "The given combination of point size and SSR is not supported. Please use a different value of SSR.");
    void createTwidRotKernels() {
        std::array<std::array<TT_TWIDDLE, kRotFanSize>, kPtSizeD2> twRotTmp;
        std::array<std::array<TT_TWIDDLE, kPtSizeD2Ceil / TP_SSR * kRotFanSize>, TP_SSR> twRot;
        std::array<std::array<TT_TWIDDLE, kPtSizeD1 / kRotFanSize>, kPtSizeD2> twMainTmp;
        std::array<std::array<TT_TWIDDLE, kPtSizeD2Ceil / TP_SSR * kPtSizeD1 / kRotFanSize>, TP_SSR> twMain;
        int32 kScaleFactor =
            std::is_same<TT_TWIDDLE, cfloat>() ? 1 : std::is_same<TT_TWIDDLE, cint32>() ? (1 << 31) - 1 : (1 << 15) - 1;

        TT_TWIDDLE val;
        // calculate all fans
        for (int rr = 0; rr < kPtSizeD2; rr++) {
            for (unsigned ii = 0; ii < kRotFanSize; ii++) {
                val.real = cos(kInv * (2 * M_PI * rr * ii) / TP_POINT_SIZE) *
                           kScaleFactor; // cos(( 2 * pi * rr * ii ) / point_size)
                val.imag = sin(kInv * (2 * M_PI * rr * ii) / TP_POINT_SIZE) * kScaleFactor;
                twRotTmp[rr][ii] = val;
            }
            memcpy(&twRot[rr % TP_SSR][(rr / TP_SSR) * kRotFanSize], &twRotTmp[rr][0],
                   kRotFanSize * sizeof(TT_TWIDDLE));
            for (unsigned ii = 0; ii < kPtSizeD1 / kRotFanSize; ii++) {
                val.real = cos(kInv * (2 * M_PI * rr * ii * kRotFanSize) / TP_POINT_SIZE) *
                           kScaleFactor; // cos(( 2 * pi * rr * ii ) / point_size)
                val.imag = sin(kInv * (2 * M_PI * rr * ii * kRotFanSize) / TP_POINT_SIZE) * kScaleFactor;
                twMainTmp[rr][ii] = val;
            }
            memcpy(&twMain[rr % TP_SSR][(rr / TP_SSR) * (kPtSizeD1 / kRotFanSize)], &twMainTmp[rr][0],
                   kPtSizeD1 / kRotFanSize * sizeof(TT_TWIDDLE));
        }

        create_par_kernels_vss_decomp<TT_DATA, TT_TWIDDLE, kWindowSizeCalc1, TP_SSR - 1, TP_SSR, kPtSizeD1, kPtSizeD2,
                                      TP_FFT_NIFFT, kIntDynPtSize, TP_RND, TP_SAT, kPtSizeD2Ceil,
                                      kRotFanSize>::create(m_fftTwRotKernels, twRot, twMain);
    }

   public:
    // FFT graph that performs the initial set of FFT calculations
    fft_ifft_dit_1ch_graph<TT_DATA,
                           TT_TWIDDLE,
                           kPtSizeD1,
                           TP_FFT_NIFFT,
                           kFirstFFTShift,
                           kIntCascLen,
                           kIntDynPtSize,
                           kWindowSizeCalc1,
                           TP_API,
                           kIntParPow,
                           kIntUseWidg,
                           TP_RND,
                           TP_SAT,
                           TP_TWIDDLE_MODE>
        frontFFTGraph[TP_SSR];

    // FFT graph that performs the final set of FFT calculations
    fft_ifft_dit_1ch_graph<TT_DATA,
                           TT_TWIDDLE,
                           kPtSizeD2,
                           TP_FFT_NIFFT,
                           kSecondFFTShift,
                           kIntCascLen,
                           kIntDynPtSize,
                           kWindowSizeCalc2,
                           TP_API,
                           kIntParPow,
                           kIntUseWidg,
                           TP_RND,
                           TP_SAT,
                           TP_TWIDDLE_MODE>
        backFFTGraph[TP_SSR];

    /**
     * @brief This is the constructor function for the AIE sub-portion of the VSS FFT IP.
     **/
    vss_fft_ifft_1d_graph() {
        createTwidRotKernels();
        for (int ss = 0; ss < TP_SSR; ss++) {
            connect<>(frontFFTGraph[ss].out[0], m_fftTwRotKernels[ss].in[0]);
            dimensions(m_fftTwRotKernels[ss].in[0]) = {kWindowSizeCalc1 + kHeaderBytes / sizeof(TT_DATA)};

            if
                constexpr(kUseBDTranspose) {
                    connect<>(front_i[ss], frontFFTGraph[ss].in[0]);
                    connect<>(backFFTGraph[ss].out[0], back_o[ss]);
                    write_access(frontFFTGraph[ss].getKernels()->in[0]) =
                        adf::tiling({.buffer_dimension = {(kPtSizeD1), (kPtSizeD2 / TP_SSR)},
                                     .tiling_dimension = {1, (1)},
                                     .offset = {0, 0},
                                     .tile_traversal = {{.dimension = 1, .stride = 1, .wrap = (kPtSizeD2 / TP_SSR)},
                                                        {.dimension = 0, .stride = 1, .wrap = (kPtSizeD1)}}});
                    read_access(backFFTGraph[ss].getKernels()->out[0]) = adf::tiling( // 16, 64
                        {.buffer_dimension = {(kPtSizeD2), (kPtSizeD1 / TP_SSR)},
                         .tiling_dimension = {1, (1)},
                         .offset = {0, 0},
                         .tile_traversal = {{.dimension = 1, .stride = 1, .wrap = (kPtSizeD1 / TP_SSR)},
                                            {.dimension = 0, .stride = 1, .wrap = kPtSizeD2}}});
                }
#if __HAS_MEM_TILE__ == 1
            else if
                constexpr(kUseMemTileTranspose) {
                    // Connect through memtile for transpose operation when MEM runs out of space
                    memTileFrontIn[ss] = adf::shared_buffer<TT_DATA>::create({(kPtSizeD1), (kPtSizeD2 / TP_SSR)}, 1, 1);
                    num_buffers(memTileFrontIn[ss]) = 2;
                    write_access(memTileFrontIn[ss].in[0]) =
                        adf::tiling({.buffer_dimension = {(kPtSizeD1), (kPtSizeD2 / TP_SSR)},
                                     .tiling_dimension = {1, (1)},
                                     .offset = {0, 0},
                                     .tile_traversal = {{.dimension = 1, .stride = 1, .wrap = (kPtSizeD2 / TP_SSR)},
                                                        {.dimension = 0, .stride = 1, .wrap = (kPtSizeD1)}}});
                    read_access(memTileFrontIn[ss].out[0]) =
                        adf::tiling({.buffer_dimension = {(kPtSizeD1), (kPtSizeD2 / TP_SSR)},
                                     .tiling_dimension = {(kPtSizeD1), (kPtSizeD2 / TP_SSR)},
                                     .offset = {0, 0}});
                    connect<>(front_i[ss], memTileFrontIn[ss].in[0]);
                    connect<>(memTileFrontIn[ss].out[0], frontFFTGraph[ss].in[0]);

                    memTileBackOut[ss] = adf::shared_buffer<TT_DATA>::create({(kPtSizeD2), (kPtSizeD1 / TP_SSR)}, 1, 1);
                    num_buffers(memTileBackOut[ss]) = 2;
                    write_access(memTileBackOut[ss].in[0]) =
                        adf::tiling({.buffer_dimension = {(kPtSizeD1), (kPtSizeD2 / TP_SSR)},
                                     .tiling_dimension = {(kPtSizeD1), (kPtSizeD2 / TP_SSR)},
                                     .offset = {0, 0}});
                    read_access(memTileBackOut[ss].out[0]) =
                        adf::tiling({.buffer_dimension = {(kPtSizeD2), (kPtSizeD1 / TP_SSR)},
                                     .tiling_dimension = {1, (1)},
                                     .offset = {0, 0},
                                     .tile_traversal = {{.dimension = 1, .stride = 1, .wrap = (kPtSizeD1 / TP_SSR)},
                                                        {.dimension = 0, .stride = 1, .wrap = kPtSizeD2}}});
                    connect<>(backFFTGraph[ss].out[0], memTileBackOut[ss].in[0]);
                    connect<>(memTileBackOut[ss].out[0], back_o[ss]);
                }
#endif
            else {
                connect<>(front_i[ss], frontFFTGraph[ss].in[0]);
                connect<>(backFFTGraph[ss].out[0], back_o[ss]);
            }
            if
                constexpr(kUseBDTiling) {
                    read_access(m_fftTwRotKernels[ss].out[0]) =
                        adf::tiling({.buffer_dimension = {(kPtSizeD1), (kPtSizeD2 / TP_SSR)},
                                     .tiling_dimension = {1, kSamplesPerRead},
                                     .offset = {0, 0},
                                     .tile_traversal = {
                                         {.dimension = 0, .stride = 1, .wrap = kPtSizeD1},
                                         {.dimension = 1,
                                          .stride = kSamplesPerRead,
                                          .wrap = ((kPtSizeD2 / TP_SSR / kSamplesPerRead))},
                                     }});

                    write_access(backFFTGraph[ss].getKernels()->in[0]) =
                        adf::tiling({.buffer_dimension = {(kPtSizeD2), (kPtSizeD1 / TP_SSR)},
                                     .tiling_dimension = {1, kSamplesPerRead},
                                     .offset = {0, 0},
                                     .tile_traversal = {{.dimension = 0, .stride = 1, .wrap = kPtSizeD2},
                                                        {.dimension = 1,
                                                         .stride = kSamplesPerRead,
                                                         .wrap = ((kPtSizeD1 / TP_SSR) / kSamplesPerRead)}}});
                    dimensions(m_fftTwRotKernels[ss].out[0]) = {kWindowSizeCalc1 + kHeaderBytes / sizeof(TT_DATA)};
                    connect<>(m_fftTwRotKernels[ss].out[0], front_o[ss]);
                    dimensions(backFFTGraph[ss].in[0]) = {kWindowSizeCalc1 + kHeaderBytes / sizeof(TT_DATA)};
                    connect<>(back_i[ss], backFFTGraph[ss].in[0]);
                }
#if __HAS_MEM_TILE__ == 1
            else if
                constexpr(kUseMemTileTiling) {
                    // Connect through memtile where available and when DMA BDs cannot do the trick
                    // Front AIEs to Middle transpose
                    memTileFrontOut[ss] =
                        adf::shared_buffer<TT_DATA>::create({(kPtSizeD1), (kPtSizeD2 / TP_SSR)}, 1, 1);
                    num_buffers(memTileFrontOut[ss]) = 2;
                    connect<>(m_fftTwRotKernels[ss].out[0], memTileFrontOut[ss].in[0]);
                    connect<>(memTileFrontOut[ss].out[0], front_o[ss]);
                    dimensions(m_fftTwRotKernels[ss].out[0]) = {kWindowSizeCalc1 + kHeaderBytes / sizeof(TT_DATA)};
                    write_access(memTileFrontOut[ss].in[0]) =
                        tiling({.buffer_dimension = {(kPtSizeD1), (kPtSizeD2 / TP_SSR)},
                                .tiling_dimension = {(kPtSizeD1), (kPtSizeD2 / TP_SSR)},
                                .offset = {0, 0}});
                    read_access(memTileFrontOut[ss].out[0]) =
                        adf::tiling({.buffer_dimension = {(kPtSizeD1), (kPtSizeD2 / TP_SSR)},
                                     .tiling_dimension = {1, kSamplesPerRead},
                                     .offset = {0, 0},
                                     .tile_traversal = {
                                         {.dimension = 0, .stride = 1, .wrap = kPtSizeD1},
                                         {.dimension = 1,
                                          .stride = kSamplesPerRead,
                                          .wrap = ((kPtSizeD2 / TP_SSR / kSamplesPerRead))},
                                     }});

                    // Middle transpose to back BDs
                    memTileBackIn[ss] = adf::shared_buffer<TT_DATA>::create({(kPtSizeD2), (kPtSizeD1 / TP_SSR)}, 1, 1);
                    num_buffers(memTileBackIn[ss]) = 2;
                    write_access(memTileBackIn[ss].in[0]) =
                        tiling({.buffer_dimension = {(kPtSizeD2), (kPtSizeD1 / TP_SSR)},
                                .tiling_dimension = {1, kSamplesPerRead},
                                .offset = {0, 0},
                                .tile_traversal = {
                                    {.dimension = 0, .stride = 1, .wrap = kPtSizeD2},
                                    {.dimension = 1,
                                     .stride = kSamplesPerRead,
                                     .wrap = ((kPtSizeD1 / TP_SSR / kSamplesPerRead))},
                                }});
                    read_access(memTileBackIn[ss].out[0]) =
                        adf::tiling({.buffer_dimension = {(kPtSizeD2), (kPtSizeD1 / TP_SSR)},
                                     .tiling_dimension = {(kPtSizeD2), (kPtSizeD1 / TP_SSR)},
                                     .offset = {0, 0}});
                    connect<>(back_i[ss], memTileBackIn[ss].in[0]);
                    connect<>(memTileBackIn[ss].out[0], backFFTGraph[ss].in[0]);
                    dimensions(backFFTGraph[ss].in[0]) = {kWindowSizeCalc1 + kHeaderBytes / sizeof(TT_DATA)};
                }
#endif
            else {
                connect<>(m_fftTwRotKernels[ss].out[0], front_o[ss]);
                dimensions(m_fftTwRotKernels[ss].out[0]) = {kWindowSizeCalc1 + kHeaderBytes / sizeof(TT_DATA)};
                connect<>(back_i[ss], backFFTGraph[ss].in[0]);
                dimensions(backFFTGraph[ss].in[0]) = {kWindowSizeCalc1 + kHeaderBytes / sizeof(TT_DATA)};
            }
        }
    };
};

} // namespace vss_1d
} // namespace fft
} // namespace aie
} // namespace dsp
} // namespace xf

#endif // _DSPLIB_FFT_IFFT_1D_GRAPH_HPP_
